Libparserutils
|
#include <assert.h>
#include <stdlib.h>
#include <string.h>
#include <parserutils/charset/mibenum.h>
#include "charset/codecs/codec_impl.h"
#include "charset/encodings/utf8impl.h"
#include "utils/endian.h"
#include "utils/utils.h"
Go to the source code of this file.
Data Structures | |
struct | charset_utf8_codec |
UTF-8 charset codec. More... |
Macros | |
#define | INVAL_BUFSIZE (32) |
#define | READ_BUFSIZE (8) |
#define | WRITE_BUFSIZE (8) |
Typedefs | |
typedef struct charset_utf8_codec | charset_utf8_codec |
UTF-8 charset codec. |
Functions | |
static bool | charset_utf8_codec_handles_charset (const char *charset) |
Determine whether this codec handles a specific charset. | |
static parserutils_error | charset_utf8_codec_create (const char *charset, parserutils_charset_codec **codec) |
Create a UTF-8 codec. | |
static parserutils_error | charset_utf8_codec_destroy (parserutils_charset_codec *codec) |
Destroy a UTF-8 codec. | |
static parserutils_error | charset_utf8_codec_encode (parserutils_charset_codec *codec, const uint8_t **source, size_t *sourcelen, uint8_t **dest, size_t *destlen) |
Encode a chunk of UCS-4 (big endian) data into UTF-8. | |
static parserutils_error | charset_utf8_codec_decode (parserutils_charset_codec *codec, const uint8_t **source, size_t *sourcelen, uint8_t **dest, size_t *destlen) |
Decode a chunk of UTF-8 data into UCS-4 (big endian) | |
static parserutils_error | charset_utf8_codec_reset (parserutils_charset_codec *codec) |
Clear a UTF-8 codec's encoding state. | |
static parserutils_error | charset_utf8_codec_read_char (charset_utf8_codec *c, const uint8_t **source, size_t *sourcelen, uint8_t **dest, size_t *destlen) |
Read a character from the UTF-8 to UCS-4 (big endian) | |
static parserutils_error | charset_utf8_codec_output_decoded_char (charset_utf8_codec *c, uint32_t ucs4, uint8_t **dest, size_t *destlen) |
Output a UCS-4 character (big endian) |
Variables | |
const parserutils_charset_handler | charset_utf8_codec_handler |
#define INVAL_BUFSIZE (32) |
Definition at line 25 of file codec_utf8.c.
#define READ_BUFSIZE (8) |
Definition at line 31 of file codec_utf8.c.
#define WRITE_BUFSIZE (8) |
Definition at line 37 of file codec_utf8.c.
typedef struct charset_utf8_codec charset_utf8_codec |
UTF-8 charset codec.
|
static |
Create a UTF-8 codec.
charset | The charset to read from / write to |
codec | Pointer to location to receive codec |
Definition at line 91 of file codec_utf8.c.
References charset_utf8_codec::base, charset_utf8_codec_decode(), charset_utf8_codec_destroy(), charset_utf8_codec_encode(), charset_utf8_codec_reset(), parserutils_charset_codec::decode, parserutils_charset_codec::destroy, parserutils_charset_codec::encode, parserutils_charset_codec::handler, charset_utf8_codec::inval_buf, charset_utf8_codec::inval_len, PARSERUTILS_NOMEM, PARSERUTILS_OK, charset_utf8_codec::read_buf, charset_utf8_codec::read_len, parserutils_charset_codec::reset, UNUSED, charset_utf8_codec::write_buf, and charset_utf8_codec::write_len.
|
static |
Decode a chunk of UTF-8 data into UCS-4 (big endian)
codec | The codec to use |
source | Pointer to pointer to source data |
sourcelen | Pointer to length (in bytes) of source data |
dest | Pointer to pointer to output buffer |
destlen | Pointer to length (in bytes) of output buffer |
On exit, ::source will point immediately after the last input character read, if the result is _OK or _NOMEM. Any remaining output for the character will be buffered by the codec for writing on the next call.
In the case of the result being INVALID, ::source will point _at the last input character read; nothing will be written or buffered for the failed character. It is up to the client to fix the cause of the failure and retry the decoding process.
Note that, if failure occurs whilst attempting to write any output buffered by the last call, then ::source and ::sourcelen will remain unchanged (as nothing more has been read).
If STRICT error handling is configured and an illegal sequence is split over two calls, then _INVALID will be returned from the second call, but ::source will point mid-way through the invalid sequence (i.e. it will be unmodified over the second call). In addition, the internal incomplete-sequence buffer will be emptied, such that subsequent calls will progress, rather than re-evaluating the same invalid sequence.
::sourcelen will be reduced appropriately on exit.
::dest will point immediately after the last character written.
::destlen will be reduced appropriately on exit.
Call this with a source length of 0 to flush the output buffer.
Definition at line 278 of file codec_utf8.c.
References charset_utf8_codec_read_char(), endian_host_to_big(), charset_utf8_codec::inval_buf, INVAL_BUFSIZE, charset_utf8_codec::inval_len, max, min, PARSERUTILS_NOMEM, PARSERUTILS_OK, charset_utf8_codec::read_buf, and charset_utf8_codec::read_len.
Referenced by charset_utf8_codec_create().
|
static |
Destroy a UTF-8 codec.
codec | The codec to destroy |
Definition at line 128 of file codec_utf8.c.
References PARSERUTILS_OK, and UNUSED.
Referenced by charset_utf8_codec_create().
|
static |
Encode a chunk of UCS-4 (big endian) data into UTF-8.
codec | The codec to use |
source | Pointer to pointer to source data |
sourcelen | Pointer to length (in bytes) of source data |
dest | Pointer to pointer to output buffer |
destlen | Pointer to length (in bytes) of output buffer |
On exit, ::source will point immediately after the last input character read. Any remaining output for the character will be buffered by the codec for writing on the next call.
Note that, if failure occurs whilst attempting to write any output buffered by the last call, then ::source and ::sourcelen will remain unchanged (as nothing more has been read).
::sourcelen will be reduced appropriately on exit.
::dest will point immediately after the last character written.
::destlen will be reduced appropriately on exit.
Definition at line 162 of file codec_utf8.c.
References endian_big_to_host(), len, PARSERUTILS_NOMEM, PARSERUTILS_OK, UTF8_FROM_UCS4, charset_utf8_codec::write_buf, WRITE_BUFSIZE, and charset_utf8_codec::write_len.
Referenced by charset_utf8_codec_create().
|
static |
Determine whether this codec handles a specific charset.
charset | Charset to test |
Definition at line 74 of file codec_utf8.c.
References parserutils_charset_mibenum_from_name(), and SLEN.
|
inlinestatic |
Output a UCS-4 character (big endian)
c | Codec to use |
ucs4 | UCS-4 character (host endian) |
dest | Pointer to pointer to output buffer |
destlen | Pointer to output buffer length |
Definition at line 523 of file codec_utf8.c.
References endian_host_to_big(), PARSERUTILS_NOMEM, PARSERUTILS_OK, charset_utf8_codec::read_buf, and charset_utf8_codec::read_len.
Referenced by charset_utf8_codec_read_char().
|
inlinestatic |
Read a character from the UTF-8 to UCS-4 (big endian)
c | The codec |
source | Pointer to pointer to source buffer (updated on exit) |
sourcelen | Pointer to length of source buffer (updated on exit) |
dest | Pointer to pointer to output buffer (updated on exit) |
destlen | Pointer to length of output buffer (updated on exit) |
On exit, ::source will point immediately after the last input character read, if the result is _OK or _NOMEM. Any remaining output for the character will be buffered by the codec for writing on the next call.
In the case of the result being INVALID, ::source will point _at the last input character read; nothing will be written or buffered for the failed character. It is up to the client to fix the cause of the failure and retry the decoding process.
::sourcelen will be reduced appropriately on exit.
::dest will point immediately after the last character written.
::destlen will be reduced appropriately on exit.
Definition at line 408 of file codec_utf8.c.
References charset_utf8_codec::base, charset_utf8_codec_output_decoded_char(), parserutils_charset_codec::errormode, charset_utf8_codec::inval_buf, INVAL_BUFSIZE, charset_utf8_codec::inval_len, PARSERUTILS_CHARSET_CODEC_ERROR_STRICT, PARSERUTILS_INVALID, PARSERUTILS_NEEDDATA, PARSERUTILS_NOMEM, PARSERUTILS_OK, UTF8_NEXT_PARANOID, and UTF8_TO_UCS4.
Referenced by charset_utf8_codec_decode().
|
static |
Clear a UTF-8 codec's encoding state.
codec | The codec to reset |
Definition at line 363 of file codec_utf8.c.
References charset_utf8_codec::inval_buf, charset_utf8_codec::inval_len, PARSERUTILS_OK, charset_utf8_codec::read_buf, charset_utf8_codec::read_len, charset_utf8_codec::write_buf, and charset_utf8_codec::write_len.
Referenced by charset_utf8_codec_create().
const parserutils_charset_handler charset_utf8_codec_handler |
Definition at line 542 of file codec_utf8.c.