Libparserutils
codec_ext8.c
Go to the documentation of this file.
1/*
2 * This file is part of LibParserUtils.
3 * Licensed under the MIT License,
4 * http://www.opensource.org/licenses/mit-license.php
5 * Copyright 2008 John-Mark Bell <jmb@netsurf-browser.org>
6 */
7
8#include <assert.h>
9#include <stdlib.h>
10#include <string.h>
11
13
15#include "utils/endian.h"
16#include "utils/utils.h"
17
19
20static struct {
21 uint16_t mib;
22 const char *name;
23 size_t len;
24 uint32_t *table;
25} known_charsets[] = {
26 { 0, "Windows-1250", SLEN("Windows-1250"), w1250 },
27 { 0, "Windows-1251", SLEN("Windows-1251"), w1251 },
28 { 0, "Windows-1252", SLEN("Windows-1252"), w1252 },
29 { 0, "Windows-1253", SLEN("Windows-1253"), w1253 },
30 { 0, "Windows-1254", SLEN("Windows-1254"), w1254 },
31 { 0, "Windows-1255", SLEN("Windows-1255"), w1255 },
32 { 0, "Windows-1256", SLEN("Windows-1256"), w1256 },
33 { 0, "Windows-1257", SLEN("Windows-1257"), w1257 },
34 { 0, "Windows-1258", SLEN("Windows-1258"), w1258 },
35};
36
40typedef struct charset_ext8_codec {
42
43 uint32_t *table;
44
45#define READ_BUFSIZE (8)
49 size_t read_len;
50
51#define WRITE_BUFSIZE (8)
55 size_t write_len;
56
58
59static bool charset_ext8_codec_handles_charset(const char *charset);
60static parserutils_error charset_ext8_codec_create(const char *charset,
66 const uint8_t **source, size_t *sourcelen,
67 uint8_t **dest, size_t *destlen);
70 const uint8_t **source, size_t *sourcelen,
71 uint8_t **dest, size_t *destlen);
76 const uint8_t **source, size_t *sourcelen,
77 uint8_t **dest, size_t *destlen);
80 uint32_t ucs4, uint8_t **dest, size_t *destlen);
82 uint32_t ucs4, uint8_t **s, size_t *len);
84 const uint8_t *s, size_t len, uint32_t *ucs4);
85
92bool charset_ext8_codec_handles_charset(const char *charset)
93{
94 uint32_t i;
95 uint16_t match = parserutils_charset_mibenum_from_name(charset,
96 strlen(charset));
97
98 if (known_charsets[0].mib == 0) {
99 for (i = 0; i < N_ELEMENTS(known_charsets); i++) {
100 known_charsets[i].mib =
104 }
105 }
106
107 for (i = 0; i < N_ELEMENTS(known_charsets); i++) {
108 if (known_charsets[i].mib == match)
109 return true;
110 }
111
112 return false;
113}
114
126{
127 uint32_t i;
130 charset, strlen(charset));
131 uint32_t *table = NULL;
132
133 for (i = 0; i < N_ELEMENTS(known_charsets); i++) {
134 if (known_charsets[i].mib == match) {
135 table = known_charsets[i].table;
136 break;
137 }
138 }
139
140 assert(table != NULL);
141
142 c = malloc(sizeof(charset_ext8_codec));
143 if (c == NULL)
144 return PARSERUTILS_NOMEM;
145
146 c->table = table;
147
148 c->read_buf[0] = 0;
149 c->read_len = 0;
150
151 c->write_buf[0] = 0;
152 c->write_len = 0;
153
154 /* Finally, populate vtable */
159
160 *codec = (parserutils_charset_codec *) c;
161
162 return PARSERUTILS_OK;
163}
164
177
206 const uint8_t **source, size_t *sourcelen,
207 uint8_t **dest, size_t *destlen)
208{
210 uint32_t ucs4;
211 uint32_t *towrite;
212 size_t towritelen;
213 parserutils_error error;
214
215 /* Process any outstanding characters from the previous call */
216 if (c->write_len > 0) {
217 uint32_t *pwrite = c->write_buf;
218
219 while (c->write_len > 0) {
220 error = charset_ext8_from_ucs4(c, pwrite[0],
221 dest, destlen);
222 if (error != PARSERUTILS_OK) {
223 uint32_t len;
224 assert(error == PARSERUTILS_NOMEM);
225
226 for (len = 0; len < c->write_len; len++) {
227 c->write_buf[len] = pwrite[len];
228 }
229
230 return error;
231 }
232
233 pwrite++;
234 c->write_len--;
235 }
236 }
237
238 /* Now process the characters for this call */
239 while (*sourcelen > 0) {
240 ucs4 = endian_big_to_host(*((uint32_t *) (void *) *source));
241 towrite = &ucs4;
242 towritelen = 1;
243
244 /* Output current characters */
245 while (towritelen > 0) {
246 error = charset_ext8_from_ucs4(c, towrite[0], dest,
247 destlen);
248 if (error != PARSERUTILS_OK) {
249 uint32_t len;
250 if (error != PARSERUTILS_NOMEM) {
251 return error;
252 }
253
254 /* Insufficient output space */
255 assert(towritelen < WRITE_BUFSIZE);
256
257 c->write_len = towritelen;
258
259 /* Copy pending chars to save area, for
260 * processing next call. */
261 for (len = 0; len < towritelen; len++)
262 c->write_buf[len] = towrite[len];
263
264 /* Claim character we've just buffered,
265 * so it's not reprocessed */
266 *source += 4;
267 *sourcelen -= 4;
268
269 return PARSERUTILS_NOMEM;
270 }
271
272 towrite++;
273 towritelen--;
274 }
275
276 *source += 4;
277 *sourcelen -= 4;
278 }
279
280 return PARSERUTILS_OK;
281}
282
325 const uint8_t **source, size_t *sourcelen,
326 uint8_t **dest, size_t *destlen)
327{
329 parserutils_error error;
330
331 if (c->read_len > 0) {
332 /* Output left over from last decode */
333 uint32_t *pread = c->read_buf;
334
335 while (c->read_len > 0 && *destlen >= c->read_len * 4) {
336 *((uint32_t *) (void *) *dest) =
337 endian_host_to_big(pread[0]);
338
339 *dest += 4;
340 *destlen -= 4;
341
342 pread++;
343 c->read_len--;
344 }
345
346 if (*destlen < c->read_len * 4) {
347 /* Ran out of output buffer */
348 size_t i;
349
350 /* Shuffle remaining output down */
351 for (i = 0; i < c->read_len; i++)
352 c->read_buf[i] = pread[i];
353
354 return PARSERUTILS_NOMEM;
355 }
356 }
357
358 /* Finally, the "normal" case; process all outstanding characters */
359 while (*sourcelen > 0) {
361 source, sourcelen, dest, destlen);
362 if (error != PARSERUTILS_OK) {
363 return error;
364 }
365 }
366
367 return PARSERUTILS_OK;
368}
369
377{
379
380 c->read_buf[0] = 0;
381 c->read_len = 0;
382
383 c->write_buf[0] = 0;
384 c->write_len = 0;
385
386 return PARSERUTILS_OK;
387}
388
389
419 const uint8_t **source, size_t *sourcelen,
420 uint8_t **dest, size_t *destlen)
421{
422 uint32_t ucs4;
423 parserutils_error error;
424
425 /* Convert a single character */
426 error = charset_ext8_to_ucs4(c, *source, *sourcelen, &ucs4);
427 if (error == PARSERUTILS_OK) {
428 /* Read a character */
430 ucs4, dest, destlen);
431 if (error == PARSERUTILS_OK || error == PARSERUTILS_NOMEM) {
432 /* output succeeded; update source pointers */
433 *source += 1;
434 *sourcelen -= 1;
435 }
436
437 return error;
438 } else if (error == PARSERUTILS_NEEDDATA) {
439 /* Can only happen if sourcelen == 0 */
440 return error;
441 } else if (error == PARSERUTILS_INVALID) {
442 /* Illegal input sequence */
443
444 /* Strict errormode; simply flag invalid character */
445 if (c->base.errormode ==
447 return PARSERUTILS_INVALID;
448 }
449
450 /* output U+FFFD and continue processing. */
452 0xFFFD, dest, destlen);
453 if (error == PARSERUTILS_OK || error == PARSERUTILS_NOMEM) {
454 /* output succeeded; update source pointers */
455 *source += 1;
456 *sourcelen -= 1;
457 }
458
459 return error;
460 }
461
462 return PARSERUTILS_OK;
463}
464
476 uint32_t ucs4, uint8_t **dest, size_t *destlen)
477{
478 if (*destlen < 4) {
479 /* Run out of output buffer */
480 c->read_len = 1;
481 c->read_buf[0] = ucs4;
482
483 return PARSERUTILS_NOMEM;
484 }
485
486 *((uint32_t *) (void *) *dest) = endian_host_to_big(ucs4);
487 *dest += 4;
488 *destlen -= 4;
489
490 return PARSERUTILS_OK;
491}
492
510 uint32_t ucs4, uint8_t **s, size_t *len)
511{
512 uint8_t out = 0;
513
514 if (*len < 1)
515 return PARSERUTILS_NOMEM;
516
517 if (ucs4 < 0x80) {
518 /* ASCII */
519 out = ucs4;
520 } else {
521 uint32_t i;
522
523 for (i = 0; i < 128; i++) {
524 if (ucs4 == c->table[i])
525 break;
526 }
527
528 if (i == 128) {
529 if (c->base.errormode ==
531 return PARSERUTILS_INVALID;
532 else
533 out = '?';
534 } else {
535 out = 0x80 + i;
536 }
537 }
538
539 *(*s) = out;
540 (*s)++;
541 (*len)--;
542
543 return PARSERUTILS_OK;
544}
545
558 const uint8_t *s, size_t len, uint32_t *ucs4)
559{
560 uint32_t out;
561
562 if (len < 1)
564
565 if (*s < 0x80) {
566 out = *s;
567 } else {
568 if (c->table[*s - 0x80] == 0xFFFF)
569 return PARSERUTILS_INVALID;
570
571 out = c->table[*s - 0x80];
572 }
573
574 *ucs4 = out;
575
576 return PARSERUTILS_OK;
577}
578
583
parserutils_charset_handler charset_ext8_codec_handler
Definition codec_ext8.c:579
@ PARSERUTILS_CHARSET_CODEC_ERROR_STRICT
Abort processing if unrepresentable character encountered.
Definition codec.h:64
uint32_t * table
Definition codec_8859.c:24
size_t len
Definition codec_8859.c:23
uint16_t mib
Definition codec_8859.c:21
const char * name
Definition codec_8859.c:22
static struct @253173311160310370314263334127070070016153225302 known_charsets[]
#define WRITE_BUFSIZE
Definition codec_8859.c:57
static parserutils_error charset_ext8_codec_destroy(parserutils_charset_codec *codec)
Destroy an extended 8bit codec.
Definition codec_ext8.c:171
static parserutils_error charset_ext8_from_ucs4(charset_ext8_codec *c, uint32_t ucs4, uint8_t **s, size_t *len)
Convert a UCS4 (host endian) character to extended 8bit.
Definition codec_ext8.c:509
static parserutils_error charset_ext8_codec_reset(parserutils_charset_codec *codec)
Clear an extended 8bit codec's encoding state.
Definition codec_ext8.c:376
#define READ_BUFSIZE
Definition codec_ext8.c:45
static parserutils_error charset_ext8_codec_output_decoded_char(charset_ext8_codec *c, uint32_t ucs4, uint8_t **dest, size_t *destlen)
Output a UCS-4 character (big endian)
Definition codec_ext8.c:475
static parserutils_error charset_ext8_codec_read_char(charset_ext8_codec *c, const uint8_t **source, size_t *sourcelen, uint8_t **dest, size_t *destlen)
Read a character from the extended 8bit to UCS-4 (big endian)
Definition codec_ext8.c:418
static parserutils_error charset_ext8_codec_decode(parserutils_charset_codec *codec, const uint8_t **source, size_t *sourcelen, uint8_t **dest, size_t *destlen)
Decode a chunk of extended 8bit data into UCS-4 (big endian)
Definition codec_ext8.c:324
static parserutils_error charset_ext8_codec_encode(parserutils_charset_codec *codec, const uint8_t **source, size_t *sourcelen, uint8_t **dest, size_t *destlen)
Encode a chunk of UCS-4 (big endian) data into extended 8bit.
Definition codec_ext8.c:205
static parserutils_error charset_ext8_to_ucs4(charset_ext8_codec *c, const uint8_t *s, size_t len, uint32_t *ucs4)
Convert an extended 8bit character to UCS4 (host endian)
Definition codec_ext8.c:557
#define WRITE_BUFSIZE
Definition codec_ext8.c:51
static bool charset_ext8_codec_handles_charset(const char *charset)
Determine whether this codec handles a specific charset.
Definition codec_ext8.c:92
static parserutils_error charset_ext8_codec_create(const char *charset, parserutils_charset_codec **codec)
Create an extended 8bit codec.
Definition codec_ext8.c:124
static uint32_t endian_host_to_big(uint32_t host)
Definition endian.h:24
static uint32_t endian_big_to_host(uint32_t big)
Definition endian.h:32
parserutils_error
Definition errors.h:18
@ PARSERUTILS_OK
Definition errors.h:19
@ PARSERUTILS_NEEDDATA
Definition errors.h:25
@ PARSERUTILS_INVALID
Definition errors.h:23
@ PARSERUTILS_NOMEM
Definition errors.h:21
static uint32_t w1257[128]
static uint32_t w1254[128]
Definition ext8_tables.h:92
static uint32_t w1250[128]
Definition ext8_tables.h:16
static uint32_t w1252[128]
Definition ext8_tables.h:54
static uint32_t w1251[128]
Definition ext8_tables.h:35
static uint32_t w1258[128]
static uint32_t w1255[128]
static uint32_t w1253[128]
Definition ext8_tables.h:73
static uint32_t w1256[128]
uint16_t parserutils_charset_mibenum_from_name(const char *alias, size_t len)
Retrieve the MIB enum value assigned to an encoding name.
Definition aliases.c:107
Windows charset codec.
Definition codec_ext8.c:40
uint32_t read_buf[READ_BUFSIZE]
Buffer for partial output sequences (decode) (host-endian)
Definition codec_ext8.c:46
uint32_t * table
Mapping table for 0x80-0xFF.
Definition codec_ext8.c:43
uint32_t write_buf[WRITE_BUFSIZE]
Buffer for partial output sequences (encode) (host-endian)
Definition codec_ext8.c:52
parserutils_charset_codec base
Base class.
Definition codec_ext8.c:41
size_t write_len
Character length of write_buf.
Definition codec_ext8.c:55
size_t read_len
Character length of read_buf.
Definition codec_ext8.c:49
Core charset codec definition; implementations extend this.
Definition codec_impl.h:19
parserutils_charset_codec_errormode errormode
error mode
Definition codec_impl.h:22
parserutils_error(* encode)(parserutils_charset_codec *codec, const uint8_t **source, size_t *sourcelen, uint8_t **dest, size_t *destlen)
Definition codec_impl.h:26
parserutils_error(* destroy)(parserutils_charset_codec *codec)
Definition codec_impl.h:25
parserutils_error(* decode)(parserutils_charset_codec *codec, const uint8_t **source, size_t *sourcelen, uint8_t **dest, size_t *destlen)
Definition codec_impl.h:29
struct parserutils_charset_codec::@271367034342366162232062053053007137175253257255 handler
Vtable for handler code.
parserutils_error(* reset)(parserutils_charset_codec *codec)
Definition codec_impl.h:32
Codec factory component definition.
Definition codec_impl.h:39
#define UNUSED(x)
Definition utils.h:25
#define SLEN(s)
Definition utils.h:21
#define N_ELEMENTS(s)
Definition utils.h:29