diff options
Diffstat (limited to 'newlib/libc/iconv/ces/utf-16.c')
-rw-r--r-- | newlib/libc/iconv/ces/utf-16.c | 364 |
1 files changed, 266 insertions, 98 deletions
diff --git a/newlib/libc/iconv/ces/utf-16.c b/newlib/libc/iconv/ces/utf-16.c index fae4ed5f0..4b2ff77a1 100644 --- a/newlib/libc/iconv/ces/utf-16.c +++ b/newlib/libc/iconv/ces/utf-16.c @@ -1,6 +1,6 @@ -/*- - * Copyright (c) 1999,2000 - * Konstantin Chuguev. All rights reserved. +/* + * Copyright (c) 2003-2004, Artem B. Bityuckiy + * Copyright (c) 1999,2000, Konstantin Chuguev. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -22,117 +22,285 @@ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. - * - * iconv (Charset Conversion Library) v2.0 */ -#include "../lib/deps.h" +#include "cesbi.h" + +#if defined (ICONV_TO_UCS_CES_UTF_16) \ + || defined (ICONV_FROM_UCS_CES_UTF_16) -#ifdef _ICONV_CONVERTER_UTF_16 +#include <_ansi.h> +#include <reent.h> +#include <sys/types.h> +#include <stdlib.h> #include <string.h> +#include <wchar.h> #include "../lib/local.h" +#include "../lib/ucsconv.h" +#include "../lib/endian.h" + +/* + * On input UTF-16 converter interpret BOM and uses Big Endian byte order if BOM + * is absent. UTF-16 converter outputs in System Endian and adds correspondent + * BOM as first code. UTF-16LE and UTF-16BE converters ignore BOM on input and + * don't output BOM. + */ + +#define UTF16_UNDEFINED 0x00 +#define UTF16_BIG_ENDIAN 0x01 +#define UTF16_LITTLE_ENDIAN 0x02 +#define UTF16_SYSTEM_ENDIAN 0x04 +#define UTF16_BOM_WRITTEN 0x08 + +#define UTF16_BOM 0xFEFF + +#define UTF_16 "utf_16" +#define UTF_16BE "utf_16be" +#define UTF_16LE "utf_16le" + +static size_t +_DEFUN(utf_16_close, (rptr, data), + struct _reent *rptr _AND + _VOID_PTR data) +{ + _free_r(rptr, data); + return 0; +} -static ssize_t -_DEFUN(convert_from_ucs, (ces, in, outbuf, outbytesleft), - struct iconv_ces *ces _AND - ucs_t in _AND - unsigned char **outbuf _AND - size_t *outbytesleft) +#if defined (ICONV_FROM_UCS_CES_UTF_16) +static _VOID_PTR +_DEFUN(utf_16_init_from_ucs, (rptr, encoding), + struct _reent *rptr _AND + _CONST char *encoding) { - unsigned char *cp; - int *state; - int bytes; - - if (in == UCS_CHAR_NONE) - return 1; /* No state reinitialization for table charsets */ - if (in > 0x10FFFF) - return -1; - bytes = *(state = (int *)(ces->data)) ? 2 : 4; - if (in > 0xFFFF) - bytes += 2; - if (*outbytesleft < bytes) - return 0; /* No space in the output buffer */ - cp = *outbuf; - if (*state == 0) { - *cp++ = 0xFE; - *cp++ = 0xFF; - *state = 1; + int *data; + + if ((data = (int *)_malloc_r (rptr, sizeof (int))) == NULL) + return (_VOID_PTR)NULL; + + if (strcmp (encoding, UTF_16LE) == 0) + *data = UTF16_LITTLE_ENDIAN; + else if (strcmp (encoding, UTF_16BE) == 0) + *data = UTF16_BIG_ENDIAN; + else + *data = UTF16_SYSTEM_ENDIAN; + + return (_VOID_PTR)data; +} + +static size_t +_DEFUN(utf_16_convert_from_ucs, (data, in, outbuf, outbytesleft), + _VOID_PTR data _AND + register ucs4_t in _AND + unsigned char **outbuf _AND + size_t *outbytesleft) +{ + register ucs2_t *cp; + register size_t bytes; + register int *state; + + if (in > 0x0010FFFF || (in >= 0x0000D800 && in <= 0x0000DFFF) + || in == 0x0000FFFF || in == 0x0000FFFE) + return (size_t)ICONV_CES_INVALID_CHARACTER; + + state = (int *)data; + bytes = (*state == UTF16_SYSTEM_ENDIAN) ? sizeof (ucs2_t) * 2 + : sizeof (ucs2_t); + + if (in > 0x0000FFFF) + bytes += sizeof (ucs2_t); + + if (*outbytesleft < bytes) + return (size_t)ICONV_CES_NOSPACE; + + cp = (ucs2_t *)*outbuf; + + if (*state == UTF16_SYSTEM_ENDIAN) + { + *cp++ = UTF16_BOM; + *state |= UTF16_BOM_WRITTEN; } - if (in > 0xFFFF) { - *cp++ = ((in -= 0x10000) >> 18) | 0xD8; - *cp++ = (in >> 10) & 0xFF; - *cp++ = ((in >> 8) & 3) | 0xDC; - } else - *cp++ = (in >> 8) & 0xFF; - *cp++ = in & 0xFF; - (*outbuf) += bytes; - *outbytesleft -= bytes; - return 1; + + if (in < 0x00010000) + { + switch (*state) + { + case UTF16_LITTLE_ENDIAN: + *cp = ICONV_HTOLES ((ucs2_t)in); + break; + case UTF16_BIG_ENDIAN: + *cp = ICONV_HTOBES ((ucs2_t)in); + break; + case (UTF16_SYSTEM_ENDIAN | UTF16_BOM_WRITTEN): + *cp = (ucs2_t)in; + break; + } + } + else + { + ucs2_t w1, w2; + + /* Process surrogate pair */ + in -= 0x00010000; + w1 = ((ucs2_t)((in >> 10)) & 0x03FF) | 0xD800; + w2 = (ucs2_t)(in & 0x000003FF) | 0xDC00; + + switch (*state) + { + case UTF16_LITTLE_ENDIAN: + *cp++ = ICONV_HTOLES (w1); + *cp = ICONV_HTOLES (w2); + break; + case UTF16_BIG_ENDIAN: + *cp++ = ICONV_HTOBES (w1); + *cp = ICONV_HTOBES (w2); + break; + case (UTF16_SYSTEM_ENDIAN | UTF16_BOM_WRITTEN): + *cp++ = w1; + *cp = w2; + break; + } + } + + *outbuf += bytes; + *outbytesleft -= bytes; + + return bytes; } +#endif /* ICONV_FROM_UCS_CES_UTF_16 */ -static __inline ucs_t -_DEFUN(msb, (buf), _CONST unsigned char *buf) +#if defined (ICONV_TO_UCS_CES_UTF_16) +static _VOID_PTR +_DEFUN(utf_16_init_to_ucs, (rptr, encoding), + struct _reent *rptr _AND + _CONST char *encoding) { - return (buf[0] << 8) | buf[1]; + int *data; + + if ((data = (int *)_malloc_r (rptr, sizeof (int))) == NULL) + return (_VOID_PTR)NULL; + + if (strcmp (encoding, UTF_16BE) == 0) + *data = UTF16_BIG_ENDIAN; + else if (strcmp (encoding, UTF_16LE) == 0) + *data = UTF16_LITTLE_ENDIAN; + else + *data = UTF16_UNDEFINED; + + return (_VOID_PTR)data; } -static ucs_t -_DEFUN(convert_to_ucs, (ces, inbuf, inbytesleft), - struct iconv_ces *ces _AND - _CONST unsigned char **inbuf _AND - size_t *inbytesleft) +static ucs4_t +_DEFUN(utf_16_convert_to_ucs, (data, inbuf, inbytesleft), + _VOID_PTR data _AND + _CONST unsigned char **inbuf _AND + size_t *inbytesleft) { - ucs_t res, res2; - int *state, mark; - - if (*inbytesleft < 2) - return UCS_CHAR_NONE; /* Not enough bytes in the input buffer */ - state = (int *)(ces->data); - res = msb(*inbuf); - switch (res) { - case UCS_CHAR_ZERO_WIDTH_NBSP: - if (*state == 0) - *state = 1; - mark = 1; - break; - case UCS_CHAR_INVALID: - if (*state == 0) - *state = 2; - mark = 1; - break; - default: - if (*state == 0) - *state = 1; - mark = 0; + register ucs2_t w1; + register ucs2_t w2; + register ucs2_t *cp; + int *state; + ucs4_t res; + int bytes = sizeof (ucs2_t); + + if (*inbytesleft < bytes) + return (ucs4_t)ICONV_CES_BAD_SEQUENCE; + + state = (int *)data; + cp = ((ucs2_t *)*inbuf); + + if (*state == UTF16_UNDEFINED) + { + if (*cp == ICONV_HTOLES(UTF16_BOM)) + *state = UTF16_LITTLE_ENDIAN; + else + *state = UTF16_BIG_ENDIAN; + + if ( *cp == ICONV_HTOBES (UTF16_BOM) + || *cp == ICONV_HTOLES (UTF16_BOM)) + { + if (*inbytesleft < (bytes += sizeof (ucs2_t))) + return (ucs4_t)ICONV_CES_BAD_SEQUENCE; + cp += 1; + } + } + + if (*state == UTF16_LITTLE_ENDIAN) + w1 = ICONV_LETOHS (*cp); + else + w1 = ICONV_BETOHS (*cp); + + if (w1 < 0xD800 || w1 > 0xDFFF) + { + if (w1 == 0xFFFF || w1 == 0xFFFE) + return (ucs4_t)ICONV_CES_INVALID_CHARACTER; + res = (ucs4_t)w1; } - if (mark) { - if (*inbytesleft < 4) - return UCS_CHAR_NONE; /* Not enough bytes in the input buffer */ - *inbytesleft -= 2; - res = msb((*inbuf) += 2); + else + { + /* Process surrogate pair */ + if (*inbytesleft < (bytes += 2)) + return (ucs4_t)ICONV_CES_BAD_SEQUENCE; + + if (w1 > 0xDBFF) + /* Broken surrogate character */ + return (ucs4_t)ICONV_CES_INVALID_CHARACTER; + + cp += 1; + + if (*state == UTF16_LITTLE_ENDIAN) + w2 = ICONV_LETOHS (*cp); + else + w2 = ICONV_BETOHS (*cp); + + if (w2 < 0xDC00 || w2 > 0xDFFF) + /* Broken surrogate character */ + return (ucs4_t)ICONV_CES_INVALID_CHARACTER; + + res = (ucs4_t)(w2 & 0x03FF) | ((ucs4_t)(w1 & 0x03FF) << 10); + res += 0x00010000; } - if (*state == 2) { /* LSB order */ - res = (*(*inbuf) ++); - res |= (*(*inbuf) ++) << 8; - } else - *inbuf += 2; - *inbytesleft -= 2; - if ((res & 0xFC00) != 0xD800) /* Non-surrogate character */ - return res; - if (*inbytesleft < 2) - return UCS_CHAR_NONE; /* Not enough bytes in the input buffer */ - if (*state == 2) { - res2 = (*inbuf)[0]; - res2 |= (*inbuf)[1] << 8; - } else - res2 = msb(*inbuf); - if ((res2 & 0xFC00) != 0xDC00) /* Broken surrogate pair */ - return -1; - (*inbuf) += 2; - (*inbytesleft) -= 2; - return (((res & 0x3FF) << 10) | (res2 & 0x3FF)) + 0x10000; + + *inbuf += bytes; + *inbytesleft -= bytes; + + return res; } +#endif /* ICONV_TO_UCS_CES_UTF_16 */ -ICONV_CES_STATEFUL_MODULE_DECL(utf_16); +static int +_DEFUN(utf_16_get_mb_cur_max, (data), + _VOID_PTR data) +{ + return 6; +} + +#if defined (ICONV_TO_UCS_CES_UTF_16) +_CONST iconv_to_ucs_ces_handlers_t +_iconv_to_ucs_ces_handlers_utf_16 = +{ + utf_16_init_to_ucs, + utf_16_close, + utf_16_get_mb_cur_max, + NULL, + NULL, + NULL, + utf_16_convert_to_ucs +}; +#endif + +#if defined (ICONV_FROM_UCS_CES_UTF_16) +_CONST iconv_from_ucs_ces_handlers_t +_iconv_from_ucs_ces_handlers_utf_16 = +{ + utf_16_init_from_ucs, + utf_16_close, + utf_16_get_mb_cur_max, + NULL, + NULL, + NULL, + utf_16_convert_from_ucs +}; +#endif -#endif /* #ifdef _ICONV_CONVERTER_UTF_16 */ +#endif /* ICONV_TO_UCS_CES_UTF_16 || ICONV_FROM_UCS_CES_UTF_16 */ |