diff options
Diffstat (limited to 'newlib/libc/iconv/ces/utf-8.c')
-rw-r--r-- | newlib/libc/iconv/ces/utf-8.c | 396 |
1 files changed, 143 insertions, 253 deletions
diff --git a/newlib/libc/iconv/ces/utf-8.c b/newlib/libc/iconv/ces/utf-8.c index fda1e6e3b..157afc089 100644 --- a/newlib/libc/iconv/ces/utf-8.c +++ b/newlib/libc/iconv/ces/utf-8.c @@ -1,6 +1,6 @@ -/* - * Copyright (c) 2003-2004, Artem B. Bityuckiy - * Copyright (c) 1999,2000, Konstantin Chuguev. All rights reserved. +/*- + * Copyright (c) 1999,2000 + * Konstantin Chuguev. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -22,276 +22,166 @@ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. + * + * iconv (Charset Conversion Library) v2.0 */ -#include "cesbi.h" - -#if defined (ICONV_TO_UCS_CES_UTF_8) \ - || defined (ICONV_FROM_UCS_CES_UTF_8) +#include "../lib/deps.h" -#include <_ansi.h> -#include <reent.h> -#include <sys/types.h> +#ifdef _ICONV_CONVERTER_UTF_8 #include "../lib/local.h" -#include "../lib/ucsconv.h" -#define UTF8_MB_CUR_MAX 6 - -/* - * UTF-8 CES converter doesn't interpret BOM. Reject overlong sequences, - * U'FFFF, U'FFFE codes, UTF-16 surrogate codes and all codes > 0x7FFFFFFF. - */ +#define cont_byte(b) (((b) & 0x3F) | 0x80) -#if defined (ICONV_FROM_UCS_CES_UTF_8) -static size_t -_DEFUN(convert_from_ucs, (data, in, outbuf, outbytesleft), - _VOID_PTR data _AND - register ucs4_t in _AND - unsigned char **outbuf _AND - size_t *outbytesleft) +static ssize_t +_DEFUN(convert_from_ucs, (ces, in, outbuf, outbytesleft), + struct iconv_ces *ces _AND + ucs_t in _AND + unsigned char **outbuf _AND + size_t *outbytesleft) { - register unsigned char *cp; - register size_t bytes; - - if ((in >= 0x0000D800 && in <= 0x0000DFFF) - || in > 0x7FFFFFFF || in == 0x0000FFFF || in == 0x0000FFFE) - return (size_t)ICONV_CES_INVALID_CHARACTER; - - if (in < 0x80) - bytes = 1; - else if (in < 0x800) - bytes = 2; - else if (in < 0x10000) - bytes = 3; - else if (in < 0x200000) - bytes = 4; - else if (in < 0x4000000) - bytes = 5; - else - bytes = 6; - - if (*outbytesleft < bytes) - return (size_t)ICONV_CES_NOSPACE; - - cp = *outbuf; - - switch (bytes) - { - case 1: + unsigned char *cp; + int n; + if (in == UCS_CHAR_NONE) + return 1; /* No state reinitialization for table charsets */ + if (in < 0x80) { + n = 1; + } else if (in < 0x800) { + n = 2; + } else if (in < 0x10000) { + n = 3; + } else if (in < 0x200000) { + n = 4; + } else if (in < 0x4000000) { + n = 5; + } else if (in <= 0x7FFFFFFF) { + n = 6; + } else + return -1; + if (*outbytesleft < n) + return 0; + cp = *outbuf; + switch (n) { + case 1: *cp = (unsigned char)in; break; - - case 2: - *cp++ = (unsigned char)((in >> 6) | 0x000000C0); - *cp++ = (unsigned char)((in & 0x0000003F) | 0x00000080); + case 2: + *cp++ = (unsigned char)((in >> 6) | 0xC0); + *cp++ = (unsigned char)cont_byte(in); break; - - case 3: - *cp++ = (unsigned char)((in >> 12) | 0x000000E0); - *cp++ = (unsigned char)(((in >> 6) & 0x0000003F) | 0x00000080); - *cp++ = (unsigned char)((in & 0x0000003F) | 0x00000080); + case 3: + *cp++ = (unsigned char)((in >> 12) | 0xE0); + *cp++ = (unsigned char)cont_byte(in >> 6); + *cp++ = (unsigned char)cont_byte(in); break; - - case 4: - *cp++ = (unsigned char)((in >> 18) | 0x000000F0); - *cp++ = (unsigned char)(((in >> 12) & 0x0000003F) | 0x00000080); - *cp++ = (unsigned char)(((in >> 6) & 0x0000003F) | 0x00000080); - *cp++ = (unsigned char)((in & 0x0000003F) | 0x00000080); + case 4: + *cp++ = (unsigned char)((in >> 18) | 0xF0); + *cp++ = (unsigned char)cont_byte(in >> 12); + *cp++ = (unsigned char)cont_byte(in >> 6); + *cp++ = (unsigned char)cont_byte(in); break; - - case 5: - *cp++ = (unsigned char)((in >> 24) | 0x000000F8); - *cp++ = (unsigned char)(((in >> 18) & 0x0000003F) | 0x00000080); - *cp++ = (unsigned char)(((in >> 12) & 0x0000003F) | 0x00000080); - *cp++ = (unsigned char)(((in >> 6) & 0x0000003F) | 0x00000080); - *cp++ = (unsigned char)((in & 0x0000003F) | 0x00000080); + case 5: + *cp++ = (unsigned char)((in >> 24) | 0xF8); + *cp++ = (unsigned char)cont_byte(in >> 18); + *cp++ = (unsigned char)cont_byte(in >> 12); + *cp++ = (unsigned char)cont_byte(in >> 6); + *cp++ = (unsigned char)cont_byte(in); break; - - case 6: - *cp++ = (unsigned char)((in >> 30) | 0x000000FC); - *cp++ = (unsigned char)(((in >> 24) & 0x0000003F) | 0x00000080); - *cp++ = (unsigned char)(((in >> 18) & 0x0000003F) | 0x00000080); - *cp++ = (unsigned char)(((in >> 12) & 0x0000003F) | 0x00000080); - *cp++ = (unsigned char)(((in >> 6) & 0x0000003F) | 0x00000080); - *cp++ = (unsigned char)((in & 0x0000003F) | 0x00000080); + case 6: + *cp++ = (unsigned char)((in >> 30) | 0xFC); + *cp++ = (unsigned char)cont_byte(in >> 24); + *cp++ = (unsigned char)cont_byte(in >> 18); + *cp++ = (unsigned char)cont_byte(in >> 12); + *cp++ = (unsigned char)cont_byte(in >> 6); + *cp++ = (unsigned char)cont_byte(in); break; } - - *outbytesleft -= bytes; - *outbuf += bytes; - - return bytes; + (*outbytesleft) -= n; + (*outbuf) += n; + return 1; } -#endif /* ICONV_FROM_UCS_CES_UTF_8 */ -#if defined (ICONV_TO_UCS_CES_UTF_8) -static ucs4_t -_DEFUN(convert_to_ucs, (data, inbuf, inbytesleft), - _VOID_PTR data _AND +static ucs_t +_DEFUN(convert_to_ucs, (ces, inbuf, inbytesleft), + struct iconv_ces *ces _AND _CONST unsigned char **inbuf _AND size_t *inbytesleft) { - register _CONST unsigned char *in = *inbuf; - register size_t bytes; - ucs4_t res; - - if (in[0] >= 0xC0) - { - if (in[0] < 0xE0) - { - if (*inbytesleft < (bytes = 2)) - return (ucs4_t)ICONV_CES_BAD_SEQUENCE; - - if ( ((in[0] & ~0x1F) == 0xC0) - && ((in[1] & 0xC0) == 0x80)) - res = ((ucs4_t)(in[0] & 0x1F) << 6) - | ((ucs4_t)(in[1] & 0x3F)); - else - return (ucs4_t)ICONV_CES_INVALID_CHARACTER; - - if (res < 0x00000080) /* Overlong sequence */ - return (ucs4_t)ICONV_CES_INVALID_CHARACTER; - } - - else if (in[0] < 0xF0) - { - if (*inbytesleft < (bytes = 3)) - return (ucs4_t)ICONV_CES_BAD_SEQUENCE; - - if ( ((in[0] & ~0x0F) == 0xE0) - && ((in[1] & 0xC0) == 0x80) - && ((in[2] & 0xC0) == 0x80)) - res = ((ucs4_t)(in[0] & 0x0F) << 12) - | ((ucs4_t)(in[1] & 0x3F) << 6) - | ((ucs4_t)(in[2] & 0x3F)); - else - return (ucs4_t)ICONV_CES_INVALID_CHARACTER; - - if (res < 0x00000800) /* Overlong sequence */ - return (ucs4_t)ICONV_CES_INVALID_CHARACTER; - } - - else if (in[0] < 0xF8) - { - if (*inbytesleft < (bytes = 4)) - return (ucs4_t)ICONV_CES_BAD_SEQUENCE; - - if ( ((in[0] & ~0x07) == 0xF0) - && ((in[1] & 0xC0) == 0x80) - && ((in[2] & 0xC0) == 0x80) - && ((in[3] & 0xC0) == 0x80)) - res = ((ucs4_t)(in[0] & 0x07) << 18) - | ((ucs4_t)(in[1] & 0x3F) << 12) - | ((ucs4_t)(in[2] & 0x3F) << 6) - | ((ucs4_t)(in[3] & 0x3F)); - else - return (ucs4_t)ICONV_CES_INVALID_CHARACTER; - - if (res < 0x00010000) /* Overlong sequence */ - return (ucs4_t)ICONV_CES_INVALID_CHARACTER; - } - - else if (in[0] < 0xFC) - { - if (*inbytesleft < (bytes = 5)) - return (ucs4_t)ICONV_CES_BAD_SEQUENCE; - - if ( ((in[0] & ~0x03) == 0xF8) - && ((in[1] & 0xC0) == 0x80) - && ((in[2] & 0xC0) == 0x80) - && ((in[3] & 0xC0) == 0x80) - && ((in[4] & 0xC0) == 0x80)) - res = ((ucs4_t)(in[0] & 0x03) << 24) - | ((ucs4_t)(in[1] & 0x3F) << 18) - | ((ucs4_t)(in[2] & 0x3F) << 12) - | ((ucs4_t)(in[3] & 0x3F) << 6) - | ((ucs4_t)(in[4] & 0x3F)); - else - return (ucs4_t)ICONV_CES_INVALID_CHARACTER; - - if (res < 0x00200000) /* Overlong sequence */ - return (ucs4_t)ICONV_CES_INVALID_CHARACTER; - } - - else if (in[0] <= 0xFD) - { - if (*inbytesleft < (bytes = 6)) - return (ucs4_t)ICONV_CES_BAD_SEQUENCE; - - if ( ((in[0] & ~0x01) == 0xFC) - && ((in[1] & 0xC0) == 0x80) - && ((in[2] & 0xC0) == 0x80) - && ((in[3] & 0xC0) == 0x80) - && ((in[4] & 0xC0) == 0x80) - && ((in[5] & 0xC0) == 0x80)) - res = ((ucs4_t)(in[0] & 0x1) << 30) - | ((ucs4_t)(in[1] & 0x3F) << 24) - | ((ucs4_t)(in[2] & 0x3F) << 18) - | ((ucs4_t)(in[3] & 0x3F) << 12) - | ((ucs4_t)(in[4] & 0x3F) << 6) - | ((ucs4_t)(in[5] & 0x3F)); - else - return (ucs4_t)ICONV_CES_INVALID_CHARACTER; - - if (res < 0x04000000) /* Overlong sequence */ - return (ucs4_t)ICONV_CES_INVALID_CHARACTER; - } - - else - return (ucs4_t)ICONV_CES_INVALID_CHARACTER; - } - else if (in[0] & 0x80) - return (ucs4_t)ICONV_CES_INVALID_CHARACTER; - else - { - res = (ucs4_t)in[0]; - bytes = 1; - } - - if ( (res >= 0x0000D800 && res <= 0x0000DFFF) - || res > 0x7FFFFFFF || res == 0x0000FFFF || res == 0x0000FFFE) - return (ucs4_t)ICONV_CES_INVALID_CHARACTER; - - *inbytesleft -= bytes; - *inbuf += bytes; - - return res; -} -#endif /* ICONV_TO_UCS_CES_UTF_8 */ - -static int -_DEFUN(get_mb_cur_max, (data), - _VOID_PTR data) -{ - return UTF8_MB_CUR_MAX; + _CONST unsigned char *in = *inbuf; + unsigned char byte = *in++; + ucs_t res = byte; + + if (byte >= 0xC0) { + if (byte < 0xE0) { + if (*inbytesleft < 2) + return UCS_CHAR_NONE; + if (((byte & ~0x1F) == 0xC0) + && ((in[0] & 0xC0) == 0x80)) { + res = ((byte & 0x1F) << 6) | (*in++ & 0x3F); + } else + res = UCS_CHAR_INVALID; + } else if (byte < 0xF0) { + if (*inbytesleft < 3) + return UCS_CHAR_NONE; + if (((byte & ~0x0F) == 0xE0) + && ((in[0] & 0xC0) == 0x80) + && ((in[1] & 0xC0) == 0x80)) { + res = ((byte & 0x0F) << 12) | ((in[0] & 0x3F) << 6) + | (in[1] & 0x3F); + in += 2; + } else + res = UCS_CHAR_INVALID; + } else if (byte < 0xF8) { + if (*inbytesleft < 4) + return UCS_CHAR_NONE; + if (((byte & ~0x7) == 0xF0) + && ((in[0] & 0xC0) == 0x80) + && ((in[1] & 0xC0) == 0x80) + && ((in[2] & 0xC0) == 0x80)) { + res = ((byte & 0x7) << 18) | ((in[0] & 0x3F) << 12) + | ((in[1] & 0x3F) << 6) | (in[2] & 0x3F); + in += 3; + } else + res = UCS_CHAR_INVALID; + } else if (byte < 0xFC) { + if (*inbytesleft < 5) + return UCS_CHAR_NONE; + if (((byte & ~0x3) == 0xF8) + && ((in[0] & 0xC0) == 0x80) + && ((in[1] & 0xC0) == 0x80) + && ((in[2] & 0xC0) == 0x80) + && ((in[3] & 0xC0) == 0x80)) { + res = ((byte & 0x3) << 24) | ((in[0] & 0x3F) << 18) + | ((in[1] & 0x3F) << 12) | ((in[2] & 0x3F) << 8) + | (in[3] & 0x3F); + in += 4; + } else + res = UCS_CHAR_INVALID; + } else if (byte <= 0xFD) { + if (*inbytesleft < 6) + return UCS_CHAR_NONE; + if (((byte & ~0x1) == 0xFC) + && ((in[0] & 0xC0) == 0x80) + && ((in[1] & 0xC0) == 0x80) + && ((in[2] & 0xC0) == 0x80) + && ((in[3] & 0xC0) == 0x80) + && ((in[4] & 0xC0) == 0x80)) { + res = ((byte & 0x1) << 30) | ((in[0] & 0x3F) << 24) + | ((in[1] & 0x3F) << 18) | ((in[2] & 0x3F) << 12) + | ((in[3] & 0x3F) << 8) | (in[4] & 0x3F); + in += 5; + } else + res = UCS_CHAR_INVALID; + } else + res = UCS_CHAR_INVALID; + } else if (byte & 0x80) + res = UCS_CHAR_INVALID; + + (*inbytesleft) -= (in - *inbuf); + *inbuf = in; + return res; } -#if defined (ICONV_TO_UCS_CES_UTF_8) -_CONST iconv_to_ucs_ces_handlers_t -_iconv_to_ucs_ces_handlers_utf_8 = -{ - NULL, - NULL, - get_mb_cur_max, - NULL, - NULL, - NULL, - convert_to_ucs -}; -#endif - -#if defined (ICONV_FROM_UCS_CES_UTF_8) -_CONST iconv_from_ucs_ces_handlers_t -_iconv_from_ucs_ces_handlers_utf_8 = -{ - NULL, - NULL, - get_mb_cur_max, - NULL, - NULL, - NULL, - convert_from_ucs -}; -#endif +ICONV_CES_STATELESS_MODULE_DECL(utf_8); -#endif /* ICONV_TO_UCS_CES_UTF_8 || ICONV_FROM_UCS_CES_UTF_8 */ +#endif /* #ifdef _ICONV_CONVERTER_UTF_8 */ |