Welcome to mirror list, hosted at ThFree Co, Russian Federation.

cygwin.com/git/newlib-cygwin.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
Diffstat (limited to 'newlib/libc/iconv/ces/utf-16.c')
-rw-r--r--newlib/libc/iconv/ces/utf-16.c364
1 files changed, 266 insertions, 98 deletions
diff --git a/newlib/libc/iconv/ces/utf-16.c b/newlib/libc/iconv/ces/utf-16.c
index fae4ed5f0..4b2ff77a1 100644
--- a/newlib/libc/iconv/ces/utf-16.c
+++ b/newlib/libc/iconv/ces/utf-16.c
@@ -1,6 +1,6 @@
-/*-
- * Copyright (c) 1999,2000
- * Konstantin Chuguev. All rights reserved.
+/*
+ * Copyright (c) 2003-2004, Artem B. Bityuckiy
+ * Copyright (c) 1999,2000, Konstantin Chuguev. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
@@ -22,117 +22,285 @@
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
- *
- * iconv (Charset Conversion Library) v2.0
*/
-#include "../lib/deps.h"
+#include "cesbi.h"
+
+#if defined (ICONV_TO_UCS_CES_UTF_16) \
+ || defined (ICONV_FROM_UCS_CES_UTF_16)
-#ifdef _ICONV_CONVERTER_UTF_16
+#include <_ansi.h>
+#include <reent.h>
+#include <sys/types.h>
+#include <stdlib.h>
#include <string.h>
+#include <wchar.h>
#include "../lib/local.h"
+#include "../lib/ucsconv.h"
+#include "../lib/endian.h"
+
+/*
+ * On input UTF-16 converter interpret BOM and uses Big Endian byte order if BOM
+ * is absent. UTF-16 converter outputs in System Endian and adds correspondent
+ * BOM as first code. UTF-16LE and UTF-16BE converters ignore BOM on input and
+ * don't output BOM.
+ */
+
+#define UTF16_UNDEFINED 0x00
+#define UTF16_BIG_ENDIAN 0x01
+#define UTF16_LITTLE_ENDIAN 0x02
+#define UTF16_SYSTEM_ENDIAN 0x04
+#define UTF16_BOM_WRITTEN 0x08
+
+#define UTF16_BOM 0xFEFF
+
+#define UTF_16 "utf_16"
+#define UTF_16BE "utf_16be"
+#define UTF_16LE "utf_16le"
+
+static size_t
+_DEFUN(utf_16_close, (rptr, data),
+ struct _reent *rptr _AND
+ _VOID_PTR data)
+{
+ _free_r(rptr, data);
+ return 0;
+}
-static ssize_t
-_DEFUN(convert_from_ucs, (ces, in, outbuf, outbytesleft),
- struct iconv_ces *ces _AND
- ucs_t in _AND
- unsigned char **outbuf _AND
- size_t *outbytesleft)
+#if defined (ICONV_FROM_UCS_CES_UTF_16)
+static _VOID_PTR
+_DEFUN(utf_16_init_from_ucs, (rptr, encoding),
+ struct _reent *rptr _AND
+ _CONST char *encoding)
{
- unsigned char *cp;
- int *state;
- int bytes;
-
- if (in == UCS_CHAR_NONE)
- return 1; /* No state reinitialization for table charsets */
- if (in > 0x10FFFF)
- return -1;
- bytes = *(state = (int *)(ces->data)) ? 2 : 4;
- if (in > 0xFFFF)
- bytes += 2;
- if (*outbytesleft < bytes)
- return 0; /* No space in the output buffer */
- cp = *outbuf;
- if (*state == 0) {
- *cp++ = 0xFE;
- *cp++ = 0xFF;
- *state = 1;
+ int *data;
+
+ if ((data = (int *)_malloc_r (rptr, sizeof (int))) == NULL)
+ return (_VOID_PTR)NULL;
+
+ if (strcmp (encoding, UTF_16LE) == 0)
+ *data = UTF16_LITTLE_ENDIAN;
+ else if (strcmp (encoding, UTF_16BE) == 0)
+ *data = UTF16_BIG_ENDIAN;
+ else
+ *data = UTF16_SYSTEM_ENDIAN;
+
+ return (_VOID_PTR)data;
+}
+
+static size_t
+_DEFUN(utf_16_convert_from_ucs, (data, in, outbuf, outbytesleft),
+ _VOID_PTR data _AND
+ register ucs4_t in _AND
+ unsigned char **outbuf _AND
+ size_t *outbytesleft)
+{
+ register ucs2_t *cp;
+ register size_t bytes;
+ register int *state;
+
+ if (in > 0x0010FFFF || (in >= 0x0000D800 && in <= 0x0000DFFF)
+ || in == 0x0000FFFF || in == 0x0000FFFE)
+ return (size_t)ICONV_CES_INVALID_CHARACTER;
+
+ state = (int *)data;
+ bytes = (*state == UTF16_SYSTEM_ENDIAN) ? sizeof (ucs2_t) * 2
+ : sizeof (ucs2_t);
+
+ if (in > 0x0000FFFF)
+ bytes += sizeof (ucs2_t);
+
+ if (*outbytesleft < bytes)
+ return (size_t)ICONV_CES_NOSPACE;
+
+ cp = (ucs2_t *)*outbuf;
+
+ if (*state == UTF16_SYSTEM_ENDIAN)
+ {
+ *cp++ = UTF16_BOM;
+ *state |= UTF16_BOM_WRITTEN;
}
- if (in > 0xFFFF) {
- *cp++ = ((in -= 0x10000) >> 18) | 0xD8;
- *cp++ = (in >> 10) & 0xFF;
- *cp++ = ((in >> 8) & 3) | 0xDC;
- } else
- *cp++ = (in >> 8) & 0xFF;
- *cp++ = in & 0xFF;
- (*outbuf) += bytes;
- *outbytesleft -= bytes;
- return 1;
+
+ if (in < 0x00010000)
+ {
+ switch (*state)
+ {
+ case UTF16_LITTLE_ENDIAN:
+ *cp = ICONV_HTOLES ((ucs2_t)in);
+ break;
+ case UTF16_BIG_ENDIAN:
+ *cp = ICONV_HTOBES ((ucs2_t)in);
+ break;
+ case (UTF16_SYSTEM_ENDIAN | UTF16_BOM_WRITTEN):
+ *cp = (ucs2_t)in;
+ break;
+ }
+ }
+ else
+ {
+ ucs2_t w1, w2;
+
+ /* Process surrogate pair */
+ in -= 0x00010000;
+ w1 = ((ucs2_t)((in >> 10)) & 0x03FF) | 0xD800;
+ w2 = (ucs2_t)(in & 0x000003FF) | 0xDC00;
+
+ switch (*state)
+ {
+ case UTF16_LITTLE_ENDIAN:
+ *cp++ = ICONV_HTOLES (w1);
+ *cp = ICONV_HTOLES (w2);
+ break;
+ case UTF16_BIG_ENDIAN:
+ *cp++ = ICONV_HTOBES (w1);
+ *cp = ICONV_HTOBES (w2);
+ break;
+ case (UTF16_SYSTEM_ENDIAN | UTF16_BOM_WRITTEN):
+ *cp++ = w1;
+ *cp = w2;
+ break;
+ }
+ }
+
+ *outbuf += bytes;
+ *outbytesleft -= bytes;
+
+ return bytes;
}
+#endif /* ICONV_FROM_UCS_CES_UTF_16 */
-static __inline ucs_t
-_DEFUN(msb, (buf), _CONST unsigned char *buf)
+#if defined (ICONV_TO_UCS_CES_UTF_16)
+static _VOID_PTR
+_DEFUN(utf_16_init_to_ucs, (rptr, encoding),
+ struct _reent *rptr _AND
+ _CONST char *encoding)
{
- return (buf[0] << 8) | buf[1];
+ int *data;
+
+ if ((data = (int *)_malloc_r (rptr, sizeof (int))) == NULL)
+ return (_VOID_PTR)NULL;
+
+ if (strcmp (encoding, UTF_16BE) == 0)
+ *data = UTF16_BIG_ENDIAN;
+ else if (strcmp (encoding, UTF_16LE) == 0)
+ *data = UTF16_LITTLE_ENDIAN;
+ else
+ *data = UTF16_UNDEFINED;
+
+ return (_VOID_PTR)data;
}
-static ucs_t
-_DEFUN(convert_to_ucs, (ces, inbuf, inbytesleft),
- struct iconv_ces *ces _AND
- _CONST unsigned char **inbuf _AND
- size_t *inbytesleft)
+static ucs4_t
+_DEFUN(utf_16_convert_to_ucs, (data, inbuf, inbytesleft),
+ _VOID_PTR data _AND
+ _CONST unsigned char **inbuf _AND
+ size_t *inbytesleft)
{
- ucs_t res, res2;
- int *state, mark;
-
- if (*inbytesleft < 2)
- return UCS_CHAR_NONE; /* Not enough bytes in the input buffer */
- state = (int *)(ces->data);
- res = msb(*inbuf);
- switch (res) {
- case UCS_CHAR_ZERO_WIDTH_NBSP:
- if (*state == 0)
- *state = 1;
- mark = 1;
- break;
- case UCS_CHAR_INVALID:
- if (*state == 0)
- *state = 2;
- mark = 1;
- break;
- default:
- if (*state == 0)
- *state = 1;
- mark = 0;
+ register ucs2_t w1;
+ register ucs2_t w2;
+ register ucs2_t *cp;
+ int *state;
+ ucs4_t res;
+ int bytes = sizeof (ucs2_t);
+
+ if (*inbytesleft < bytes)
+ return (ucs4_t)ICONV_CES_BAD_SEQUENCE;
+
+ state = (int *)data;
+ cp = ((ucs2_t *)*inbuf);
+
+ if (*state == UTF16_UNDEFINED)
+ {
+ if (*cp == ICONV_HTOLES(UTF16_BOM))
+ *state = UTF16_LITTLE_ENDIAN;
+ else
+ *state = UTF16_BIG_ENDIAN;
+
+ if ( *cp == ICONV_HTOBES (UTF16_BOM)
+ || *cp == ICONV_HTOLES (UTF16_BOM))
+ {
+ if (*inbytesleft < (bytes += sizeof (ucs2_t)))
+ return (ucs4_t)ICONV_CES_BAD_SEQUENCE;
+ cp += 1;
+ }
+ }
+
+ if (*state == UTF16_LITTLE_ENDIAN)
+ w1 = ICONV_LETOHS (*cp);
+ else
+ w1 = ICONV_BETOHS (*cp);
+
+ if (w1 < 0xD800 || w1 > 0xDFFF)
+ {
+ if (w1 == 0xFFFF || w1 == 0xFFFE)
+ return (ucs4_t)ICONV_CES_INVALID_CHARACTER;
+ res = (ucs4_t)w1;
}
- if (mark) {
- if (*inbytesleft < 4)
- return UCS_CHAR_NONE; /* Not enough bytes in the input buffer */
- *inbytesleft -= 2;
- res = msb((*inbuf) += 2);
+ else
+ {
+ /* Process surrogate pair */
+ if (*inbytesleft < (bytes += 2))
+ return (ucs4_t)ICONV_CES_BAD_SEQUENCE;
+
+ if (w1 > 0xDBFF)
+ /* Broken surrogate character */
+ return (ucs4_t)ICONV_CES_INVALID_CHARACTER;
+
+ cp += 1;
+
+ if (*state == UTF16_LITTLE_ENDIAN)
+ w2 = ICONV_LETOHS (*cp);
+ else
+ w2 = ICONV_BETOHS (*cp);
+
+ if (w2 < 0xDC00 || w2 > 0xDFFF)
+ /* Broken surrogate character */
+ return (ucs4_t)ICONV_CES_INVALID_CHARACTER;
+
+ res = (ucs4_t)(w2 & 0x03FF) | ((ucs4_t)(w1 & 0x03FF) << 10);
+ res += 0x00010000;
}
- if (*state == 2) { /* LSB order */
- res = (*(*inbuf) ++);
- res |= (*(*inbuf) ++) << 8;
- } else
- *inbuf += 2;
- *inbytesleft -= 2;
- if ((res & 0xFC00) != 0xD800) /* Non-surrogate character */
- return res;
- if (*inbytesleft < 2)
- return UCS_CHAR_NONE; /* Not enough bytes in the input buffer */
- if (*state == 2) {
- res2 = (*inbuf)[0];
- res2 |= (*inbuf)[1] << 8;
- } else
- res2 = msb(*inbuf);
- if ((res2 & 0xFC00) != 0xDC00) /* Broken surrogate pair */
- return -1;
- (*inbuf) += 2;
- (*inbytesleft) -= 2;
- return (((res & 0x3FF) << 10) | (res2 & 0x3FF)) + 0x10000;
+
+ *inbuf += bytes;
+ *inbytesleft -= bytes;
+
+ return res;
}
+#endif /* ICONV_TO_UCS_CES_UTF_16 */
-ICONV_CES_STATEFUL_MODULE_DECL(utf_16);
+static int
+_DEFUN(utf_16_get_mb_cur_max, (data),
+ _VOID_PTR data)
+{
+ return 6;
+}
+
+#if defined (ICONV_TO_UCS_CES_UTF_16)
+_CONST iconv_to_ucs_ces_handlers_t
+_iconv_to_ucs_ces_handlers_utf_16 =
+{
+ utf_16_init_to_ucs,
+ utf_16_close,
+ utf_16_get_mb_cur_max,
+ NULL,
+ NULL,
+ NULL,
+ utf_16_convert_to_ucs
+};
+#endif
+
+#if defined (ICONV_FROM_UCS_CES_UTF_16)
+_CONST iconv_from_ucs_ces_handlers_t
+_iconv_from_ucs_ces_handlers_utf_16 =
+{
+ utf_16_init_from_ucs,
+ utf_16_close,
+ utf_16_get_mb_cur_max,
+ NULL,
+ NULL,
+ NULL,
+ utf_16_convert_from_ucs
+};
+#endif
-#endif /* #ifdef _ICONV_CONVERTER_UTF_16 */
+#endif /* ICONV_TO_UCS_CES_UTF_16 || ICONV_FROM_UCS_CES_UTF_16 */