diff options
Diffstat (limited to 'intern/utfconv/utfconv.c')
-rw-r--r-- | intern/utfconv/utfconv.c | 443 |
1 files changed, 245 insertions, 198 deletions
diff --git a/intern/utfconv/utfconv.c b/intern/utfconv/utfconv.c index 767337b84b3..1dd868b8170 100644 --- a/intern/utfconv/utfconv.c +++ b/intern/utfconv/utfconv.c @@ -22,235 +22,282 @@ size_t count_utf_8_from_16(const wchar_t *string16) { - int i; - size_t count = 0; - wchar_t u = 0; - if (!string16) { - return 0; - } - - for (i = 0; (u = string16[i]); i++) { - if (u < 0x0080) { - count += 1; - } - else { - if (u < 0x0800) { - count += 2; - } - else { - if (u < 0xD800) { - count += 3; - } - else { - if (u < 0xDC00) { - i++; - if ((u = string16[i]) == 0) { - break; - } - if (u >= 0xDC00 && u < 0xE000) { - count += 4; - } - } - else { - if (u < 0xE000) { - /*illigal*/; - } - else { - count += 3; - } - } - } - } - } - } - - return ++count; + int i; + size_t count = 0; + wchar_t u = 0; + if (!string16) { + return 0; + } + + for (i = 0; (u = string16[i]); i++) { + if (u < 0x0080) { + count += 1; + } + else { + if (u < 0x0800) { + count += 2; + } + else { + if (u < 0xD800) { + count += 3; + } + else { + if (u < 0xDC00) { + i++; + if ((u = string16[i]) == 0) { + break; + } + if (u >= 0xDC00 && u < 0xE000) { + count += 4; + } + } + else { + if (u < 0xE000) { + /*illigal*/; + } + else { + count += 3; + } + } + } + } + } + } + + return ++count; } - size_t count_utf_16_from_8(const char *string8) { - size_t count = 0; - char u; - char type = 0; - unsigned int u32 = 0; - - if (!string8) return 0; - - for (; (u = *string8); string8++) { - if (type == 0) { - if ((u & 0x01 << 7) == 0) { count++; u32 = 0; continue; } //1 utf-8 char - if ((u & 0x07 << 5) == 0xC0) { type = 1; u32 = u & 0x1F; continue; } //2 utf-8 char - if ((u & 0x0F << 4) == 0xE0) { type = 2; u32 = u & 0x0F; continue; } //3 utf-8 char - if ((u & 0x1F << 3) == 0xF0) { type = 3; u32 = u & 0x07; continue; } //4 utf-8 char - continue; - } - else { - if ((u & 0xC0) == 0x80) { - u32 = (u32 << 6) | (u & 0x3F); - type--; - } - else { - u32 = 0; - type = 0; - } - } - - if (type == 0) { - if ((0 < u32 && u32 < 0xD800) || (0xE000 <= u32 && u32 < 0x10000)) count++; - else if (0x10000 <= u32 && u32 < 0x110000) count += 2; - u32 = 0; - } - } - - return ++count; + size_t count = 0; + char u; + char type = 0; + unsigned int u32 = 0; + + if (!string8) + return 0; + + for (; (u = *string8); string8++) { + if (type == 0) { + if ((u & 0x01 << 7) == 0) { + count++; + u32 = 0; + continue; + } //1 utf-8 char + if ((u & 0x07 << 5) == 0xC0) { + type = 1; + u32 = u & 0x1F; + continue; + } //2 utf-8 char + if ((u & 0x0F << 4) == 0xE0) { + type = 2; + u32 = u & 0x0F; + continue; + } //3 utf-8 char + if ((u & 0x1F << 3) == 0xF0) { + type = 3; + u32 = u & 0x07; + continue; + } //4 utf-8 char + continue; + } + else { + if ((u & 0xC0) == 0x80) { + u32 = (u32 << 6) | (u & 0x3F); + type--; + } + else { + u32 = 0; + type = 0; + } + } + + if (type == 0) { + if ((0 < u32 && u32 < 0xD800) || (0xE000 <= u32 && u32 < 0x10000)) + count++; + else if (0x10000 <= u32 && u32 < 0x110000) + count += 2; + u32 = 0; + } + } + + return ++count; } - int conv_utf_16_to_8(const wchar_t *in16, char *out8, size_t size8) { - char *out8end = out8 + size8; - wchar_t u = 0; - int err = 0; - if (!size8 || !in16 || !out8) return UTF_ERROR_NULL_IN; - out8end--; - - for (; out8 < out8end && (u = *in16); in16++, out8++) { - if (u < 0x0080) { - *out8 = u; - } - else if (u < 0x0800) { - if (out8 + 1 >= out8end) break; - *out8++ = (0x3 << 6) | (0x1F & (u >> 6)); - *out8 = (0x1 << 7) | (0x3F & (u)); - } - else if (u < 0xD800 || u >= 0xE000) { - if (out8 + 2 >= out8end) break; - *out8++ = (0x7 << 5) | (0xF & (u >> 12)); - *out8++ = (0x1 << 7) | (0x3F & (u >> 6)); - *out8 = (0x1 << 7) | (0x3F & (u)); - } - else if (u < 0xDC00) { - wchar_t u2 = *++in16; - - if (!u2) break; - if (u2 >= 0xDC00 && u2 < 0xE000) { - if (out8 + 3 >= out8end) break; else { - unsigned int uc = 0x10000 + (u2 - 0xDC00) + ((u - 0xD800) << 10); - - *out8++ = (0xF << 4) | (0x7 & (uc >> 18)); - *out8++ = (0x1 << 7) | (0x3F & (uc >> 12)); - *out8++ = (0x1 << 7) | (0x3F & (uc >> 6)); - *out8 = (0x1 << 7) | (0x3F & (uc)); - } - } - else { - out8--; err |= UTF_ERROR_ILLCHAR; - } - } - else if (u < 0xE000) { - out8--; err |= UTF_ERROR_ILLCHAR; - } - } - - *out8 = *out8end = 0; - - if (*in16) err |= UTF_ERROR_SMALL; - - return err; + char *out8end = out8 + size8; + wchar_t u = 0; + int err = 0; + if (!size8 || !in16 || !out8) + return UTF_ERROR_NULL_IN; + out8end--; + + for (; out8 < out8end && (u = *in16); in16++, out8++) { + if (u < 0x0080) { + *out8 = u; + } + else if (u < 0x0800) { + if (out8 + 1 >= out8end) + break; + *out8++ = (0x3 << 6) | (0x1F & (u >> 6)); + *out8 = (0x1 << 7) | (0x3F & (u)); + } + else if (u < 0xD800 || u >= 0xE000) { + if (out8 + 2 >= out8end) + break; + *out8++ = (0x7 << 5) | (0xF & (u >> 12)); + *out8++ = (0x1 << 7) | (0x3F & (u >> 6)); + *out8 = (0x1 << 7) | (0x3F & (u)); + } + else if (u < 0xDC00) { + wchar_t u2 = *++in16; + + if (!u2) + break; + if (u2 >= 0xDC00 && u2 < 0xE000) { + if (out8 + 3 >= out8end) + break; + else { + unsigned int uc = 0x10000 + (u2 - 0xDC00) + ((u - 0xD800) << 10); + + *out8++ = (0xF << 4) | (0x7 & (uc >> 18)); + *out8++ = (0x1 << 7) | (0x3F & (uc >> 12)); + *out8++ = (0x1 << 7) | (0x3F & (uc >> 6)); + *out8 = (0x1 << 7) | (0x3F & (uc)); + } + } + else { + out8--; + err |= UTF_ERROR_ILLCHAR; + } + } + else if (u < 0xE000) { + out8--; + err |= UTF_ERROR_ILLCHAR; + } + } + + *out8 = *out8end = 0; + + if (*in16) + err |= UTF_ERROR_SMALL; + + return err; } - int conv_utf_8_to_16(const char *in8, wchar_t *out16, size_t size16) { - char u; - char type = 0; - unsigned int u32 = 0; - wchar_t *out16end = out16 + size16; - int err = 0; - if (!size16 || !in8 || !out16) return UTF_ERROR_NULL_IN; - out16end--; - - for (; out16 < out16end && (u = *in8); in8++) { - if (type == 0) { - if ((u & 0x01 << 7) == 0) { *out16 = u; out16++; u32 = 0; continue; } //1 utf-8 char - if ((u & 0x07 << 5) == 0xC0) { type = 1; u32 = u & 0x1F; continue; } //2 utf-8 char - if ((u & 0x0F << 4) == 0xE0) { type = 2; u32 = u & 0x0F; continue; } //3 utf-8 char - if ((u & 0x1F << 3) == 0xF0) { type = 3; u32 = u & 0x07; continue; } //4 utf-8 char - err |= UTF_ERROR_ILLCHAR; - continue; - } - else { - if ((u & 0xC0) == 0x80) { - u32 = (u32 << 6) | (u & 0x3F); - type--; - } - else { - u32 = 0; type = 0; err |= UTF_ERROR_ILLSEQ; - } - } - if (type == 0) { - if ((0 < u32 && u32 < 0xD800) || (0xE000 <= u32 && u32 < 0x10000)) { - *out16 = u32; - out16++; - } - else if (0x10000 <= u32 && u32 < 0x110000) { - if (out16 + 1 >= out16end) break; - u32 -= 0x10000; - *out16 = 0xD800 + (u32 >> 10); - out16++; - *out16 = 0xDC00 + (u32 & 0x3FF); - out16++; - } - u32 = 0; - } - - } - - *out16 = *out16end = 0; - - if (*in8) err |= UTF_ERROR_SMALL; - - return err; + char u; + char type = 0; + unsigned int u32 = 0; + wchar_t *out16end = out16 + size16; + int err = 0; + if (!size16 || !in8 || !out16) + return UTF_ERROR_NULL_IN; + out16end--; + + for (; out16 < out16end && (u = *in8); in8++) { + if (type == 0) { + if ((u & 0x01 << 7) == 0) { + *out16 = u; + out16++; + u32 = 0; + continue; + } //1 utf-8 char + if ((u & 0x07 << 5) == 0xC0) { + type = 1; + u32 = u & 0x1F; + continue; + } //2 utf-8 char + if ((u & 0x0F << 4) == 0xE0) { + type = 2; + u32 = u & 0x0F; + continue; + } //3 utf-8 char + if ((u & 0x1F << 3) == 0xF0) { + type = 3; + u32 = u & 0x07; + continue; + } //4 utf-8 char + err |= UTF_ERROR_ILLCHAR; + continue; + } + else { + if ((u & 0xC0) == 0x80) { + u32 = (u32 << 6) | (u & 0x3F); + type--; + } + else { + u32 = 0; + type = 0; + err |= UTF_ERROR_ILLSEQ; + } + } + if (type == 0) { + if ((0 < u32 && u32 < 0xD800) || (0xE000 <= u32 && u32 < 0x10000)) { + *out16 = u32; + out16++; + } + else if (0x10000 <= u32 && u32 < 0x110000) { + if (out16 + 1 >= out16end) + break; + u32 -= 0x10000; + *out16 = 0xD800 + (u32 >> 10); + out16++; + *out16 = 0xDC00 + (u32 & 0x3FF); + out16++; + } + u32 = 0; + } + } + + *out16 = *out16end = 0; + + if (*in8) + err |= UTF_ERROR_SMALL; + + return err; } /* UNUSED FUNCTIONS */ #if 0 static int is_ascii(const char *in8) { - for (; *in8; in8++) - if (0x80 & *in8) return 0; + for (; *in8; in8++) + if (0x80 & *in8) return 0; - return 1; + return 1; } static void utf_8_cut_end(char *inout8, size_t maxcutpoint) { - char *cur = inout8 + maxcutpoint; - char cc; - if (!inout8) return; + char *cur = inout8 + maxcutpoint; + char cc; + if (!inout8) return; - cc = *cur; + cc = *cur; } #endif - char *alloc_utf_8_from_16(const wchar_t *in16, size_t add) { - size_t bsize = count_utf_8_from_16(in16); - char *out8 = NULL; - if (!bsize) return NULL; - out8 = (char *)malloc(sizeof(char) * (bsize + add)); - conv_utf_16_to_8(in16, out8, bsize); - return out8; + size_t bsize = count_utf_8_from_16(in16); + char *out8 = NULL; + if (!bsize) + return NULL; + out8 = (char *)malloc(sizeof(char) * (bsize + add)); + conv_utf_16_to_8(in16, out8, bsize); + return out8; } wchar_t *alloc_utf16_from_8(const char *in8, size_t add) { - size_t bsize = count_utf_16_from_8(in8); - wchar_t *out16 = NULL; - if (!bsize) return NULL; - out16 = (wchar_t *) malloc(sizeof(wchar_t) * (bsize + add)); - conv_utf_8_to_16(in8, out16, bsize); - return out16; + size_t bsize = count_utf_16_from_8(in8); + wchar_t *out16 = NULL; + if (!bsize) + return NULL; + out16 = (wchar_t *)malloc(sizeof(wchar_t) * (bsize + add)); + conv_utf_8_to_16(in8, out16, bsize); + return out16; } |