Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/lexborisov/Modest.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
Diffstat (limited to 'source/myencoding/encoding.c')
-rw-r--r--source/myencoding/encoding.c1244
1 files changed, 1244 insertions, 0 deletions
diff --git a/source/myencoding/encoding.c b/source/myencoding/encoding.c
new file mode 100644
index 0000000..515e059
--- /dev/null
+++ b/source/myencoding/encoding.c
@@ -0,0 +1,1244 @@
+/*
+ Copyright (C) 2015-2017 Alexander Borisov
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+
+ Author: lex.borisov@gmail.com (Alexander Borisov)
+*/
+
+#include "myencoding/encoding.h"
+#include "myencoding/resource.h"
+#include "mycore/utils/resources.h"
+
+myencoding_custom_f myencoding_get_function_by_id(myencoding_t idx)
+{
+ return myencoding_function_index[idx];
+}
+
+myencoding_status_t myencoding_decode_utf_8(unsigned const char data, myencoding_result_t *res)
+{
+ // res->first -- lower boundary
+ // res->second -- upper boundary
+ // res->result -- code point
+ // res->third -- bytes seen
+ // res->flag -- bytes needed
+
+ if(res->flag == 0)
+ {
+ if(data <= 0x7F) {
+ res->result = data;
+ return MyENCODING_STATUS_OK;
+ }
+ else if(data >= 0xC2 && data <= 0xDF) {
+ res->flag = 1;
+ res->result = data - 0xC0;
+ }
+ else if(data >= 0xE0 && data <= 0xEF) {
+ if(data == 0xE0) {
+ res->first = 0xA0;
+ }
+ else if(data == 0xED){
+ res->second = 0x9F;
+ }
+
+ res->flag = 2;
+ res->result = data - 0xE0;
+ }
+ else if(data >= 0xF0 && data <= 0xF4) {
+ if(data == 0xF0) {
+ res->first = 0x90;
+ }
+ else if(data == 0xF4){
+ res->second = 0x8F;
+ }
+
+ res->flag = 2;
+ res->result = data - 0xF0;
+ }
+ else
+ return MyENCODING_STATUS_ERROR;
+
+ res->result = res->result << (6 * res->flag);
+ return MyENCODING_STATUS_CONTINUE;
+ }
+
+ if(data < res->first && data > res->second)
+ {
+ res->result = 0x00;
+ res->flag = 0x00;
+ res->third = 0x00;
+ res->first = 0x80;
+ res->second = 0xBF;
+
+ return MyENCODING_STATUS_DONE|MyENCODING_STATUS_ERROR;
+ }
+
+ res->first = 0x80;
+ res->second = 0xBF;
+
+ res->third++;
+ res->result += (unsigned long)(data - 0x80) << (6 * (res->flag - res->third));
+
+ if(res->third != res->flag)
+ return MyENCODING_STATUS_CONTINUE;
+
+ res->flag = 0x00;
+ res->third = 0x00;
+
+ return MyENCODING_STATUS_OK;
+}
+
+myencoding_status_t myencoding_decode_ibm866(unsigned const char data, myencoding_result_t *res)
+{
+ if(data <= 0x7F)
+ (res->result = data);
+ else
+ (res->result = myencoding_map_ibm866[(data - 0x80)]);
+
+ return MyENCODING_STATUS_OK;
+}
+
+myencoding_status_t myencoding_decode_iso_8859_2(unsigned const char data, myencoding_result_t *res)
+{
+ if(data <= 0x7F)
+ (res->result = data);
+ else
+ (res->result = myencoding_map_iso_8859_2[(data - 0x80)]);
+
+ return MyENCODING_STATUS_OK;
+}
+
+myencoding_status_t myencoding_decode_iso_8859_3(unsigned const char data, myencoding_result_t *res)
+{
+ if(data <= 0x7F)
+ (res->result = data);
+ else
+ (res->result = myencoding_map_iso_8859_3[(data - 0x80)]);
+
+ return MyENCODING_STATUS_OK;
+}
+
+myencoding_status_t myencoding_decode_iso_8859_4(unsigned const char data, myencoding_result_t *res)
+{
+ if(data <= 0x7F)
+ (res->result = data);
+ else
+ (res->result = myencoding_map_iso_8859_4[(data - 0x80)]);
+
+ return MyENCODING_STATUS_OK;
+}
+
+myencoding_status_t myencoding_decode_iso_8859_5(unsigned const char data, myencoding_result_t *res)
+{
+ if(data <= 0x7F)
+ (res->result = data);
+ else
+ (res->result = myencoding_map_iso_8859_5[(data - 0x80)]);
+
+ return MyENCODING_STATUS_OK;
+}
+
+myencoding_status_t myencoding_decode_iso_8859_6(unsigned const char data, myencoding_result_t *res)
+{
+ if(data <= 0x7F)
+ (res->result = data);
+ else
+ (res->result = myencoding_map_iso_8859_6[(data - 0x80)]);
+
+ return MyENCODING_STATUS_OK;
+}
+
+myencoding_status_t myencoding_decode_iso_8859_7(unsigned const char data, myencoding_result_t *res)
+{
+ if(data <= 0x7F)
+ (res->result = data);
+ else
+ (res->result = myencoding_map_iso_8859_7[(data - 0x80)]);
+
+ return MyENCODING_STATUS_OK;
+}
+
+myencoding_status_t myencoding_decode_iso_8859_8(unsigned const char data, myencoding_result_t *res)
+{
+ if(data <= 0x7F)
+ (res->result = data);
+ else
+ (res->result = myencoding_map_iso_8859_8[(data - 0x80)]);
+
+ return MyENCODING_STATUS_OK;
+}
+
+myencoding_status_t myencoding_decode_iso_8859_8_i(unsigned const char data, myencoding_result_t *res)
+{
+ if(data <= 0x7F)
+ (res->result = data);
+ else
+ (res->result = myencoding_map_iso_8859_8[(data - 0x80)]);
+
+ return MyENCODING_STATUS_OK;
+}
+
+myencoding_status_t myencoding_decode_iso_8859_10(unsigned const char data, myencoding_result_t *res)
+{
+ if(data <= 0x7F)
+ (res->result = data);
+ else
+ (res->result = myencoding_map_iso_8859_10[(data - 0x80)]);
+
+ return MyENCODING_STATUS_OK;
+}
+
+myencoding_status_t myencoding_decode_iso_8859_13(unsigned const char data, myencoding_result_t *res)
+{
+ if(data <= 0x7F)
+ (res->result = data);
+ else
+ (res->result = myencoding_map_iso_8859_13[(data - 0x80)]);
+
+ return MyENCODING_STATUS_OK;
+}
+
+myencoding_status_t myencoding_decode_iso_8859_14(unsigned const char data, myencoding_result_t *res)
+{
+ if(data <= 0x7F)
+ (res->result = data);
+ else
+ (res->result = myencoding_map_iso_8859_14[(data - 0x80)]);
+
+ return MyENCODING_STATUS_OK;
+}
+
+myencoding_status_t myencoding_decode_iso_8859_15(unsigned const char data, myencoding_result_t *res)
+{
+ if(data <= 0x7F)
+ (res->result = data);
+ else
+ (res->result = myencoding_map_iso_8859_15[(data - 0x80)]);
+
+ return MyENCODING_STATUS_OK;
+}
+
+myencoding_status_t myencoding_decode_iso_8859_16(unsigned const char data, myencoding_result_t *res)
+{
+ if(data <= 0x7F)
+ (res->result = data);
+ else
+ (res->result = myencoding_map_iso_8859_16[(data - 0x80)]);
+
+ return MyENCODING_STATUS_OK;
+}
+
+myencoding_status_t myencoding_decode_koi8_r(unsigned const char data, myencoding_result_t *res)
+{
+ if(data <= 0x7F)
+ (res->result = data);
+ else
+ (res->result = myencoding_map_koi8_r[(data - 0x80)]);
+
+ return MyENCODING_STATUS_OK;
+}
+
+myencoding_status_t myencoding_decode_koi8_u(unsigned const char data, myencoding_result_t *res)
+{
+ if(data <= 0x7F)
+ (res->result = data);
+ else
+ (res->result = myencoding_map_koi8_u[(data - 0x80)]);
+
+ return MyENCODING_STATUS_OK;
+}
+
+myencoding_status_t myencoding_decode_macintosh(unsigned const char data, myencoding_result_t *res)
+{
+ if(data <= 0x7F)
+ (res->result = data);
+ else
+ (res->result = myencoding_map_macintosh[(data - 0x80)]);
+
+ return MyENCODING_STATUS_OK;
+}
+
+myencoding_status_t myencoding_decode_windows_874(unsigned const char data, myencoding_result_t *res)
+{
+ if(data <= 0x7F)
+ (res->result = data);
+ else
+ (res->result = myencoding_map_windows_874[(data - 0x80)]);
+
+ return MyENCODING_STATUS_OK;
+}
+
+myencoding_status_t myencoding_decode_windows_1250(unsigned const char data, myencoding_result_t *res)
+{
+ if(data <= 0x7F)
+ (res->result = data);
+ else
+ (res->result = myencoding_map_windows_1250[(data - 0x80)]);
+
+ return MyENCODING_STATUS_OK;
+}
+
+myencoding_status_t myencoding_decode_windows_1251(unsigned const char data, myencoding_result_t *res)
+{
+ if(data <= 0x7F)
+ (res->result = data);
+ else
+ (res->result = myencoding_map_windows_1251[(data - 0x80)]);
+
+ return MyENCODING_STATUS_OK;
+}
+
+myencoding_status_t myencoding_decode_windows_1252(unsigned const char data, myencoding_result_t *res)
+{
+ if(data <= 0x7F)
+ (res->result = data);
+ else
+ (res->result = myencoding_map_windows_1252[(data - 0x80)]);
+
+ return MyENCODING_STATUS_OK;
+}
+
+myencoding_status_t myencoding_decode_windows_1253(unsigned const char data, myencoding_result_t *res)
+{
+ if(data <= 0x7F)
+ (res->result = data);
+ else
+ (res->result = myencoding_map_windows_1253[(data - 0x80)]);
+
+ return MyENCODING_STATUS_OK;
+}
+
+myencoding_status_t myencoding_decode_windows_1254(unsigned const char data, myencoding_result_t *res)
+{
+ if(data <= 0x7F)
+ (res->result = data);
+ else
+ (res->result = myencoding_map_windows_1254[(data - 0x80)]);
+
+ return MyENCODING_STATUS_OK;
+}
+
+myencoding_status_t myencoding_decode_windows_1255(unsigned const char data, myencoding_result_t *res)
+{
+ if(data <= 0x7F)
+ (res->result = data);
+ else
+ (res->result = myencoding_map_windows_1255[(data - 0x80)]);
+
+ return MyENCODING_STATUS_OK;
+}
+
+myencoding_status_t myencoding_decode_windows_1256(unsigned const char data, myencoding_result_t *res)
+{
+ if(data <= 0x7F)
+ (res->result = data);
+ else
+ (res->result = myencoding_map_windows_1256[(data - 0x80)]);
+
+ return MyENCODING_STATUS_OK;
+}
+
+myencoding_status_t myencoding_decode_windows_1257(unsigned const char data, myencoding_result_t *res)
+{
+ if(data <= 0x7F)
+ (res->result = data);
+ else
+ (res->result = myencoding_map_windows_1257[(data - 0x80)]);
+
+ return MyENCODING_STATUS_OK;
+}
+
+myencoding_status_t myencoding_decode_windows_1258(unsigned const char data, myencoding_result_t *res)
+{
+ if(data <= 0x7F)
+ (res->result = data);
+ else
+ (res->result = myencoding_map_windows_1258[(data - 0x80)]);
+
+ return MyENCODING_STATUS_OK;
+}
+
+myencoding_status_t myencoding_decode_x_mac_cyrillic(unsigned const char data, myencoding_result_t *res)
+{
+ if(data <= 0x7F)
+ (res->result = data);
+ else
+ (res->result = myencoding_map_x_mac_cyrillic[(data - 0x80)]);
+
+ return MyENCODING_STATUS_OK;
+}
+
+myencoding_status_t myencoding_decode_gbk(unsigned const char data, myencoding_result_t *res)
+{
+ return myencoding_decode_gb18030(data, res);
+}
+
+unsigned long myencoding_index_gb18030_ranges_code_point(unsigned long pointer)
+{
+ // step 1
+ if((pointer > 39419 && pointer < 189000) || pointer > 1237575)
+ return 0;
+
+ // step 2
+ if(pointer == 7457)
+ return 0xe7c7;
+
+ // step 3
+ unsigned long offset = sizeof(myencoding_map_gb18030_ranges) / (sizeof(unsigned long) * 2);
+
+ unsigned long code_point_offset = 0;
+ while (offset) {
+ offset--;
+
+ if(myencoding_map_gb18030_ranges[offset][0] == pointer ||
+ myencoding_map_gb18030_ranges[offset][0] < pointer)
+ {
+ code_point_offset = myencoding_map_gb18030_ranges[offset][1];
+ break;
+ }
+ }
+
+ // step 4
+ return (code_point_offset + pointer - offset);
+}
+
+myencoding_status_t myencoding_decode_gb18030(unsigned const char data, myencoding_result_t *res)
+{
+ if(res->third) {
+ if(data >= 0x30 && data <= 0x39) {
+ res->result = myencoding_index_gb18030_ranges_code_point((((res->first - 0x81) * 10 + res->second - 0x30) *
+ 126 + res->third - 0x81) * 10 + data - 0x30);
+ }
+
+ res->first = 0;
+ res->second = 0;
+ res->third = 0;
+
+ if(res->result)
+ return MyENCODING_STATUS_OK;
+
+ return MyENCODING_STATUS_ERROR;
+ }
+ else if(res->second) {
+ if(data >= 0x81 && data <= 0xFE) {
+ res->third = data;
+ return MyENCODING_STATUS_CONTINUE;
+ }
+
+ res->first = 0;
+ res->second = 0;
+
+ return MyENCODING_STATUS_ERROR;
+ }
+ else if(res->first) {
+ if(data >= 0x30 && data <= 0x39) {
+ res->second = data;
+ return MyENCODING_STATUS_CONTINUE;
+ }
+
+ unsigned long lead = res->first, pointer = 0x00;
+ res->first = 0x00;
+
+ unsigned char offset;
+
+ if(data < 0x7F)
+ offset = 0x40;
+ else
+ offset = 0x41;
+
+ if((data >= 0x40 && data <= 0x7E) ||
+ (data >= 0x80 && data <= 0xFE))
+ {
+ pointer = (lead - 0x81) * 190 + (data - offset);
+ }
+ else {
+ return MyENCODING_STATUS_ERROR;
+ }
+
+ res->result = myencoding_map_gb18030[pointer];
+ return MyENCODING_STATUS_OK;
+ }
+
+ if(data <= 0x7F) {
+ res->result = data;
+ return MyENCODING_STATUS_OK;
+ }
+
+ if(data == 0x80) {
+ res->result = 0x20ac;
+ return MyENCODING_STATUS_OK;
+ }
+
+ if(data >= 0x81 && data <= 0xFE) {
+ res->first = data;
+ return MyENCODING_STATUS_CONTINUE;
+ }
+
+ return MyENCODING_STATUS_ERROR;
+}
+
+myencoding_status_t myencoding_decode_big5(unsigned const char data, myencoding_result_t *res)
+{
+ if(res->first)
+ {
+ unsigned long lead = res->first;
+ unsigned long pointer = 0x00;
+ unsigned long offset;
+
+ res->first = 0x00;
+
+ if(data < 0x7F)
+ offset = 0x40;
+ else
+ offset = 0x62;
+
+ if((data >= 0x40 && data <= 0x7E) ||
+ (data >= 0xA1 && data <= 0xFE))
+ {
+ pointer = (lead - 0x81) * 157 + (data - offset);
+ }
+
+ switch (pointer) {
+ case 1133:
+ // U+00CA U+0304
+ res->result = 0x00ca;
+ res->result_aux = 0x0304;
+ return MyENCODING_STATUS_OK;
+ case 1135:
+ // U+00CA U+030C
+ res->result = 0x00ca;
+ res->result_aux = 0x030c;
+ return MyENCODING_STATUS_OK;
+ case 1164:
+ // U+00EA U+0304
+ res->result = 0x00ea;
+ res->result_aux = 0x0304;
+ return MyENCODING_STATUS_OK;
+ case 1166:
+ // U+00EA U+030C
+ res->result = 0x00ea;
+ res->result_aux = 0x030c;
+ return MyENCODING_STATUS_OK;
+ default:
+ break;
+ }
+
+ if(pointer == 0)
+ return MyENCODING_STATUS_ERROR;
+
+ res->result = myencoding_map_big5[pointer];
+ res->result_aux = 0;
+
+ return MyENCODING_STATUS_OK;
+ }
+
+ if(data <= 0x7F) {
+ res->result = data;
+ return MyENCODING_STATUS_OK;
+ }
+
+ if(data >= 0x81 && data <= 0xFE) {
+ res->first = data;
+ return MyENCODING_STATUS_CONTINUE;
+ }
+
+ return MyENCODING_STATUS_ERROR;
+}
+
+myencoding_status_t myencoding_decode_euc_jp(unsigned const char data, myencoding_result_t *res)
+{
+ if(res->first == 0x8E && (data >= 0xA1 && data <= 0xDF)) {
+ res->first = 0x00;
+ res->result = 0xFF61 + data - 0xA1;
+
+ return MyENCODING_STATUS_OK;
+ }
+ else if(res->first == 0x8F && (data >= 0xA1 && data <= 0xFE)) {
+ res->flag = 1;
+ res->first = data;
+
+ return MyENCODING_STATUS_CONTINUE;
+ }
+ else if(res->first)
+ {
+ unsigned long lead = res->first;
+ res->first = 0x00;
+
+ if((lead >= 0xA1 && lead <= 0xFE) &&
+ (data >= 0xA1 && data <= 0xFE))
+ {
+ unsigned long idx = (lead - 0xA1) * 94 + data - 0xA1;
+ if(res->flag) {
+ res->result = myencoding_map_jis0212[idx];
+ }
+ else {
+ res->result = myencoding_map_jis0208[idx];
+ }
+
+ res->flag = 0;
+
+ if(res->result == 0)
+ return MyENCODING_STATUS_ERROR;
+
+ return MyENCODING_STATUS_OK;
+ }
+ }
+
+ if(data <= 0x7F) {
+ res->result = data;
+ return MyENCODING_STATUS_OK;
+ }
+
+ if((data >= 0x8E && data <= 0x8F) ||
+ (data >= 0xA1 && data <= 0xFE))
+ {
+ res->first = data;
+ return MyENCODING_STATUS_CONTINUE;
+ }
+
+ return MyENCODING_STATUS_ERROR;
+}
+
+myencoding_status_t myencoding_decode_iso_2022_jp(unsigned const char data, myencoding_result_t *res)
+{
+ // res->first -- lead
+ // res->second -- state
+ // res->third -- output state
+ // res->flag -- output flag
+
+ switch (res->second) {
+ case 0: // ASCII
+ {
+ if(data == 0x1B) {
+ res->second = 6;
+ return MyENCODING_STATUS_CONTINUE;
+ }
+ else if((data <= 0x7F) &&
+ data != 0x0E && data != 0x0F && data != 0x1B)
+ {
+ res->flag = 0;
+ res->result = data;
+ return MyENCODING_STATUS_OK;
+ }
+
+ res->flag = 0;
+ return MyENCODING_STATUS_ERROR;
+ }
+
+ case 1: // Roman
+ {
+ if(data == 0x1B) {
+ res->second = 6;
+ return MyENCODING_STATUS_CONTINUE;
+ }
+ else if(data == 0x5C) {
+ res->flag = 0;
+ res->result = 0x00A5;
+ return MyENCODING_STATUS_OK;
+ }
+ else if(data == 0x7E) {
+ res->flag = 0;
+ res->result = 0x203E;
+ return MyENCODING_STATUS_OK;
+ }
+ else if((data <= 0x7F) &&
+ data != 0x0E && data != 0x0F &&
+ data != 0x1B && data != 0x5C &&
+ data != 0x7E)
+ {
+ res->flag = 0;
+ res->result = data;
+ return MyENCODING_STATUS_OK;
+ }
+
+ res->flag = 0;
+ return MyENCODING_STATUS_ERROR;
+ }
+
+ case 3: // Katakana
+ {
+ if(data == 0x1B) {
+ res->second = 6;
+ return MyENCODING_STATUS_CONTINUE;
+ }
+ else if(data >= 0x21 && data <= 0x5F) {
+ res->flag = 0;
+ res->result = 0xFF61 + data - 0x21;
+
+ return MyENCODING_STATUS_OK;
+ }
+
+ res->flag = 0;
+ return MyENCODING_STATUS_ERROR;
+ }
+
+ case 4: // Lead byte
+ {
+ if(data == 0x1B) {
+ res->second = 6;
+ return MyENCODING_STATUS_CONTINUE;
+ }
+ else if(data >= 0x21 && data <= 0x7E) {
+ res->flag = 0;
+ res->first = data;
+ res->second = 5;
+
+ return MyENCODING_STATUS_CONTINUE;
+ }
+
+ res->flag = 0;
+ return MyENCODING_STATUS_ERROR;
+ }
+
+ case 5: // Trail byte
+ {
+ if(data == 0x1B) {
+ res->second = 6;
+ return MyENCODING_STATUS_CONTINUE|MyENCODING_STATUS_ERROR;
+ }
+ else if(data >= 0x21 && data <= 0x7E) {
+ res->second = 4;
+
+ unsigned long pointer = (res->first - 0x21) * 94 + data - 0x21;
+ res->result = myencoding_map_jis0208[pointer];
+
+ if(res->result == 0)
+ return MyENCODING_STATUS_ERROR;
+
+ return MyENCODING_STATUS_OK;
+ }
+
+ res->second = 4;
+ return MyENCODING_STATUS_ERROR;
+ }
+
+ case 6: // Escape start
+ {
+ if(data == 0x24 || data == 0x28) {
+ res->first = data;
+ res->second = 7;
+
+ return MyENCODING_STATUS_CONTINUE;
+ }
+
+ res->flag = 0;
+ res->second = res->third;
+
+ return MyENCODING_STATUS_ERROR;
+ }
+
+ case 7: // Escape
+ {
+ unsigned long lead = res->first;
+ res->first = 0x00;
+
+ res->second = 0x00;
+
+ if(lead == 0x28 && data == 0x42) {
+ res->second = 0;
+ }
+ else if(lead == 0x28 && data == 0x4A) {
+ res->second = 1;
+ }
+ else if(lead == 0x28 && data == 0x49) {
+ res->second = 2;
+ }
+ else if(lead == 0x24 && (data == 0x40 || data == 0x42)) {
+ res->second = 4;
+ }
+
+ if(res->second)
+ {
+ res->third = res->second;
+
+ unsigned long output_flag = res->flag;
+ res->flag = 1;
+
+ if(output_flag)
+ return MyENCODING_STATUS_ERROR;
+
+ return MyENCODING_STATUS_CONTINUE;
+ }
+
+ res->flag = 0;
+ res->second = res->third;
+
+ return MyENCODING_STATUS_ERROR;
+ }
+
+ default:
+ break;
+ }
+
+ return 0;
+}
+
+myencoding_status_t myencoding_decode_shift_jis(unsigned const char data, myencoding_result_t *res)
+{
+ // res->first -- lead
+
+ if(res->first)
+ {
+ unsigned long offset;
+ unsigned long lead_offset;
+
+ if(data < 0x7F)
+ offset = 0x40;
+ else
+ offset = 0x41;
+
+ if(res->first < 0xA0)
+ lead_offset = 0x81;
+ else
+ lead_offset = 0xC1;
+
+ unsigned long pointer = 0x00;
+ if((data >= 0x40 && data <= 0x7E) ||
+ (data >= 0x80 && data <= 0xFC))
+ {
+ pointer = (res->first - lead_offset) * 188 + data - offset;
+ }
+
+ if(pointer)
+ res->result = myencoding_map_jis0208[pointer];
+ else
+ res->result = 0x00;
+
+ if(res->result == 0x00 && (pointer >= 8836 && pointer <= 10528)) {
+ res->result = 0xE000 + pointer - 8836;
+ }
+
+ if(res->result)
+ return MyENCODING_STATUS_OK;
+
+ return MyENCODING_STATUS_ERROR;
+ }
+
+ if((data <= 0x7F) || data == 0x80) {
+ res->result = data;
+ return MyENCODING_STATUS_OK;
+ }
+
+ if(data >= 0xA1 && data <= 0xDF) {
+ res->result = 0xFF61 + data - 0xA1;
+ return MyENCODING_STATUS_OK;
+ }
+
+ if((data >= 0x81 && data <= 0x9F) ||
+ (data >= 0xE0 && data <= 0xFC)) {
+ res->first = data;
+ return MyENCODING_STATUS_CONTINUE;
+ }
+
+ return MyENCODING_STATUS_ERROR;
+}
+
+myencoding_status_t myencoding_decode_euc_kr(unsigned const char data, myencoding_result_t *res)
+{
+ // res->first -- lead
+
+ if(res->first)
+ {
+ unsigned long lead = res->first, pointer = 0x00;
+ res->first = 0x00;
+
+ if(data >= 0x41 && data <= 0xFE) {
+ pointer = (lead - 0x81) * 190 + (data - 0x41);
+ }
+
+ if(pointer)
+ res->result = myencoding_map_euc_kr[pointer];
+
+ if(res->result)
+ return MyENCODING_STATUS_OK;
+
+ return MyENCODING_STATUS_ERROR;
+ }
+
+ if(data <= 0x7F) {
+ res->result = data;
+ return MyENCODING_STATUS_OK;
+ }
+
+ if(data >= 0x81 && data <= 0xFE) {
+ res->first = data;
+ return MyENCODING_STATUS_CONTINUE;
+ }
+
+ return MyENCODING_STATUS_ERROR;
+}
+
+myencoding_status_t myencoding_decode_shared_utf_16(unsigned const char data, myencoding_result_t *res)
+{
+ // res->first -- lead
+ // res->second -- lead surrogate
+ // res->flag -- flag
+
+ if(res->first == 0x00) {
+ res->first = data;
+ return MyENCODING_STATUS_CONTINUE;
+ }
+
+ unsigned long code_unit;
+ if(res->flag)
+ (code_unit = (res->first << 8) + data);
+ else
+ (code_unit = (unsigned long)(data << 8) + res->first);
+
+ res->first = 0x00;
+
+ if(res->second) {
+ unsigned long lead_surrogate = res->second;
+ res->second = 0x00;
+
+ if(code_unit >= 0xDC00 && code_unit <= 0xDFFF) {
+ res->result = 0x10000 + ((lead_surrogate - 0xD800) << 10) + (code_unit - 0xDC00);
+ return MyENCODING_STATUS_OK;
+ }
+
+ unsigned char byte1 = (unsigned char)(code_unit >> 8);
+ unsigned char byte2 = (unsigned char)(code_unit & 0x00FF);
+
+ if(res->flag) {
+ res->result = byte1;
+ res->result_aux = byte2;
+ }
+ else {
+ res->result = byte2;
+ res->result_aux = byte1;
+ }
+
+ return MyENCODING_STATUS_DONE|MyENCODING_STATUS_ERROR;
+ }
+
+ if(code_unit >= 0xD800 && code_unit <= 0xDBFF) {
+ res->second = code_unit;
+ return MyENCODING_STATUS_CONTINUE;
+ }
+
+ if(code_unit >= 0xDC00 && code_unit <= 0xDFFF) {
+ return MyENCODING_STATUS_ERROR;
+ }
+
+ res->result = code_unit;
+
+ return MyENCODING_STATUS_OK;
+}
+
+myencoding_status_t myencoding_decode_utf_16be(unsigned const char data, myencoding_result_t *res)
+{
+ if(res->flag == 0)
+ res->flag = 1;
+
+ return myencoding_decode_shared_utf_16(data, res);
+}
+
+myencoding_status_t myencoding_decode_utf_16le(unsigned const char data, myencoding_result_t *res)
+{
+ if(res->flag)
+ res->flag = 0;
+
+ return myencoding_decode_shared_utf_16(data, res);
+}
+
+myencoding_status_t myencoding_decode_x_user_defined(unsigned const char data, myencoding_result_t *res)
+{
+ if(data <= 0x7F)
+ res->result = data;
+ else
+ res->result = 0xF780 + data - 0x80;
+
+ return MyENCODING_STATUS_OK;
+}
+
+void myencoding_result_clean(myencoding_result_t *res)
+{
+ memset(res, 0, sizeof(myencoding_result_t));
+}
+
+size_t myencoding_codepoint_ascii_length(size_t codepoint)
+{
+ if (codepoint <= 0x0000007F) {
+ return 1;
+ }
+ else if (codepoint <= 0x000007FF) {
+ return 2;
+ }
+ else if (codepoint <= 0x0000FFFF) {
+ return 3;
+ }
+ else if (codepoint <= 0x001FFFFF) {
+ return 4;
+ }
+ /* not uses in unicode */
+ else if (codepoint <= 0x03FFFFFF) {
+ return 5;
+ }
+ else if (codepoint <= 0x7FFFFFFF) {
+ return 6;
+ }
+
+ return 0;
+}
+
+size_t myencoding_ascii_utf_8_length(const unsigned char data)
+{
+ if (data < 0x80){
+ return 1;
+ }
+ else if ((data & 0xe0) == 0xc0) {
+ return 2;
+ }
+ else if ((data & 0xf0) == 0xe0) {
+ return 3;
+ }
+ else if ((data & 0xf8) == 0xf0) {
+ return 4;
+ }
+ else if ((data & 0xfc) == 0xf8) {
+ return 5;
+ }
+ else if ((data & 0xff) == 0xfc) {
+ return 6;
+ }
+
+ return 0;
+}
+
+size_t myencoding_codepoint_to_ascii_utf_8(size_t codepoint, char *data)
+{
+ /* 0x80 -- 10xxxxxx */
+ /* 0xC0 -- 110xxxxx */
+ /* 0xE0 -- 1110xxxx */
+ /* 0xF0 -- 11110xxx */
+ /* 0xF8 -- 111110xx */
+ /* 0xFC -- 1111110x */
+
+ if (codepoint <= 0x0000007F) {
+ /* 0xxxxxxx */
+ data[0] = (char)codepoint;
+ return 1;
+ }
+ else if (codepoint <= 0x000007FF) {
+ /* 110xxxxx 10xxxxxx */
+ data[0] = (char)(0xC0 | (codepoint >> 6 ));
+ data[1] = (char)(0x80 | (codepoint & 0x3F));
+
+ return 2;
+ }
+ else if (codepoint <= 0x0000FFFF) {
+ /* 1110xxxx 10xxxxxx 10xxxxxx */
+ data[0] = (char)(0xE0 | ((codepoint >> 12)));
+ data[1] = (char)(0x80 | ((codepoint >> 6 ) & 0x3F));
+ data[2] = (char)(0x80 | ( codepoint & 0x3F));
+
+ return 3;
+ }
+ else if (codepoint <= 0x001FFFFF) {
+ /* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
+ data[0] = (char)(0xF0 | ( codepoint >> 18));
+ data[1] = (char)(0x80 | ((codepoint >> 12) & 0x3F));
+ data[2] = (char)(0x80 | ((codepoint >> 6 ) & 0x3F));
+ data[3] = (char)(0x80 | ( codepoint & 0x3F));
+
+ return 4;
+ }
+ /* not uses in unicode */
+ // else if (codepoint <= 0x03FFFFFF) {
+ // /* 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx */
+ // data[0] = 0xF8 | ( codepoint >> 24);
+ // data[1] = 0x80 | ((codepoint >> 18) & 0x3F);
+ // data[2] = 0x80 | ((codepoint >> 12) & 0x3F);
+ // data[3] = 0x80 | ((codepoint >> 6 ) & 0x3F);
+ // data[4] = 0x80 | ( codepoint & 0x3F);
+ //
+ // return 5;
+ // }
+ // else if (codepoint <= 0x7FFFFFFF) {
+ // /* 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx */
+ // data[0] = 0xFC | ( codepoint >> 30);
+ // data[1] = 0x80 | ((codepoint >> 24) & 0x3F);
+ // data[2] = 0x80 | ((codepoint >> 18) & 0x3F);
+ // data[3] = 0x80 | ((codepoint >> 12) & 0x3F);
+ // data[4] = 0x80 | ((codepoint >> 6 ) & 0x3F);
+ // data[5] = 0x80 | ( codepoint & 0x3F);
+ //
+ // return 6;
+ // }
+
+ return 0;
+}
+
+size_t myencoding_codepoint_to_lowercase_ascii_utf_8(size_t codepoint, char *data)
+{
+ /* 0x80 -- 10xxxxxx */
+ /* 0xC0 -- 110xxxxx */
+ /* 0xE0 -- 1110xxxx */
+ /* 0xF0 -- 11110xxx */
+ /* 0xF8 -- 111110xx */
+ /* 0xFC -- 1111110x */
+
+ if (codepoint <= 0x0000007F) {
+ /* 0xxxxxxx */
+ data[0] = (char)mycore_string_chars_lowercase_map[ codepoint ];
+ return 1;
+ }
+ else if (codepoint <= 0x000007FF) {
+ /* 110xxxxx 10xxxxxx */
+ data[0] = (char)(0xC0 | (codepoint >> 6 ));
+ data[1] = (char)(0x80 | (codepoint & 0x3F));
+
+ return 2;
+ }
+ else if (codepoint <= 0x0000FFFF) {
+ /* 1110xxxx 10xxxxxx 10xxxxxx */
+ data[0] = (char)(0xE0 | ((codepoint >> 12)));
+ data[1] = (char)(0x80 | ((codepoint >> 6 ) & 0x3F));
+ data[2] = (char)(0x80 | ( codepoint & 0x3F));
+
+ return 3;
+ }
+ else if (codepoint <= 0x001FFFFF) {
+ /* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
+ data[0] = (char)(0xF0 | ( codepoint >> 18));
+ data[1] = (char)(0x80 | ((codepoint >> 12) & 0x3F));
+ data[2] = (char)(0x80 | ((codepoint >> 6 ) & 0x3F));
+ data[3] = (char)(0x80 | ( codepoint & 0x3F));
+
+ return 4;
+ }
+ /* not uses in unicode */
+ // else if (codepoint <= 0x03FFFFFF) {
+ // /* 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx */
+ // data[0] = 0xF8 | ( codepoint >> 24);
+ // data[1] = 0x80 | ((codepoint >> 18) & 0x3F);
+ // data[2] = 0x80 | ((codepoint >> 12) & 0x3F);
+ // data[3] = 0x80 | ((codepoint >> 6 ) & 0x3F);
+ // data[4] = 0x80 | ( codepoint & 0x3F);
+ //
+ // return 5;
+ // }
+ // else if (codepoint <= 0x7FFFFFFF) {
+ // /* 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx */
+ // data[0] = 0xFC | ( codepoint >> 30);
+ // data[1] = 0x80 | ((codepoint >> 24) & 0x3F);
+ // data[2] = 0x80 | ((codepoint >> 18) & 0x3F);
+ // data[3] = 0x80 | ((codepoint >> 12) & 0x3F);
+ // data[4] = 0x80 | ((codepoint >> 6 ) & 0x3F);
+ // data[5] = 0x80 | ( codepoint & 0x3F);
+ //
+ // return 6;
+ // }
+
+ return 0;
+}
+
+size_t myencoding_ascii_utf_8_to_codepoint(const unsigned char* data, size_t* codepoint)
+{
+ if (*data < 0x80){
+ /* 0xxxxxxx */
+ *codepoint = (size_t)*data;
+ return 1;
+ }
+ else if ((*data & 0xe0) == 0xc0) {
+ /* 110xxxxx 10xxxxxx */
+ *codepoint = (data[0] ^ (0xC0 & data[0])) << 6;
+ *codepoint |= (data[1] ^ (0x80 & data[1]));
+
+ return 2;
+ }
+ else if ((*data & 0xf0) == 0xe0) {
+ /* 1110xxxx 10xxxxxx 10xxxxxx */
+ *codepoint = (data[0] ^ (0xE0 & data[0])) << 12;
+ *codepoint |= (data[1] ^ (0x80 & data[1])) << 6;
+ *codepoint |= (data[2] ^ (0x80 & data[2]));
+
+ return 3;
+ }
+ else if ((*data & 0xf8) == 0xf0) {
+ /* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
+ *codepoint = (data[0] ^ (0xF0 & data[0])) << 18;
+ *codepoint |= (data[1] ^ (0x80 & data[1])) << 12;
+ *codepoint |= (data[2] ^ (0x80 & data[2])) << 6;
+ *codepoint |= (data[3] ^ (0x80 & data[3]));
+
+ return 4;
+ }
+
+ return 0;
+}
+
+size_t myencoding_codepoint_to_ascii_utf_16(size_t codepoint, char *data)
+{
+ if((codepoint >> 16)) {
+ codepoint -= 0x10000;
+
+ size_t high = 0xD800 | (codepoint >> 10);
+ size_t low = 0xDC00 | (codepoint & 0x3FF);
+
+ data[0] = (char)(high >> 8);
+ data[1] = (char)high;
+ data[2] = (char)(low >> 8);
+ data[3] = (char)low;
+
+ return 4;
+ }
+
+ data[0] = (char)(codepoint >> 8);
+ data[1] = (char)codepoint;
+
+ return 2;
+}
+
+size_t myencoding_convert_to_ascii_utf_8(mycore_string_raw_t* raw_str, const char* buff, size_t length, myencoding_t encoding)
+{
+ if(raw_str->data == NULL) {
+ raw_str->size = length + 1;
+ raw_str->length = 0;
+ raw_str->data = mycore_malloc(sizeof(char) * raw_str->size);
+
+ if(raw_str->data == NULL)
+ return 0;
+ }
+
+ myencoding_result_t res = {0};
+
+ unsigned const char* u_buff = (unsigned const char*)buff;
+ const myencoding_custom_f func = myencoding_get_function_by_id(encoding);
+
+ size_t i;
+ for (i = 0; i < length; i++)
+ {
+ if(func(u_buff[i], &res) == MyENCODING_STATUS_OK) {
+ if((raw_str->length + 6) >= raw_str->size) {
+ size_t new_size = raw_str->length + 6 + (length / 2);
+ char *new_data = mycore_realloc(raw_str->data, sizeof(char) * new_size);
+
+ if(new_data == NULL) {
+ return 0;
+ }
+
+ raw_str->data = new_data;
+ raw_str->size = new_size;
+ }
+
+ raw_str->length += myencoding_codepoint_to_ascii_utf_8(res.result, &raw_str->data[raw_str->length]);
+ }
+ }
+
+ return i;
+}
+
+