diff options
Diffstat (limited to 'source/myhtml/encoding_detect.c')
-rw-r--r-- | source/myhtml/encoding_detect.c | 948 |
1 files changed, 0 insertions, 948 deletions
diff --git a/source/myhtml/encoding_detect.c b/source/myhtml/encoding_detect.c deleted file mode 100644 index 4dc0842..0000000 --- a/source/myhtml/encoding_detect.c +++ /dev/null @@ -1,948 +0,0 @@ -/* - Copyright (C) 2015-2016 Alexander Borisov - - This library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - This library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with this library; if not, write to the Free Software - Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - - Author: lex.borisov@gmail.com (Alexander Borisov) -*/ - -#include "myhtml/encoding.h" -#include "myhtml/encoding_detect_resource.h" -#include "myhtml/utils/resources.h" - -myhtml_encoding_trigram_result_t myhtml_encoding_detect_by_trigram(unsigned const char *u_text, size_t length, - const myhtml_encoding_trigram_t *list, size_t list_length, - size_t max_sum_for_break) -{ - myhtml_encoding_trigram_result_t res = {0, 0}; - - for (size_t i = 0; i < (length - 3); i++) { - if(u_text[i] > 127) - { - for (size_t j = 0; j < list_length; j++) - { - if(memcmp(list[j].trigram, &u_text[i], 3) == 0) { - res.value += list[j].value; - res.count++; - - if(res.value >= max_sum_for_break) - i = length; - - break; - } - } - } - } - - return res; -} - -bool myhtml_encoding_detect_russian_has_end(myhtml_encoding_trigram_result_t *res, size_t min_count, size_t min_value) -{ - if(res->value >= min_value || res->count >= min_count) - return true; - - return false; -} - -bool myhtml_encoding_detect_unicode_has_end(myhtml_encoding_unicode_result_t *res, size_t max_bad_percent) -{ - if(res->count_good == 0) { - if(res->count_bad) - return false; - - return true; - } - else if(res->count_bad == 0) - return true; - - size_t percent_bad = (res->count_bad * 100) / res->count_good; - if(percent_bad < max_bad_percent) - return true; - - return false; -} - -myhtml_encoding_unicode_result_t myhtml_encoding_detect_utf_8(unsigned const char *u_text, size_t length) -{ - size_t i = 0; - myhtml_encoding_unicode_result_t res = {0, 0, 0}; - - while(i < length) - { - if((u_text[i] & 0x80) == 0x00) { - i++; - res.count_ascii++; - } - else if((u_text[i] & 0xE0) == 0xC0) { - i += 2; - - if(i >= length) - break; - - if(((u_text[i - 1] & 0x80) && (u_text[i - 1] & 0x40) == 0)) - res.count_good++; - else - res.count_bad++; - } - else if((u_text[i] & 0xF0) == 0xE0) { - i += 3; - - if(i >= length) - break; - - if(((u_text[i - 1] & 0x80) && (u_text[i - 1] & 0x40) == 0) && - ((u_text[i - 2] & 0x80) && (u_text[i - 2] & 0x40) == 0)) { - res.count_good++; - } - else - res.count_bad++; - } - else if((u_text[i] & 0xF8) == 0xF0) { - i += 4; - - if(i >= length) - break; - - if(((u_text[i - 1] & 0x80) && (u_text[i - 1] & 0x40) == 0) && - ((u_text[i - 2] & 0x80) && (u_text[i - 2] & 0x40) == 0) && - ((u_text[i - 3] & 0x80) && (u_text[i - 3] & 0x40) == 0)) { - res.count_good++; - } - else - res.count_bad++; - } - else { - i++; - res.count_bad++; - } - } - - return res; -} - -myhtml_encoding_unicode_result_t myhtml_encoding_detect_utf_16(unsigned const char *u_text, size_t length) -{ - size_t i = 0; - myhtml_encoding_unicode_result_t res = {0, 0, 0}; - - while(i < length) - { - if(u_text[i] == 0x00) { - if((i % 2) == 0) { - i++; - - if(u_text[i] > 0x1F && u_text[i] < 0x7F) - res.count_bad++; - } - else { - if(u_text[(i - 1)] > 0x1F && u_text[(i - 1)] < 0x7F) - res.count_good++; - - i++; - } - } - else - i++; - } - - return res; -} - -bool myhtml_encoding_detect_bom(const char *text, size_t length, myhtml_encoding_t *encoding) -{ - unsigned const char *u_text = (unsigned const char*)text; - - if(length > 2) { - if(u_text[0] == 0xEF && - u_text[1] == 0xBB && - u_text[2] == 0xBF) - { - *encoding = MyHTML_ENCODING_UTF_8; - return true; - } - } - - if(length > 1) { - if(u_text[0] == 0xFE && u_text[1] == 0xFF) { - *encoding = MyHTML_ENCODING_UTF_16BE; - return true; - } - - if(u_text[0] == 0xFF && u_text[1] == 0xFE) { - *encoding = MyHTML_ENCODING_UTF_16LE; - return true; - } - } - -// //for UTF-32 -// if(length > 3) { -// if(u_text[0] == 0x00 && -// u_text[1] == 0x00 && -// u_text[2] == 0xFE && -// u_text[3] == 0xFF) -// { -// *encoding = MyHTML_ENCODING_UTF_32BE; -// return true; -// } -// -// if(u_text[0] == 0xFF && -// u_text[1] == 0xFE && -// u_text[2] == 0x00 && -// u_text[3] == 0x00) -// { -// *encoding = MyHTML_ENCODING_UTF_32LE; -// return true; -// } -// } - - return false; -} - -bool myhtml_encoding_detect_and_cut_bom(const char *text, size_t length, myhtml_encoding_t *encoding, const char **new_text, size_t *new_size) -{ - if(myhtml_encoding_detect_bom(text, length, encoding)) - { - if(*encoding == MyHTML_ENCODING_UTF_8) { - *new_text = &text[3]; - *new_size = length - 3; - } - else { - *new_text = &text[2]; - *new_size = length - 2; - } - - return true; - } - - return false; -} - -bool myhtml_encoding_detect_unicode(const char *text, size_t length, myhtml_encoding_t *encoding) -{ - unsigned const char *u_text = (unsigned const char*)text; - *encoding = MyHTML_ENCODING_DEFAULT; - - myhtml_encoding_unicode_result_t res = myhtml_encoding_detect_utf_16(u_text, length); - - if(res.count_bad == 0 && res.count_good >= 3) { - *encoding = MyHTML_ENCODING_UTF_16LE; - return true; - } - else if(res.count_bad >= 3 && res.count_good == 0) { - *encoding = MyHTML_ENCODING_UTF_16BE; - return true; - } - - res = myhtml_encoding_detect_utf_8(u_text, length); - if(myhtml_encoding_detect_unicode_has_end(&res, 10)) { - *encoding = MyHTML_ENCODING_UTF_8; - return true; - } - - return false; -} - -bool myhtml_encoding_detect_russian(const char *text, size_t length, myhtml_encoding_t *encoding) -{ - unsigned const char *u_text = (unsigned const char*)text; - - size_t min_count = 50; - size_t min_value = 100000; - size_t max_value = 0; - - *encoding = MyHTML_ENCODING_DEFAULT; - - myhtml_encoding_trigram_result_t - res = myhtml_encoding_detect_by_trigram(u_text, length, myhtml_encoding_detect_trigrams_index_windows_1251, 1000, min_value); - if(myhtml_encoding_detect_russian_has_end(&res, min_count, min_value)) { - *encoding = MyHTML_ENCODING_WINDOWS_1251; - return true; - } - - max_value = res.value; - if(max_value) { - *encoding = MyHTML_ENCODING_WINDOWS_1251; - } - - res = myhtml_encoding_detect_by_trigram(u_text, length, myhtml_encoding_detect_trigrams_index_koi8_r, 1000, min_value); - if(myhtml_encoding_detect_russian_has_end(&res, min_count, min_value)) { - *encoding = MyHTML_ENCODING_KOI8_R; - return true; - } - - if(max_value < res.value) { - *encoding = MyHTML_ENCODING_KOI8_R; - max_value = res.value; - } - - res = myhtml_encoding_detect_by_trigram(u_text, length, myhtml_encoding_detect_trigrams_index_iso_8859_5, 1000, min_value); - if(myhtml_encoding_detect_russian_has_end(&res, min_count, min_value)) { - *encoding = MyHTML_ENCODING_ISO_8859_5; - return true; - } - - if(max_value < res.value) { - *encoding = MyHTML_ENCODING_ISO_8859_5; - max_value = res.value; - } - - res = myhtml_encoding_detect_by_trigram(u_text, length, myhtml_encoding_detect_trigrams_index_x_mac_cyrillic, 1000, min_value); - if(myhtml_encoding_detect_russian_has_end(&res, min_count, min_value)) { - *encoding = MyHTML_ENCODING_X_MAC_CYRILLIC; - return true; - } - - if(max_value < res.value) { - *encoding = MyHTML_ENCODING_X_MAC_CYRILLIC; - max_value = res.value; - } - - res = myhtml_encoding_detect_by_trigram(u_text, length, myhtml_encoding_detect_trigrams_index_ibm866, 1000, min_value); - if(myhtml_encoding_detect_russian_has_end(&res, min_count, min_value)) { - *encoding = MyHTML_ENCODING_IBM866; - return true; - } - - if(max_value < res.value) { - *encoding = MyHTML_ENCODING_IBM866; - } - - return false; -} - -bool myhtml_encoding_detect(const char *text, size_t length, myhtml_encoding_t *encoding) -{ - *encoding = MyHTML_ENCODING_DEFAULT; - - if(myhtml_encoding_detect_unicode(text, length, encoding)) - return true; - - if(myhtml_encoding_detect_russian(text, length, encoding)) - return true; - - return false; -} - -const myhtml_encoding_detect_name_entry_t * myhtml_encoding_name_entry_by_name(const char* name, size_t length) -{ - size_t idx = ((myhtml_string_chars_lowercase_map[ (const unsigned char)name[0] ] * - myhtml_string_chars_lowercase_map[ (const unsigned char)name[(length - 1)] ] * - length) - % MyHTML_ENCODING_DETECT_NAME_STATIC_SIZE) + 1; - - while (myhtml_encoding_detect_name_entry_static_list_index[idx].label) - { - if(myhtml_encoding_detect_name_entry_static_list_index[idx].label_length == length) { - if(myhtml_strncasecmp(myhtml_encoding_detect_name_entry_static_list_index[idx].label, name, length) == 0) - return &myhtml_encoding_detect_name_entry_static_list_index[idx]; - - if(myhtml_encoding_detect_name_entry_static_list_index[idx].next) - idx = myhtml_encoding_detect_name_entry_static_list_index[idx].next; - else - return NULL; - } - else if(myhtml_encoding_detect_name_entry_static_list_index[idx].label_length > length) { - return NULL; - } - else { - idx = myhtml_encoding_detect_name_entry_static_list_index[idx].next; - } - } - - return NULL; -} - -bool myhtml_encoding_by_name(const char *name, size_t length, myhtml_encoding_t *encoding) -{ - const myhtml_encoding_detect_name_entry_t *entry = myhtml_encoding_name_entry_by_name(name, length); - - if(entry) { - if(encoding) - *encoding = entry->encoding; - - return true; - } - - return false; -} - -const char * myhtml_encoding_name_by_id(myhtml_encoding_t encoding, size_t *length) -{ - if(encoding >= MyHTML_ENCODING_LAST_ENTRY) { - if(length) { - *length = 0; - } - - return NULL; - } - - const myhtml_encoding_entry_name_index_t *entry = &myhtml_encoding_entry_name_index_static_list_index[encoding]; - - if(length) { - *length = entry->length; - } - - return entry->name; -} - -/* - When an algorithm requires a user agent to prescan a byte stream to determine its encoding, - given some defined end condition, then it must run the following steps. - These steps either abort unsuccessfully or return a character encoding. - If at any point during these steps (including during instances of the get an attribute algorithm invoked by this one) - the user agent either runs out of bytes (meaning the position pointer created in the first step below goes beyond the end of the byte stream obtained so far) - or reaches its end condition, then abort the prescan a byte stream to determine its encoding algorithm unsuccessfully. -*/ - -bool myhtml_encoding_algorithm_extracting_character_encoding_from_meta_element(const char *data, size_t data_size, myhtml_encoding_t *encoding) -{ - *encoding = MyHTML_ENCODING_NOT_DETERMINED; - - /* 1 */ - size_t length = 0; - size_t charset_length = strlen("charset"); - - bool is_get_pos = false; - const unsigned char *udata = (const unsigned char *)data; - - /* 2 */ - while((length + charset_length) < data_size) { - if(myhtml_ustrcasecmp_without_checks_by_secondary((const unsigned char*)"charset", &udata[length])) - { - length += charset_length; - - /* 2 */ - while(length < data_size) { - if(udata[length] != 0x09 && udata[length] != 0x0A && udata[length] != 0x0C && - udata[length] != 0x0D && udata[length] != 0x20) - { - break; - } - - length++; - } - - /* 4 */ - if(udata[length] == 0x3D) { /* EQUALS SIGN (=) */ - is_get_pos = true; - - length++; - break; - } - } - - length++; - } - - if(is_get_pos == false || length >= data_size) - return false; - - /* 5 */ - while(length < data_size) { - if(udata[length] != 0x09 && udata[length] != 0x0A && udata[length] != 0x0C && - udata[length] != 0x0D && udata[length] != 0x20) - { - break; - } - - length++; - } - - if(length >= data_size) - return false; - - /* 6 */ - /* " */ - if(udata[length] == 0x22) - { - length++; - size_t begin = length; - - while(length < data_size) { - if(udata[length] == 0x22) - return myhtml_encoding_by_name(&data[begin], (length - begin), encoding); - - length++; - } - - return false; - } - - /* ' */ - if(udata[length] == 0x27) - { - length++; - size_t begin = length; - - while(length < data_size) { - if(udata[length] == 0x27) - return myhtml_encoding_by_name(&data[begin], (length - begin), encoding); - - length++; - } - - return false; - } - - /* other */ - while(length < data_size) { - if(udata[length] != 0x09 && udata[length] != 0x0A && udata[length] != 0x0C && - udata[length] != 0x0D && udata[length] != 0x20) - { - size_t begin = length; - - while(length < data_size) { - /* SEMICOLON character (;) */ - if(udata[length] == 0x3B) { - return myhtml_encoding_by_name(&data[begin], (length - begin), encoding); - } - - length++; - } - - return myhtml_encoding_by_name(&data[begin], (length - begin), encoding); - } - - length++; - } - - return false; -} - -bool myhtml_encoding_prescan_stream_to_determine_encoding_get_attr_spaces(const unsigned char *udata, size_t *data_length, size_t data_size, myhtml_encoding_detect_attr_t *attr) -{ - size_t length = *data_length; - - /* set position */ - attr->key_length = length - attr->key_begin; - - /* 6 */ - while(length < data_size) { - if(udata[length] != 0x09 && udata[length] != 0x0A && udata[length] != 0x0C && - udata[length] != 0x0D && udata[length] != 0x20 && udata[length] != 0x2F) - { - break; - } - - length++; - } - - if(length >= data_size) { - *data_length = length; - return false; - } - - /* 7 */ - if(udata[length] != 0x3D) { - *data_length = length; - return false; - } - - /* 8 */ - *data_length = (length + 1); - return true; -} - -size_t myhtml_encoding_prescan_stream_to_determine_encoding_get_attr_value(const unsigned char *udata, size_t length, size_t data_size, myhtml_encoding_detect_attr_t *attr, bool *it_last) -{ - /* 9 */ - while(length < data_size) { - if(udata[length] != 0x09 && udata[length] != 0x0A && udata[length] != 0x0C && - udata[length] != 0x0D && udata[length] != 0x20) - { - break; - } - - length++; - } - - if(length >= data_size) { - *it_last = true; - return length; - } - - /* 10 */ - switch (udata[length]) { - case 0x22: /* (ASCII ") */ - length++; - attr->value_begin = length; - - while(length < data_size) { - if(udata[length] == 0x22) - { - attr->value_length = length - attr->value_begin; - return (length + 1); - } - - length++; - } - - break; - - case 0x27: /* (ASCII ') */ - length++; - attr->value_begin = length; - - while(length < data_size) { - if(udata[length] == 0x27) - { - attr->value_length = length - attr->value_begin; - return (length + 1); - } - - length++; - } - - break; - - case 0x3E: /* (ASCII >) */ - *it_last = true; - return (length + 1); - - default: - attr->value_begin = length; - - while(length < data_size) { - if(udata[length] == 0x09 || udata[length] == 0x0A || udata[length] == 0x0C || - udata[length] == 0x0D || udata[length] == 0x20 || udata[length] == 0x3E) - { - attr->value_length = length - attr->value_begin; - return (length + 1); - } - - length++; - } - - break; - } - - attr->value_length = length - attr->value_begin; - return length; -} - -size_t myhtml_encoding_prescan_stream_to_determine_encoding_get_attr(const unsigned char *udata, size_t length, size_t data_size, myhtml_encoding_detect_attr_t *attr, bool *it_last) -{ - memset(attr, 0, sizeof(myhtml_encoding_detect_attr_t)); - - /* - If the byte at position is one of 0x09 (ASCII TAB), 0x0A (ASCII LF), 0x0C (ASCII FF), 0x0D (ASCII CR), - 0x20 (ASCII space), or 0x2F (ASCII /) then advance position to the next byte and redo this step. - */ - /* 1 */ - while(length < data_size) { - if(udata[length] != 0x09 && udata[length] != 0x0A && udata[length] != 0x0C && - udata[length] != 0x0D && udata[length] != 0x20 && udata[length] != 0x2F) - { - break; - } - - length++; - } - - if(length >= data_size) { - *it_last = true; - return length; - } - - /* 2 */ - if(udata[length] == 0x3E) { /* (ASCII >) */ - *it_last = true; - return (length + 1); - } - - attr->key_begin = length; - - /* 3, 4 */ - while(length < data_size) { - switch (udata[length]) { - case 0x3D: /* (ASCII =) */ - if(attr->key_begin != (length - 1)) { - attr->key_length = length - attr->key_begin; - - length++; - return myhtml_encoding_prescan_stream_to_determine_encoding_get_attr_value(udata, length, data_size, attr, it_last); - } - - /* fall through */ - - case 0x09: /* (ASCII TAB) */ - case 0x0A: /* (ASCII LF) */ - case 0x0C: /* (ASCII FF) */ - case 0x0D: /* (ASCII CR) */ - case 0x20: /* (ASCII space) */ - length++; - - if(myhtml_encoding_prescan_stream_to_determine_encoding_get_attr_spaces(udata, &length, data_size, attr) == false) { - *it_last = true; - return length; - } - - return myhtml_encoding_prescan_stream_to_determine_encoding_get_attr_value(udata, length, data_size, attr, it_last); - - case 0x2F: /* (ASCII /) */ - case 0x3E: /* (ASCII >) */ - *it_last = true; - attr->key_length = length - attr->key_begin; - - return (length + 1); - - default: - break; - } - - length++; - } - - if(myhtml_encoding_prescan_stream_to_determine_encoding_get_attr_spaces(udata, &length, data_size, attr) == false) { - *it_last = true; - return length; - } - - return myhtml_encoding_prescan_stream_to_determine_encoding_get_attr_value(udata, length, data_size, attr, it_last); -} - -bool myhtml_encoding_prescan_stream_to_determine_encoding_check_meta(const unsigned char *udata, size_t *length, size_t data_size, myhtml_encoding_t *encoding) -{ - myhtml_encoding_detect_attr_t attr; - - bool got_pragma = false; - bool it_last = false; - - unsigned int need_pragma = 0; /* 0 = NULL, 1 = false, 2 = true */ - - /* - http-equiv = 1 - content = 2 - charset = 4 - */ - /* If the attribute's name is already in attribute list, then return to the step labeled attributes. */ - size_t is_exists = 0; - - while(*length < data_size) { - *length = myhtml_encoding_prescan_stream_to_determine_encoding_get_attr(udata, *length, data_size, &attr, &it_last); - - /* 9 */ - if(attr.key_length == strlen("http-equiv") && - myhtml_ustrcasecmp_without_checks_by_secondary((const unsigned char*)"http-equiv", &udata[ attr.key_begin ])) - { - if((is_exists & 1) == 0) { - is_exists |= 1; - - if(attr.value_length == strlen("content-type") && - myhtml_ustrcasecmp_without_checks_by_secondary((const unsigned char*)"content-type", &udata[ attr.value_begin ])) - { - got_pragma = true; - } - } - } - else if(attr.key_length == strlen("content") && - myhtml_ustrcasecmp_without_checks_by_secondary((const unsigned char*)"content", &udata[ attr.key_begin ])) - { - if((is_exists & 2) == 0) { - is_exists |= 2; - - if(myhtml_encoding_algorithm_extracting_character_encoding_from_meta_element((const char*)(&udata[ attr.value_begin ]), attr.value_length, encoding)) { - need_pragma = 2; - } - } - } - else if(attr.key_length == strlen("charset") && - myhtml_ustrcasecmp_without_checks_by_secondary((const unsigned char*)"charset", &udata[ attr.key_begin ])) - { - if((is_exists & 4) == 0) { - is_exists |= 4; - - myhtml_encoding_by_name((const char*)(&udata[ attr.value_begin ]), attr.value_length, encoding); - need_pragma = 1; - } - } - - if(it_last) - break; - } - - /* 11, 12, 13 */ - if(need_pragma == 0 || (need_pragma == 2 && got_pragma == false)) { - *encoding = MyHTML_ENCODING_NOT_DETERMINED; - return false; - } - - /* 14 */ - if(*encoding == MyHTML_ENCODING_UTF_16BE || *encoding == MyHTML_ENCODING_UTF_16LE) { - *encoding = MyHTML_ENCODING_UTF_8; - } - - /* 15 */ - if(*encoding == MyHTML_ENCODING_X_USER_DEFINED) { - *encoding = MyHTML_ENCODING_WINDOWS_1252; - } - - /* 16 */ - return true; -} - -size_t myhtml_encoding_prescan_stream_to_determine_encoding_skip_name(const unsigned char *udata, size_t length, size_t data_size) -{ - while(length < data_size) { - if(udata[length] != 0x09 && udata[length] != 0x0A && udata[length] != 0x0C && - udata[length] != 0x0D && udata[length] != 0x20) - { - break; - } - - length++; - } - - if(length >= data_size) - return length; - - if(udata[length] == 0x3E) { - return (length + 1); - } - - myhtml_encoding_detect_attr_t attr; - bool it_last = false; - - while(length < data_size) { - length = myhtml_encoding_prescan_stream_to_determine_encoding_get_attr(udata, length, data_size, &attr, &it_last); - - if(it_last) { - return length; - } - } - - return length; -} - -size_t myhtml_encoding_prescan_stream_to_determine_encoding_skip_other(const unsigned char *udata, size_t length, size_t data_size) -{ - if(udata[length] == 0x2F) { /* / */ - length++; - - if(length >= data_size) - return length; - - if(myhtml_tokenizer_chars_map[ udata[length] ] == MyHTML_TOKENIZER_CHAR_A_Z_a_z) { - return myhtml_encoding_prescan_stream_to_determine_encoding_skip_name(udata, length, data_size); - } - - while(length < data_size) { - if(udata[length] != 0x3E) { - return (length + 1); - } - - length++; - } - - return length; - } - else if(udata[length] == 0x21) { /* ! */ - length++; - - if((length + 2) < data_size && udata[length] == 0x2D && udata[(length+1)] == 0x2D) { - while(length < data_size) { - if(udata[length] != 0x3E) { - if(udata[(length - 1)] == 0x2D && udata[(length - 2)] == 0x2D) - return (length + 1); - - length++; - } - } - - return length; - } - - while(length < data_size) { - if(udata[length] != 0x3E) { - return (length + 1); - } - - length++; - } - - return length; - } - else if(udata[length] == 0x3F) { /* ? */ - length++; - - while(length < data_size) { - if(udata[length] != 0x3E) { - return (length + 1); - } - - length++; - } - - return length; - } - - - return myhtml_encoding_prescan_stream_to_determine_encoding_skip_name(udata, length, data_size); -} - -myhtml_encoding_t myhtml_encoding_prescan_stream_to_determine_encoding(const char *data, size_t data_size) -{ - const unsigned char* udata = (const unsigned char*)data; - myhtml_encoding_t encoding = MyHTML_ENCODING_NOT_DETERMINED; - - size_t i = 0; - while(i < data_size) { - /* 0x3C = '<' */ - if(data[i] == 0x3C) - { - if((i + 5) >= data_size) - return encoding; - - i++; - - switch (data[i]) { - /* - A sequence of bytes starting with: - 0x3C, 0x4D or 0x6D, 0x45 or 0x65, 0x54 or 0x74, 0x41 or 0x61, - and one of 0x09, 0x0A, 0x0C, 0x0D, 0x20, 0x2F - (case-insensitive ASCII '<meta' followed by a space or slash) - */ - case 0x4D: - case 0x6D: - if(myhtml_ustrcasecmp_without_checks_by_secondary((const unsigned char*)"meta", &udata[i])) { - i += 4; - - if(udata[i] == 0x09 || udata[i] == 0x0A || udata[i] == 0x0C || - udata[i] == 0x0D || udata[i] == 0x20 || udata[i] == 0x2F) - { - i++; - - if(myhtml_encoding_prescan_stream_to_determine_encoding_check_meta(udata, &i, data_size, &encoding)) - return encoding; - } - } - - break; - - default: - i = myhtml_encoding_prescan_stream_to_determine_encoding_skip_other(udata, i, data_size); - break; - } - } - else { - i++; - } - } - - return encoding; -} - - |