Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/lexborisov/Modest.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
Diffstat (limited to 'source/myhtml/encoding_detect.c')
-rw-r--r--source/myhtml/encoding_detect.c948
1 files changed, 0 insertions, 948 deletions
diff --git a/source/myhtml/encoding_detect.c b/source/myhtml/encoding_detect.c
deleted file mode 100644
index 4dc0842..0000000
--- a/source/myhtml/encoding_detect.c
+++ /dev/null
@@ -1,948 +0,0 @@
-/*
- Copyright (C) 2015-2016 Alexander Borisov
-
- This library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- This library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with this library; if not, write to the Free Software
- Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-
- Author: lex.borisov@gmail.com (Alexander Borisov)
-*/
-
-#include "myhtml/encoding.h"
-#include "myhtml/encoding_detect_resource.h"
-#include "myhtml/utils/resources.h"
-
-myhtml_encoding_trigram_result_t myhtml_encoding_detect_by_trigram(unsigned const char *u_text, size_t length,
- const myhtml_encoding_trigram_t *list, size_t list_length,
- size_t max_sum_for_break)
-{
- myhtml_encoding_trigram_result_t res = {0, 0};
-
- for (size_t i = 0; i < (length - 3); i++) {
- if(u_text[i] > 127)
- {
- for (size_t j = 0; j < list_length; j++)
- {
- if(memcmp(list[j].trigram, &u_text[i], 3) == 0) {
- res.value += list[j].value;
- res.count++;
-
- if(res.value >= max_sum_for_break)
- i = length;
-
- break;
- }
- }
- }
- }
-
- return res;
-}
-
-bool myhtml_encoding_detect_russian_has_end(myhtml_encoding_trigram_result_t *res, size_t min_count, size_t min_value)
-{
- if(res->value >= min_value || res->count >= min_count)
- return true;
-
- return false;
-}
-
-bool myhtml_encoding_detect_unicode_has_end(myhtml_encoding_unicode_result_t *res, size_t max_bad_percent)
-{
- if(res->count_good == 0) {
- if(res->count_bad)
- return false;
-
- return true;
- }
- else if(res->count_bad == 0)
- return true;
-
- size_t percent_bad = (res->count_bad * 100) / res->count_good;
- if(percent_bad < max_bad_percent)
- return true;
-
- return false;
-}
-
-myhtml_encoding_unicode_result_t myhtml_encoding_detect_utf_8(unsigned const char *u_text, size_t length)
-{
- size_t i = 0;
- myhtml_encoding_unicode_result_t res = {0, 0, 0};
-
- while(i < length)
- {
- if((u_text[i] & 0x80) == 0x00) {
- i++;
- res.count_ascii++;
- }
- else if((u_text[i] & 0xE0) == 0xC0) {
- i += 2;
-
- if(i >= length)
- break;
-
- if(((u_text[i - 1] & 0x80) && (u_text[i - 1] & 0x40) == 0))
- res.count_good++;
- else
- res.count_bad++;
- }
- else if((u_text[i] & 0xF0) == 0xE0) {
- i += 3;
-
- if(i >= length)
- break;
-
- if(((u_text[i - 1] & 0x80) && (u_text[i - 1] & 0x40) == 0) &&
- ((u_text[i - 2] & 0x80) && (u_text[i - 2] & 0x40) == 0)) {
- res.count_good++;
- }
- else
- res.count_bad++;
- }
- else if((u_text[i] & 0xF8) == 0xF0) {
- i += 4;
-
- if(i >= length)
- break;
-
- if(((u_text[i - 1] & 0x80) && (u_text[i - 1] & 0x40) == 0) &&
- ((u_text[i - 2] & 0x80) && (u_text[i - 2] & 0x40) == 0) &&
- ((u_text[i - 3] & 0x80) && (u_text[i - 3] & 0x40) == 0)) {
- res.count_good++;
- }
- else
- res.count_bad++;
- }
- else {
- i++;
- res.count_bad++;
- }
- }
-
- return res;
-}
-
-myhtml_encoding_unicode_result_t myhtml_encoding_detect_utf_16(unsigned const char *u_text, size_t length)
-{
- size_t i = 0;
- myhtml_encoding_unicode_result_t res = {0, 0, 0};
-
- while(i < length)
- {
- if(u_text[i] == 0x00) {
- if((i % 2) == 0) {
- i++;
-
- if(u_text[i] > 0x1F && u_text[i] < 0x7F)
- res.count_bad++;
- }
- else {
- if(u_text[(i - 1)] > 0x1F && u_text[(i - 1)] < 0x7F)
- res.count_good++;
-
- i++;
- }
- }
- else
- i++;
- }
-
- return res;
-}
-
-bool myhtml_encoding_detect_bom(const char *text, size_t length, myhtml_encoding_t *encoding)
-{
- unsigned const char *u_text = (unsigned const char*)text;
-
- if(length > 2) {
- if(u_text[0] == 0xEF &&
- u_text[1] == 0xBB &&
- u_text[2] == 0xBF)
- {
- *encoding = MyHTML_ENCODING_UTF_8;
- return true;
- }
- }
-
- if(length > 1) {
- if(u_text[0] == 0xFE && u_text[1] == 0xFF) {
- *encoding = MyHTML_ENCODING_UTF_16BE;
- return true;
- }
-
- if(u_text[0] == 0xFF && u_text[1] == 0xFE) {
- *encoding = MyHTML_ENCODING_UTF_16LE;
- return true;
- }
- }
-
-// //for UTF-32
-// if(length > 3) {
-// if(u_text[0] == 0x00 &&
-// u_text[1] == 0x00 &&
-// u_text[2] == 0xFE &&
-// u_text[3] == 0xFF)
-// {
-// *encoding = MyHTML_ENCODING_UTF_32BE;
-// return true;
-// }
-//
-// if(u_text[0] == 0xFF &&
-// u_text[1] == 0xFE &&
-// u_text[2] == 0x00 &&
-// u_text[3] == 0x00)
-// {
-// *encoding = MyHTML_ENCODING_UTF_32LE;
-// return true;
-// }
-// }
-
- return false;
-}
-
-bool myhtml_encoding_detect_and_cut_bom(const char *text, size_t length, myhtml_encoding_t *encoding, const char **new_text, size_t *new_size)
-{
- if(myhtml_encoding_detect_bom(text, length, encoding))
- {
- if(*encoding == MyHTML_ENCODING_UTF_8) {
- *new_text = &text[3];
- *new_size = length - 3;
- }
- else {
- *new_text = &text[2];
- *new_size = length - 2;
- }
-
- return true;
- }
-
- return false;
-}
-
-bool myhtml_encoding_detect_unicode(const char *text, size_t length, myhtml_encoding_t *encoding)
-{
- unsigned const char *u_text = (unsigned const char*)text;
- *encoding = MyHTML_ENCODING_DEFAULT;
-
- myhtml_encoding_unicode_result_t res = myhtml_encoding_detect_utf_16(u_text, length);
-
- if(res.count_bad == 0 && res.count_good >= 3) {
- *encoding = MyHTML_ENCODING_UTF_16LE;
- return true;
- }
- else if(res.count_bad >= 3 && res.count_good == 0) {
- *encoding = MyHTML_ENCODING_UTF_16BE;
- return true;
- }
-
- res = myhtml_encoding_detect_utf_8(u_text, length);
- if(myhtml_encoding_detect_unicode_has_end(&res, 10)) {
- *encoding = MyHTML_ENCODING_UTF_8;
- return true;
- }
-
- return false;
-}
-
-bool myhtml_encoding_detect_russian(const char *text, size_t length, myhtml_encoding_t *encoding)
-{
- unsigned const char *u_text = (unsigned const char*)text;
-
- size_t min_count = 50;
- size_t min_value = 100000;
- size_t max_value = 0;
-
- *encoding = MyHTML_ENCODING_DEFAULT;
-
- myhtml_encoding_trigram_result_t
- res = myhtml_encoding_detect_by_trigram(u_text, length, myhtml_encoding_detect_trigrams_index_windows_1251, 1000, min_value);
- if(myhtml_encoding_detect_russian_has_end(&res, min_count, min_value)) {
- *encoding = MyHTML_ENCODING_WINDOWS_1251;
- return true;
- }
-
- max_value = res.value;
- if(max_value) {
- *encoding = MyHTML_ENCODING_WINDOWS_1251;
- }
-
- res = myhtml_encoding_detect_by_trigram(u_text, length, myhtml_encoding_detect_trigrams_index_koi8_r, 1000, min_value);
- if(myhtml_encoding_detect_russian_has_end(&res, min_count, min_value)) {
- *encoding = MyHTML_ENCODING_KOI8_R;
- return true;
- }
-
- if(max_value < res.value) {
- *encoding = MyHTML_ENCODING_KOI8_R;
- max_value = res.value;
- }
-
- res = myhtml_encoding_detect_by_trigram(u_text, length, myhtml_encoding_detect_trigrams_index_iso_8859_5, 1000, min_value);
- if(myhtml_encoding_detect_russian_has_end(&res, min_count, min_value)) {
- *encoding = MyHTML_ENCODING_ISO_8859_5;
- return true;
- }
-
- if(max_value < res.value) {
- *encoding = MyHTML_ENCODING_ISO_8859_5;
- max_value = res.value;
- }
-
- res = myhtml_encoding_detect_by_trigram(u_text, length, myhtml_encoding_detect_trigrams_index_x_mac_cyrillic, 1000, min_value);
- if(myhtml_encoding_detect_russian_has_end(&res, min_count, min_value)) {
- *encoding = MyHTML_ENCODING_X_MAC_CYRILLIC;
- return true;
- }
-
- if(max_value < res.value) {
- *encoding = MyHTML_ENCODING_X_MAC_CYRILLIC;
- max_value = res.value;
- }
-
- res = myhtml_encoding_detect_by_trigram(u_text, length, myhtml_encoding_detect_trigrams_index_ibm866, 1000, min_value);
- if(myhtml_encoding_detect_russian_has_end(&res, min_count, min_value)) {
- *encoding = MyHTML_ENCODING_IBM866;
- return true;
- }
-
- if(max_value < res.value) {
- *encoding = MyHTML_ENCODING_IBM866;
- }
-
- return false;
-}
-
-bool myhtml_encoding_detect(const char *text, size_t length, myhtml_encoding_t *encoding)
-{
- *encoding = MyHTML_ENCODING_DEFAULT;
-
- if(myhtml_encoding_detect_unicode(text, length, encoding))
- return true;
-
- if(myhtml_encoding_detect_russian(text, length, encoding))
- return true;
-
- return false;
-}
-
-const myhtml_encoding_detect_name_entry_t * myhtml_encoding_name_entry_by_name(const char* name, size_t length)
-{
- size_t idx = ((myhtml_string_chars_lowercase_map[ (const unsigned char)name[0] ] *
- myhtml_string_chars_lowercase_map[ (const unsigned char)name[(length - 1)] ] *
- length)
- % MyHTML_ENCODING_DETECT_NAME_STATIC_SIZE) + 1;
-
- while (myhtml_encoding_detect_name_entry_static_list_index[idx].label)
- {
- if(myhtml_encoding_detect_name_entry_static_list_index[idx].label_length == length) {
- if(myhtml_strncasecmp(myhtml_encoding_detect_name_entry_static_list_index[idx].label, name, length) == 0)
- return &myhtml_encoding_detect_name_entry_static_list_index[idx];
-
- if(myhtml_encoding_detect_name_entry_static_list_index[idx].next)
- idx = myhtml_encoding_detect_name_entry_static_list_index[idx].next;
- else
- return NULL;
- }
- else if(myhtml_encoding_detect_name_entry_static_list_index[idx].label_length > length) {
- return NULL;
- }
- else {
- idx = myhtml_encoding_detect_name_entry_static_list_index[idx].next;
- }
- }
-
- return NULL;
-}
-
-bool myhtml_encoding_by_name(const char *name, size_t length, myhtml_encoding_t *encoding)
-{
- const myhtml_encoding_detect_name_entry_t *entry = myhtml_encoding_name_entry_by_name(name, length);
-
- if(entry) {
- if(encoding)
- *encoding = entry->encoding;
-
- return true;
- }
-
- return false;
-}
-
-const char * myhtml_encoding_name_by_id(myhtml_encoding_t encoding, size_t *length)
-{
- if(encoding >= MyHTML_ENCODING_LAST_ENTRY) {
- if(length) {
- *length = 0;
- }
-
- return NULL;
- }
-
- const myhtml_encoding_entry_name_index_t *entry = &myhtml_encoding_entry_name_index_static_list_index[encoding];
-
- if(length) {
- *length = entry->length;
- }
-
- return entry->name;
-}
-
-/*
- When an algorithm requires a user agent to prescan a byte stream to determine its encoding,
- given some defined end condition, then it must run the following steps.
- These steps either abort unsuccessfully or return a character encoding.
- If at any point during these steps (including during instances of the get an attribute algorithm invoked by this one)
- the user agent either runs out of bytes (meaning the position pointer created in the first step below goes beyond the end of the byte stream obtained so far)
- or reaches its end condition, then abort the prescan a byte stream to determine its encoding algorithm unsuccessfully.
-*/
-
-bool myhtml_encoding_algorithm_extracting_character_encoding_from_meta_element(const char *data, size_t data_size, myhtml_encoding_t *encoding)
-{
- *encoding = MyHTML_ENCODING_NOT_DETERMINED;
-
- /* 1 */
- size_t length = 0;
- size_t charset_length = strlen("charset");
-
- bool is_get_pos = false;
- const unsigned char *udata = (const unsigned char *)data;
-
- /* 2 */
- while((length + charset_length) < data_size) {
- if(myhtml_ustrcasecmp_without_checks_by_secondary((const unsigned char*)"charset", &udata[length]))
- {
- length += charset_length;
-
- /* 2 */
- while(length < data_size) {
- if(udata[length] != 0x09 && udata[length] != 0x0A && udata[length] != 0x0C &&
- udata[length] != 0x0D && udata[length] != 0x20)
- {
- break;
- }
-
- length++;
- }
-
- /* 4 */
- if(udata[length] == 0x3D) { /* EQUALS SIGN (=) */
- is_get_pos = true;
-
- length++;
- break;
- }
- }
-
- length++;
- }
-
- if(is_get_pos == false || length >= data_size)
- return false;
-
- /* 5 */
- while(length < data_size) {
- if(udata[length] != 0x09 && udata[length] != 0x0A && udata[length] != 0x0C &&
- udata[length] != 0x0D && udata[length] != 0x20)
- {
- break;
- }
-
- length++;
- }
-
- if(length >= data_size)
- return false;
-
- /* 6 */
- /* " */
- if(udata[length] == 0x22)
- {
- length++;
- size_t begin = length;
-
- while(length < data_size) {
- if(udata[length] == 0x22)
- return myhtml_encoding_by_name(&data[begin], (length - begin), encoding);
-
- length++;
- }
-
- return false;
- }
-
- /* ' */
- if(udata[length] == 0x27)
- {
- length++;
- size_t begin = length;
-
- while(length < data_size) {
- if(udata[length] == 0x27)
- return myhtml_encoding_by_name(&data[begin], (length - begin), encoding);
-
- length++;
- }
-
- return false;
- }
-
- /* other */
- while(length < data_size) {
- if(udata[length] != 0x09 && udata[length] != 0x0A && udata[length] != 0x0C &&
- udata[length] != 0x0D && udata[length] != 0x20)
- {
- size_t begin = length;
-
- while(length < data_size) {
- /* SEMICOLON character (;) */
- if(udata[length] == 0x3B) {
- return myhtml_encoding_by_name(&data[begin], (length - begin), encoding);
- }
-
- length++;
- }
-
- return myhtml_encoding_by_name(&data[begin], (length - begin), encoding);
- }
-
- length++;
- }
-
- return false;
-}
-
-bool myhtml_encoding_prescan_stream_to_determine_encoding_get_attr_spaces(const unsigned char *udata, size_t *data_length, size_t data_size, myhtml_encoding_detect_attr_t *attr)
-{
- size_t length = *data_length;
-
- /* set position */
- attr->key_length = length - attr->key_begin;
-
- /* 6 */
- while(length < data_size) {
- if(udata[length] != 0x09 && udata[length] != 0x0A && udata[length] != 0x0C &&
- udata[length] != 0x0D && udata[length] != 0x20 && udata[length] != 0x2F)
- {
- break;
- }
-
- length++;
- }
-
- if(length >= data_size) {
- *data_length = length;
- return false;
- }
-
- /* 7 */
- if(udata[length] != 0x3D) {
- *data_length = length;
- return false;
- }
-
- /* 8 */
- *data_length = (length + 1);
- return true;
-}
-
-size_t myhtml_encoding_prescan_stream_to_determine_encoding_get_attr_value(const unsigned char *udata, size_t length, size_t data_size, myhtml_encoding_detect_attr_t *attr, bool *it_last)
-{
- /* 9 */
- while(length < data_size) {
- if(udata[length] != 0x09 && udata[length] != 0x0A && udata[length] != 0x0C &&
- udata[length] != 0x0D && udata[length] != 0x20)
- {
- break;
- }
-
- length++;
- }
-
- if(length >= data_size) {
- *it_last = true;
- return length;
- }
-
- /* 10 */
- switch (udata[length]) {
- case 0x22: /* (ASCII ") */
- length++;
- attr->value_begin = length;
-
- while(length < data_size) {
- if(udata[length] == 0x22)
- {
- attr->value_length = length - attr->value_begin;
- return (length + 1);
- }
-
- length++;
- }
-
- break;
-
- case 0x27: /* (ASCII ') */
- length++;
- attr->value_begin = length;
-
- while(length < data_size) {
- if(udata[length] == 0x27)
- {
- attr->value_length = length - attr->value_begin;
- return (length + 1);
- }
-
- length++;
- }
-
- break;
-
- case 0x3E: /* (ASCII >) */
- *it_last = true;
- return (length + 1);
-
- default:
- attr->value_begin = length;
-
- while(length < data_size) {
- if(udata[length] == 0x09 || udata[length] == 0x0A || udata[length] == 0x0C ||
- udata[length] == 0x0D || udata[length] == 0x20 || udata[length] == 0x3E)
- {
- attr->value_length = length - attr->value_begin;
- return (length + 1);
- }
-
- length++;
- }
-
- break;
- }
-
- attr->value_length = length - attr->value_begin;
- return length;
-}
-
-size_t myhtml_encoding_prescan_stream_to_determine_encoding_get_attr(const unsigned char *udata, size_t length, size_t data_size, myhtml_encoding_detect_attr_t *attr, bool *it_last)
-{
- memset(attr, 0, sizeof(myhtml_encoding_detect_attr_t));
-
- /*
- If the byte at position is one of 0x09 (ASCII TAB), 0x0A (ASCII LF), 0x0C (ASCII FF), 0x0D (ASCII CR),
- 0x20 (ASCII space), or 0x2F (ASCII /) then advance position to the next byte and redo this step.
- */
- /* 1 */
- while(length < data_size) {
- if(udata[length] != 0x09 && udata[length] != 0x0A && udata[length] != 0x0C &&
- udata[length] != 0x0D && udata[length] != 0x20 && udata[length] != 0x2F)
- {
- break;
- }
-
- length++;
- }
-
- if(length >= data_size) {
- *it_last = true;
- return length;
- }
-
- /* 2 */
- if(udata[length] == 0x3E) { /* (ASCII >) */
- *it_last = true;
- return (length + 1);
- }
-
- attr->key_begin = length;
-
- /* 3, 4 */
- while(length < data_size) {
- switch (udata[length]) {
- case 0x3D: /* (ASCII =) */
- if(attr->key_begin != (length - 1)) {
- attr->key_length = length - attr->key_begin;
-
- length++;
- return myhtml_encoding_prescan_stream_to_determine_encoding_get_attr_value(udata, length, data_size, attr, it_last);
- }
-
- /* fall through */
-
- case 0x09: /* (ASCII TAB) */
- case 0x0A: /* (ASCII LF) */
- case 0x0C: /* (ASCII FF) */
- case 0x0D: /* (ASCII CR) */
- case 0x20: /* (ASCII space) */
- length++;
-
- if(myhtml_encoding_prescan_stream_to_determine_encoding_get_attr_spaces(udata, &length, data_size, attr) == false) {
- *it_last = true;
- return length;
- }
-
- return myhtml_encoding_prescan_stream_to_determine_encoding_get_attr_value(udata, length, data_size, attr, it_last);
-
- case 0x2F: /* (ASCII /) */
- case 0x3E: /* (ASCII >) */
- *it_last = true;
- attr->key_length = length - attr->key_begin;
-
- return (length + 1);
-
- default:
- break;
- }
-
- length++;
- }
-
- if(myhtml_encoding_prescan_stream_to_determine_encoding_get_attr_spaces(udata, &length, data_size, attr) == false) {
- *it_last = true;
- return length;
- }
-
- return myhtml_encoding_prescan_stream_to_determine_encoding_get_attr_value(udata, length, data_size, attr, it_last);
-}
-
-bool myhtml_encoding_prescan_stream_to_determine_encoding_check_meta(const unsigned char *udata, size_t *length, size_t data_size, myhtml_encoding_t *encoding)
-{
- myhtml_encoding_detect_attr_t attr;
-
- bool got_pragma = false;
- bool it_last = false;
-
- unsigned int need_pragma = 0; /* 0 = NULL, 1 = false, 2 = true */
-
- /*
- http-equiv = 1
- content = 2
- charset = 4
- */
- /* If the attribute's name is already in attribute list, then return to the step labeled attributes. */
- size_t is_exists = 0;
-
- while(*length < data_size) {
- *length = myhtml_encoding_prescan_stream_to_determine_encoding_get_attr(udata, *length, data_size, &attr, &it_last);
-
- /* 9 */
- if(attr.key_length == strlen("http-equiv") &&
- myhtml_ustrcasecmp_without_checks_by_secondary((const unsigned char*)"http-equiv", &udata[ attr.key_begin ]))
- {
- if((is_exists & 1) == 0) {
- is_exists |= 1;
-
- if(attr.value_length == strlen("content-type") &&
- myhtml_ustrcasecmp_without_checks_by_secondary((const unsigned char*)"content-type", &udata[ attr.value_begin ]))
- {
- got_pragma = true;
- }
- }
- }
- else if(attr.key_length == strlen("content") &&
- myhtml_ustrcasecmp_without_checks_by_secondary((const unsigned char*)"content", &udata[ attr.key_begin ]))
- {
- if((is_exists & 2) == 0) {
- is_exists |= 2;
-
- if(myhtml_encoding_algorithm_extracting_character_encoding_from_meta_element((const char*)(&udata[ attr.value_begin ]), attr.value_length, encoding)) {
- need_pragma = 2;
- }
- }
- }
- else if(attr.key_length == strlen("charset") &&
- myhtml_ustrcasecmp_without_checks_by_secondary((const unsigned char*)"charset", &udata[ attr.key_begin ]))
- {
- if((is_exists & 4) == 0) {
- is_exists |= 4;
-
- myhtml_encoding_by_name((const char*)(&udata[ attr.value_begin ]), attr.value_length, encoding);
- need_pragma = 1;
- }
- }
-
- if(it_last)
- break;
- }
-
- /* 11, 12, 13 */
- if(need_pragma == 0 || (need_pragma == 2 && got_pragma == false)) {
- *encoding = MyHTML_ENCODING_NOT_DETERMINED;
- return false;
- }
-
- /* 14 */
- if(*encoding == MyHTML_ENCODING_UTF_16BE || *encoding == MyHTML_ENCODING_UTF_16LE) {
- *encoding = MyHTML_ENCODING_UTF_8;
- }
-
- /* 15 */
- if(*encoding == MyHTML_ENCODING_X_USER_DEFINED) {
- *encoding = MyHTML_ENCODING_WINDOWS_1252;
- }
-
- /* 16 */
- return true;
-}
-
-size_t myhtml_encoding_prescan_stream_to_determine_encoding_skip_name(const unsigned char *udata, size_t length, size_t data_size)
-{
- while(length < data_size) {
- if(udata[length] != 0x09 && udata[length] != 0x0A && udata[length] != 0x0C &&
- udata[length] != 0x0D && udata[length] != 0x20)
- {
- break;
- }
-
- length++;
- }
-
- if(length >= data_size)
- return length;
-
- if(udata[length] == 0x3E) {
- return (length + 1);
- }
-
- myhtml_encoding_detect_attr_t attr;
- bool it_last = false;
-
- while(length < data_size) {
- length = myhtml_encoding_prescan_stream_to_determine_encoding_get_attr(udata, length, data_size, &attr, &it_last);
-
- if(it_last) {
- return length;
- }
- }
-
- return length;
-}
-
-size_t myhtml_encoding_prescan_stream_to_determine_encoding_skip_other(const unsigned char *udata, size_t length, size_t data_size)
-{
- if(udata[length] == 0x2F) { /* / */
- length++;
-
- if(length >= data_size)
- return length;
-
- if(myhtml_tokenizer_chars_map[ udata[length] ] == MyHTML_TOKENIZER_CHAR_A_Z_a_z) {
- return myhtml_encoding_prescan_stream_to_determine_encoding_skip_name(udata, length, data_size);
- }
-
- while(length < data_size) {
- if(udata[length] != 0x3E) {
- return (length + 1);
- }
-
- length++;
- }
-
- return length;
- }
- else if(udata[length] == 0x21) { /* ! */
- length++;
-
- if((length + 2) < data_size && udata[length] == 0x2D && udata[(length+1)] == 0x2D) {
- while(length < data_size) {
- if(udata[length] != 0x3E) {
- if(udata[(length - 1)] == 0x2D && udata[(length - 2)] == 0x2D)
- return (length + 1);
-
- length++;
- }
- }
-
- return length;
- }
-
- while(length < data_size) {
- if(udata[length] != 0x3E) {
- return (length + 1);
- }
-
- length++;
- }
-
- return length;
- }
- else if(udata[length] == 0x3F) { /* ? */
- length++;
-
- while(length < data_size) {
- if(udata[length] != 0x3E) {
- return (length + 1);
- }
-
- length++;
- }
-
- return length;
- }
-
-
- return myhtml_encoding_prescan_stream_to_determine_encoding_skip_name(udata, length, data_size);
-}
-
-myhtml_encoding_t myhtml_encoding_prescan_stream_to_determine_encoding(const char *data, size_t data_size)
-{
- const unsigned char* udata = (const unsigned char*)data;
- myhtml_encoding_t encoding = MyHTML_ENCODING_NOT_DETERMINED;
-
- size_t i = 0;
- while(i < data_size) {
- /* 0x3C = '<' */
- if(data[i] == 0x3C)
- {
- if((i + 5) >= data_size)
- return encoding;
-
- i++;
-
- switch (data[i]) {
- /*
- A sequence of bytes starting with:
- 0x3C, 0x4D or 0x6D, 0x45 or 0x65, 0x54 or 0x74, 0x41 or 0x61,
- and one of 0x09, 0x0A, 0x0C, 0x0D, 0x20, 0x2F
- (case-insensitive ASCII '<meta' followed by a space or slash)
- */
- case 0x4D:
- case 0x6D:
- if(myhtml_ustrcasecmp_without_checks_by_secondary((const unsigned char*)"meta", &udata[i])) {
- i += 4;
-
- if(udata[i] == 0x09 || udata[i] == 0x0A || udata[i] == 0x0C ||
- udata[i] == 0x0D || udata[i] == 0x20 || udata[i] == 0x2F)
- {
- i++;
-
- if(myhtml_encoding_prescan_stream_to_determine_encoding_check_meta(udata, &i, data_size, &encoding))
- return encoding;
- }
- }
-
- break;
-
- default:
- i = myhtml_encoding_prescan_stream_to_determine_encoding_skip_other(udata, i, data_size);
- break;
- }
- }
- else {
- i++;
- }
- }
-
- return encoding;
-}
-
-