Welcome to mirror list, hosted at ThFree Co, Russian Federation.

encoding.h « myencoding « source - github.com/lexborisov/Modest.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
blob: fa43018ac1ddd7aeb0e045c449f6d67832186b39 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
/*
 Copyright (C) 2015-2017 Alexander Borisov
 
 This library is free software; you can redistribute it and/or
 modify it under the terms of the GNU Lesser General Public
 License as published by the Free Software Foundation; either
 version 2.1 of the License, or (at your option) any later version.
 
 This library is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
 Lesser General Public License for more details.
 
 You should have received a copy of the GNU Lesser General Public
 License along with this library; if not, write to the Free Software
 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 
 Author: lex.borisov@gmail.com (Alexander Borisov)
*/

#ifndef MyENCODING_ENCODING_H
#define MyENCODING_ENCODING_H
#pragma once

#ifdef __cplusplus
extern "C" {
#endif

#include <string.h>

#include "myencoding/myosi.h"
#include "mycore/utils.h"
#include "mycore/mystring.h"

struct myencoding_result {
    unsigned long first;
    unsigned long second;
    unsigned long third;
    unsigned long result;
    unsigned long result_aux;
    unsigned long flag;
};

struct myencoding_trigram {
    const unsigned char trigram[3];
    size_t value;
};

struct myencoding_trigram_result {
    size_t count;
    size_t value;
};

struct myencoding_unicode_result {
    size_t count_ascii;
    size_t count_good;
    size_t count_bad;
};

struct myencoding_detect_name_entry {
    const char* name;
    size_t name_length;
    const char* label;
    size_t label_length;
    
    myencoding_t encoding;
    
    size_t next;
    size_t curr;
};

struct myencoding_detect_attr {
    size_t key_begin;
    size_t key_length;
    size_t value_begin;
    size_t value_length;
    
    myencoding_detect_attr_t *next;
};
    
struct myencoding_entry_name_index {
    const char *name;
    size_t length;
};

typedef myencoding_status_t (*myencoding_custom_f)(unsigned const char data, myencoding_result_t *res);

myencoding_custom_f myencoding_get_function_by_id(myencoding_t idx);

myencoding_status_t myencoding_decode_utf_8(unsigned const char data, myencoding_result_t *res);
myencoding_status_t myencoding_decode_ibm866(unsigned const char data, myencoding_result_t *res);
myencoding_status_t myencoding_decode_iso_8859_2(unsigned const char data, myencoding_result_t *res);
myencoding_status_t myencoding_decode_iso_8859_3(unsigned const char data, myencoding_result_t *res);
myencoding_status_t myencoding_decode_iso_8859_4(unsigned const char data, myencoding_result_t *res);
myencoding_status_t myencoding_decode_iso_8859_5(unsigned const char data, myencoding_result_t *res);
myencoding_status_t myencoding_decode_iso_8859_6(unsigned const char data, myencoding_result_t *res);
myencoding_status_t myencoding_decode_iso_8859_7(unsigned const char data, myencoding_result_t *res);
myencoding_status_t myencoding_decode_iso_8859_8(unsigned const char data, myencoding_result_t *res);
myencoding_status_t myencoding_decode_iso_8859_8_i(unsigned const char data, myencoding_result_t *res);
myencoding_status_t myencoding_decode_iso_8859_10(unsigned const char data, myencoding_result_t *res);
myencoding_status_t myencoding_decode_iso_8859_13(unsigned const char data, myencoding_result_t *res);
myencoding_status_t myencoding_decode_iso_8859_14(unsigned const char data, myencoding_result_t *res);
myencoding_status_t myencoding_decode_iso_8859_15(unsigned const char data, myencoding_result_t *res);
myencoding_status_t myencoding_decode_iso_8859_16(unsigned const char data, myencoding_result_t *res);
myencoding_status_t myencoding_decode_koi8_r(unsigned const char data, myencoding_result_t *res);
myencoding_status_t myencoding_decode_koi8_u(unsigned const char data, myencoding_result_t *res);
myencoding_status_t myencoding_decode_macintosh(unsigned const char data, myencoding_result_t *res);
myencoding_status_t myencoding_decode_windows_874(unsigned const char data, myencoding_result_t *res);
myencoding_status_t myencoding_decode_windows_1250(unsigned const char data, myencoding_result_t *res);
myencoding_status_t myencoding_decode_windows_1251(unsigned const char data, myencoding_result_t *res);
myencoding_status_t myencoding_decode_windows_1252(unsigned const char data, myencoding_result_t *res);
myencoding_status_t myencoding_decode_windows_1253(unsigned const char data, myencoding_result_t *res);
myencoding_status_t myencoding_decode_windows_1254(unsigned const char data, myencoding_result_t *res);
myencoding_status_t myencoding_decode_windows_1255(unsigned const char data, myencoding_result_t *res);
myencoding_status_t myencoding_decode_windows_1256(unsigned const char data, myencoding_result_t *res);
myencoding_status_t myencoding_decode_windows_1257(unsigned const char data, myencoding_result_t *res);
myencoding_status_t myencoding_decode_windows_1258(unsigned const char data, myencoding_result_t *res);
myencoding_status_t myencoding_decode_x_mac_cyrillic(unsigned const char data, myencoding_result_t *res);
myencoding_status_t myencoding_decode_gbk(unsigned const char data, myencoding_result_t *res);
myencoding_status_t myencoding_decode_gb18030(unsigned const char data, myencoding_result_t *res);
myencoding_status_t myencoding_decode_big5(unsigned const char data, myencoding_result_t *res);
myencoding_status_t myencoding_decode_euc_jp(unsigned const char data, myencoding_result_t *res);
myencoding_status_t myencoding_decode_iso_2022_jp(unsigned const char data, myencoding_result_t *res);
myencoding_status_t myencoding_decode_shift_jis(unsigned const char data, myencoding_result_t *res);
myencoding_status_t myencoding_decode_euc_kr(unsigned const char data, myencoding_result_t *res);
myencoding_status_t myencoding_decode_shared_utf_16(unsigned const char data, myencoding_result_t *res);
myencoding_status_t myencoding_decode_utf_16be(unsigned const char data, myencoding_result_t *res);
myencoding_status_t myencoding_decode_utf_16le(unsigned const char data, myencoding_result_t *res);
myencoding_status_t myencoding_decode_x_user_defined(unsigned const char data, myencoding_result_t *res);

size_t myencoding_codepoint_ascii_length(size_t codepoint);
size_t myencoding_ascii_utf_8_length(const unsigned char data);

size_t myencoding_codepoint_to_ascii_utf_8(size_t codepoint, char *data);
size_t myencoding_codepoint_to_lowercase_ascii_utf_8(size_t codepoint, char *data);
size_t myencoding_codepoint_to_ascii_utf_16(size_t codepoint, char *data);
size_t myencoding_ascii_utf_8_to_codepoint(const unsigned char* data, size_t* codepoint);

void myencoding_result_clean(myencoding_result_t *res);

bool myencoding_detect(const char *text, size_t length, myencoding_t *encoding);
bool myencoding_detect_russian(const char *text, size_t length, myencoding_t *encoding);
bool myencoding_detect_unicode(const char *text, size_t length, myencoding_t *encoding);
bool myencoding_detect_bom(const char *text, size_t length, myencoding_t *encoding);
bool myencoding_detect_and_cut_bom(const char *text, size_t length, myencoding_t *encoding, const char **new_text, size_t *new_size);

size_t myencoding_convert_to_ascii_utf_8(mycore_string_raw_t* raw_str, const char* buff, size_t length, myencoding_t encoding);

const myencoding_detect_name_entry_t * myencoding_name_entry_by_name(const char* name, size_t length);
bool myencoding_by_name(const char *name, size_t length, myencoding_t *encoding);
const char * myencoding_name_by_id(myencoding_t encoding, size_t *length);

bool myencoding_extracting_character_encoding_from_charset(const char *data, size_t data_size, myencoding_t *encoding);
myencoding_t myencoding_prescan_stream_to_determine_encoding(const char *data, size_t data_size);

#ifdef __cplusplus
} /* extern "C" */
#endif

#endif /* MyENCODING_ENCODING_H */