blob: ff3eddd8413e9308f95f3f445d2ac04aeffcf34d (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
|
/*---------------------------------------------------------------------------------------------
* Copyright (c) Microsoft Corporation. All rights reserved.
* Licensed under the MIT License. See License.txt in the project root for license information.
*--------------------------------------------------------------------------------------------*/
import * as jschardet from 'jschardet';
function detectEncodingByBOM(buffer: Buffer): string | null {
if (!buffer || buffer.length < 2) {
return null;
}
const b0 = buffer.readUInt8(0);
const b1 = buffer.readUInt8(1);
// UTF-16 BE
if (b0 === 0xFE && b1 === 0xFF) {
return 'utf16be';
}
// UTF-16 LE
if (b0 === 0xFF && b1 === 0xFE) {
return 'utf16le';
}
if (buffer.length < 3) {
return null;
}
const b2 = buffer.readUInt8(2);
// UTF-8
if (b0 === 0xEF && b1 === 0xBB && b2 === 0xBF) {
return 'utf8';
}
return null;
}
const IGNORE_ENCODINGS = [
'ascii',
'utf-8',
'utf-16',
'utf-32'
];
const JSCHARDET_TO_ICONV_ENCODINGS: { [name: string]: string } = {
'ibm866': 'cp866',
'big5': 'cp950'
};
export function detectEncoding(buffer: Buffer): string | null {
let result = detectEncodingByBOM(buffer);
if (result) {
return result;
}
const detected = jschardet.detect(buffer);
if (!detected || !detected.encoding) {
return null;
}
const encoding = detected.encoding;
// Ignore encodings that cannot guess correctly
// (http://chardet.readthedocs.io/en/latest/supported-encodings.html)
if (0 <= IGNORE_ENCODINGS.indexOf(encoding.toLowerCase())) {
return null;
}
const normalizedEncodingName = encoding.replace(/[^a-zA-Z0-9]/g, '').toLowerCase();
const mapped = JSCHARDET_TO_ICONV_ENCODINGS[normalizedEncodingName];
return mapped || normalizedEncodingName;
}
|