diff options
Diffstat (limited to 'node_modules/iconv-lite/encodings/utf16.js')
-rw-r--r-- | node_modules/iconv-lite/encodings/utf16.js | 96 |
1 files changed, 58 insertions, 38 deletions
diff --git a/node_modules/iconv-lite/encodings/utf16.js b/node_modules/iconv-lite/encodings/utf16.js index 54765aeee..97d066925 100644 --- a/node_modules/iconv-lite/encodings/utf16.js +++ b/node_modules/iconv-lite/encodings/utf16.js @@ -61,6 +61,7 @@ Utf16BEDecoder.prototype.write = function(buf) { } Utf16BEDecoder.prototype.end = function() { + this.overflowByte = -1; } @@ -103,8 +104,8 @@ Utf16Encoder.prototype.end = function() { function Utf16Decoder(options, codec) { this.decoder = null; - this.initialBytes = []; - this.initialBytesLen = 0; + this.initialBufs = []; + this.initialBufsLen = 0; this.options = options || {}; this.iconv = codec.iconv; @@ -113,17 +114,22 @@ function Utf16Decoder(options, codec) { Utf16Decoder.prototype.write = function(buf) { if (!this.decoder) { // Codec is not chosen yet. Accumulate initial bytes. - this.initialBytes.push(buf); - this.initialBytesLen += buf.length; + this.initialBufs.push(buf); + this.initialBufsLen += buf.length; - if (this.initialBytesLen < 16) // We need more bytes to use space heuristic (see below) + if (this.initialBufsLen < 16) // We need more bytes to use space heuristic (see below) return ''; // We have enough bytes -> detect endianness. - var buf = Buffer.concat(this.initialBytes), - encoding = detectEncoding(buf, this.options.defaultEncoding); + var encoding = detectEncoding(this.initialBufs, this.options.defaultEncoding); this.decoder = this.iconv.getDecoder(encoding, this.options); - this.initialBytes.length = this.initialBytesLen = 0; + + var resStr = ''; + for (var i = 0; i < this.initialBufs.length; i++) + resStr += this.decoder.write(this.initialBufs[i]); + + this.initialBufs.length = this.initialBufsLen = 0; + return resStr; } return this.decoder.write(buf); @@ -131,47 +137,61 @@ Utf16Decoder.prototype.write = function(buf) { Utf16Decoder.prototype.end = function() { if (!this.decoder) { - var buf = Buffer.concat(this.initialBytes), - encoding = detectEncoding(buf, this.options.defaultEncoding); + var encoding = detectEncoding(this.initialBufs, this.options.defaultEncoding); this.decoder = this.iconv.getDecoder(encoding, this.options); - var res = this.decoder.write(buf), - trail = this.decoder.end(); + var resStr = ''; + for (var i = 0; i < this.initialBufs.length; i++) + resStr += this.decoder.write(this.initialBufs[i]); - return trail ? (res + trail) : res; + var trail = this.decoder.end(); + if (trail) + resStr += trail; + + this.initialBufs.length = this.initialBufsLen = 0; + return resStr; } return this.decoder.end(); } -function detectEncoding(buf, defaultEncoding) { - var enc = defaultEncoding || 'utf-16le'; - - if (buf.length >= 2) { - // Check BOM. - if (buf[0] == 0xFE && buf[1] == 0xFF) // UTF-16BE BOM - enc = 'utf-16be'; - else if (buf[0] == 0xFF && buf[1] == 0xFE) // UTF-16LE BOM - enc = 'utf-16le'; - else { - // No BOM found. Try to deduce encoding from initial content. - // Most of the time, the content has ASCII chars (U+00**), but the opposite (U+**00) is uncommon. - // So, we count ASCII as if it was LE or BE, and decide from that. - var asciiCharsLE = 0, asciiCharsBE = 0, // Counts of chars in both positions - _len = Math.min(buf.length - (buf.length % 2), 64); // Len is always even. - - for (var i = 0; i < _len; i += 2) { - if (buf[i] === 0 && buf[i+1] !== 0) asciiCharsBE++; - if (buf[i] !== 0 && buf[i+1] === 0) asciiCharsLE++; +function detectEncoding(bufs, defaultEncoding) { + var b = []; + var charsProcessed = 0; + var asciiCharsLE = 0, asciiCharsBE = 0; // Number of ASCII chars when decoded as LE or BE. + + outer_loop: + for (var i = 0; i < bufs.length; i++) { + var buf = bufs[i]; + for (var j = 0; j < buf.length; j++) { + b.push(buf[j]); + if (b.length === 2) { + if (charsProcessed === 0) { + // Check BOM first. + if (b[0] === 0xFF && b[1] === 0xFE) return 'utf-16le'; + if (b[0] === 0xFE && b[1] === 0xFF) return 'utf-16be'; + } + + if (b[0] === 0 && b[1] !== 0) asciiCharsBE++; + if (b[0] !== 0 && b[1] === 0) asciiCharsLE++; + + b.length = 0; + charsProcessed++; + + if (charsProcessed >= 100) { + break outer_loop; + } } - - if (asciiCharsBE > asciiCharsLE) - enc = 'utf-16be'; - else if (asciiCharsBE < asciiCharsLE) - enc = 'utf-16le'; } } - return enc; + // Make decisions. + // Most of the time, the content has ASCII chars (U+00**), but the opposite (U+**00) is uncommon. + // So, we count ASCII as if it was LE or BE, and decide from that. + if (asciiCharsBE > asciiCharsLE) return 'utf-16be'; + if (asciiCharsBE < asciiCharsLE) return 'utf-16le'; + + // Couldn't decide (likely all zeros or not enough data). + return defaultEncoding || 'utf-16le'; } |