diff options
Diffstat (limited to 'mcs/class/corlib/System.Text/UTF8Encoding.cs')
-rw-r--r-- | mcs/class/corlib/System.Text/UTF8Encoding.cs | 437 |
1 files changed, 164 insertions, 273 deletions
diff --git a/mcs/class/corlib/System.Text/UTF8Encoding.cs b/mcs/class/corlib/System.Text/UTF8Encoding.cs index ecc3998979a..9faa7cac64f 100644 --- a/mcs/class/corlib/System.Text/UTF8Encoding.cs +++ b/mcs/class/corlib/System.Text/UTF8Encoding.cs @@ -70,11 +70,9 @@ public class UTF8Encoding : Encoding windows_code_page = UnicodeEncoding.UNICODE_CODE_PAGE; } - #region GetByteCount() - // Internal version of "GetByteCount" which can handle a rolling // state between multiple calls to this method. - private static int InternalGetByteCount (char[] chars, int index, int count, ref char leftOver, bool flush) + private static int InternalGetByteCount (char[] chars, int index, int count, uint leftOver, bool flush) { // Validate the parameters. if (chars == null) { @@ -87,66 +85,27 @@ public class UTF8Encoding : Encoding throw new ArgumentOutOfRangeException ("count", _("ArgRange_Array")); } - if (index == chars.Length) { - if (flush && leftOver != '\0') { - // Flush the left-over surrogate pair start. - leftOver = '\0'; - return 3; - } - return 0; - } - - unsafe { - fixed (char* cptr = chars) { - return InternalGetByteCount (cptr + index, count, ref leftOver, flush); - } - } - } - - - private unsafe static int InternalGetByteCount (char* chars, int count, ref char leftOver, bool flush) - { - int index = 0; - // Determine the lengths of all characters. char ch; int length = 0; - char pair = leftOver; + uint pair = leftOver; while (count > 0) { ch = chars[index]; if (pair == 0) { if (ch < '\u0080') { - // fast path optimization - int end = index + count; - for (; index < end; index++, count--) { - if (chars [index] < '\x80') - ++length; - else - break; - } - continue; - //length++; + ++length; } else if (ch < '\u0800') { length += 2; } else if (ch >= '\uD800' && ch <= '\uDBFF') { // This is the start of a surrogate pair. - pair = ch; + pair = (uint)ch; } else { length += 3; } } else if (ch >= '\uDC00' && ch <= '\uDFFF') { - if (pair != 0) { - // We have a surrogate pair. - length += 4; - pair = '\0'; - } else { - // We have a surrogate tail without - // leading surrogate. In NET_2_0 it - // uses fallback. In NET_1_1 we output - // wrong surrogate. - length += 3; - pair = '\0'; - } + // We have a surrogate pair. + length += 4; + pair = 0; } else { // We have a surrogate start followed by a // regular character. Technically, this is @@ -154,20 +113,16 @@ public class UTF8Encoding : Encoding // We write out the surrogate start and then // re-visit the current character again. length += 3; - pair = '\0'; + pair = 0; continue; } ++index; --count; } - if (flush) { - if (pair != '\0') - // Flush the left-over surrogate pair start. - length += 3; - leftOver = '\0'; + if (flush && pair != 0) { + // Flush the left-over surrogate pair start. + length += 3; } - else - leftOver = pair; // Return the final length to the caller. return length; @@ -176,8 +131,7 @@ public class UTF8Encoding : Encoding // Get the number of bytes needed to encode a character buffer. public override int GetByteCount (char[] chars, int index, int count) { - char dummy = '\0'; - return InternalGetByteCount (chars, index, count, ref dummy, true); + return InternalGetByteCount (chars, index, count, 0, true); } // Convenience wrappers for "GetByteCount". @@ -188,23 +142,43 @@ public class UTF8Encoding : Encoding throw new ArgumentNullException ("s"); } - unsafe { - fixed (char* cptr = s) { - char dummy = '\0'; - return InternalGetByteCount (cptr, s.Length, ref dummy, true); + // Determine the lengths of all characters. + char ch; + int index = 0; + int count = s.Length; + int length = 0; + uint pair; + while (count > 0) { + ch = s[index++]; + if (ch < '\u0080') { + ++length; + } else if (ch < '\u0800') { + length += 2; + } else if (ch >= '\uD800' && ch <= '\uDBFF' && count > 1) { + // This may be the start of a surrogate pair. + pair = (uint)(s[index]); + if (pair >= (uint)0xDC00 && pair <= (uint)0xDFFF) { + length += 4; + ++index; + --count; + } else { + length += 3; + } + } else { + length += 3; } + --count; } - } - - #endregion - #region GetBytes() + // Return the final length to the caller. + return length; + } // Internal version of "GetBytes" which can handle a rolling // state between multiple calls to this method. private static int InternalGetBytes (char[] chars, int charIndex, int charCount, byte[] bytes, - int byteIndex, ref char leftOver, + int byteIndex, ref uint leftOver, bool flush) { // Validate the parameters. @@ -224,175 +198,93 @@ public class UTF8Encoding : Encoding throw new ArgumentOutOfRangeException ("byteIndex", _("ArgRange_Array")); } - if (charIndex == chars.Length) { - if (flush && leftOver != '\0') { -#if NET_2_0 - // FIXME: use EncoderFallback. - // - // By default it is empty, so I do nothing for now. - leftOver = '\0'; -#else - // Flush the left-over surrogate pair start. - if (byteIndex >= bytes.Length - 3) - throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes"); - bytes [byteIndex++] = 0xEF; - bytes [byteIndex++] = 0xBB; - bytes [byteIndex++] = 0xBF; - leftOver = '\0'; - return 3; -#endif - } - return 0; - } - - unsafe { - fixed (char* cptr = chars) { - if (bytes.Length == byteIndex) - return InternalGetBytes ( - cptr + charIndex, charCount, - null, 0, ref leftOver, flush); - fixed (byte *bptr = bytes) { - return InternalGetBytes ( - cptr + charIndex, charCount, - bptr + byteIndex, bytes.Length - byteIndex, - ref leftOver, flush); - } - } - } - } - - private unsafe static int InternalGetBytes (char* chars, int charCount, - byte* bytes, int byteCount, - ref char leftOver, bool flush) - { - int charIndex = 0; - int byteIndex = 0; - - // Convert the characters into bytes. // Convert the characters into bytes. char ch; - int length = byteCount; - char pair = leftOver; + int length = bytes.Length; + uint pair; + uint left = leftOver; int posn = byteIndex; - int code = 0; - while (charCount > 0) { // Fetch the next UTF-16 character pair value. - ch = chars [charIndex]; - if (pair == '\0') { - if (ch < '\uD800' || ch >= '\uE000') { - if (ch < '\x80') { // fast path optimization - int end = charIndex + charCount; - for (; charIndex < end; posn++, charIndex++, charCount--) { - if (chars [charIndex] < '\x80') - bytes [posn] = (byte) chars [charIndex]; - else - break; - } - continue; - } - code = ch; - } - else if (ch < '\uDC00') { - // surrogate start - pair = ch; - ++charIndex; - --charCount; - continue; - } else { // ch <= '\uDFFF' - // We have a surrogate tail without leading - // surrogate. In NET_2_0 it uses fallback. - // In NET_1_1 we output wrong surrogate. - if (posn > length - 3) { - throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes"); - } - bytes [posn++] = (byte) (0xE0 | (ch >> 12)); - bytes [posn++] = (byte) (0x80 | ((ch >> 6) & 0x3F)); - bytes [posn++] = (byte) (0x80 | (ch & 0x3F)); - ++charIndex; - --charCount; + ch = chars[charIndex++]; + --charCount; + if (left == 0) { + if (ch >= '\uD800' && ch <= '\uDBFF') { + // This is the start of a surrogate pair. + left = (uint)ch; continue; + } else { + // This is a regular character. + pair = (uint)ch; } + } else if (ch >= '\uDC00' && ch <= '\uDFFF') { + // We have a surrogate pair. + pair = ((left - (uint)0xD800) << 10) + + (((uint)ch) - (uint)0xDC00) + + (uint)0x10000; + left = 0; } else { - if ('\uDC00' <= ch && ch <= '\uDFFF') - code = 0x10000 + (int) ch - 0xDC00 + - (((int) pair - 0xD800) << 10); - else { - // We have a surrogate start followed by a - // regular character. Technically, this is - // invalid, but we have to do something. - // We write out the surrogate start and then - // re-visit the current character again. - if (posn > length - 3) { - throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes"); - } - bytes [posn++] = (byte) (0xE0 | (pair >> 12)); - bytes [posn++] = (byte) (0x80 | ((pair >> 6) & 0x3F)); - bytes [posn++] = (byte) (0x80 | (pair & 0x3F)); - pair = '\0'; - continue; - } - pair = '\0'; + // We have a surrogate start followed by a + // regular character. Technically, this is + // invalid, but we have to do something. + // We write out the surrogate start and then + // re-visit the current character again. + pair = (uint)left; + left = 0; + --charIndex; + ++charCount; } - ++charIndex; - --charCount; // Encode the character pair value. - if (code < 0x0080) { - if (posn >= length) + if (pair < (uint)0x0080) { + if (posn >= length) { throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes"); - bytes [posn++] = (byte)code; - } else if (code < 0x0800) { - if ((posn + 2) > length) + } + bytes[posn++] = (byte)pair; + } else if (pair < (uint)0x0800) { + if ((posn + 2) > length) { throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes"); - bytes [posn++] = (byte) (0xC0 | (code >> 6)); - bytes [posn++] = (byte) (0x80 | (code & 0x3F)); - } else if (code < 0x10000) { - if (posn > length - 3) + } + bytes[posn++] = (byte)(0xC0 | (pair >> 6)); + bytes[posn++] = (byte)(0x80 | (pair & 0x3F)); + } else if (pair < (uint)0x10000) { + if ((posn + 3) > length) { throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes"); - bytes [posn++] = (byte) (0xE0 | (code >> 12)); - bytes [posn++] = (byte) (0x80 | ((code >> 6) & 0x3F)); - bytes [posn++] = (byte) (0x80 | (code & 0x3F)); + } + bytes[posn++] = (byte)(0xE0 | (pair >> 12)); + bytes[posn++] = (byte)(0x80 | ((pair >> 6) & 0x3F)); + bytes[posn++] = (byte)(0x80 | (pair & 0x3F)); } else { - if (posn > length - 4) + if ((posn + 4) > length) { throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes"); - bytes [posn++] = (byte) (0xF0 | (code >> 18)); - bytes [posn++] = (byte) (0x80 | ((code >> 12) & 0x3F)); - bytes [posn++] = (byte) (0x80 | ((code >> 6) & 0x3F)); - bytes [posn++] = (byte) (0x80 | (code & 0x3F)); + } + bytes[posn++] = (byte)(0xF0 | (pair >> 18)); + bytes[posn++] = (byte)(0x80 | ((pair >> 12) & 0x3F)); + bytes[posn++] = (byte)(0x80 | ((pair >> 6) & 0x3F)); + bytes[posn++] = (byte)(0x80 | (pair & 0x3F)); } } - - if (flush) { - if (pair != '\0') { - // Flush the left-over incomplete surrogate. - if (posn > length - 3) { - throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes"); - } - bytes [posn++] = (byte) (0xE0 | (pair >> 12)); - bytes [posn++] = (byte) (0x80 | ((pair >> 6) & 0x3F)); - bytes [posn++] = (byte) (0x80 | (pair & 0x3F)); + if (flush && left != 0) { + // Flush the left-over surrogate pair start. + if ((posn + 3) > length) { + throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes"); } - leftOver = '\0'; + bytes[posn++] = (byte)(0xE0 | (left >> 12)); + bytes[posn++] = (byte)(0x80 | ((left >> 6) & 0x3F)); + bytes[posn++] = (byte)(0x80 | (left & 0x3F)); + left = 0; } - else - leftOver = pair; -Char.IsLetterOrDigit (pair); + leftOver = left; // Return the final count to the caller. return posn - byteIndex; } - private unsafe int Fallback (byte* bytes, int byteCount, char lead, char tail) - { - throw new NotImplementedException (); - } - // Get the bytes that result from encoding a character buffer. public override int GetBytes (char[] chars, int charIndex, int charCount, byte[] bytes, int byteIndex) { - char leftOver = '\0'; + uint leftOver = 0; return InternalGetBytes (chars, charIndex, charCount, bytes, byteIndex, ref leftOver, true); } @@ -417,31 +309,70 @@ Char.IsLetterOrDigit (pair); throw new ArgumentOutOfRangeException ("byteIndex", _("ArgRange_Array")); } - if (charIndex == s.Length) - return 0; + // Convert the characters into bytes. + char ch; + int length = bytes.Length; + uint pair; + int posn = byteIndex; + while (charCount > 0) { + // Fetch the next UTF-16 character pair value. + ch = s[charIndex++]; + if (ch >= '\uD800' && ch <= '\uDBFF' && charCount > 1) { + // This may be the start of a surrogate pair. + pair = (uint)(s[charIndex]); + if (pair >= (uint)0xDC00 && pair <= (uint)0xDFFF) { + pair = (pair - (uint)0xDC00) + + ((((uint)ch) - (uint)0xD800) << 10) + + (uint)0x10000; + ++charIndex; + --charCount; + } else { + pair = (uint)ch; + } + } else { + pair = (uint)ch; + } + --charCount; - unsafe { - fixed (char* cptr = s) { - char dummy = '\0'; - if (bytes.Length == byteIndex) - return InternalGetBytes ( - cptr + charIndex, charCount, - null, 0, ref dummy, true); - fixed (byte *bptr = bytes) { - return InternalGetBytes ( - cptr + charIndex, charCount, - bptr + byteIndex, bytes.Length - byteIndex, - ref dummy, true); + // Encode the character pair value. + if (pair < (uint)0x0080) { + if (posn >= length) { + throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes"); } + bytes[posn++] = (byte)pair; + } else if (pair < (uint)0x0800) { + if ((posn + 2) > length) { + throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes"); + } + bytes[posn++] = (byte)(0xC0 | (pair >> 6)); + bytes[posn++] = (byte)(0x80 | (pair & 0x3F)); + } else if (pair < (uint)0x10000) { + if ((posn + 3) > length) { + throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes"); + } + bytes[posn++] = (byte)(0xE0 | (pair >> 12)); + bytes[posn++] = (byte)(0x80 | ((pair >> 6) & 0x3F)); + bytes[posn++] = (byte)(0x80 | (pair & 0x3F)); + } else { + if ((posn + 4) > length) { + throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes"); + } + bytes[posn++] = (byte)(0xF0 | (pair >> 18)); + bytes[posn++] = (byte)(0x80 | ((pair >> 12) & 0x3F)); + bytes[posn++] = (byte)(0x80 | ((pair >> 6) & 0x3F)); + bytes[posn++] = (byte)(0x80 | (pair & 0x3F)); } } - } - #endregion + // Return the final count to the caller. + return posn - byteIndex; + } // Internal version of "GetCharCount" which can handle a rolling // state between multiple calls to this method. #if NET_2_0 + // Internal version of "GetCharCount" which can handle a rolling + // state between multiple calls to this method. private static int InternalGetCharCount ( byte[] bytes, int index, int count, uint leftOverBits, uint leftOverCount, object provider, @@ -463,20 +394,9 @@ Char.IsLetterOrDigit (pair); throw new ArgumentOutOfRangeException ("count", _("ArgRange_Array")); } - int length = 0; - - if (leftOverCount == 0) { - int end = index + count; - for (; index < end; index++, count--) { - if (bytes [index] < 0x80) - length++; - else - break; - } - } - // Determine the number of characters that we have. uint ch; + int length = 0; uint leftBits = leftOverBits; uint leftSoFar = (leftOverCount & (uint)0x0F); uint leftSize = ((leftOverCount >> 4) & (uint)0x0F); @@ -588,7 +508,7 @@ Char.IsLetterOrDigit (pair); // We had left-over bytes that didn't make up // a complete UTF-8 character sequence. #if NET_2_0 - length += Fallback (provider, ref fallbackBuffer, bytes, index); + length += Fallback (provider, ref fallbackBuffer, bytes, index - 1); #else if (throwOnInvalid) throw new ArgumentException (_("Arg_InvalidUTF8"), "bytes"); @@ -610,7 +530,7 @@ Char.IsLetterOrDigit (pair); else buffer = ((Decoder) provider).FallbackBuffer; } - buffer.Fallback (bytes, index); + buffer.Fallback (bytes, index - 1); return buffer.Remaining; } @@ -625,7 +545,7 @@ Char.IsLetterOrDigit (pair); else buffer = ((Decoder) provider).FallbackBuffer; } - buffer.Fallback (bytes, byteIndex); + buffer.Fallback (bytes, byteIndex - 1); while (buffer.Remaining > 0) chars [charIndex++] = buffer.GetNextChar (); } @@ -676,21 +596,10 @@ Char.IsLetterOrDigit (pair); if (charIndex == chars.Length) return 0; - int posn = charIndex; - - if (leftOverCount == 0) { - int end = byteIndex + byteCount; - for (; byteIndex < end; posn++, byteIndex++, byteCount--) { - if (bytes [byteIndex] < 0x80) - chars [posn] = (char) bytes [byteIndex]; - else - break; - } - } - // Convert the bytes into the output buffer. uint ch; int length = chars.Length; + int posn = charIndex; uint leftBits = leftOverBits; uint leftSoFar = (leftOverCount & (uint)0x0F); uint leftSize = ((leftOverCount >> 4) & (uint)0x0F); @@ -1005,48 +914,30 @@ Char.IsLetterOrDigit (pair); private class UTF8Encoder : Encoder { private bool emitIdentifier; - private char leftOverForCount; - private char leftOverForConv; + private uint leftOver; // Constructor. public UTF8Encoder (bool emitIdentifier) { this.emitIdentifier = emitIdentifier; - leftOverForCount = '\0'; - leftOverForConv = '\0'; + leftOver = 0; } // Override inherited methods. public override int GetByteCount (char[] chars, int index, int count, bool flush) { - return InternalGetByteCount (chars, index, count, ref leftOverForCount, flush); + return InternalGetByteCount (chars, index, count, leftOver, flush); } public override int GetBytes (char[] chars, int charIndex, - int charCount, byte[] bytes, int byteIndex, bool flush) + int charCount, byte[] bytes, int byteCount, bool flush) { int result; - result = InternalGetBytes (chars, charIndex, charCount, bytes, byteIndex, ref leftOverForConv, flush); + result = InternalGetBytes (chars, charIndex, charCount, bytes, byteCount, ref leftOver, flush); emitIdentifier = false; return result; } -#if NET_2_0 - public unsafe override int GetByteCount (char* chars, int count, bool flush) - { - return InternalGetByteCount (chars, count, ref leftOverForCount, flush); - } - - public unsafe override int GetBytes (char* chars, int charCount, - byte* bytes, int byteCount, bool flush) - { - int result; - result = InternalGetBytes (chars, charCount, bytes, byteCount, ref leftOverForConv, flush); - emitIdentifier = false; - return result; - } -#endif - } // class UTF8Encoder }; // class UTF8Encoding |