1 files changed, 164 insertions, 273 deletions
diff --git a/mcs/class/corlib/System.Text/UTF8Encoding.cs b/mcs/class/corlib/System.Text/UTF8Encoding.cs
index ecc3998979a..9faa7cac64f 100644
--- a/mcs/class/corlib/System.Text/UTF8Encoding.cs
+++ b/mcs/class/corlib/System.Text/UTF8Encoding.cs
@@ -70,11 +70,9 @@ public class UTF8Encoding : Encoding
 		windows_code_page = UnicodeEncoding.UNICODE_CODE_PAGE;
 	}
 
-	#region GetByteCount()
-
 	// Internal version of "GetByteCount" which can handle a rolling
 	// state between multiple calls to this method.
-	private static int InternalGetByteCount (char[] chars, int index, int count, ref char leftOver, bool flush)
+	private static int InternalGetByteCount (char[] chars, int index, int count, uint leftOver, bool flush)
 	{
 		// Validate the parameters.
 		if (chars == null) {
@@ -87,66 +85,27 @@ public class UTF8Encoding : Encoding
 			throw new ArgumentOutOfRangeException ("count", _("ArgRange_Array"));
 		}
 
-		if (index == chars.Length) {
-			if (flush && leftOver != '\0') {
-				// Flush the left-over surrogate pair start.
-				leftOver = '\0';
-				return 3;
-			}
-			return 0;
-		}
-
-		unsafe {
-			fixed (char* cptr = chars) {
-				return InternalGetByteCount (cptr + index, count, ref leftOver, flush);
-			}
-		}
-	}
-
-
-	private unsafe static int InternalGetByteCount (char* chars, int count, ref char leftOver, bool flush)
-	{
-		int index = 0;
-
 		// Determine the lengths of all characters.
 		char ch;
 		int length = 0;
-		char pair = leftOver;
+		uint pair = leftOver;
 		while (count > 0) {
 			ch = chars[index];
 			if (pair == 0) {
 				if (ch < '\u0080') {
-					// fast path optimization
-					int end = index + count;
-					for (; index < end; index++, count--) {
-						if (chars [index] < '\x80')
-							++length;
-						else
-							break;
-					}
-					continue;
-					//length++;
+					++length;
 				} else if (ch < '\u0800') {
 					length += 2;
 				} else if (ch >= '\uD800' && ch <= '\uDBFF') {
 					// This is the start of a surrogate pair.
-					pair = ch;
+					pair = (uint)ch;
 				} else {
 					length += 3;
 				}
 			} else if (ch >= '\uDC00' && ch <= '\uDFFF') {
-				if (pair != 0) {
-					// We have a surrogate pair.
-					length += 4;
-					pair = '\0';
-				} else {
-					// We have a surrogate tail without 
-					// leading surrogate. In NET_2_0 it
-					// uses fallback. In NET_1_1 we output
-					// wrong surrogate.
-					length += 3;
-					pair = '\0';
-				}
+				// We have a surrogate pair.
+				length += 4;
+				pair = 0;
 			} else {
 				// We have a surrogate start followed by a
 				// regular character.  Technically, this is
@@ -154,20 +113,16 @@ public class UTF8Encoding : Encoding
 				// We write out the surrogate start and then
 				// re-visit the current character again.
 				length += 3;
-				pair = '\0';
+				pair = 0;
 				continue;
 			}
 			++index;
 			--count;
 		}
-		if (flush) {
-			if (pair != '\0')
-				// Flush the left-over surrogate pair start.
-				length += 3;
-			leftOver = '\0';
+		if (flush && pair != 0) {
+			// Flush the left-over surrogate pair start.
+			length += 3;
 		}
-		else
-			leftOver = pair;
 
 		// Return the final length to the caller.
 		return length;
@@ -176,8 +131,7 @@ public class UTF8Encoding : Encoding
 	// Get the number of bytes needed to encode a character buffer.
 	public override int GetByteCount (char[] chars, int index, int count)
 	{
-		char dummy = '\0';
-		return InternalGetByteCount (chars, index, count, ref dummy, true);
+		return InternalGetByteCount (chars, index, count, 0, true);
 	}
 
 	// Convenience wrappers for "GetByteCount".
@@ -188,23 +142,43 @@ public class UTF8Encoding : Encoding
 			throw new ArgumentNullException ("s");
 		}
 
-		unsafe {
-			fixed (char* cptr = s) {
-				char dummy = '\0';
-				return InternalGetByteCount (cptr, s.Length, ref dummy, true);
+		// Determine the lengths of all characters.
+		char ch;
+		int index = 0;
+		int count = s.Length;
+		int length = 0;
+		uint pair;
+		while (count > 0) {
+			ch = s[index++];
+			if (ch < '\u0080') {
+				++length;
+			} else if (ch < '\u0800') {
+				length += 2;
+			} else if (ch >= '\uD800' && ch <= '\uDBFF' && count > 1) {
+				// This may be the start of a surrogate pair.
+				pair = (uint)(s[index]);
+				if (pair >= (uint)0xDC00 && pair <= (uint)0xDFFF) {
+					length += 4;
+					++index;
+					--count;
+				} else {
+					length += 3;
+				}
+			} else {
+				length += 3;
 			}
+			--count;
 		}
-	}
-
-	#endregion
 
-	#region GetBytes()
+		// Return the final length to the caller.
+		return length;
+	}
 
 	// Internal version of "GetBytes" which can handle a rolling
 	// state between multiple calls to this method.
 	private static int InternalGetBytes (char[] chars, int charIndex,
 					     int charCount, byte[] bytes,
-					     int byteIndex, ref char leftOver,
+					     int byteIndex, ref uint leftOver,
 					     bool flush)
 	{
 		// Validate the parameters.
@@ -224,175 +198,93 @@ public class UTF8Encoding : Encoding
 			throw new ArgumentOutOfRangeException ("byteIndex", _("ArgRange_Array"));
 		}
 
-		if (charIndex == chars.Length) {
-			if (flush && leftOver != '\0') {
-#if NET_2_0
-				// FIXME: use EncoderFallback.
-				//
-				// By default it is empty, so I do nothing for now.
-				leftOver = '\0';
-#else
-				// Flush the left-over surrogate pair start.
-				if (byteIndex >= bytes.Length - 3)
-					throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes");
-				bytes [byteIndex++] = 0xEF;
-				bytes [byteIndex++] = 0xBB;
-				bytes [byteIndex++] = 0xBF;
-				leftOver = '\0';
-				return 3;
-#endif
-			}
-			return 0;
-		}
-
-		unsafe {
-			fixed (char* cptr = chars) {
-				if (bytes.Length == byteIndex)
-					return InternalGetBytes (
-						cptr + charIndex, charCount, 
-						null, 0, ref leftOver, flush);
-				fixed (byte *bptr = bytes) {
-					return InternalGetBytes (
-						cptr + charIndex, charCount,
-						bptr + byteIndex, bytes.Length - byteIndex,
-						ref leftOver, flush);
-				}
-			}
-		}
-	}
-
-	private unsafe static int InternalGetBytes (char* chars, int charCount,
-					     byte* bytes, int byteCount,
-					     ref char leftOver, bool flush)
-	{
-		int charIndex = 0;
-		int byteIndex = 0;
-
-		// Convert the characters into bytes.
 		// Convert the characters into bytes.
 		char ch;
-		int length = byteCount;
-		char pair = leftOver;
+		int length = bytes.Length;
+		uint pair;
+		uint left = leftOver;
 		int posn = byteIndex;
-		int code = 0;
-
 		while (charCount > 0) {
 			// Fetch the next UTF-16 character pair value.
-			ch = chars [charIndex];
-			if (pair == '\0') {
-				if (ch < '\uD800' || ch >= '\uE000') {
-					if (ch < '\x80') { // fast path optimization
-						int end = charIndex + charCount;
-						for (; charIndex < end; posn++, charIndex++, charCount--) {
-							if (chars [charIndex] < '\x80')
-								bytes [posn] = (byte) chars [charIndex];
-							else
-								break;
-						}
-						continue;
-					}
-					code = ch;
-				}
-				else if (ch < '\uDC00') {
-					// surrogate start
-					pair = ch;
-					++charIndex;
-					--charCount;
-					continue;
-				} else { // ch <= '\uDFFF'
-					// We have a surrogate tail without leading 
-					// surrogate. In NET_2_0 it uses fallback.
-					// In NET_1_1 we output wrong surrogate.
-					if (posn > length - 3) {
-						throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes");
-					}
-					bytes [posn++] = (byte) (0xE0 | (ch >> 12));
-					bytes [posn++] = (byte) (0x80 | ((ch >> 6) & 0x3F));
-					bytes [posn++] = (byte) (0x80 | (ch & 0x3F));
-					++charIndex;
-					--charCount;
+			ch = chars[charIndex++];
+			--charCount;
+			if (left == 0) {
+				if (ch >= '\uD800' && ch <= '\uDBFF') {
+					// This is the start of a surrogate pair.
+					left = (uint)ch;
 					continue;
+				} else {
+					// This is a regular character.
+					pair = (uint)ch;
 				}
+			} else if (ch >= '\uDC00' && ch <= '\uDFFF') {
+				// We have a surrogate pair.
+				pair = ((left - (uint)0xD800) << 10) +
+					   (((uint)ch) - (uint)0xDC00) +
+					   (uint)0x10000;
+				left = 0;
 			} else {
-				if ('\uDC00' <= ch && ch <= '\uDFFF')
-					code =  0x10000 + (int) ch - 0xDC00 +
-						(((int) pair - 0xD800) << 10);
-				else {
-					// We have a surrogate start followed by a
-					// regular character.  Technically, this is
-					// invalid, but we have to do something.
-					// We write out the surrogate start and then
-					// re-visit the current character again.
-					if (posn > length - 3) {
-						throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes");
-					}
-					bytes [posn++] = (byte) (0xE0 | (pair >> 12));
-					bytes [posn++] = (byte) (0x80 | ((pair >> 6) & 0x3F));
-					bytes [posn++] = (byte) (0x80 | (pair & 0x3F));
-					pair = '\0';
-					continue;
-				}
-				pair = '\0';
+				// We have a surrogate start followed by a
+				// regular character.  Technically, this is
+				// invalid, but we have to do something.
+				// We write out the surrogate start and then
+				// re-visit the current character again.
+				pair = (uint)left;
+				left = 0;
+				--charIndex;
+				++charCount;
 			}
-			++charIndex;
-			--charCount;
 
 			// Encode the character pair value.
-			if (code < 0x0080) {
-				if (posn >= length)
+			if (pair < (uint)0x0080) {
+				if (posn >= length) {
 					throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes");
-				bytes [posn++] = (byte)code;
-			} else if (code < 0x0800) {
-				if ((posn + 2) > length)
+				}
+				bytes[posn++] = (byte)pair;
+			} else if (pair < (uint)0x0800) {
+				if ((posn + 2) > length) {
 					throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes");
-				bytes [posn++] = (byte) (0xC0 | (code >> 6));
-				bytes [posn++] = (byte) (0x80 | (code & 0x3F));
-			} else if (code < 0x10000) {
-				if (posn > length - 3)
+				}
+				bytes[posn++] = (byte)(0xC0 | (pair >> 6));
+				bytes[posn++] = (byte)(0x80 | (pair & 0x3F));
+			} else if (pair < (uint)0x10000) {
+				if ((posn + 3) > length) {
 					throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes");
-				bytes [posn++] = (byte) (0xE0 | (code >> 12));
-				bytes [posn++] = (byte) (0x80 | ((code >> 6) & 0x3F));
-				bytes [posn++] = (byte) (0x80 | (code & 0x3F));
+				}
+				bytes[posn++] = (byte)(0xE0 | (pair >> 12));
+				bytes[posn++] = (byte)(0x80 | ((pair >> 6) & 0x3F));
+				bytes[posn++] = (byte)(0x80 | (pair & 0x3F));
 			} else {
-				if (posn > length - 4)
+				if ((posn + 4) > length) {
 					throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes");
-				bytes [posn++] = (byte) (0xF0 | (code >> 18));
-				bytes [posn++] = (byte) (0x80 | ((code >> 12) & 0x3F));
-				bytes [posn++] = (byte) (0x80 | ((code >> 6) & 0x3F));
-				bytes [posn++] = (byte) (0x80 | (code & 0x3F));
+				}
+				bytes[posn++] = (byte)(0xF0 | (pair >> 18));
+				bytes[posn++] = (byte)(0x80 | ((pair >> 12) & 0x3F));
+				bytes[posn++] = (byte)(0x80 | ((pair >> 6) & 0x3F));
+				bytes[posn++] = (byte)(0x80 | (pair & 0x3F));
 			}
 		}
-
-		if (flush) {
-			if (pair != '\0') {
-				// Flush the left-over incomplete surrogate.
-				if (posn > length - 3) {
-					throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes");
-				}
-				bytes [posn++] = (byte) (0xE0 | (pair >> 12));
-				bytes [posn++] = (byte) (0x80 | ((pair >> 6) & 0x3F));
-				bytes [posn++] = (byte) (0x80 | (pair & 0x3F));
+		if (flush && left != 0) {
+			// Flush the left-over surrogate pair start.
+			if ((posn + 3) > length) {
+				throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes");
 			}
-			leftOver = '\0';
+			bytes[posn++] = (byte)(0xE0 | (left >> 12));
+			bytes[posn++] = (byte)(0x80 | ((left >> 6) & 0x3F));
+			bytes[posn++] = (byte)(0x80 | (left & 0x3F));
+			left = 0;
 		}
-		else
-			leftOver = pair;
-Char.IsLetterOrDigit (pair);
+		leftOver = left;
 
 		// Return the final count to the caller.
 		return posn - byteIndex;
 	}
 
-	private unsafe int Fallback (byte* bytes, int byteCount, char lead, char tail)
-	{
-		throw new NotImplementedException ();
-	}
-
 	// Get the bytes that result from encoding a character buffer.
 	public override int GetBytes (char[] chars, int charIndex, int charCount,
 								 byte[] bytes, int byteIndex)
 	{
-		char leftOver = '\0';
+		uint leftOver = 0;
 		return InternalGetBytes (chars, charIndex, charCount, bytes, byteIndex, ref leftOver, true);
 	}
 
@@ -417,31 +309,70 @@ Char.IsLetterOrDigit (pair);
 			throw new ArgumentOutOfRangeException ("byteIndex", _("ArgRange_Array"));
 		}
 
-		if (charIndex == s.Length)
-			return 0;
+		// Convert the characters into bytes.
+		char ch;
+		int length = bytes.Length;
+		uint pair;
+		int posn = byteIndex;
+		while (charCount > 0) {
+			// Fetch the next UTF-16 character pair value.
+			ch = s[charIndex++];
+			if (ch >= '\uD800' && ch <= '\uDBFF' && charCount > 1) {
+				// This may be the start of a surrogate pair.
+				pair = (uint)(s[charIndex]);
+				if (pair >= (uint)0xDC00 && pair <= (uint)0xDFFF) {
+					pair = (pair - (uint)0xDC00) +
+						   ((((uint)ch) - (uint)0xD800) << 10) +
+						   (uint)0x10000;
+					++charIndex;
+					--charCount;
+				} else {
+					pair = (uint)ch;
+				}
+			} else {
+				pair = (uint)ch;
+			}
+			--charCount;
 
-		unsafe {
-			fixed (char* cptr = s) {
-				char dummy = '\0';
-				if (bytes.Length == byteIndex)
-					return InternalGetBytes (
-						cptr + charIndex, charCount,
-						null, 0, ref dummy, true);
-				fixed (byte *bptr = bytes) {
-					return InternalGetBytes (
-						cptr + charIndex, charCount,
-						bptr + byteIndex, bytes.Length - byteIndex,
-						ref dummy, true);
+			// Encode the character pair value.
+			if (pair < (uint)0x0080) {
+				if (posn >= length) {
+					throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes");
 				}
+				bytes[posn++] = (byte)pair;
+			} else if (pair < (uint)0x0800) {
+				if ((posn + 2) > length) {
+					throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes");
+				}
+				bytes[posn++] = (byte)(0xC0 | (pair >> 6));
+				bytes[posn++] = (byte)(0x80 | (pair & 0x3F));
+			} else if (pair < (uint)0x10000) {
+				if ((posn + 3) > length) {
+					throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes");
+				}
+				bytes[posn++] = (byte)(0xE0 | (pair >> 12));
+				bytes[posn++] = (byte)(0x80 | ((pair >> 6) & 0x3F));
+				bytes[posn++] = (byte)(0x80 | (pair & 0x3F));
+			} else {
+				if ((posn + 4) > length) {
+					throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes");
+				}
+				bytes[posn++] = (byte)(0xF0 | (pair >> 18));
+				bytes[posn++] = (byte)(0x80 | ((pair >> 12) & 0x3F));
+				bytes[posn++] = (byte)(0x80 | ((pair >> 6) & 0x3F));
+				bytes[posn++] = (byte)(0x80 | (pair & 0x3F));
 			}
 		}
-	}
 
-	#endregion
+		// Return the final count to the caller.
+		return posn - byteIndex;
+	}
 
 	// Internal version of "GetCharCount" which can handle a rolling
 	// state between multiple calls to this method.
 #if NET_2_0
+	// Internal version of "GetCharCount" which can handle a rolling
+	// state between multiple calls to this method.
 	private static int InternalGetCharCount (
 		byte[] bytes, int index, int count, uint leftOverBits,
 		uint leftOverCount, object provider,
@@ -463,20 +394,9 @@ Char.IsLetterOrDigit (pair);
 			throw new ArgumentOutOfRangeException ("count", _("ArgRange_Array"));
 		}
 
-		int length = 0;
-
-		if (leftOverCount == 0) {
-			int end = index + count;
-			for (; index < end; index++, count--) {
-				if (bytes [index] < 0x80)
-					length++;
-				else
-					break;
-			}
-		}
-
 		// Determine the number of characters that we have.
 		uint ch;
+		int length = 0;
 		uint leftBits = leftOverBits;
 		uint leftSoFar = (leftOverCount & (uint)0x0F);
 		uint leftSize = ((leftOverCount >> 4) & (uint)0x0F);
@@ -588,7 +508,7 @@ Char.IsLetterOrDigit (pair);
 			// We had left-over bytes that didn't make up
 			// a complete UTF-8 character sequence.
 #if NET_2_0
-			length += Fallback (provider, ref fallbackBuffer, bytes, index);
+			length += Fallback (provider, ref fallbackBuffer, bytes, index - 1);
 #else
 			if (throwOnInvalid)
 				throw new ArgumentException (_("Arg_InvalidUTF8"), "bytes");
@@ -610,7 +530,7 @@ Char.IsLetterOrDigit (pair);
 			else
 				buffer = ((Decoder) provider).FallbackBuffer;
 		}
-		buffer.Fallback (bytes, index);
+		buffer.Fallback (bytes, index - 1);
 		return buffer.Remaining;
 	}
 
@@ -625,7 +545,7 @@ Char.IsLetterOrDigit (pair);
 			else
 				buffer = ((Decoder) provider).FallbackBuffer;
 		}
-		buffer.Fallback (bytes, byteIndex);
+		buffer.Fallback (bytes, byteIndex - 1);
 		while (buffer.Remaining > 0)
 			chars [charIndex++] = buffer.GetNextChar ();
 	}
@@ -676,21 +596,10 @@ Char.IsLetterOrDigit (pair);
 		if (charIndex == chars.Length)
 			return 0;
 
-		int posn = charIndex;
-
-		if (leftOverCount == 0) {
-			int end = byteIndex + byteCount;
-			for (; byteIndex < end; posn++, byteIndex++, byteCount--) {
-				if (bytes [byteIndex] < 0x80)
-					chars [posn] = (char) bytes [byteIndex];
-				else
-					break;
-			}
-		}
-
 		// Convert the bytes into the output buffer.
 		uint ch;
 		int length = chars.Length;
+		int posn = charIndex;
 		uint leftBits = leftOverBits;
 		uint leftSoFar = (leftOverCount & (uint)0x0F);
 		uint leftSize = ((leftOverCount >> 4) & (uint)0x0F);
@@ -1005,48 +914,30 @@ Char.IsLetterOrDigit (pair);
 	private class UTF8Encoder : Encoder
 	{
 		private bool emitIdentifier;
-		private char leftOverForCount;
-		private char leftOverForConv;
+		private uint leftOver;
 
 		// Constructor.
 		public UTF8Encoder (bool emitIdentifier)
 		{
 			this.emitIdentifier = emitIdentifier;
-			leftOverForCount = '\0';
-			leftOverForConv = '\0';
+			leftOver = 0;
 		}
 
 		// Override inherited methods.
 		public override int GetByteCount (char[] chars, int index,
 					 int count, bool flush)
 		{
-			return InternalGetByteCount (chars, index, count, ref leftOverForCount, flush);
+			return InternalGetByteCount (chars, index, count, leftOver, flush);
 		}
 		public override int GetBytes (char[] chars, int charIndex,
-					 int charCount, byte[] bytes, int byteIndex, bool flush)
+					 int charCount, byte[] bytes, int byteCount, bool flush)
 		{
 			int result;
-			result = InternalGetBytes (chars, charIndex, charCount, bytes, byteIndex, ref leftOverForConv, flush);
+			result = InternalGetBytes (chars, charIndex, charCount, bytes, byteCount, ref leftOver, flush);
 			emitIdentifier = false;
 			return result;
 		}
 
-#if NET_2_0
-		public unsafe override int GetByteCount (char* chars, int count, bool flush)
-		{
-			return InternalGetByteCount (chars, count, ref leftOverForCount, flush);
-		}
-
-		public unsafe override int GetBytes (char* chars, int charCount,
-			byte* bytes, int byteCount, bool flush)
-		{
-			int result;
-			result = InternalGetBytes (chars, charCount, bytes, byteCount, ref leftOverForConv, flush);
-			emitIdentifier = false;
-			return result;
-		}
-#endif
-
 	} // class UTF8Encoder
 
 }; // class UTF8Encoding