diff options
author | Calvin <calvin@cmpct.info> | 2018-06-03 05:03:12 +0300 |
---|---|---|
committer | Marek Safar <marek.safar@gmail.com> | 2018-06-03 09:01:57 +0300 |
commit | 3a3f1bc99878a0bfb82aea366805f8ec00c38e17 (patch) | |
tree | 9023fb21444d49568adc475981062061bd4515f6 | |
parent | 957ba72461bc1a6b69fb7e120b906b5cd98cfb35 (diff) |
Try to fix issues with Unicode on big endian OSes (dotnet/coreclr#18254)
Mono had some patches in their fork of referencesource to resolve
issues on big endian. Essentially, compile-time endianness handling
is no good for big endian platforms, because they have to consume a
Monolite intended for all platforms in order to bootstrap. So then,
all big endian platforms are now consuming a bootstrap tarball made
on little endian systems, and bad things happen as a result.
This makes it so that endianness is checked at runtime by using
System.BitConverter.IsLittleEndian, not by a compiler definition.
This integrates mono/mono@1f9b218 and mono/mono@92cec46, which won't
apply cleanly to current CoreFX.
mono/mono#8679 may get fixed by this, but as the comments say,
there could be more patches missing. This just integrates the
patches known to me on UnicodeEncoding and UTF8Encoding.
Signed-off-by: dotnet-bot-corefx-mirror <dotnet-bot@microsoft.com>
-rw-r--r-- | src/Common/src/CoreLib/System/Text/UTF8Encoding.cs | 190 | ||||
-rw-r--r-- | src/Common/src/CoreLib/System/Text/UnicodeEncoding.cs | 58 |
2 files changed, 124 insertions, 124 deletions
diff --git a/src/Common/src/CoreLib/System/Text/UTF8Encoding.cs b/src/Common/src/CoreLib/System/Text/UTF8Encoding.cs index b9e08f9040..5446058721 100644 --- a/src/Common/src/CoreLib/System/Text/UTF8Encoding.cs +++ b/src/Common/src/CoreLib/System/Text/UTF8Encoding.cs @@ -719,12 +719,15 @@ namespace System.Text break; LongCodeWithMask: -#if BIGENDIAN - // be careful about the sign extension - ch = (int)(((uint)ch) >> 16); -#else // BIGENDIAN - ch = (char)ch; -#endif // BIGENDIAN + if (BitConverter.IsLittleEndian) + { + ch = (char)ch; + } + else + { + // be careful about the sign extension + ch = (int)(((uint)ch) >> 16); + } pSrc++; if (ch <= 0x7F) @@ -1146,31 +1149,37 @@ namespace System.Text } // Unfortunately, this is endianess sensitive -#if BIGENDIAN - *pTarget = (byte)(ch>>16); - *(pTarget+1) = (byte)ch; - pSrc += 4; - *(pTarget+2) = (byte)(chc>>16); - *(pTarget+3) = (byte)chc; - pTarget += 4; -#else // BIGENDIAN - *pTarget = (byte)ch; - *(pTarget + 1) = (byte)(ch >> 16); - pSrc += 4; - *(pTarget + 2) = (byte)chc; - *(pTarget + 3) = (byte)(chc >> 16); - pTarget += 4; -#endif // BIGENDIAN + if (BitConverter.IsLittleEndian) + { + *pTarget = (byte)ch; + *(pTarget + 1) = (byte)(ch >> 16); + pSrc += 4; + *(pTarget + 2) = (byte)chc; + *(pTarget + 3) = (byte)(chc >> 16); + pTarget += 4; + } + else + { + *pTarget = (byte)(ch>>16); + *(pTarget+1) = (byte)ch; + pSrc += 4; + *(pTarget+2) = (byte)(chc>>16); + *(pTarget+3) = (byte)chc; + pTarget += 4; + } } continue; LongCodeWithMask: -#if BIGENDIAN - // be careful about the sign extension - ch = (int)(((uint)ch) >> 16); -#else // BIGENDIAN - ch = (char)ch; -#endif // BIGENDIAN + if (BitConverter.IsLittleEndian) + { + ch = (char)ch; + } + else + { + // be careful about the sign extension + ch = (int)(((uint)ch) >> 16); + } pSrc++; if (ch > 0x7F) @@ -1568,17 +1577,26 @@ namespace System.Text } break; -#if BIGENDIAN - LongCodeWithMask32: - // be careful about the sign extension - ch = (int)(((uint)ch) >> 16); - LongCodeWithMask16: - ch = (int)(((uint)ch) >> 8); -#else // BIGENDIAN LongCodeWithMask32: + if (BitConverter.IsLittleEndian) + { + ch &= 0xFF; + } + else + { + // be careful about the sign extension + ch = (int)(((uint)ch) >> 16); + } LongCodeWithMask16: - ch &= 0xFF; -#endif // BIGENDIAN + if (BitConverter.IsLittleEndian) + { + ch &= 0xFF; + } + else + { + ch = (int)(((uint)ch) >> 8); + } + pSrc++; if (ch <= 0x7F) { @@ -2052,17 +2070,20 @@ namespace System.Text } // Unfortunately, this is endianess sensitive -#if BIGENDIAN - *pTarget = (char)((ch >> 8) & 0x7F); - pSrc += 2; - *(pTarget+1) = (char)(ch & 0x7F); - pTarget += 2; -#else // BIGENDIAN - *pTarget = (char)(ch & 0x7F); - pSrc += 2; - *(pTarget + 1) = (char)((ch >> 8) & 0x7F); - pTarget += 2; -#endif // BIGENDIAN + if (BitConverter.IsLittleEndian) + { + *pTarget = (char)(ch & 0x7F); + pSrc += 2; + *(pTarget + 1) = (char)((ch >> 8) & 0x7F); + pTarget += 2; + } + else + { + *pTarget = (char)((ch >> 8) & 0x7F); + pSrc += 2; + *(pTarget+1) = (char)(ch & 0x7F); + pTarget += 2; + } } // Run 8 characters at a time! @@ -2076,43 +2097,54 @@ namespace System.Text } // Unfortunately, this is endianess sensitive -#if BIGENDIAN - *pTarget = (char)((ch >> 24) & 0x7F); - *(pTarget+1) = (char)((ch >> 16) & 0x7F); - *(pTarget+2) = (char)((ch >> 8) & 0x7F); - *(pTarget+3) = (char)(ch & 0x7F); - pSrc += 8; - *(pTarget+4) = (char)((chb >> 24) & 0x7F); - *(pTarget+5) = (char)((chb >> 16) & 0x7F); - *(pTarget+6) = (char)((chb >> 8) & 0x7F); - *(pTarget+7) = (char)(chb & 0x7F); - pTarget += 8; -#else // BIGENDIAN - *pTarget = (char)(ch & 0x7F); - *(pTarget + 1) = (char)((ch >> 8) & 0x7F); - *(pTarget + 2) = (char)((ch >> 16) & 0x7F); - *(pTarget + 3) = (char)((ch >> 24) & 0x7F); - pSrc += 8; - *(pTarget + 4) = (char)(chb & 0x7F); - *(pTarget + 5) = (char)((chb >> 8) & 0x7F); - *(pTarget + 6) = (char)((chb >> 16) & 0x7F); - *(pTarget + 7) = (char)((chb >> 24) & 0x7F); - pTarget += 8; -#endif // BIGENDIAN + if (BitConverter.IsLittleEndian) + { + *pTarget = (char)(ch & 0x7F); + *(pTarget + 1) = (char)((ch >> 8) & 0x7F); + *(pTarget + 2) = (char)((ch >> 16) & 0x7F); + *(pTarget + 3) = (char)((ch >> 24) & 0x7F); + pSrc += 8; + *(pTarget + 4) = (char)(chb & 0x7F); + *(pTarget + 5) = (char)((chb >> 8) & 0x7F); + *(pTarget + 6) = (char)((chb >> 16) & 0x7F); + *(pTarget + 7) = (char)((chb >> 24) & 0x7F); + pTarget += 8; + } + else + { + *pTarget = (char)((ch >> 24) & 0x7F); + *(pTarget+1) = (char)((ch >> 16) & 0x7F); + *(pTarget+2) = (char)((ch >> 8) & 0x7F); + *(pTarget+3) = (char)(ch & 0x7F); + pSrc += 8; + *(pTarget+4) = (char)((chb >> 24) & 0x7F); + *(pTarget+5) = (char)((chb >> 16) & 0x7F); + *(pTarget+6) = (char)((chb >> 8) & 0x7F); + *(pTarget+7) = (char)(chb & 0x7F); + pTarget += 8; + } } break; -#if BIGENDIAN - LongCodeWithMask32: - // be careful about the sign extension - ch = (int)(((uint)ch) >> 16); - LongCodeWithMask16: - ch = (int)(((uint)ch) >> 8); -#else // BIGENDIAN LongCodeWithMask32: + if (BitConverter.IsLittleEndian) + { + ch &= 0xFF; + } + else + { + // be careful about the sign extension + ch = (int)(((uint)ch) >> 16); + } LongCodeWithMask16: - ch &= 0xFF; -#endif // BIGENDIAN + if (BitConverter.IsLittleEndian) + { + ch &= 0xFF; + } + else + { + ch = (int)(((uint)ch) >> 8); + } pSrc++; if (ch <= 0x7F) { diff --git a/src/Common/src/CoreLib/System/Text/UnicodeEncoding.cs b/src/Common/src/CoreLib/System/Text/UnicodeEncoding.cs index 4895a21280..049e1decdc 100644 --- a/src/Common/src/CoreLib/System/Text/UnicodeEncoding.cs +++ b/src/Common/src/CoreLib/System/Text/UnicodeEncoding.cs @@ -35,6 +35,8 @@ namespace System.Text // Unicode version 2.0 character size in bytes public const int CharSize = 2; + // endianness-based bit pattern mask. + static readonly ulong highLowPatternMask = ((ulong) 0xd800d800d800d800 | (BitConverter.IsLittleEndian ? (ulong) 0x0400000004000000 : (ulong) 0x0000040000000400)); public UnicodeEncoding() : this(false, true) @@ -415,11 +417,8 @@ namespace System.Text { // No fallback, maybe we can do it fast #if !NO_FAST_UNICODE_LOOP -#if BIGENDIAN // If endianess is backwards then each pair of bytes would be backwards. - if ( bigEndian && -#else - if (!bigEndian && -#endif // BIGENDIAN + // If endianess is backwards then each pair of bytes would be backwards. + if ( (bigEndian ^ BitConverter.IsLittleEndian) && #if BIT64 // 64 bit CPU needs to be long aligned for this to work. charLeftOver == 0 && (unchecked((long)chars) & 7) == 0) @@ -457,11 +456,7 @@ namespace System.Text // If they happen to be high/low/high/low, we may as well continue. Check the next // bit to see if its set (low) or not (high) in the right pattern -#if BIGENDIAN - if (((0xfc00fc00fc00fc00 & *longChars) ^ 0xd800dc00d800dc00) != 0) -#else - if (((0xfc00fc00fc00fc00 & *longChars) ^ 0xdc00d800dc00d800) != 0) -#endif + if (((0xfc00fc00fc00fc00 & *longChars) ^ highLowPatternMask) != 0) { // Either there weren't 4 surrogates, or the 0x0400 bit was set when a high // was hoped for or the 0x0400 bit wasn't set where a low was hoped for. @@ -705,11 +700,8 @@ namespace System.Text { // No fallback, maybe we can do it fast #if !NO_FAST_UNICODE_LOOP -#if BIGENDIAN // If endianess is backwards then each pair of bytes would be backwards. - if ( bigEndian && -#else - if (!bigEndian && -#endif // BIGENDIAN + // If endianess is backwards then each pair of bytes would be backwards. + if ( (bigEndian ^ BitConverter.IsLittleEndian) && #if BIT64 // 64 bit CPU needs to be long aligned for this to work, 32 bit CPU needs to be 32 bit aligned (unchecked((long)chars) & 7) == 0 && (unchecked((long)bytes) & 7) == 0 && #else @@ -756,11 +748,7 @@ namespace System.Text // If they happen to be high/low/high/low, we may as well continue. Check the next // bit to see if its set (low) or not (high) in the right pattern -#if BIGENDIAN - if (((0xfc00fc00fc00fc00 & *longChars) ^ 0xd800dc00d800dc00) != 0) -#else - if (((0xfc00fc00fc00fc00 & *longChars) ^ 0xdc00d800dc00d800) != 0) -#endif + if (((0xfc00fc00fc00fc00 & *longChars) ^ highLowPatternMask) != 0) { // Either there weren't 4 surrogates, or the 0x0400 bit was set when a high // was hoped for or the 0x0400 bit wasn't set where a low was hoped for. @@ -790,11 +778,7 @@ namespace System.Text // Also somehow this optimizes the above loop? It seems to cause something above // to get enregistered, but I haven't figured out how to make that happen without this loop. else if ((charLeftOver == 0) && -#if BIGENDIAN - bigEndian && -#else - !bigEndian && -#endif // BIGENDIAN + (bigEndian ^ BitConverter.IsLittleEndian) && #if BIT64 (unchecked((long)chars) & 7) != (unchecked((long)bytes) & 7) && // Only do this if chars & bytes are out of line, otherwise faster loop will be faster next time @@ -1131,11 +1115,7 @@ namespace System.Text // If we're aligned then maybe we can do it fast // That'll hurt if we're unaligned because we'll always test but never be aligned #if !NO_FAST_UNICODE_LOOP -#if BIGENDIAN - if (bigEndian && -#else // BIGENDIAN - if (!bigEndian && -#endif // BIGENDIAN + if ((bigEndian ^ BitConverter.IsLittleEndian) && #if BIT64 // win64 has to be long aligned (unchecked((long)bytes) & 7) == 0 && #else @@ -1173,11 +1153,7 @@ namespace System.Text // If they happen to be high/low/high/low, we may as well continue. Check the next // bit to see if its set (low) or not (high) in the right pattern -#if BIGENDIAN - if (((0xfc00fc00fc00fc00 & *longBytes) ^ 0xd800dc00d800dc00) != 0) -#else - if (((0xfc00fc00fc00fc00 & *longBytes) ^ 0xdc00d800dc00d800) != 0) -#endif + if (((0xfc00fc00fc00fc00 & *longBytes) ^ highLowPatternMask) != 0) { // Either there weren't 4 surrogates, or the 0x0400 bit was set when a high // was hoped for or the 0x0400 bit wasn't set where a low was hoped for. @@ -1454,11 +1430,7 @@ namespace System.Text // If we're aligned then maybe we can do it fast // That'll hurt if we're unaligned because we'll always test but never be aligned #if !NO_FAST_UNICODE_LOOP -#if BIGENDIAN - if (bigEndian && -#else // BIGENDIAN - if (!bigEndian && -#endif // BIGENDIAN + if ((bigEndian ^ BitConverter.IsLittleEndian) && #if BIT64 // win64 has to be long aligned (unchecked((long)chars) & 7) == 0 && (unchecked((long)bytes) & 7) == 0 && #else @@ -1505,11 +1477,7 @@ namespace System.Text // If they happen to be high/low/high/low, we may as well continue. Check the next // bit to see if its set (low) or not (high) in the right pattern -#if BIGENDIAN - if (((0xfc00fc00fc00fc00 & *longBytes) ^ 0xd800dc00d800dc00) != 0) -#else - if (((0xfc00fc00fc00fc00 & *longBytes) ^ 0xdc00d800dc00d800) != 0) -#endif + if (((0xfc00fc00fc00fc00 & *longBytes) ^ highLowPatternMask) != 0) { // Either there weren't 4 surrogates, or the 0x0400 bit was set when a high // was hoped for or the 0x0400 bit wasn't set where a low was hoped for. |