Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/mono/corefx.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorCalvin <calvin@cmpct.info>2018-06-03 05:03:12 +0300
committerMarek Safar <marek.safar@gmail.com>2018-06-03 09:01:57 +0300
commit3a3f1bc99878a0bfb82aea366805f8ec00c38e17 (patch)
tree9023fb21444d49568adc475981062061bd4515f6
parent957ba72461bc1a6b69fb7e120b906b5cd98cfb35 (diff)
Try to fix issues with Unicode on big endian OSes (dotnet/coreclr#18254)
Mono had some patches in their fork of referencesource to resolve issues on big endian. Essentially, compile-time endianness handling is no good for big endian platforms, because they have to consume a Monolite intended for all platforms in order to bootstrap. So then, all big endian platforms are now consuming a bootstrap tarball made on little endian systems, and bad things happen as a result. This makes it so that endianness is checked at runtime by using System.BitConverter.IsLittleEndian, not by a compiler definition. This integrates mono/mono@1f9b218 and mono/mono@92cec46, which won't apply cleanly to current CoreFX. mono/mono#8679 may get fixed by this, but as the comments say, there could be more patches missing. This just integrates the patches known to me on UnicodeEncoding and UTF8Encoding. Signed-off-by: dotnet-bot-corefx-mirror <dotnet-bot@microsoft.com>
-rw-r--r--src/Common/src/CoreLib/System/Text/UTF8Encoding.cs190
-rw-r--r--src/Common/src/CoreLib/System/Text/UnicodeEncoding.cs58
2 files changed, 124 insertions, 124 deletions
diff --git a/src/Common/src/CoreLib/System/Text/UTF8Encoding.cs b/src/Common/src/CoreLib/System/Text/UTF8Encoding.cs
index b9e08f9040..5446058721 100644
--- a/src/Common/src/CoreLib/System/Text/UTF8Encoding.cs
+++ b/src/Common/src/CoreLib/System/Text/UTF8Encoding.cs
@@ -719,12 +719,15 @@ namespace System.Text
break;
LongCodeWithMask:
-#if BIGENDIAN
- // be careful about the sign extension
- ch = (int)(((uint)ch) >> 16);
-#else // BIGENDIAN
- ch = (char)ch;
-#endif // BIGENDIAN
+ if (BitConverter.IsLittleEndian)
+ {
+ ch = (char)ch;
+ }
+ else
+ {
+ // be careful about the sign extension
+ ch = (int)(((uint)ch) >> 16);
+ }
pSrc++;
if (ch <= 0x7F)
@@ -1146,31 +1149,37 @@ namespace System.Text
}
// Unfortunately, this is endianess sensitive
-#if BIGENDIAN
- *pTarget = (byte)(ch>>16);
- *(pTarget+1) = (byte)ch;
- pSrc += 4;
- *(pTarget+2) = (byte)(chc>>16);
- *(pTarget+3) = (byte)chc;
- pTarget += 4;
-#else // BIGENDIAN
- *pTarget = (byte)ch;
- *(pTarget + 1) = (byte)(ch >> 16);
- pSrc += 4;
- *(pTarget + 2) = (byte)chc;
- *(pTarget + 3) = (byte)(chc >> 16);
- pTarget += 4;
-#endif // BIGENDIAN
+ if (BitConverter.IsLittleEndian)
+ {
+ *pTarget = (byte)ch;
+ *(pTarget + 1) = (byte)(ch >> 16);
+ pSrc += 4;
+ *(pTarget + 2) = (byte)chc;
+ *(pTarget + 3) = (byte)(chc >> 16);
+ pTarget += 4;
+ }
+ else
+ {
+ *pTarget = (byte)(ch>>16);
+ *(pTarget+1) = (byte)ch;
+ pSrc += 4;
+ *(pTarget+2) = (byte)(chc>>16);
+ *(pTarget+3) = (byte)chc;
+ pTarget += 4;
+ }
}
continue;
LongCodeWithMask:
-#if BIGENDIAN
- // be careful about the sign extension
- ch = (int)(((uint)ch) >> 16);
-#else // BIGENDIAN
- ch = (char)ch;
-#endif // BIGENDIAN
+ if (BitConverter.IsLittleEndian)
+ {
+ ch = (char)ch;
+ }
+ else
+ {
+ // be careful about the sign extension
+ ch = (int)(((uint)ch) >> 16);
+ }
pSrc++;
if (ch > 0x7F)
@@ -1568,17 +1577,26 @@ namespace System.Text
}
break;
-#if BIGENDIAN
- LongCodeWithMask32:
- // be careful about the sign extension
- ch = (int)(((uint)ch) >> 16);
- LongCodeWithMask16:
- ch = (int)(((uint)ch) >> 8);
-#else // BIGENDIAN
LongCodeWithMask32:
+ if (BitConverter.IsLittleEndian)
+ {
+ ch &= 0xFF;
+ }
+ else
+ {
+ // be careful about the sign extension
+ ch = (int)(((uint)ch) >> 16);
+ }
LongCodeWithMask16:
- ch &= 0xFF;
-#endif // BIGENDIAN
+ if (BitConverter.IsLittleEndian)
+ {
+ ch &= 0xFF;
+ }
+ else
+ {
+ ch = (int)(((uint)ch) >> 8);
+ }
+
pSrc++;
if (ch <= 0x7F)
{
@@ -2052,17 +2070,20 @@ namespace System.Text
}
// Unfortunately, this is endianess sensitive
-#if BIGENDIAN
- *pTarget = (char)((ch >> 8) & 0x7F);
- pSrc += 2;
- *(pTarget+1) = (char)(ch & 0x7F);
- pTarget += 2;
-#else // BIGENDIAN
- *pTarget = (char)(ch & 0x7F);
- pSrc += 2;
- *(pTarget + 1) = (char)((ch >> 8) & 0x7F);
- pTarget += 2;
-#endif // BIGENDIAN
+ if (BitConverter.IsLittleEndian)
+ {
+ *pTarget = (char)(ch & 0x7F);
+ pSrc += 2;
+ *(pTarget + 1) = (char)((ch >> 8) & 0x7F);
+ pTarget += 2;
+ }
+ else
+ {
+ *pTarget = (char)((ch >> 8) & 0x7F);
+ pSrc += 2;
+ *(pTarget+1) = (char)(ch & 0x7F);
+ pTarget += 2;
+ }
}
// Run 8 characters at a time!
@@ -2076,43 +2097,54 @@ namespace System.Text
}
// Unfortunately, this is endianess sensitive
-#if BIGENDIAN
- *pTarget = (char)((ch >> 24) & 0x7F);
- *(pTarget+1) = (char)((ch >> 16) & 0x7F);
- *(pTarget+2) = (char)((ch >> 8) & 0x7F);
- *(pTarget+3) = (char)(ch & 0x7F);
- pSrc += 8;
- *(pTarget+4) = (char)((chb >> 24) & 0x7F);
- *(pTarget+5) = (char)((chb >> 16) & 0x7F);
- *(pTarget+6) = (char)((chb >> 8) & 0x7F);
- *(pTarget+7) = (char)(chb & 0x7F);
- pTarget += 8;
-#else // BIGENDIAN
- *pTarget = (char)(ch & 0x7F);
- *(pTarget + 1) = (char)((ch >> 8) & 0x7F);
- *(pTarget + 2) = (char)((ch >> 16) & 0x7F);
- *(pTarget + 3) = (char)((ch >> 24) & 0x7F);
- pSrc += 8;
- *(pTarget + 4) = (char)(chb & 0x7F);
- *(pTarget + 5) = (char)((chb >> 8) & 0x7F);
- *(pTarget + 6) = (char)((chb >> 16) & 0x7F);
- *(pTarget + 7) = (char)((chb >> 24) & 0x7F);
- pTarget += 8;
-#endif // BIGENDIAN
+ if (BitConverter.IsLittleEndian)
+ {
+ *pTarget = (char)(ch & 0x7F);
+ *(pTarget + 1) = (char)((ch >> 8) & 0x7F);
+ *(pTarget + 2) = (char)((ch >> 16) & 0x7F);
+ *(pTarget + 3) = (char)((ch >> 24) & 0x7F);
+ pSrc += 8;
+ *(pTarget + 4) = (char)(chb & 0x7F);
+ *(pTarget + 5) = (char)((chb >> 8) & 0x7F);
+ *(pTarget + 6) = (char)((chb >> 16) & 0x7F);
+ *(pTarget + 7) = (char)((chb >> 24) & 0x7F);
+ pTarget += 8;
+ }
+ else
+ {
+ *pTarget = (char)((ch >> 24) & 0x7F);
+ *(pTarget+1) = (char)((ch >> 16) & 0x7F);
+ *(pTarget+2) = (char)((ch >> 8) & 0x7F);
+ *(pTarget+3) = (char)(ch & 0x7F);
+ pSrc += 8;
+ *(pTarget+4) = (char)((chb >> 24) & 0x7F);
+ *(pTarget+5) = (char)((chb >> 16) & 0x7F);
+ *(pTarget+6) = (char)((chb >> 8) & 0x7F);
+ *(pTarget+7) = (char)(chb & 0x7F);
+ pTarget += 8;
+ }
}
break;
-#if BIGENDIAN
- LongCodeWithMask32:
- // be careful about the sign extension
- ch = (int)(((uint)ch) >> 16);
- LongCodeWithMask16:
- ch = (int)(((uint)ch) >> 8);
-#else // BIGENDIAN
LongCodeWithMask32:
+ if (BitConverter.IsLittleEndian)
+ {
+ ch &= 0xFF;
+ }
+ else
+ {
+ // be careful about the sign extension
+ ch = (int)(((uint)ch) >> 16);
+ }
LongCodeWithMask16:
- ch &= 0xFF;
-#endif // BIGENDIAN
+ if (BitConverter.IsLittleEndian)
+ {
+ ch &= 0xFF;
+ }
+ else
+ {
+ ch = (int)(((uint)ch) >> 8);
+ }
pSrc++;
if (ch <= 0x7F)
{
diff --git a/src/Common/src/CoreLib/System/Text/UnicodeEncoding.cs b/src/Common/src/CoreLib/System/Text/UnicodeEncoding.cs
index 4895a21280..049e1decdc 100644
--- a/src/Common/src/CoreLib/System/Text/UnicodeEncoding.cs
+++ b/src/Common/src/CoreLib/System/Text/UnicodeEncoding.cs
@@ -35,6 +35,8 @@ namespace System.Text
// Unicode version 2.0 character size in bytes
public const int CharSize = 2;
+ // endianness-based bit pattern mask.
+ static readonly ulong highLowPatternMask = ((ulong) 0xd800d800d800d800 | (BitConverter.IsLittleEndian ? (ulong) 0x0400000004000000 : (ulong) 0x0000040000000400));
public UnicodeEncoding()
: this(false, true)
@@ -415,11 +417,8 @@ namespace System.Text
{
// No fallback, maybe we can do it fast
#if !NO_FAST_UNICODE_LOOP
-#if BIGENDIAN // If endianess is backwards then each pair of bytes would be backwards.
- if ( bigEndian &&
-#else
- if (!bigEndian &&
-#endif // BIGENDIAN
+ // If endianess is backwards then each pair of bytes would be backwards.
+ if ( (bigEndian ^ BitConverter.IsLittleEndian) &&
#if BIT64 // 64 bit CPU needs to be long aligned for this to work.
charLeftOver == 0 && (unchecked((long)chars) & 7) == 0)
@@ -457,11 +456,7 @@ namespace System.Text
// If they happen to be high/low/high/low, we may as well continue. Check the next
// bit to see if its set (low) or not (high) in the right pattern
-#if BIGENDIAN
- if (((0xfc00fc00fc00fc00 & *longChars) ^ 0xd800dc00d800dc00) != 0)
-#else
- if (((0xfc00fc00fc00fc00 & *longChars) ^ 0xdc00d800dc00d800) != 0)
-#endif
+ if (((0xfc00fc00fc00fc00 & *longChars) ^ highLowPatternMask) != 0)
{
// Either there weren't 4 surrogates, or the 0x0400 bit was set when a high
// was hoped for or the 0x0400 bit wasn't set where a low was hoped for.
@@ -705,11 +700,8 @@ namespace System.Text
{
// No fallback, maybe we can do it fast
#if !NO_FAST_UNICODE_LOOP
-#if BIGENDIAN // If endianess is backwards then each pair of bytes would be backwards.
- if ( bigEndian &&
-#else
- if (!bigEndian &&
-#endif // BIGENDIAN
+ // If endianess is backwards then each pair of bytes would be backwards.
+ if ( (bigEndian ^ BitConverter.IsLittleEndian) &&
#if BIT64 // 64 bit CPU needs to be long aligned for this to work, 32 bit CPU needs to be 32 bit aligned
(unchecked((long)chars) & 7) == 0 && (unchecked((long)bytes) & 7) == 0 &&
#else
@@ -756,11 +748,7 @@ namespace System.Text
// If they happen to be high/low/high/low, we may as well continue. Check the next
// bit to see if its set (low) or not (high) in the right pattern
-#if BIGENDIAN
- if (((0xfc00fc00fc00fc00 & *longChars) ^ 0xd800dc00d800dc00) != 0)
-#else
- if (((0xfc00fc00fc00fc00 & *longChars) ^ 0xdc00d800dc00d800) != 0)
-#endif
+ if (((0xfc00fc00fc00fc00 & *longChars) ^ highLowPatternMask) != 0)
{
// Either there weren't 4 surrogates, or the 0x0400 bit was set when a high
// was hoped for or the 0x0400 bit wasn't set where a low was hoped for.
@@ -790,11 +778,7 @@ namespace System.Text
// Also somehow this optimizes the above loop? It seems to cause something above
// to get enregistered, but I haven't figured out how to make that happen without this loop.
else if ((charLeftOver == 0) &&
-#if BIGENDIAN
- bigEndian &&
-#else
- !bigEndian &&
-#endif // BIGENDIAN
+ (bigEndian ^ BitConverter.IsLittleEndian) &&
#if BIT64
(unchecked((long)chars) & 7) != (unchecked((long)bytes) & 7) && // Only do this if chars & bytes are out of line, otherwise faster loop will be faster next time
@@ -1131,11 +1115,7 @@ namespace System.Text
// If we're aligned then maybe we can do it fast
// That'll hurt if we're unaligned because we'll always test but never be aligned
#if !NO_FAST_UNICODE_LOOP
-#if BIGENDIAN
- if (bigEndian &&
-#else // BIGENDIAN
- if (!bigEndian &&
-#endif // BIGENDIAN
+ if ((bigEndian ^ BitConverter.IsLittleEndian) &&
#if BIT64 // win64 has to be long aligned
(unchecked((long)bytes) & 7) == 0 &&
#else
@@ -1173,11 +1153,7 @@ namespace System.Text
// If they happen to be high/low/high/low, we may as well continue. Check the next
// bit to see if its set (low) or not (high) in the right pattern
-#if BIGENDIAN
- if (((0xfc00fc00fc00fc00 & *longBytes) ^ 0xd800dc00d800dc00) != 0)
-#else
- if (((0xfc00fc00fc00fc00 & *longBytes) ^ 0xdc00d800dc00d800) != 0)
-#endif
+ if (((0xfc00fc00fc00fc00 & *longBytes) ^ highLowPatternMask) != 0)
{
// Either there weren't 4 surrogates, or the 0x0400 bit was set when a high
// was hoped for or the 0x0400 bit wasn't set where a low was hoped for.
@@ -1454,11 +1430,7 @@ namespace System.Text
// If we're aligned then maybe we can do it fast
// That'll hurt if we're unaligned because we'll always test but never be aligned
#if !NO_FAST_UNICODE_LOOP
-#if BIGENDIAN
- if (bigEndian &&
-#else // BIGENDIAN
- if (!bigEndian &&
-#endif // BIGENDIAN
+ if ((bigEndian ^ BitConverter.IsLittleEndian) &&
#if BIT64 // win64 has to be long aligned
(unchecked((long)chars) & 7) == 0 && (unchecked((long)bytes) & 7) == 0 &&
#else
@@ -1505,11 +1477,7 @@ namespace System.Text
// If they happen to be high/low/high/low, we may as well continue. Check the next
// bit to see if its set (low) or not (high) in the right pattern
-#if BIGENDIAN
- if (((0xfc00fc00fc00fc00 & *longBytes) ^ 0xd800dc00d800dc00) != 0)
-#else
- if (((0xfc00fc00fc00fc00 & *longBytes) ^ 0xdc00d800dc00d800) != 0)
-#endif
+ if (((0xfc00fc00fc00fc00 & *longBytes) ^ highLowPatternMask) != 0)
{
// Either there weren't 4 surrogates, or the 0x0400 bit was set when a high
// was hoped for or the 0x0400 bit wasn't set where a low was hoped for.