2005-06-28 Atsushi Enomoto <atsushi@ximian.com>

* create-mscompat-collation-table.cs : Fixed 0x8 category characters. Added hack for need-to-be-fixed characters to fall into 0xA category. * create-collation-element-table.cs : previous checkin seem failed :( * README: updated a bit. svn path=/trunk/mcs/; revision=46624
author: Atsushi Eno <atsushieno@gmail.com> 2005-06-28 11:23:06 +0400
committer: Atsushi Eno <atsushieno@gmail.com> 2005-06-28 11:23:06 +0400
commit: dd9256ba20952bebea867eef89ad855d6d7fb802 (patch)
tree: 96dab187514b86df30a4983535e3706a7c22f838 /mcs/class/corlib/Mono.Globalization.Unicode
parent: 8d1f1e6c468c236a911ccfaf5702b5149b33fe96 (diff)
4 files changed, 105 insertions, 30 deletions
diff --git a/mcs/class/corlib/Mono.Globalization.Unicode/ChangeLog b/mcs/class/corlib/Mono.Globalization.Unicode/ChangeLog
index dbaee2fe603..e78193f8409 100644
--- a/mcs/class/corlib/Mono.Globalization.Unicode/ChangeLog
+++ b/mcs/class/corlib/Mono.Globalization.Unicode/ChangeLog
@@ -1,3 +1,10 @@
+2005-06-28  Atsushi Enomoto  <atsushi@ximian.com>
+
+	* create-mscompat-collation-table.cs : Fixed 0x8 category characters.
+	  Added hack for need-to-be-fixed characters to fall into 0xA category.
+	* create-collation-element-table.cs : previous checkin seem failed :(
+	* README: updated a bit.
+
 2005-06-24  Atsushi Enomoto  <atsushi@ximian.com>
 
 	* CodePointIndexer.cs :
diff --git a/mcs/class/corlib/Mono.Globalization.Unicode/README b/mcs/class/corlib/Mono.Globalization.Unicode/README
index be5168547f0..8a00092f51b 100644
--- a/mcs/class/corlib/Mono.Globalization.Unicode/README
+++ b/mcs/class/corlib/Mono.Globalization.Unicode/README
@@ -1,13 +1,20 @@
 
 This directory contains support for Unicode normalization and collation.
 
-This branch contains Normalization.cs and CollationElementTable.cs which
-can be built from generator executables and Unicode Character Database found
-at unicode.org (see Makefile for details).
+This directory contains several files to be autogenerated:
+
+	- MSCompatUnicodeTable.cs : support for CompareInfo.
+	- Normalization.cs : support for String.Normalize()
+	- CollationElementTable.cs : used in code generators which generate
+	  above two sources.
+
+Targets in Makefile often downloads dependency files, such as Unicode
+Character Database files. They are fed to source generators which are written
+in C# sources.
 
 Many of the files in this directory are subject to change (for example,
-being rewritten to generate C header like char-conversions.h and
-culture-info-table.h).
+being rewritten to generate resource files and possibly C header like
+char-conversions.h and culture-info-table.h in the later stage).
 
 Atsushi Eno <atsushi@ximian.com>
 
diff --git a/mcs/class/corlib/Mono.Globalization.Unicode/create-collation-element-table.cs b/mcs/class/corlib/Mono.Globalization.Unicode/create-collation-element-table.cs
index 3e8694c6836..4bb4134d506 100644
--- a/mcs/class/corlib/Mono.Globalization.Unicode/create-collation-element-table.cs
+++ b/mcs/class/corlib/Mono.Globalization.Unicode/create-collation-element-table.cs
@@ -56,13 +56,13 @@ namespace Mono.Globalization.Unicode
 
 		private void Serialize ()
 		{
-			Console.WriteLine ("static readonly short [] collElem = new short [] {");
+			Console.WriteLine ("static readonly int [] collElem = new int [] {");
 			DumpArray (collElem, CollationElementTableUtil.Indexer.TotalCount, true);
 			Console.WriteLine ("};");
 			Console.WriteLine ("static readonly SortKeyValue [] keyValues = new SortKeyValue [] {");
 			for (int i = 0; i < keyCount; i++) {
 				SortKeyValue s = keyValues [i];
-				Console.WriteLine ("	new SortKeyValue ({0}, {1}, {2}, {3}, {4}),",
+				Console.WriteLine ("	new SortKeyValue ({0}, 0x{1:X04}, 0x{2:X04}, 0x{3:X04}, 0x{4:X04}),",
 					s.Alt ? "true" : "false", s.Primary, s.Secondary,
 					s.Thirtiary, s.Quarternary);
 			}
@@ -87,7 +87,7 @@ namespace Mono.Globalization.Unicode
 
 		private void Parse ()
 		{
-			int [] v = new int [4];
+			ushort [] v = new ushort [4];
 
 			TextReader reader = Console.In;
 			while (reader.Peek () != -1) {
@@ -111,20 +111,29 @@ namespace Mono.Globalization.Unicode
 				line = line.Substring (line.IndexOf (';') + 1).Trim ();
 				// count entries in a line
 				int entryPerLine = 0;
-				for (int e = 0; (e = line.IndexOf ('[', e + 1)) >= 0;)
+				for (int e = 0; (e = line.IndexOf ('[', e) + 1) > 0;)
 					entryPerLine++;
 
 				int start = 0;
 				for (int e = 0; e < entryPerLine; e++) {
 					start = line.IndexOf ('[', start) + 1;
-					string s = line.Substring (start, line.IndexOf (']', start) - start - 1);
+					string s = line.Substring (start, line.IndexOf (']', start) - start);
 
 					bool alt = false;
 					if (s [0] == '*')
 						alt = true;
 					string [] vslist = s.Substring (1).Split ('.');
-					for (int i = 0; i < 4; i++)
-						v [i] = int.Parse (vslist [i], NumberStyles.HexNumber);
+					bool skip = false;
+					for (int i = 0; i < 4; i++) {
+						if (vslist [i].Length > 4)
+							skip = true;
+						else
+							v [i] = ushort.Parse (vslist [i], NumberStyles.HexNumber);
+					}
+					if (skip) {
+//						Console.Error.WriteLine ("WARNING: skipped entry {0:X}", cp);
+						continue;
+					}
 					idx = keyCount;
 					if (entryPerLine == 1) {
 						// idx = 0 means "no matching entry", so here we start from 1
@@ -151,7 +160,7 @@ namespace Mono.Globalization.Unicode
 			reader.Close ();
 		}
 
-		private void AddEntry (bool alt, int [] v)
+		private void AddEntry (bool alt, ushort [] v)
 		{
 			if (keyCount == keyValues.Length) {
 				SortKeyValue [] tmp = new SortKeyValue [keyCount * 2];
@@ -160,7 +169,7 @@ namespace Mono.Globalization.Unicode
 			}
 			keyValues [keyCount] =
 				new SortKeyValue (alt,
-				v [0], v [1], v [2], v [3]);
+				v [0], (byte) v [1], (byte) v [2], v [3]);
 			keyCount++;
 		}
 	}
diff --git a/mcs/class/corlib/Mono.Globalization.Unicode/create-mscompat-collation-table.cs b/mcs/class/corlib/Mono.Globalization.Unicode/create-mscompat-collation-table.cs
index 1a42cf15752..6c2ba5615c2 100644
--- a/mcs/class/corlib/Mono.Globalization.Unicode/create-mscompat-collation-table.cs
+++ b/mcs/class/corlib/Mono.Globalization.Unicode/create-mscompat-collation-table.cs
@@ -1450,7 +1450,7 @@ sw.Close ();
 			AddCharMap ('\u2423', 0x7, 1, 0); // open box
 			#endregion
 
-			// FIXME: 09 should be more complete.
+			// category 09 - continued symbols from 08
 			fillIndex [0x9] = 2;
 			// misc tech mark
 			for (int cp = 0x2300; cp <= 0x237A; cp++)
@@ -1649,6 +1649,9 @@ sw.Close ();
 				//   but inside a-to-z range.
 				// 3.there are some expanded characters that
 				//   are not part of Unicode Standard NFKD.
+				// 4. some characters are letter in IsLetter
+				//   but not in sortkeys (maybe unicode version
+				//   difference caused it).
 				switch (i) {
 				// 1. skipping them does not make sense
 //				case 0xD0: case 0xF0: case 0x131: case 0x138:
@@ -1666,11 +1669,12 @@ sw.Close ();
 				case 0xFE: // Icelandic Thorn
 				case 0xDF: // German ss
 				case 0xFF: // German ss
+				// 4.
+				case 0x1C0: case 0x1C1: case 0x1C2: case 0x1C3:
 				// not classified yet
 //				case 0x1A6: case 0x1A7: case 0x1A8: case 0x1A9:
 //				case 0x1AA: case 0x1B1: case 0x1B7: case 0x1B8:
 //				case 0x1B9: case 0x1BA: case 0x1BB: case 0x1BF:
-//				case 0x1C0: case 0x1C1: case 0x1C2: case 0x1C3:
 //				case 0x1DD:
 					continue;
 				}
@@ -1848,8 +1852,8 @@ Console.Error.WriteLine ("----- {0:x04}", (int) orderedCyrillic [i]);
 			AddCharMap ('\u0BD7', 0x19, 0);
 			fillIndex [0x19] = 0xA;
 			// vowels
-			for (int i = 0x0BD7; i < 0x0B94; i++)
-				if (Char.IsLetter ((char) i))
+			for (int i = 0x0B82; i < 0x0B94; i++)
+				if (!IsIgnorable ((char) i))
 					AddCharMap ((char) i, 0x19, 2);
 			// special vowel
 			fillIndex [0x19] = 0x24;
@@ -2242,6 +2246,7 @@ Console.Error.WriteLine ("----- {0:x04}", (int) orderedCyrillic [i]);
 				switch (i) {
 				case 0xAB: // 08
 				case 0xB7: // 0A
+				case 0xBB: // 08
 				case 0x2329: // 09
 				case 0x232A: // 09
 					continue;
@@ -2272,14 +2277,6 @@ Console.Error.WriteLine ("----- {0:x04}", (int) orderedCyrillic [i]);
 
 			// FIXME: for 07 xx we need more love.
 
-			// FIXME: 08 should be more complete.
-			fillIndex [0x8] = 2;
-			for (int cp = 0; cp < char.MaxValue; cp++)
-				if (!map [cp].Defined &&
-					Char.GetUnicodeCategory ((char) cp) ==
-					UnicodeCategory.MathSymbol)
-					AddCharMapGroup ((char) cp, 0x8, 1, 0);
-
 			// Characters w/ diacritical marks (NFKD)
 			for (int i = 0; i <= char.MaxValue; i++) {
 				if (map [i].Defined || IsIgnorable (i))
@@ -2318,6 +2315,54 @@ Console.Error.WriteLine ("----- {0:x04}", (int) orderedCyrillic [i]);
 				
 			}
 
+			// category 08 - symbols
+			fillIndex [0x8] = 2;
+			// Here Windows mapping is not straightforward. It is
+			// not based on computation but seems manual sorting.
+			AddCharMapGroup ('+', 0x8, 1, 0); // plus
+			AddCharMapGroup ('\u2212', 0x8, 1, 0); // minus
+			AddCharMapGroup ('\u229D', 0x8, 1, 0); // minus
+			AddCharMapGroup ('\u2297', 0x8, 1, 0); // mul
+			AddCharMapGroup ('\u2044', 0x8, 1, 0); // div
+			AddCharMapGroup ('\u2215', 0x8, 1, 0); // div
+			AddCharMapGroup ('\u2217', 0x8, 1, 0); // mul
+			AddCharMapGroup ('\u2218', 0x8, 1, 0); // ring
+			AddCharMapGroup ('\u2219', 0x8, 1, 0); // bullet
+			AddCharMapGroup ('\u2213', 0x8, 1, 0); // minus-or-plus
+			AddCharMapGroup ('\u003C', 0x8, 1, 0); // <
+			AddCharMapGroup ('\u227A', 0x8, 1, 0); // precedes relation
+			AddCharMapGroup ('\u22B0', 0x8, 1, 0); // precedes under relation
+
+			for (int cp = 0; cp < 0x2300; cp++) {
+				if (cp == 0x200)
+					cp = 0x2200; // skip to 2200
+				if (cp == 0xAC) // SPECIAL CASE: skip
+					continue;
+				if (!map [cp].Defined &&
+//					Char.GetUnicodeCategory ((char) cp) ==
+//					UnicodeCategory.MathSymbol)
+					Char.IsSymbol ((char) cp))
+					AddCharMapGroup ((char) cp, 0x8, 1, 0);
+				// SPECIAL CASES: no idea why Windows sorts as such
+				switch (cp) {
+				case 0x3E:
+					AddCharMap ('\u227B', 0x8, 1, 0);
+					AddCharMap ('\u22B1', 0x8, 1, 0);
+					break;
+				case 0xB1:
+					AddCharMapGroup ('\u00AB', 0x8, 1, 0);
+					AddCharMapGroup ('\u226A', 0x8, 1, 0);
+					AddCharMapGroup ('\u00BB', 0x8, 1, 0);
+					AddCharMapGroup ('\u226B', 0x8, 1, 0);
+					break;
+				case 0xF7:
+					AddCharMap ('\u01C0', 0x8, 1, 0);
+					AddCharMap ('\u01C1', 0x8, 1, 0);
+					AddCharMap ('\u01C2', 0x8, 1, 0);
+					break;
+				}
+			}
+
 			#region Level2 adjustment
 			// Arabic Hamzah
 			diacritical [0x624] = 0x5;
@@ -2328,7 +2373,6 @@ Console.Error.WriteLine ("----- {0:x04}", (int) orderedCyrillic [i]);
 			diacritical [0x649] = 0x5; // 'alif maqs.uurah
 			diacritical [0x64A] = 0x7; // Yaa'
 
-
 			for (int i = 0; i < char.MaxValue; i++) {
 				byte mod = 0;
 				byte cat = map [i].Category;
@@ -2350,15 +2394,23 @@ Console.Error.WriteLine ("----- {0:x04}", (int) orderedCyrillic [i]);
 			}
 			#endregion
 
-			// FIXME: this is hack but those which are 
-			// NonSpacingMark characters and still undefined
-			// are likely to be nonspacing.
+			// FIXME: this is hack but those NonSpacingMark 
+			// characters and still undefined are likely to
+			// be nonspacing.
 			for (int i = 0; i < char.MaxValue; i++)
 				if (!map [i].Defined &&
 					!IsIgnorable (i) &&
 					Char.GetUnicodeCategory ((char) i) ==
 					UnicodeCategory.NonSpacingMark)
 					AddCharMap ((char) i, 1, 1);
+
+			// FIXME: this is hack but those Symbol characters
+			// are likely to fall into 0xA category.
+			for (int i = 0; i < char.MaxValue; i++)
+				if (!map [i].Defined &&
+					!IsIgnorable (i) &&
+					Char.IsSymbol ((char) i))
+					AddCharMap ((char) i, 0xA, 1);
 		}
 
 		private void IncrementSequentialIndex (ref byte hangulCat)
author	Atsushi Eno <atsushieno@gmail.com>	2005-06-28 11:23:06 +0400
committer	Atsushi Eno <atsushieno@gmail.com>	2005-06-28 11:23:06 +0400
commit	dd9256ba20952bebea867eef89ad855d6d7fb802 (patch)
tree	96dab187514b86df30a4983535e3706a7c22f838 /mcs/class/corlib/Mono.Globalization.Unicode
parent	8d1f1e6c468c236a911ccfaf5702b5149b33fe96 (diff)