From dd9256ba20952bebea867eef89ad855d6d7fb802 Mon Sep 17 00:00:00 2001 From: Atsushi Eno Date: Tue, 28 Jun 2005 07:23:06 +0000 Subject: 2005-06-28 Atsushi Enomoto * create-mscompat-collation-table.cs : Fixed 0x8 category characters. Added hack for need-to-be-fixed characters to fall into 0xA category. * create-collation-element-table.cs : previous checkin seem failed :( * README: updated a bit. svn path=/trunk/mcs/; revision=46624 --- .../corlib/Mono.Globalization.Unicode/ChangeLog | 7 ++ mcs/class/corlib/Mono.Globalization.Unicode/README | 17 +++-- .../create-collation-element-table.cs | 27 ++++--- .../create-mscompat-collation-table.cs | 84 +++++++++++++++++----- 4 files changed, 105 insertions(+), 30 deletions(-) (limited to 'mcs/class/corlib/Mono.Globalization.Unicode') diff --git a/mcs/class/corlib/Mono.Globalization.Unicode/ChangeLog b/mcs/class/corlib/Mono.Globalization.Unicode/ChangeLog index dbaee2fe603..e78193f8409 100644 --- a/mcs/class/corlib/Mono.Globalization.Unicode/ChangeLog +++ b/mcs/class/corlib/Mono.Globalization.Unicode/ChangeLog @@ -1,3 +1,10 @@ +2005-06-28 Atsushi Enomoto + + * create-mscompat-collation-table.cs : Fixed 0x8 category characters. + Added hack for need-to-be-fixed characters to fall into 0xA category. + * create-collation-element-table.cs : previous checkin seem failed :( + * README: updated a bit. + 2005-06-24 Atsushi Enomoto * CodePointIndexer.cs : diff --git a/mcs/class/corlib/Mono.Globalization.Unicode/README b/mcs/class/corlib/Mono.Globalization.Unicode/README index be5168547f0..8a00092f51b 100644 --- a/mcs/class/corlib/Mono.Globalization.Unicode/README +++ b/mcs/class/corlib/Mono.Globalization.Unicode/README @@ -1,13 +1,20 @@ This directory contains support for Unicode normalization and collation. -This branch contains Normalization.cs and CollationElementTable.cs which -can be built from generator executables and Unicode Character Database found -at unicode.org (see Makefile for details). +This directory contains several files to be autogenerated: + + - MSCompatUnicodeTable.cs : support for CompareInfo. + - Normalization.cs : support for String.Normalize() + - CollationElementTable.cs : used in code generators which generate + above two sources. + +Targets in Makefile often downloads dependency files, such as Unicode +Character Database files. They are fed to source generators which are written +in C# sources. Many of the files in this directory are subject to change (for example, -being rewritten to generate C header like char-conversions.h and -culture-info-table.h). +being rewritten to generate resource files and possibly C header like +char-conversions.h and culture-info-table.h in the later stage). Atsushi Eno diff --git a/mcs/class/corlib/Mono.Globalization.Unicode/create-collation-element-table.cs b/mcs/class/corlib/Mono.Globalization.Unicode/create-collation-element-table.cs index 3e8694c6836..4bb4134d506 100644 --- a/mcs/class/corlib/Mono.Globalization.Unicode/create-collation-element-table.cs +++ b/mcs/class/corlib/Mono.Globalization.Unicode/create-collation-element-table.cs @@ -56,13 +56,13 @@ namespace Mono.Globalization.Unicode private void Serialize () { - Console.WriteLine ("static readonly short [] collElem = new short [] {"); + Console.WriteLine ("static readonly int [] collElem = new int [] {"); DumpArray (collElem, CollationElementTableUtil.Indexer.TotalCount, true); Console.WriteLine ("};"); Console.WriteLine ("static readonly SortKeyValue [] keyValues = new SortKeyValue [] {"); for (int i = 0; i < keyCount; i++) { SortKeyValue s = keyValues [i]; - Console.WriteLine (" new SortKeyValue ({0}, {1}, {2}, {3}, {4}),", + Console.WriteLine (" new SortKeyValue ({0}, 0x{1:X04}, 0x{2:X04}, 0x{3:X04}, 0x{4:X04}),", s.Alt ? "true" : "false", s.Primary, s.Secondary, s.Thirtiary, s.Quarternary); } @@ -87,7 +87,7 @@ namespace Mono.Globalization.Unicode private void Parse () { - int [] v = new int [4]; + ushort [] v = new ushort [4]; TextReader reader = Console.In; while (reader.Peek () != -1) { @@ -111,20 +111,29 @@ namespace Mono.Globalization.Unicode line = line.Substring (line.IndexOf (';') + 1).Trim (); // count entries in a line int entryPerLine = 0; - for (int e = 0; (e = line.IndexOf ('[', e + 1)) >= 0;) + for (int e = 0; (e = line.IndexOf ('[', e) + 1) > 0;) entryPerLine++; int start = 0; for (int e = 0; e < entryPerLine; e++) { start = line.IndexOf ('[', start) + 1; - string s = line.Substring (start, line.IndexOf (']', start) - start - 1); + string s = line.Substring (start, line.IndexOf (']', start) - start); bool alt = false; if (s [0] == '*') alt = true; string [] vslist = s.Substring (1).Split ('.'); - for (int i = 0; i < 4; i++) - v [i] = int.Parse (vslist [i], NumberStyles.HexNumber); + bool skip = false; + for (int i = 0; i < 4; i++) { + if (vslist [i].Length > 4) + skip = true; + else + v [i] = ushort.Parse (vslist [i], NumberStyles.HexNumber); + } + if (skip) { +// Console.Error.WriteLine ("WARNING: skipped entry {0:X}", cp); + continue; + } idx = keyCount; if (entryPerLine == 1) { // idx = 0 means "no matching entry", so here we start from 1 @@ -151,7 +160,7 @@ namespace Mono.Globalization.Unicode reader.Close (); } - private void AddEntry (bool alt, int [] v) + private void AddEntry (bool alt, ushort [] v) { if (keyCount == keyValues.Length) { SortKeyValue [] tmp = new SortKeyValue [keyCount * 2]; @@ -160,7 +169,7 @@ namespace Mono.Globalization.Unicode } keyValues [keyCount] = new SortKeyValue (alt, - v [0], v [1], v [2], v [3]); + v [0], (byte) v [1], (byte) v [2], v [3]); keyCount++; } } diff --git a/mcs/class/corlib/Mono.Globalization.Unicode/create-mscompat-collation-table.cs b/mcs/class/corlib/Mono.Globalization.Unicode/create-mscompat-collation-table.cs index 1a42cf15752..6c2ba5615c2 100644 --- a/mcs/class/corlib/Mono.Globalization.Unicode/create-mscompat-collation-table.cs +++ b/mcs/class/corlib/Mono.Globalization.Unicode/create-mscompat-collation-table.cs @@ -1450,7 +1450,7 @@ sw.Close (); AddCharMap ('\u2423', 0x7, 1, 0); // open box #endregion - // FIXME: 09 should be more complete. + // category 09 - continued symbols from 08 fillIndex [0x9] = 2; // misc tech mark for (int cp = 0x2300; cp <= 0x237A; cp++) @@ -1649,6 +1649,9 @@ sw.Close (); // but inside a-to-z range. // 3.there are some expanded characters that // are not part of Unicode Standard NFKD. + // 4. some characters are letter in IsLetter + // but not in sortkeys (maybe unicode version + // difference caused it). switch (i) { // 1. skipping them does not make sense // case 0xD0: case 0xF0: case 0x131: case 0x138: @@ -1666,11 +1669,12 @@ sw.Close (); case 0xFE: // Icelandic Thorn case 0xDF: // German ss case 0xFF: // German ss + // 4. + case 0x1C0: case 0x1C1: case 0x1C2: case 0x1C3: // not classified yet // case 0x1A6: case 0x1A7: case 0x1A8: case 0x1A9: // case 0x1AA: case 0x1B1: case 0x1B7: case 0x1B8: // case 0x1B9: case 0x1BA: case 0x1BB: case 0x1BF: -// case 0x1C0: case 0x1C1: case 0x1C2: case 0x1C3: // case 0x1DD: continue; } @@ -1848,8 +1852,8 @@ Console.Error.WriteLine ("----- {0:x04}", (int) orderedCyrillic [i]); AddCharMap ('\u0BD7', 0x19, 0); fillIndex [0x19] = 0xA; // vowels - for (int i = 0x0BD7; i < 0x0B94; i++) - if (Char.IsLetter ((char) i)) + for (int i = 0x0B82; i < 0x0B94; i++) + if (!IsIgnorable ((char) i)) AddCharMap ((char) i, 0x19, 2); // special vowel fillIndex [0x19] = 0x24; @@ -2242,6 +2246,7 @@ Console.Error.WriteLine ("----- {0:x04}", (int) orderedCyrillic [i]); switch (i) { case 0xAB: // 08 case 0xB7: // 0A + case 0xBB: // 08 case 0x2329: // 09 case 0x232A: // 09 continue; @@ -2272,14 +2277,6 @@ Console.Error.WriteLine ("----- {0:x04}", (int) orderedCyrillic [i]); // FIXME: for 07 xx we need more love. - // FIXME: 08 should be more complete. - fillIndex [0x8] = 2; - for (int cp = 0; cp < char.MaxValue; cp++) - if (!map [cp].Defined && - Char.GetUnicodeCategory ((char) cp) == - UnicodeCategory.MathSymbol) - AddCharMapGroup ((char) cp, 0x8, 1, 0); - // Characters w/ diacritical marks (NFKD) for (int i = 0; i <= char.MaxValue; i++) { if (map [i].Defined || IsIgnorable (i)) @@ -2318,6 +2315,54 @@ Console.Error.WriteLine ("----- {0:x04}", (int) orderedCyrillic [i]); } + // category 08 - symbols + fillIndex [0x8] = 2; + // Here Windows mapping is not straightforward. It is + // not based on computation but seems manual sorting. + AddCharMapGroup ('+', 0x8, 1, 0); // plus + AddCharMapGroup ('\u2212', 0x8, 1, 0); // minus + AddCharMapGroup ('\u229D', 0x8, 1, 0); // minus + AddCharMapGroup ('\u2297', 0x8, 1, 0); // mul + AddCharMapGroup ('\u2044', 0x8, 1, 0); // div + AddCharMapGroup ('\u2215', 0x8, 1, 0); // div + AddCharMapGroup ('\u2217', 0x8, 1, 0); // mul + AddCharMapGroup ('\u2218', 0x8, 1, 0); // ring + AddCharMapGroup ('\u2219', 0x8, 1, 0); // bullet + AddCharMapGroup ('\u2213', 0x8, 1, 0); // minus-or-plus + AddCharMapGroup ('\u003C', 0x8, 1, 0); // < + AddCharMapGroup ('\u227A', 0x8, 1, 0); // precedes relation + AddCharMapGroup ('\u22B0', 0x8, 1, 0); // precedes under relation + + for (int cp = 0; cp < 0x2300; cp++) { + if (cp == 0x200) + cp = 0x2200; // skip to 2200 + if (cp == 0xAC) // SPECIAL CASE: skip + continue; + if (!map [cp].Defined && +// Char.GetUnicodeCategory ((char) cp) == +// UnicodeCategory.MathSymbol) + Char.IsSymbol ((char) cp)) + AddCharMapGroup ((char) cp, 0x8, 1, 0); + // SPECIAL CASES: no idea why Windows sorts as such + switch (cp) { + case 0x3E: + AddCharMap ('\u227B', 0x8, 1, 0); + AddCharMap ('\u22B1', 0x8, 1, 0); + break; + case 0xB1: + AddCharMapGroup ('\u00AB', 0x8, 1, 0); + AddCharMapGroup ('\u226A', 0x8, 1, 0); + AddCharMapGroup ('\u00BB', 0x8, 1, 0); + AddCharMapGroup ('\u226B', 0x8, 1, 0); + break; + case 0xF7: + AddCharMap ('\u01C0', 0x8, 1, 0); + AddCharMap ('\u01C1', 0x8, 1, 0); + AddCharMap ('\u01C2', 0x8, 1, 0); + break; + } + } + #region Level2 adjustment // Arabic Hamzah diacritical [0x624] = 0x5; @@ -2328,7 +2373,6 @@ Console.Error.WriteLine ("----- {0:x04}", (int) orderedCyrillic [i]); diacritical [0x649] = 0x5; // 'alif maqs.uurah diacritical [0x64A] = 0x7; // Yaa' - for (int i = 0; i < char.MaxValue; i++) { byte mod = 0; byte cat = map [i].Category; @@ -2350,15 +2394,23 @@ Console.Error.WriteLine ("----- {0:x04}", (int) orderedCyrillic [i]); } #endregion - // FIXME: this is hack but those which are - // NonSpacingMark characters and still undefined - // are likely to be nonspacing. + // FIXME: this is hack but those NonSpacingMark + // characters and still undefined are likely to + // be nonspacing. for (int i = 0; i < char.MaxValue; i++) if (!map [i].Defined && !IsIgnorable (i) && Char.GetUnicodeCategory ((char) i) == UnicodeCategory.NonSpacingMark) AddCharMap ((char) i, 1, 1); + + // FIXME: this is hack but those Symbol characters + // are likely to fall into 0xA category. + for (int i = 0; i < char.MaxValue; i++) + if (!map [i].Defined && + !IsIgnorable (i) && + Char.IsSymbol ((char) i)) + AddCharMap ((char) i, 0xA, 1); } private void IncrementSequentialIndex (ref byte hangulCat) -- cgit v1.2.3