Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/mono/mono.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMiguel de Icaza <miguel@gnome.org>2010-04-23 01:25:04 +0400
committerMiguel de Icaza <miguel@gnome.org>2010-04-23 01:25:04 +0400
commitfe997730cc77ddfc7f16ce36df2bf30173a1ba7f (patch)
treeb1b81e4192b31c2b5309638f28e5856ee8f532f3 /mcs/class/corlib/Mono.Globalization.Unicode
parent08d5d47c72761a08d8b8c079bc15ef85df2084f2 (diff)
Second patch from: Damien Diederen <dd@crosstwine.com>
For bug: https://bugzilla.novell.com/show_bug.cgi?id=480152 * Normalization.cs: Follow the spec when checking composition pairs. Figure 7 in section 1.3 of http://unicode.org/reports/tr15/ shows how when doing composition, one has to examine the successive (starter, candidate) pairs, and combine if a matching canonical decomposition exists. The original algorithm was, instead, iterating on canonical decompositions, and, for each one, trying to match a sequence of (starter, non-starter, ...). This, however, does not produce the same results as it is violating some implicit ordering constraints in the Unicode tables. E.g., when composing the following sequence of codepoints, the original algorithm was picking: 03B7 0313 0300 0345 ^^^^ ^^^^ 1F74 0313 0345 ^^^^ ^^^^ 1FC2 0313 and would stop at 1FC2 0313 as there is no decomposition matching it. The new algorithm, which follows the guidance of the pretty figure 7, ends up doing: 03B7 0313 0300 0345 ^^^^ ^^^^ 1F20 0300 0345 ^^^^ ^^^^ 1F22 0345 ^^^^ ^^^^ 1F92 resulting in the correct 1F92. svn path=/trunk/mcs/; revision=155963
Diffstat (limited to 'mcs/class/corlib/Mono.Globalization.Unicode')
-rw-r--r--mcs/class/corlib/Mono.Globalization.Unicode/ChangeLog38
-rw-r--r--mcs/class/corlib/Mono.Globalization.Unicode/Normalization.cs212
2 files changed, 127 insertions, 123 deletions
diff --git a/mcs/class/corlib/Mono.Globalization.Unicode/ChangeLog b/mcs/class/corlib/Mono.Globalization.Unicode/ChangeLog
index df42abb38cb..84d1258842d 100644
--- a/mcs/class/corlib/Mono.Globalization.Unicode/ChangeLog
+++ b/mcs/class/corlib/Mono.Globalization.Unicode/ChangeLog
@@ -1,3 +1,41 @@
+2010-04-20 Damien Diederen <dd@crosstwine.com>
+
+ * Normalization.cs: Follow the spec when checking composition pairs.
+
+ Figure 7 in section 1.3 of http://unicode.org/reports/tr15/ shows
+ how when doing composition, one has to examine the successive
+ (starter, candidate) pairs, and combine if a matching canonical
+ decomposition exists.
+
+ The original algorithm was, instead, iterating on canonical
+ decompositions, and, for each one, trying to match a sequence
+ of (starter, non-starter, ...). This, however, does not produce
+ the same results as it is violating some implicit ordering
+ constraints in the Unicode tables.
+
+ E.g., when composing the following sequence of codepoints, the
+ original algorithm was picking:
+
+ 03B7 0313 0300 0345
+ ^^^^ ^^^^
+ 1F74 0313 0345
+ ^^^^ ^^^^
+ 1FC2 0313
+
+ and would stop at 1FC2 0313 as there is no decomposition matching
+ it. The new algorithm, which follows the guidance of the pretty
+ figure 7, ends up doing:
+
+ 03B7 0313 0300 0345
+ ^^^^ ^^^^
+ 1F20 0300 0345
+ ^^^^ ^^^^
+ 1F22 0345
+ ^^^^ ^^^^
+ 1F92
+
+ resulting in the correct 1F92.
+
2010-04-19 Damien Diederen <dd@crosstwine.com>
* Normalization.cs: Recursively apply the Unicode decomposition mapping.
diff --git a/mcs/class/corlib/Mono.Globalization.Unicode/Normalization.cs b/mcs/class/corlib/Mono.Globalization.Unicode/Normalization.cs
index 74d25334a96..01cf944980d 100644
--- a/mcs/class/corlib/Mono.Globalization.Unicode/Normalization.cs
+++ b/mcs/class/corlib/Mono.Globalization.Unicode/Normalization.cs
@@ -38,15 +38,6 @@ namespace Mono.Globalization.Unicode
return charMapIndex [NUtil.MapIdx (cp)];
}
- static int GetNormalizedStringLength (int ch)
- {
- int start = charMapIndex [NUtil.MapIdx (ch)];
- int i = start;
- while (mappedChars [i] != 0)
- i++;
- return i - start;
- }
-
static byte GetCombiningClass (int c)
{
return combiningClass [NUtil.Combining.ToIndex (c)];
@@ -62,66 +53,6 @@ namespace Mono.Globalization.Unicode
return helperIndex [NUtil.Helper.ToIndex (cp)];
}
- static int GetPrimaryCompositeCharIndex (object chars, int start)
- {
- string s = chars as string;
- StringBuilder sb = chars as StringBuilder;
- char startCh = s != null ? s [start] : sb [start];
- int charsLength = sb != null ? sb.Length : s.Length;
-
- int idx = GetPrimaryCompositeHelperIndex ((int) startCh);
- if (idx == 0)
- return 0;
- while (mappedChars [idx] == startCh) {
- int prevCB = 0;
- int combiningClass = 0;
- for (int i = 1, j = 1; ; i++, j++) {
- prevCB = combiningClass;
-
- if (mappedChars [idx + i] == 0)
- // matched
- return idx;
- if (start + i >= charsLength)
- return 0; // didn't match
-
- // handle blocked characters here.
- char curCh;
- bool match = false;
- do {
- curCh = s != null ?
- s [start + j] :
- sb [start + j];
- combiningClass = GetCombiningClass (curCh);
- if (mappedChars [idx + i] == curCh) {
- match = true;
- break;
- }
- if (combiningClass < prevCB) // blocked. Give up this map entry.
- break;
- if (++j + start >= charsLength || combiningClass == 0)
- break;
- } while (true);
-
- if (match)
- continue; // check next character in the current map entry string.
- if (prevCB < combiningClass) {
- j--;
- if (mappedChars [idx + i] == curCh)
- continue;
- //if (mappedChars [idx + i] > curCh)
- // return 0; // no match
- }
- // otherwise move idx to next item
- while (mappedChars [i] != 0)
- i++;
- idx += i + 1;
- break;
- }
- }
- // reached to end of entries
- return 0;
- }
-
private static string Compose (string source, int checkType)
{
StringBuilder sb = null;
@@ -155,57 +86,98 @@ namespace Mono.Globalization.Unicode
return (PropValue (i) & IsUnsafe) != 0;
}
*/
- private static void Combine (StringBuilder sb, int start, int checkType)
+ private static void Combine (StringBuilder sb, int i, int checkType)
{
- for (int i = start; i < sb.Length; i++) {
- if (QuickCheck (sb [i], checkType) == NormalizationCheck.Yes)
+ while (i < sb.Length) {
+ if (QuickCheck (sb [i], checkType) == NormalizationCheck.Yes) {
+ i++;
continue;
-
- int cur = i;
- // FIXME: It should check "blocked" too
- for (;i > 0; i--) // this loop does not check sb[0], but regardless of the condition below it should not go under 0.
- if (GetCombiningClass ((int) sb [i]) == 0)
- break;
-
- int idx = 0; // index to mappedChars
- for (; i < cur; i++) {
- idx = GetPrimaryCompositeMapIndex (sb, (int) sb [i], i);
- if (idx > 0)
- break;
}
- if (idx == 0) {
- i = cur;
+
+ i = TryComposeWithPreviousStarter (sb, null, i);
+ }
+ }
+
+ static int Fetch (StringBuilder sb, string s, int i)
+ {
+ return (int) (sb != null ? sb [i] : s [i]);
+ }
+
+ // Cf. figure 7, section 1.3 of http://unicode.org/reports/tr15/.
+ static int TryComposeWithPreviousStarter (StringBuilder sb, string s, int current)
+ {
+ // Backtrack to previous starter.
+ int i = current - 1;
+ if (GetCombiningClass (Fetch (sb, s, current)) == 0) {
+ if (i < 0 || GetCombiningClass (Fetch (sb, s, i)) != 0)
+ return current + 1;
+ } else {
+ while (i >= 0 && GetCombiningClass (Fetch (sb, s, i)) != 0)
+ i--;
+ if (i < 0)
+ return current + 1;
+ }
+
+ int starter = Fetch (sb, s, i);
+
+ // The various decompositions involving starter follow this index.
+ int comp_idx = GetPrimaryCompositeHelperIndex (starter);
+ if (comp_idx == 0)
+ return current + 1;
+
+ int length = (sb != null ? sb.Length : s.Length);
+ int prevCombiningClass = -1;
+ for (int j = i + 1; j < length; j++) {
+ int candidate = Fetch (sb, s, j);
+
+ int combiningClass = GetCombiningClass (candidate);
+ if (combiningClass == prevCombiningClass)
+ // We skipped over a guy with the same class, without
+ // combining. Skip this one, too.
continue;
- }
- int prim = GetPrimaryCompositeFromMapIndex (idx);
- int len = GetNormalizedStringLength (prim);
- if (prim == 0 || len == 0)
- throw new SystemException ("Internal error: should not happen. Input: " + sb);
- int removed = 0;
- sb.Insert (i++, (char) prim); // always single character
-
- // handle blocked characters here.
- while (removed < len) {
- if (sb [i] == mappedChars [idx + removed]) {
- sb.Remove (i, 1);
- removed++;
- // otherwise, skip it.
- }
- else
- i++;
+ int composed = TryCompose (comp_idx, starter, candidate);
+ if (composed != 0) {
+ if (sb == null)
+ // Not normalized, and we are only checking.
+ return -1;
+
+ // Full Unicode warning: This will break when the underlying
+ // tables are extended.
+ sb [i] = (char) composed;
+ sb.Remove (j, 1);
+
+ return current;
}
- i = cur - 1;
+
+ // Gray box. We're done.
+ if (combiningClass == 0)
+ return j + 1;
+
+ prevCombiningClass = combiningClass;
}
+
+ return length;
}
- static int GetPrimaryCompositeMapIndex (object o, int cur, int bufferPos)
+ static int TryCompose (int i, int starter, int candidate)
{
- if ((PropValue (cur) & FullCompositionExclusion) != 0)
- return 0;
- if (GetCombiningClass (cur) != 0)
- return 0; // not a starter
- return GetPrimaryCompositeCharIndex (o, bufferPos);
+ while (mappedChars [i] == starter) {
+ if (mappedChars [i + 1] == candidate &&
+ mappedChars [i + 2] == 0) {
+ int composed = GetPrimaryCompositeFromMapIndex (i);
+
+ if ((PropValue (composed) & FullCompositionExclusion) == 0)
+ return composed;
+ }
+
+ // Skip this entry.
+ while (mappedChars [i] != 0)
+ i++;
+ i++;
+ }
+
+ return 0;
}
static string Decompose (string source, int checkType)
@@ -394,13 +366,15 @@ namespace Mono.Globalization.Unicode
public static bool IsNormalized (string source, int type)
{
int prevCC = -1;
- for (int i = 0; i < source.Length; i++) {
+ for (int i = 0; i < source.Length; ) {
int cc = GetCombiningClass (source [i]);
if (cc != 0 && cc < prevCC)
return false;
prevCC = cc;
+
switch (QuickCheck (source [i], type)) {
case NormalizationCheck.Yes:
+ i++;
break;
case NormalizationCheck.No:
return false;
@@ -412,18 +386,10 @@ namespace Mono.Globalization.Unicode
return source == Normalize (source, type);
}
// go on...
-
- // partly copied from Combine()
- int cur = i;
- for (;i > 0; i--) // this loop does not check sb[0], but regardless of the condition below it should not go under 0.
- if (GetCombiningClass ((int) source [i]) == 0)
- break;
- //i++;
- // Now i is the "starter"
- for (; i < cur; i++) {
- if (GetPrimaryCompositeCharIndex (source, i) != 0)
- return false;
- }
+
+ i = TryComposeWithPreviousStarter (null, source, i);
+ if (i < 0)
+ return false;
break;
}
}