diff options
Diffstat (limited to 'src/core/Analysis')
49 files changed, 11040 insertions, 0 deletions
diff --git a/src/core/Analysis/ASCIIFoldingFilter.cs b/src/core/Analysis/ASCIIFoldingFilter.cs new file mode 100644 index 0000000..6133870 --- /dev/null +++ b/src/core/Analysis/ASCIIFoldingFilter.cs @@ -0,0 +1,3285 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +using Lucene.Net.Analysis.Tokenattributes; +using ArrayUtil = Lucene.Net.Util.ArrayUtil; + +namespace Lucene.Net.Analysis +{ + + /// <summary> This class converts alphabetic, numeric, and symbolic Unicode characters + /// which are not in the first 127 ASCII characters (the "Basic Latin" Unicode + /// block) into their ASCII equivalents, if one exists. + /// + /// Characters from the following Unicode blocks are converted; however, only + /// those characters with reasonable ASCII alternatives are converted: + /// + /// <list type="bullet"> + /// <item>C1 Controls and Latin-1 Supplement: <a href="http://www.unicode.org/charts/PDF/U0080.pdf">http://www.unicode.org/charts/PDF/U0080.pdf</a></item> + /// <item>Latin Extended-A: <a href="http://www.unicode.org/charts/PDF/U0100.pdf">http://www.unicode.org/charts/PDF/U0100.pdf</a></item> + /// <item>Latin Extended-B: <a href="http://www.unicode.org/charts/PDF/U0180.pdf">http://www.unicode.org/charts/PDF/U0180.pdf</a></item> + /// <item>Latin Extended Additional: <a href="http://www.unicode.org/charts/PDF/U1E00.pdf">http://www.unicode.org/charts/PDF/U1E00.pdf</a></item> + /// <item>Latin Extended-C: <a href="http://www.unicode.org/charts/PDF/U2C60.pdf">http://www.unicode.org/charts/PDF/U2C60.pdf</a></item> + /// <item>Latin Extended-D: <a href="http://www.unicode.org/charts/PDF/UA720.pdf">http://www.unicode.org/charts/PDF/UA720.pdf</a></item> + /// <item>IPA Extensions: <a href="http://www.unicode.org/charts/PDF/U0250.pdf">http://www.unicode.org/charts/PDF/U0250.pdf</a></item> + /// <item>Phonetic Extensions: <a href="http://www.unicode.org/charts/PDF/U1D00.pdf">http://www.unicode.org/charts/PDF/U1D00.pdf</a></item> + /// <item>Phonetic Extensions Supplement: <a href="http://www.unicode.org/charts/PDF/U1D80.pdf">http://www.unicode.org/charts/PDF/U1D80.pdf</a></item> + /// <item>General Punctuation: <a href="http://www.unicode.org/charts/PDF/U2000.pdf">http://www.unicode.org/charts/PDF/U2000.pdf</a></item> + /// <item>Superscripts and Subscripts: <a href="http://www.unicode.org/charts/PDF/U2070.pdf">http://www.unicode.org/charts/PDF/U2070.pdf</a></item> + /// <item>Enclosed Alphanumerics: <a href="http://www.unicode.org/charts/PDF/U2460.pdf">http://www.unicode.org/charts/PDF/U2460.pdf</a></item> + /// <item>Dingbats: <a href="http://www.unicode.org/charts/PDF/U2700.pdf">http://www.unicode.org/charts/PDF/U2700.pdf</a></item> + /// <item>Supplemental Punctuation: <a href="http://www.unicode.org/charts/PDF/U2E00.pdf">http://www.unicode.org/charts/PDF/U2E00.pdf</a></item> + /// <item>Alphabetic Presentation Forms: <a href="http://www.unicode.org/charts/PDF/UFB00.pdf">http://www.unicode.org/charts/PDF/UFB00.pdf</a></item> + /// <item>Halfwidth and Fullwidth Forms: <a href="http://www.unicode.org/charts/PDF/UFF00.pdf">http://www.unicode.org/charts/PDF/UFF00.pdf</a></item> + /// </list> + /// + /// See: <a href="http://en.wikipedia.org/wiki/Latin_characters_in_Unicode">http://en.wikipedia.org/wiki/Latin_characters_in_Unicode</a> + /// + /// The set of character conversions supported by this class is a superset of + /// those supported by Lucene's <see cref="ISOLatin1AccentFilter" /> which strips + /// accents from Latin1 characters. For example, 'À' will be replaced by + /// 'a'. + /// </summary> + public sealed class ASCIIFoldingFilter : TokenFilter + { + public ASCIIFoldingFilter(TokenStream input):base(input) + { + termAtt = AddAttribute<ITermAttribute>(); + } + + private char[] output = new char[512]; + private int outputPos; + private ITermAttribute termAtt; + + public override bool IncrementToken() + { + if (input.IncrementToken()) + { + char[] buffer = termAtt.TermBuffer(); + int length = termAtt.TermLength(); + + // If no characters actually require rewriting then we + // just return token as-is: + for (int i = 0; i < length; ++i) + { + char c = buffer[i]; + if (c >= '\u0080') + { + FoldToASCII(buffer, length); + termAtt.SetTermBuffer(output, 0, outputPos); + break; + } + } + return true; + } + else + { + return false; + } + } + + /// <summary> Converts characters above ASCII to their ASCII equivalents. For example, + /// accents are removed from accented characters. + /// </summary> + /// <param name="input">The string to fold + /// </param> + /// <param name="length">The number of characters in the input string + /// </param> + public void FoldToASCII(char[] input, int length) + { + // Worst-case length required: + int maxSizeNeeded = 4 * length; + if (output.Length < maxSizeNeeded) + { + output = new char[ArrayUtil.GetNextSize(maxSizeNeeded)]; + } + + outputPos = 0; + + for (int pos = 0; pos < length; ++pos) + { + char c = input[pos]; + + // Quick test: if it's not in range then just keep current character + if (c < '\u0080') + { + output[outputPos++] = c; + } + else + { + switch (c) + { + + case '\u00C0': + // À [LATIN CAPITAL LETTER A WITH GRAVE] + case '\u00C1': + // � [LATIN CAPITAL LETTER A WITH ACUTE] + case '\u00C2': + //  [LATIN CAPITAL LETTER A WITH CIRCUMFLEX] + case '\u00C3': + // à [LATIN CAPITAL LETTER A WITH TILDE] + case '\u00C4': + // Ä [LATIN CAPITAL LETTER A WITH DIAERESIS] + case '\u00C5': + // Ã… [LATIN CAPITAL LETTER A WITH RING ABOVE] + case '\u0100': + // Ä€ [LATIN CAPITAL LETTER A WITH MACRON] + case '\u0102': + // Ä‚ [LATIN CAPITAL LETTER A WITH BREVE] + case '\u0104': + // Ä„ [LATIN CAPITAL LETTER A WITH OGONEK] + case '\u018F': + // � http://en.wikipedia.org/wiki/Schwa [LATIN CAPITAL LETTER SCHWA] + case '\u01CD': + // � [LATIN CAPITAL LETTER A WITH CARON] + case '\u01DE': + // Çž [LATIN CAPITAL LETTER A WITH DIAERESIS AND MACRON] + case '\u01E0': + // Ç [LATIN CAPITAL LETTER A WITH DOT ABOVE AND MACRON] + case '\u01FA': + // Ǻ [LATIN CAPITAL LETTER A WITH RING ABOVE AND ACUTE] + case '\u0200': + // È€ [LATIN CAPITAL LETTER A WITH DOUBLE GRAVE] + case '\u0202': + // È‚ [LATIN CAPITAL LETTER A WITH INVERTED BREVE] + case '\u0226': + // Ȧ [LATIN CAPITAL LETTER A WITH DOT ABOVE] + case '\u023A': + // Ⱥ [LATIN CAPITAL LETTER A WITH STROKE] + case '\u1D00': + // á´€ [LATIN LETTER SMALL CAPITAL A] + case '\u1E00': + // Ḁ [LATIN CAPITAL LETTER A WITH RING BELOW] + case '\u1EA0': + // Ạ[LATIN CAPITAL LETTER A WITH DOT BELOW] + case '\u1EA2': + // Ả [LATIN CAPITAL LETTER A WITH HOOK ABOVE] + case '\u1EA4': + // Ấ [LATIN CAPITAL LETTER A WITH CIRCUMFLEX AND ACUTE] + case '\u1EA6': + // Ầ [LATIN CAPITAL LETTER A WITH CIRCUMFLEX AND GRAVE] + case '\u1EA8': + // Ẩ [LATIN CAPITAL LETTER A WITH CIRCUMFLEX AND HOOK ABOVE] + case '\u1EAA': + // Ẫ [LATIN CAPITAL LETTER A WITH CIRCUMFLEX AND TILDE] + case '\u1EAC': + // Ậ [LATIN CAPITAL LETTER A WITH CIRCUMFLEX AND DOT BELOW] + case '\u1EAE': + // Ắ [LATIN CAPITAL LETTER A WITH BREVE AND ACUTE] + case '\u1EB0': + // Ằ [LATIN CAPITAL LETTER A WITH BREVE AND GRAVE] + case '\u1EB2': + // Ẳ [LATIN CAPITAL LETTER A WITH BREVE AND HOOK ABOVE] + case '\u1EB4': + // Ẵ [LATIN CAPITAL LETTER A WITH BREVE AND TILDE] + case '\u1EB6': + // Ặ [LATIN CAPITAL LETTER A WITH BREVE AND DOT BELOW] + case '\u24B6': + // â’¶ [CIRCLED LATIN CAPITAL LETTER A] + case '\uFF21': // A [FULLWIDTH LATIN CAPITAL LETTER A] + output[outputPos++] = 'A'; + break; + + case '\u00E0': + // à[LATIN SMALL LETTER A WITH GRAVE] + case '\u00E1': + // á [LATIN SMALL LETTER A WITH ACUTE] + case '\u00E2': + // â [LATIN SMALL LETTER A WITH CIRCUMFLEX] + case '\u00E3': + // ã [LATIN SMALL LETTER A WITH TILDE] + case '\u00E4': + // ä [LATIN SMALL LETTER A WITH DIAERESIS] + case '\u00E5': + // Ã¥ [LATIN SMALL LETTER A WITH RING ABOVE] + case '\u0101': + // � [LATIN SMALL LETTER A WITH MACRON] + case '\u0103': + // ă [LATIN SMALL LETTER A WITH BREVE] + case '\u0105': + // Ä… [LATIN SMALL LETTER A WITH OGONEK] + case '\u01CE': + // ÇŽ [LATIN SMALL LETTER A WITH CARON] + case '\u01DF': + // ÇŸ [LATIN SMALL LETTER A WITH DIAERESIS AND MACRON] + case '\u01E1': + // Ç¡ [LATIN SMALL LETTER A WITH DOT ABOVE AND MACRON] + case '\u01FB': + // Ç» [LATIN SMALL LETTER A WITH RING ABOVE AND ACUTE] + case '\u0201': + // � [LATIN SMALL LETTER A WITH DOUBLE GRAVE] + case '\u0203': + // ȃ [LATIN SMALL LETTER A WITH INVERTED BREVE] + case '\u0227': + // ȧ [LATIN SMALL LETTER A WITH DOT ABOVE] + case '\u0250': + // � [LATIN SMALL LETTER TURNED A] + case '\u0259': + // É™ [LATIN SMALL LETTER SCHWA] + case '\u025A': + // Éš [LATIN SMALL LETTER SCHWA WITH HOOK] + case '\u1D8F': + // � [LATIN SMALL LETTER A WITH RETROFLEX HOOK] + case '\u1D95': + // ᶕ [LATIN SMALL LETTER SCHWA WITH RETROFLEX HOOK] + case '\u1E01': + // ạ [LATIN SMALL LETTER A WITH RING BELOW] + case '\u1E9A': + // ả [LATIN SMALL LETTER A WITH RIGHT HALF RING] + case '\u1EA1': + // ạ [LATIN SMALL LETTER A WITH DOT BELOW] + case '\u1EA3': + // ả [LATIN SMALL LETTER A WITH HOOK ABOVE] + case '\u1EA5': + // ấ [LATIN SMALL LETTER A WITH CIRCUMFLEX AND ACUTE] + case '\u1EA7': + // ầ [LATIN SMALL LETTER A WITH CIRCUMFLEX AND GRAVE] + case '\u1EA9': + // ẩ [LATIN SMALL LETTER A WITH CIRCUMFLEX AND HOOK ABOVE] + case '\u1EAB': + // ẫ [LATIN SMALL LETTER A WITH CIRCUMFLEX AND TILDE] + case '\u1EAD': + // Ạ[LATIN SMALL LETTER A WITH CIRCUMFLEX AND DOT BELOW] + case '\u1EAF': + // ắ [LATIN SMALL LETTER A WITH BREVE AND ACUTE] + case '\u1EB1': + // ằ [LATIN SMALL LETTER A WITH BREVE AND GRAVE] + case '\u1EB3': + // ẳ [LATIN SMALL LETTER A WITH BREVE AND HOOK ABOVE] + case '\u1EB5': + // ẵ [LATIN SMALL LETTER A WITH BREVE AND TILDE] + case '\u1EB7': + // ặ [LATIN SMALL LETTER A WITH BREVE AND DOT BELOW] + case '\u2090': + // � [LATIN SUBSCRIPT SMALL LETTER A] + case '\u2094': + // �? [LATIN SUBSCRIPT SMALL LETTER SCHWA] + case '\u24D0': + // � [CIRCLED LATIN SMALL LETTER A] + case '\u2C65': + // â±¥ [LATIN SMALL LETTER A WITH STROKE] + case '\u2C6F': + // Ɐ [LATIN CAPITAL LETTER TURNED A] + case '\uFF41': // � [FULLWIDTH LATIN SMALL LETTER A] + output[outputPos++] = 'a'; + break; + + case '\uA732': // Ꜳ [LATIN CAPITAL LETTER AA] + output[outputPos++] = 'A'; + output[outputPos++] = 'A'; + break; + + case '\u00C6': + // Æ[LATIN CAPITAL LETTER AE] + case '\u01E2': + // Ç¢ [LATIN CAPITAL LETTER AE WITH MACRON] + case '\u01FC': + // Ǽ [LATIN CAPITAL LETTER AE WITH ACUTE] + case '\u1D01': // á´� [LATIN LETTER SMALL CAPITAL AE] + output[outputPos++] = 'A'; + output[outputPos++] = 'E'; + break; + + case '\uA734': // Ꜵ [LATIN CAPITAL LETTER AO] + output[outputPos++] = 'A'; + output[outputPos++] = 'O'; + break; + + case '\uA736': // Ꜷ [LATIN CAPITAL LETTER AU] + output[outputPos++] = 'A'; + output[outputPos++] = 'U'; + break; + + case '\uA738': + // Ꜹ [LATIN CAPITAL LETTER AV] + case '\uA73A': // Ꜻ [LATIN CAPITAL LETTER AV WITH HORIZONTAL BAR] + output[outputPos++] = 'A'; + output[outputPos++] = 'V'; + break; + + case '\uA73C': // Ꜽ [LATIN CAPITAL LETTER AY] + output[outputPos++] = 'A'; + output[outputPos++] = 'Y'; + break; + + case '\u249C': // â’œ [PARENTHESIZED LATIN SMALL LETTER A] + output[outputPos++] = '('; + output[outputPos++] = 'a'; + output[outputPos++] = ')'; + break; + + case '\uA733': // ꜳ [LATIN SMALL LETTER AA] + output[outputPos++] = 'a'; + output[outputPos++] = 'a'; + break; + + case '\u00E6': + // æ [LATIN SMALL LETTER AE] + case '\u01E3': + // Ç£ [LATIN SMALL LETTER AE WITH MACRON] + case '\u01FD': + // ǽ [LATIN SMALL LETTER AE WITH ACUTE] + case '\u1D02': // á´‚ [LATIN SMALL LETTER TURNED AE] + output[outputPos++] = 'a'; + output[outputPos++] = 'e'; + break; + + case '\uA735': // ꜵ [LATIN SMALL LETTER AO] + output[outputPos++] = 'a'; + output[outputPos++] = 'o'; + break; + + case '\uA737': // ꜷ [LATIN SMALL LETTER AU] + output[outputPos++] = 'a'; + output[outputPos++] = 'u'; + break; + + case '\uA739': + // ꜹ [LATIN SMALL LETTER AV] + case '\uA73B': // ꜻ [LATIN SMALL LETTER AV WITH HORIZONTAL BAR] + output[outputPos++] = 'a'; + output[outputPos++] = 'v'; + break; + + case '\uA73D': // ꜽ [LATIN SMALL LETTER AY] + output[outputPos++] = 'a'; + output[outputPos++] = 'y'; + break; + + case '\u0181': + // � [LATIN CAPITAL LETTER B WITH HOOK] + case '\u0182': + // Æ‚ [LATIN CAPITAL LETTER B WITH TOPBAR] + case '\u0243': + // Ƀ [LATIN CAPITAL LETTER B WITH STROKE] + case '\u0299': + // Ê™ [LATIN LETTER SMALL CAPITAL B] + case '\u1D03': + // á´ƒ [LATIN LETTER SMALL CAPITAL BARRED B] + case '\u1E02': + // Ḃ [LATIN CAPITAL LETTER B WITH DOT ABOVE] + case '\u1E04': + // Ḅ [LATIN CAPITAL LETTER B WITH DOT BELOW] + case '\u1E06': + // Ḇ[LATIN CAPITAL LETTER B WITH LINE BELOW] + case '\u24B7': + // â’· [CIRCLED LATIN CAPITAL LETTER B] + case '\uFF22': // ï¼¢ [FULLWIDTH LATIN CAPITAL LETTER B] + output[outputPos++] = 'B'; + break; + + case '\u0180': + // Æ€ [LATIN SMALL LETTER B WITH STROKE] + case '\u0183': + // ƃ [LATIN SMALL LETTER B WITH TOPBAR] + case '\u0253': + // É“ [LATIN SMALL LETTER B WITH HOOK] + case '\u1D6C': + // ᵬ [LATIN SMALL LETTER B WITH MIDDLE TILDE] + case '\u1D80': + // ᶀ [LATIN SMALL LETTER B WITH PALATAL HOOK] + case '\u1E03': + // ḃ [LATIN SMALL LETTER B WITH DOT ABOVE] + case '\u1E05': + // ḅ [LATIN SMALL LETTER B WITH DOT BELOW] + case '\u1E07': + // ḇ [LATIN SMALL LETTER B WITH LINE BELOW] + case '\u24D1': + // â“‘ [CIRCLED LATIN SMALL LETTER B] + case '\uFF42': // b [FULLWIDTH LATIN SMALL LETTER B] + output[outputPos++] = 'b'; + break; + + case '\u249D': // â’� [PARENTHESIZED LATIN SMALL LETTER B] + output[outputPos++] = '('; + output[outputPos++] = 'b'; + output[outputPos++] = ')'; + break; + + case '\u00C7': + // Ç [LATIN CAPITAL LETTER C WITH CEDILLA] + case '\u0106': + // Ć[LATIN CAPITAL LETTER C WITH ACUTE] + case '\u0108': + // Ĉ [LATIN CAPITAL LETTER C WITH CIRCUMFLEX] + case '\u010A': + // ÄŠ[LATIN CAPITAL LETTER C WITH DOT ABOVE] + case '\u010C': + // ÄŒ [LATIN CAPITAL LETTER C WITH CARON] + case '\u0187': + // Ƈ [LATIN CAPITAL LETTER C WITH HOOK] + case '\u023B': + // È» [LATIN CAPITAL LETTER C WITH STROKE] + case '\u0297': + // Ê— [LATIN LETTER STRETCHED C] + case '\u1D04': + // á´„ [LATIN LETTER SMALL CAPITAL C] + case '\u1E08': + // Ḉ [LATIN CAPITAL LETTER C WITH CEDILLA AND ACUTE] + case '\u24B8': + // â’¸ [CIRCLED LATIN CAPITAL LETTER C] + case '\uFF23': // ï¼£ [FULLWIDTH LATIN CAPITAL LETTER C] + output[outputPos++] = 'C'; + break; + + case '\u00E7': + // ç [LATIN SMALL LETTER C WITH CEDILLA] + case '\u0107': + // ć [LATIN SMALL LETTER C WITH ACUTE] + case '\u0109': + // ĉ [LATIN SMALL LETTER C WITH CIRCUMFLEX] + case '\u010B': + // Ä‹ [LATIN SMALL LETTER C WITH DOT ABOVE] + case '\u010D': + // � [LATIN SMALL LETTER C WITH CARON] + case '\u0188': + // ƈ [LATIN SMALL LETTER C WITH HOOK] + case '\u023C': + // ȼ [LATIN SMALL LETTER C WITH STROKE] + case '\u0255': + // É• [LATIN SMALL LETTER C WITH CURL] + case '\u1E09': + // ḉ [LATIN SMALL LETTER C WITH CEDILLA AND ACUTE] + case '\u2184': + // ↄ [LATIN SMALL LETTER REVERSED C] + case '\u24D2': + // â“’ [CIRCLED LATIN SMALL LETTER C] + case '\uA73E': + // Ꜿ [LATIN CAPITAL LETTER REVERSED C WITH DOT] + case '\uA73F': + // ꜿ [LATIN SMALL LETTER REVERSED C WITH DOT] + case '\uFF43': // c [FULLWIDTH LATIN SMALL LETTER C] + output[outputPos++] = 'c'; + break; + + case '\u249E': // â’ž [PARENTHESIZED LATIN SMALL LETTER C] + output[outputPos++] = '('; + output[outputPos++] = 'c'; + output[outputPos++] = ')'; + break; + + case '\u00D0': + // � [LATIN CAPITAL LETTER ETH] + case '\u010E': + // ÄŽ [LATIN CAPITAL LETTER D WITH CARON] + case '\u0110': + // � [LATIN CAPITAL LETTER D WITH STROKE] + case '\u0189': + // Ɖ [LATIN CAPITAL LETTER AFRICAN D] + case '\u018A': + // ÆŠ[LATIN CAPITAL LETTER D WITH HOOK] + case '\u018B': + // Æ‹ [LATIN CAPITAL LETTER D WITH TOPBAR] + case '\u1D05': + // á´… [LATIN LETTER SMALL CAPITAL D] + case '\u1D06': + // á´†[LATIN LETTER SMALL CAPITAL ETH] + case '\u1E0A': + // Ḋ[LATIN CAPITAL LETTER D WITH DOT ABOVE] + case '\u1E0C': + // Ḍ [LATIN CAPITAL LETTER D WITH DOT BELOW] + case '\u1E0E': + // Ḏ [LATIN CAPITAL LETTER D WITH LINE BELOW] + case '\u1E10': + // � [LATIN CAPITAL LETTER D WITH CEDILLA] + case '\u1E12': + // Ḓ [LATIN CAPITAL LETTER D WITH CIRCUMFLEX BELOW] + case '\u24B9': + // â’¹ [CIRCLED LATIN CAPITAL LETTER D] + case '\uA779': + // � [LATIN CAPITAL LETTER INSULAR D] + case '\uFF24': // D [FULLWIDTH LATIN CAPITAL LETTER D] + output[outputPos++] = 'D'; + break; + + case '\u00F0': + // ð [LATIN SMALL LETTER ETH] + case '\u010F': + // � [LATIN SMALL LETTER D WITH CARON] + case '\u0111': + // Ä‘ [LATIN SMALL LETTER D WITH STROKE] + case '\u018C': + // ÆŒ [LATIN SMALL LETTER D WITH TOPBAR] + case '\u0221': + // È¡ [LATIN SMALL LETTER D WITH CURL] + case '\u0256': + // É– [LATIN SMALL LETTER D WITH TAIL] + case '\u0257': + // É— [LATIN SMALL LETTER D WITH HOOK] + case '\u1D6D': + // áµ [LATIN SMALL LETTER D WITH MIDDLE TILDE] + case '\u1D81': + // � [LATIN SMALL LETTER D WITH PALATAL HOOK] + case '\u1D91': + // ᶑ [LATIN SMALL LETTER D WITH HOOK AND TAIL] + case '\u1E0B': + // ḋ [LATIN SMALL LETTER D WITH DOT ABOVE] + case '\u1E0D': + // � [LATIN SMALL LETTER D WITH DOT BELOW] + case '\u1E0F': + // � [LATIN SMALL LETTER D WITH LINE BELOW] + case '\u1E11': + // ḑ [LATIN SMALL LETTER D WITH CEDILLA] + case '\u1E13': + // ḓ [LATIN SMALL LETTER D WITH CIRCUMFLEX BELOW] + case '\u24D3': + // â““ [CIRCLED LATIN SMALL LETTER D] + case '\uA77A': + // � [LATIN SMALL LETTER INSULAR D] + case '\uFF44': // d [FULLWIDTH LATIN SMALL LETTER D] + output[outputPos++] = 'd'; + break; + + case '\u01C4': + // Ç„ [LATIN CAPITAL LETTER DZ WITH CARON] + case '\u01F1': // DZ [LATIN CAPITAL LETTER DZ] + output[outputPos++] = 'D'; + output[outputPos++] = 'Z'; + break; + + case '\u01C5': + // Ç… [LATIN CAPITAL LETTER D WITH SMALL LETTER Z WITH CARON] + case '\u01F2': // Dz [LATIN CAPITAL LETTER D WITH SMALL LETTER Z] + output[outputPos++] = 'D'; + output[outputPos++] = 'z'; + break; + + case '\u249F': // â’Ÿ [PARENTHESIZED LATIN SMALL LETTER D] + output[outputPos++] = '('; + output[outputPos++] = 'd'; + output[outputPos++] = ')'; + break; + + case '\u0238': // ȸ [LATIN SMALL LETTER DB DIGRAPH] + output[outputPos++] = 'd'; + output[outputPos++] = 'b'; + break; + + case '\u01C6': + // dž[LATIN SMALL LETTER DZ WITH CARON] + case '\u01F3': + // dz [LATIN SMALL LETTER DZ] + case '\u02A3': + // Ê£ [LATIN SMALL LETTER DZ DIGRAPH] + case '\u02A5': // Ê¥ [LATIN SMALL LETTER DZ DIGRAPH WITH CURL] + output[outputPos++] = 'd'; + output[outputPos++] = 'z'; + break; + + case '\u00C8': + // È [LATIN CAPITAL LETTER E WITH GRAVE] + case '\u00C9': + // É [LATIN CAPITAL LETTER E WITH ACUTE] + case '\u00CA': + // Ê[LATIN CAPITAL LETTER E WITH CIRCUMFLEX] + case '\u00CB': + // Ë [LATIN CAPITAL LETTER E WITH DIAERESIS] + case '\u0112': + // Ä’ [LATIN CAPITAL LETTER E WITH MACRON] + case '\u0114': + // �? [LATIN CAPITAL LETTER E WITH BREVE] + case '\u0116': + // Ä– [LATIN CAPITAL LETTER E WITH DOT ABOVE] + case '\u0118': + // Ę [LATIN CAPITAL LETTER E WITH OGONEK] + case '\u011A': + // Äš [LATIN CAPITAL LETTER E WITH CARON] + case '\u018E': + // ÆŽ [LATIN CAPITAL LETTER REVERSED E] + case '\u0190': + // � [LATIN CAPITAL LETTER OPEN E] + case '\u0204': + // È„ [LATIN CAPITAL LETTER E WITH DOUBLE GRAVE] + case '\u0206': + // Ȇ[LATIN CAPITAL LETTER E WITH INVERTED BREVE] + case '\u0228': + // Ȩ [LATIN CAPITAL LETTER E WITH CEDILLA] + case '\u0246': + // Ɇ[LATIN CAPITAL LETTER E WITH STROKE] + case '\u1D07': + // á´‡ [LATIN LETTER SMALL CAPITAL E] + case '\u1E14': + // �? [LATIN CAPITAL LETTER E WITH MACRON AND GRAVE] + case '\u1E16': + // Ḗ [LATIN CAPITAL LETTER E WITH MACRON AND ACUTE] + case '\u1E18': + // Ḙ [LATIN CAPITAL LETTER E WITH CIRCUMFLEX BELOW] + case '\u1E1A': + // Ḛ [LATIN CAPITAL LETTER E WITH TILDE BELOW] + case '\u1E1C': + // Ḝ [LATIN CAPITAL LETTER E WITH CEDILLA AND BREVE] + case '\u1EB8': + // Ẹ [LATIN CAPITAL LETTER E WITH DOT BELOW] + case '\u1EBA': + // Ẻ [LATIN CAPITAL LETTER E WITH HOOK ABOVE] + case '\u1EBC': + // Ẽ [LATIN CAPITAL LETTER E WITH TILDE] + case '\u1EBE': + // Ế [LATIN CAPITAL LETTER E WITH CIRCUMFLEX AND ACUTE] + case '\u1EC0': + // Ề [LATIN CAPITAL LETTER E WITH CIRCUMFLEX AND GRAVE] + case '\u1EC2': + // Ể [LATIN CAPITAL LETTER E WITH CIRCUMFLEX AND HOOK ABOVE] + case '\u1EC4': + // Ễ [LATIN CAPITAL LETTER E WITH CIRCUMFLEX AND TILDE] + case '\u1EC6': + // Ệ[LATIN CAPITAL LETTER E WITH CIRCUMFLEX AND DOT BELOW] + case '\u24BA': + // â’º [CIRCLED LATIN CAPITAL LETTER E] + case '\u2C7B': + // â±» [LATIN LETTER SMALL CAPITAL TURNED E] + case '\uFF25': // ï¼¥ [FULLWIDTH LATIN CAPITAL LETTER E] + output[outputPos++] = 'E'; + break; + + case '\u00E8': + // è [LATIN SMALL LETTER E WITH GRAVE] + case '\u00E9': + // é [LATIN SMALL LETTER E WITH ACUTE] + case '\u00EA': + // ê [LATIN SMALL LETTER E WITH CIRCUMFLEX] + case '\u00EB': + // ë [LATIN SMALL LETTER E WITH DIAERESIS] + case '\u0113': + // Ä“ [LATIN SMALL LETTER E WITH MACRON] + case '\u0115': + // Ä• [LATIN SMALL LETTER E WITH BREVE] + case '\u0117': + // Ä— [LATIN SMALL LETTER E WITH DOT ABOVE] + case '\u0119': + // Ä™ [LATIN SMALL LETTER E WITH OGONEK] + case '\u011B': + // Ä› [LATIN SMALL LETTER E WITH CARON] + case '\u01DD': + // � [LATIN SMALL LETTER TURNED E] + case '\u0205': + // È… [LATIN SMALL LETTER E WITH DOUBLE GRAVE] + case '\u0207': + // ȇ [LATIN SMALL LETTER E WITH INVERTED BREVE] + case '\u0229': + // È© [LATIN SMALL LETTER E WITH CEDILLA] + case '\u0247': + // ɇ [LATIN SMALL LETTER E WITH STROKE] + case '\u0258': + // ɘ [LATIN SMALL LETTER REVERSED E] + case '\u025B': + // É› [LATIN SMALL LETTER OPEN E] + case '\u025C': + // Éœ [LATIN SMALL LETTER REVERSED OPEN E] + case '\u025D': + // � [LATIN SMALL LETTER REVERSED OPEN E WITH HOOK] + case '\u025E': + // Éž [LATIN SMALL LETTER CLOSED REVERSED OPEN E] + case '\u029A': + // Êš [LATIN SMALL LETTER CLOSED OPEN E] + case '\u1D08': + // á´ˆ [LATIN SMALL LETTER TURNED OPEN E] + case '\u1D92': + // ᶒ [LATIN SMALL LETTER E WITH RETROFLEX HOOK] + case '\u1D93': + // ᶓ [LATIN SMALL LETTER OPEN E WITH RETROFLEX HOOK] + case '\u1D94': + // �? [LATIN SMALL LETTER REVERSED OPEN E WITH RETROFLEX HOOK] + case '\u1E15': + // ḕ [LATIN SMALL LETTER E WITH MACRON AND GRAVE] + case '\u1E17': + // ḗ [LATIN SMALL LETTER E WITH MACRON AND ACUTE] + case '\u1E19': + // ḙ [LATIN SMALL LETTER E WITH CIRCUMFLEX BELOW] + case '\u1E1B': + // ḛ [LATIN SMALL LETTER E WITH TILDE BELOW] + case '\u1E1D': + // � [LATIN SMALL LETTER E WITH CEDILLA AND BREVE] + case '\u1EB9': + // ẹ [LATIN SMALL LETTER E WITH DOT BELOW] + case '\u1EBB': + // ẻ [LATIN SMALL LETTER E WITH HOOK ABOVE] + case '\u1EBD': + // ẽ [LATIN SMALL LETTER E WITH TILDE] + case '\u1EBF': + // ế [LATIN SMALL LETTER E WITH CIRCUMFLEX AND ACUTE] + case '\u1EC1': + // � [LATIN SMALL LETTER E WITH CIRCUMFLEX AND GRAVE] + case '\u1EC3': + // ể [LATIN SMALL LETTER E WITH CIRCUMFLEX AND HOOK ABOVE] + case '\u1EC5': + // á»… [LATIN SMALL LETTER E WITH CIRCUMFLEX AND TILDE] + case '\u1EC7': + // ệ [LATIN SMALL LETTER E WITH CIRCUMFLEX AND DOT BELOW] + case '\u2091': + // â‚‘ [LATIN SUBSCRIPT SMALL LETTER E] + case '\u24D4': + // �? [CIRCLED LATIN SMALL LETTER E] + case '\u2C78': + // ⱸ [LATIN SMALL LETTER E WITH NOTCH] + case '\uFF45': // ï½… [FULLWIDTH LATIN SMALL LETTER E] + output[outputPos++] = 'e'; + break; + + case '\u24A0': // â’ [PARENTHESIZED LATIN SMALL LETTER E] + output[outputPos++] = '('; + output[outputPos++] = 'e'; + output[outputPos++] = ')'; + break; + + case '\u0191': + // Æ‘ [LATIN CAPITAL LETTER F WITH HOOK] + case '\u1E1E': + // Ḟ [LATIN CAPITAL LETTER F WITH DOT ABOVE] + case '\u24BB': + // â’» [CIRCLED LATIN CAPITAL LETTER F] + case '\uA730': + // ꜰ [LATIN LETTER SMALL CAPITAL F] + case '\uA77B': + // � [LATIN CAPITAL LETTER INSULAR F] + case '\uA7FB': + // ꟻ [LATIN EPIGRAPHIC LETTER REVERSED F] + case '\uFF26': // F [FULLWIDTH LATIN CAPITAL LETTER F] + output[outputPos++] = 'F'; + break; + + case '\u0192': + // Æ’ [LATIN SMALL LETTER F WITH HOOK] + case '\u1D6E': + // áµ® [LATIN SMALL LETTER F WITH MIDDLE TILDE] + case '\u1D82': + // ᶂ [LATIN SMALL LETTER F WITH PALATAL HOOK] + case '\u1E1F': + // ḟ [LATIN SMALL LETTER F WITH DOT ABOVE] + case '\u1E9B': + // ẛ [LATIN SMALL LETTER LONG S WITH DOT ABOVE] + case '\u24D5': + // â“• [CIRCLED LATIN SMALL LETTER F] + case '\uA77C': + // � [LATIN SMALL LETTER INSULAR F] + case '\uFF46': // f[FULLWIDTH LATIN SMALL LETTER F] + output[outputPos++] = 'f'; + break; + + case '\u24A1': // â’¡ [PARENTHESIZED LATIN SMALL LETTER F] + output[outputPos++] = '('; + output[outputPos++] = 'f'; + output[outputPos++] = ')'; + break; + + case '\uFB00': // ff [LATIN SMALL LIGATURE FF] + output[outputPos++] = 'f'; + output[outputPos++] = 'f'; + break; + + case '\uFB03': // ffi [LATIN SMALL LIGATURE FFI] + output[outputPos++] = 'f'; + output[outputPos++] = 'f'; + output[outputPos++] = 'i'; + break; + + case '\uFB04': // ffl [LATIN SMALL LIGATURE FFL] + output[outputPos++] = 'f'; + output[outputPos++] = 'f'; + output[outputPos++] = 'l'; + break; + + case '\uFB01': // � [LATIN SMALL LIGATURE FI] + output[outputPos++] = 'f'; + output[outputPos++] = 'i'; + break; + + case '\uFB02': // fl [LATIN SMALL LIGATURE FL] + output[outputPos++] = 'f'; + output[outputPos++] = 'l'; + break; + + case '\u011C': + // Äœ [LATIN CAPITAL LETTER G WITH CIRCUMFLEX] + case '\u011E': + // Äž [LATIN CAPITAL LETTER G WITH BREVE] + case '\u0120': + // Ä [LATIN CAPITAL LETTER G WITH DOT ABOVE] + case '\u0122': + // Ä¢ [LATIN CAPITAL LETTER G WITH CEDILLA] + case '\u0193': + // Æ“ [LATIN CAPITAL LETTER G WITH HOOK] + case '\u01E4': + // Ǥ [LATIN CAPITAL LETTER G WITH STROKE] + case '\u01E5': + // Ç¥ [LATIN SMALL LETTER G WITH STROKE] + case '\u01E6': + // Ǧ [LATIN CAPITAL LETTER G WITH CARON] + case '\u01E7': + // ǧ [LATIN SMALL LETTER G WITH CARON] + case '\u01F4': + // Ç´ [LATIN CAPITAL LETTER G WITH ACUTE] + case '\u0262': + // É¢ [LATIN LETTER SMALL CAPITAL G] + case '\u029B': + // Ê› [LATIN LETTER SMALL CAPITAL G WITH HOOK] + case '\u1E20': + // Ḡ[LATIN CAPITAL LETTER G WITH MACRON] + case '\u24BC': + // â’¼ [CIRCLED LATIN CAPITAL LETTER G] + case '\uA77D': + // � [LATIN CAPITAL LETTER INSULAR G] + case '\uA77E': + // � [LATIN CAPITAL LETTER TURNED INSULAR G] + case '\uFF27': // G [FULLWIDTH LATIN CAPITAL LETTER G] + output[outputPos++] = 'G'; + break; + + case '\u011D': + // � [LATIN SMALL LETTER G WITH CIRCUMFLEX] + case '\u011F': + // ÄŸ [LATIN SMALL LETTER G WITH BREVE] + case '\u0121': + // Ä¡ [LATIN SMALL LETTER G WITH DOT ABOVE] + case '\u0123': + // Ä£ [LATIN SMALL LETTER G WITH CEDILLA] + case '\u01F5': + // ǵ [LATIN SMALL LETTER G WITH ACUTE] + case '\u0260': + // É [LATIN SMALL LETTER G WITH HOOK] + case '\u0261': + // É¡ [LATIN SMALL LETTER SCRIPT G] + case '\u1D77': + // áµ· [LATIN SMALL LETTER TURNED G] + case '\u1D79': + // áµ¹ [LATIN SMALL LETTER INSULAR G] + case '\u1D83': + // ᶃ [LATIN SMALL LETTER G WITH PALATAL HOOK] + case '\u1E21': + // ḡ [LATIN SMALL LETTER G WITH MACRON] + case '\u24D6': + // â“– [CIRCLED LATIN SMALL LETTER G] + case '\uA77F': + // � [LATIN SMALL LETTER TURNED INSULAR G] + case '\uFF47': // g [FULLWIDTH LATIN SMALL LETTER G] + output[outputPos++] = 'g'; + break; + + case '\u24A2': // â’¢ [PARENTHESIZED LATIN SMALL LETTER G] + output[outputPos++] = '('; + output[outputPos++] = 'g'; + output[outputPos++] = ')'; + break; + + case '\u0124': + // Ĥ [LATIN CAPITAL LETTER H WITH CIRCUMFLEX] + case '\u0126': + // Ħ [LATIN CAPITAL LETTER H WITH STROKE] + case '\u021E': + // Èž [LATIN CAPITAL LETTER H WITH CARON] + case '\u029C': + // Êœ [LATIN LETTER SMALL CAPITAL H] + case '\u1E22': + // Ḣ [LATIN CAPITAL LETTER H WITH DOT ABOVE] + case '\u1E24': + // Ḥ [LATIN CAPITAL LETTER H WITH DOT BELOW] + case '\u1E26': + // Ḧ [LATIN CAPITAL LETTER H WITH DIAERESIS] + case '\u1E28': + // Ḩ [LATIN CAPITAL LETTER H WITH CEDILLA] + case '\u1E2A': + // Ḫ [LATIN CAPITAL LETTER H WITH BREVE BELOW] + case '\u24BD': + // â’½ [CIRCLED LATIN CAPITAL LETTER H] + case '\u2C67': + // Ⱨ [LATIN CAPITAL LETTER H WITH DESCENDER] + case '\u2C75': + // â±µ [LATIN CAPITAL LETTER HALF H] + case '\uFF28': // H [FULLWIDTH LATIN CAPITAL LETTER H] + output[outputPos++] = 'H'; + break; + + case '\u0125': + // Ä¥ [LATIN SMALL LETTER H WITH CIRCUMFLEX] + case '\u0127': + // ħ [LATIN SMALL LETTER H WITH STROKE] + case '\u021F': + // ÈŸ [LATIN SMALL LETTER H WITH CARON] + case '\u0265': + // É¥ [LATIN SMALL LETTER TURNED H] + case '\u0266': + // ɦ [LATIN SMALL LETTER H WITH HOOK] + case '\u02AE': + // Ê® [LATIN SMALL LETTER TURNED H WITH FISHHOOK] + case '\u02AF': + // ʯ [LATIN SMALL LETTER TURNED H WITH FISHHOOK AND TAIL] + case '\u1E23': + // ḣ [LATIN SMALL LETTER H WITH DOT ABOVE] + case '\u1E25': + // ḥ [LATIN SMALL LETTER H WITH DOT BELOW] + case '\u1E27': + // ḧ [LATIN SMALL LETTER H WITH DIAERESIS] + case '\u1E29': + // ḩ [LATIN SMALL LETTER H WITH CEDILLA] + case '\u1E2B': + // ḫ [LATIN SMALL LETTER H WITH BREVE BELOW] + case '\u1E96': + // ẖ [LATIN SMALL LETTER H WITH LINE BELOW] + case '\u24D7': + // â“— [CIRCLED LATIN SMALL LETTER H] + case '\u2C68': + // ⱨ [LATIN SMALL LETTER H WITH DESCENDER] + case '\u2C76': + // ⱶ [LATIN SMALL LETTER HALF H] + case '\uFF48': // h [FULLWIDTH LATIN SMALL LETTER H] + output[outputPos++] = 'h'; + break; + + case '\u01F6': // Ƕ http://en.wikipedia.org/wiki/Hwair [LATIN CAPITAL LETTER HWAIR] + output[outputPos++] = 'H'; + output[outputPos++] = 'V'; + break; + + case '\u24A3': // â’£ [PARENTHESIZED LATIN SMALL LETTER H] + output[outputPos++] = '('; + output[outputPos++] = 'h'; + output[outputPos++] = ')'; + break; + + case '\u0195': // Æ• [LATIN SMALL LETTER HV] + output[outputPos++] = 'h'; + output[outputPos++] = 'v'; + break; + + case '\u00CC': + // ÃŒ [LATIN CAPITAL LETTER I WITH GRAVE] + case '\u00CD': + // � [LATIN CAPITAL LETTER I WITH ACUTE] + case '\u00CE': + // ÃŽ [LATIN CAPITAL LETTER I WITH CIRCUMFLEX] + case '\u00CF': + // � [LATIN CAPITAL LETTER I WITH DIAERESIS] + case '\u0128': + // Ĩ [LATIN CAPITAL LETTER I WITH TILDE] + case '\u012A': + // Ī [LATIN CAPITAL LETTER I WITH MACRON] + case '\u012C': + // Ĭ [LATIN CAPITAL LETTER I WITH BREVE] + case '\u012E': + // Ä® [LATIN CAPITAL LETTER I WITH OGONEK] + case '\u0130': + // Ä° [LATIN CAPITAL LETTER I WITH DOT ABOVE] + case '\u0196': + // Æ– [LATIN CAPITAL LETTER IOTA] + case '\u0197': + // Æ— [LATIN CAPITAL LETTER I WITH STROKE] + case '\u01CF': + // � [LATIN CAPITAL LETTER I WITH CARON] + case '\u0208': + // Ȉ [LATIN CAPITAL LETTER I WITH DOUBLE GRAVE] + case '\u020A': + // ÈŠ[LATIN CAPITAL LETTER I WITH INVERTED BREVE] + case '\u026A': + // ɪ [LATIN LETTER SMALL CAPITAL I] + case '\u1D7B': + // áµ» [LATIN SMALL CAPITAL LETTER I WITH STROKE] + case '\u1E2C': + // Ḭ [LATIN CAPITAL LETTER I WITH TILDE BELOW] + case '\u1E2E': + // Ḯ [LATIN CAPITAL LETTER I WITH DIAERESIS AND ACUTE] + case '\u1EC8': + // Ỉ [LATIN CAPITAL LETTER I WITH HOOK ABOVE] + case '\u1ECA': + // Ị[LATIN CAPITAL LETTER I WITH DOT BELOW] + case '\u24BE': + // â’¾ [CIRCLED LATIN CAPITAL LETTER I] + case '\uA7FE': + // ꟾ [LATIN EPIGRAPHIC LETTER I LONGA] + case '\uFF29': // I [FULLWIDTH LATIN CAPITAL LETTER I] + output[outputPos++] = 'I'; + break; + + case '\u00EC': + // ì [LATIN SMALL LETTER I WITH GRAVE] + case '\u00ED': + // à[LATIN SMALL LETTER I WITH ACUTE] + case '\u00EE': + // î [LATIN SMALL LETTER I WITH CIRCUMFLEX] + case '\u00EF': + // ï [LATIN SMALL LETTER I WITH DIAERESIS] + case '\u0129': + // Ä© [LATIN SMALL LETTER I WITH TILDE] + case '\u012B': + // Ä« [LATIN SMALL LETTER I WITH MACRON] + case '\u012D': + // Ä [LATIN SMALL LETTER I WITH BREVE] + case '\u012F': + // į [LATIN SMALL LETTER I WITH OGONEK] + case '\u0131': + // ı [LATIN SMALL LETTER DOTLESS I] + case '\u01D0': + // � [LATIN SMALL LETTER I WITH CARON] + case '\u0209': + // ȉ [LATIN SMALL LETTER I WITH DOUBLE GRAVE] + case '\u020B': + // È‹ [LATIN SMALL LETTER I WITH INVERTED BREVE] + case '\u0268': + // ɨ [LATIN SMALL LETTER I WITH STROKE] + case '\u1D09': + // á´‰ [LATIN SMALL LETTER TURNED I] + case '\u1D62': + // áµ¢ [LATIN SUBSCRIPT SMALL LETTER I] + case '\u1D7C': + // áµ¼ [LATIN SMALL LETTER IOTA WITH STROKE] + case '\u1D96': + // ᶖ [LATIN SMALL LETTER I WITH RETROFLEX HOOK] + case '\u1E2D': + // Ḡ[LATIN SMALL LETTER I WITH TILDE BELOW] + case '\u1E2F': + // ḯ [LATIN SMALL LETTER I WITH DIAERESIS AND ACUTE] + case '\u1EC9': + // ỉ [LATIN SMALL LETTER I WITH HOOK ABOVE] + case '\u1ECB': + // ị [LATIN SMALL LETTER I WITH DOT BELOW] + case '\u2071': + // � [SUPERSCRIPT LATIN SMALL LETTER I] + case '\u24D8': + // ⓘ [CIRCLED LATIN SMALL LETTER I] + case '\uFF49': // i [FULLWIDTH LATIN SMALL LETTER I] + output[outputPos++] = 'i'; + break; + + case '\u0132': // IJ [LATIN CAPITAL LIGATURE IJ] + output[outputPos++] = 'I'; + output[outputPos++] = 'J'; + break; + + case '\u24A4': // â’¤ [PARENTHESIZED LATIN SMALL LETTER I] + output[outputPos++] = '('; + output[outputPos++] = 'i'; + output[outputPos++] = ')'; + break; + + case '\u0133': // ij [LATIN SMALL LIGATURE IJ] + output[outputPos++] = 'i'; + output[outputPos++] = 'j'; + break; + + case '\u0134': + // Ä´ [LATIN CAPITAL LETTER J WITH CIRCUMFLEX] + case '\u0248': + // Ɉ [LATIN CAPITAL LETTER J WITH STROKE] + case '\u1D0A': + // á´Š[LATIN LETTER SMALL CAPITAL J] + case '\u24BF': + // â’¿ [CIRCLED LATIN CAPITAL LETTER J] + case '\uFF2A': // J [FULLWIDTH LATIN CAPITAL LETTER J] + output[outputPos++] = 'J'; + break; + + case '\u0135': + // ĵ [LATIN SMALL LETTER J WITH CIRCUMFLEX] + case '\u01F0': + // Ç° [LATIN SMALL LETTER J WITH CARON] + case '\u0237': + // È· [LATIN SMALL LETTER DOTLESS J] + case '\u0249': + // ɉ [LATIN SMALL LETTER J WITH STROKE] + case '\u025F': + // ÉŸ [LATIN SMALL LETTER DOTLESS J WITH STROKE] + case '\u0284': + // Ê„ [LATIN SMALL LETTER DOTLESS J WITH STROKE AND HOOK] + case '\u029D': + // � [LATIN SMALL LETTER J WITH CROSSED-TAIL] + case '\u24D9': + // â“™ [CIRCLED LATIN SMALL LETTER J] + case '\u2C7C': + // â±¼ [LATIN SUBSCRIPT SMALL LETTER J] + case '\uFF4A': // j[FULLWIDTH LATIN SMALL LETTER J] + output[outputPos++] = 'j'; + break; + + case '\u24A5': // â’¥ [PARENTHESIZED LATIN SMALL LETTER J] + output[outputPos++] = '('; + output[outputPos++] = 'j'; + output[outputPos++] = ')'; + break; + + case '\u0136': + // Ķ [LATIN CAPITAL LETTER K WITH CEDILLA] + case '\u0198': + // Ƙ [LATIN CAPITAL LETTER K WITH HOOK] + case '\u01E8': + // Ǩ [LATIN CAPITAL LETTER K WITH CARON] + case '\u1D0B': + // á´‹ [LATIN LETTER SMALL CAPITAL K] + case '\u1E30': + // Ḱ [LATIN CAPITAL LETTER K WITH ACUTE] + case '\u1E32': + // Ḳ [LATIN CAPITAL LETTER K WITH DOT BELOW] + case '\u1E34': + // Ḵ [LATIN CAPITAL LETTER K WITH LINE BELOW] + case '\u24C0': + // â“€ [CIRCLED LATIN CAPITAL LETTER K] + case '\u2C69': + // Ⱪ [LATIN CAPITAL LETTER K WITH DESCENDER] + case '\uA740': + // � [LATIN CAPITAL LETTER K WITH STROKE] + case '\uA742': + // � [LATIN CAPITAL LETTER K WITH DIAGONAL STROKE] + case '\uA744': + // � [LATIN CAPITAL LETTER K WITH STROKE AND DIAGONAL STROKE] + case '\uFF2B': // K [FULLWIDTH LATIN CAPITAL LETTER K] + output[outputPos++] = 'K'; + break; + + case '\u0137': + // Ä· [LATIN SMALL LETTER K WITH CEDILLA] + case '\u0199': + // Æ™ [LATIN SMALL LETTER K WITH HOOK] + case '\u01E9': + // Ç© [LATIN SMALL LETTER K WITH CARON] + case '\u029E': + // Êž [LATIN SMALL LETTER TURNED K] + case '\u1D84': + // ᶄ [LATIN SMALL LETTER K WITH PALATAL HOOK] + case '\u1E31': + // ḱ [LATIN SMALL LETTER K WITH ACUTE] + case '\u1E33': + // ḳ [LATIN SMALL LETTER K WITH DOT BELOW] + case '\u1E35': + // ḵ [LATIN SMALL LETTER K WITH LINE BELOW] + case '\u24DA': + // â“š [CIRCLED LATIN SMALL LETTER K] + case '\u2C6A': + // ⱪ [LATIN SMALL LETTER K WITH DESCENDER] + case '\uA741': + // � [LATIN SMALL LETTER K WITH STROKE] + case '\uA743': + // � [LATIN SMALL LETTER K WITH DIAGONAL STROKE] + case '\uA745': + // � [LATIN SMALL LETTER K WITH STROKE AND DIAGONAL STROKE] + case '\uFF4B': // k [FULLWIDTH LATIN SMALL LETTER K] + output[outputPos++] = 'k'; + break; + + case '\u24A6': // â’¦ [PARENTHESIZED LATIN SMALL LETTER K] + output[outputPos++] = '('; + output[outputPos++] = 'k'; + output[outputPos++] = ')'; + break; + + case '\u0139': + // Ĺ [LATIN CAPITAL LETTER L WITH ACUTE] + case '\u013B': + // Ä» [LATIN CAPITAL LETTER L WITH CEDILLA] + case '\u013D': + // Ľ [LATIN CAPITAL LETTER L WITH CARON] + case '\u013F': + // Ä¿ [LATIN CAPITAL LETTER L WITH MIDDLE DOT] + case '\u0141': + // � [LATIN CAPITAL LETTER L WITH STROKE] + case '\u023D': + // Ƚ [LATIN CAPITAL LETTER L WITH BAR] + case '\u029F': + // ÊŸ [LATIN LETTER SMALL CAPITAL L] + case '\u1D0C': + // á´Œ [LATIN LETTER SMALL CAPITAL L WITH STROKE] + case '\u1E36': + // Ḷ [LATIN CAPITAL LETTER L WITH DOT BELOW] + case '\u1E38': + // Ḹ [LATIN CAPITAL LETTER L WITH DOT BELOW AND MACRON] + case '\u1E3A': + // Ḻ [LATIN CAPITAL LETTER L WITH LINE BELOW] + case '\u1E3C': + // Ḽ [LATIN CAPITAL LETTER L WITH CIRCUMFLEX BELOW] + case '\u24C1': + // � [CIRCLED LATIN CAPITAL LETTER L] + case '\u2C60': + // â± [LATIN CAPITAL LETTER L WITH DOUBLE BAR] + case '\u2C62': + // â±¢ [LATIN CAPITAL LETTER L WITH MIDDLE TILDE] + case '\uA746': + // �[LATIN CAPITAL LETTER BROKEN L] + case '\uA748': + // � [LATIN CAPITAL LETTER L WITH HIGH STROKE] + case '\uA780': + // Ꞁ [LATIN CAPITAL LETTER TURNED L] + case '\uFF2C': // L [FULLWIDTH LATIN CAPITAL LETTER L] + output[outputPos++] = 'L'; + break; + + case '\u013A': + // ĺ [LATIN SMALL LETTER L WITH ACUTE] + case '\u013C': + // ļ [LATIN SMALL LETTER L WITH CEDILLA] + case '\u013E': + // ľ [LATIN SMALL LETTER L WITH CARON] + case '\u0140': + // Å€ [LATIN SMALL LETTER L WITH MIDDLE DOT] + case '\u0142': + // Å‚ [LATIN SMALL LETTER L WITH STROKE] + case '\u019A': + // Æš [LATIN SMALL LETTER L WITH BAR] + case '\u0234': + // È´ [LATIN SMALL LETTER L WITH CURL] + case '\u026B': + // É« [LATIN SMALL LETTER L WITH MIDDLE TILDE] + case '\u026C': + // ɬ [LATIN SMALL LETTER L WITH BELT] + case '\u026D': + // É [LATIN SMALL LETTER L WITH RETROFLEX HOOK] + case '\u1D85': + // ᶅ [LATIN SMALL LETTER L WITH PALATAL HOOK] + case '\u1E37': + // ḷ [LATIN SMALL LETTER L WITH DOT BELOW] + case '\u1E39': + // ḹ [LATIN SMALL LETTER L WITH DOT BELOW AND MACRON] + case '\u1E3B': + // ḻ [LATIN SMALL LETTER L WITH LINE BELOW] + case '\u1E3D': + // ḽ [LATIN SMALL LETTER L WITH CIRCUMFLEX BELOW] + case '\u24DB': + // â“› [CIRCLED LATIN SMALL LETTER L] + case '\u2C61': + // ⱡ [LATIN SMALL LETTER L WITH DOUBLE BAR] + case '\uA747': + // � [LATIN SMALL LETTER BROKEN L] + case '\uA749': + // � [LATIN SMALL LETTER L WITH HIGH STROKE] + case '\uA781': + // � [LATIN SMALL LETTER TURNED L] + case '\uFF4C': // l [FULLWIDTH LATIN SMALL LETTER L] + output[outputPos++] = 'l'; + break; + + case '\u01C7': // LJ [LATIN CAPITAL LETTER LJ] + output[outputPos++] = 'L'; + output[outputPos++] = 'J'; + break; + + case '\u1EFA': // Ỻ [LATIN CAPITAL LETTER MIDDLE-WELSH LL] + output[outputPos++] = 'L'; + output[outputPos++] = 'L'; + break; + + case '\u01C8': // Lj [LATIN CAPITAL LETTER L WITH SMALL LETTER J] + output[outputPos++] = 'L'; + output[outputPos++] = 'j'; + break; + + case '\u24A7': // â’§ [PARENTHESIZED LATIN SMALL LETTER L] + output[outputPos++] = '('; + output[outputPos++] = 'l'; + output[outputPos++] = ')'; + break; + + case '\u01C9': // lj [LATIN SMALL LETTER LJ] + output[outputPos++] = 'l'; + output[outputPos++] = 'j'; + break; + + case '\u1EFB': // á»» [LATIN SMALL LETTER MIDDLE-WELSH LL] + output[outputPos++] = 'l'; + output[outputPos++] = 'l'; + break; + + case '\u02AA': // ʪ [LATIN SMALL LETTER LS DIGRAPH] + output[outputPos++] = 'l'; + output[outputPos++] = 's'; + break; + + case '\u02AB': // Ê« [LATIN SMALL LETTER LZ DIGRAPH] + output[outputPos++] = 'l'; + output[outputPos++] = 'z'; + break; + + case '\u019C': + // Æœ [LATIN CAPITAL LETTER TURNED M] + case '\u1D0D': + // á´� [LATIN LETTER SMALL CAPITAL M] + case '\u1E3E': + // Ḿ [LATIN CAPITAL LETTER M WITH ACUTE] + case '\u1E40': + // á¹€ [LATIN CAPITAL LETTER M WITH DOT ABOVE] + case '\u1E42': + // Ṃ [LATIN CAPITAL LETTER M WITH DOT BELOW] + case '\u24C2': + // â“‚ [CIRCLED LATIN CAPITAL LETTER M] + case '\u2C6E': + // â±® [LATIN CAPITAL LETTER M WITH HOOK] + case '\uA7FD': + // ꟽ [LATIN EPIGRAPHIC LETTER INVERTED M] + case '\uA7FF': + // ꟿ [LATIN EPIGRAPHIC LETTER ARCHAIC M] + case '\uFF2D': // ï¼ [FULLWIDTH LATIN CAPITAL LETTER M] + output[outputPos++] = 'M'; + break; + + case '\u026F': + // ɯ [LATIN SMALL LETTER TURNED M] + case '\u0270': + // É° [LATIN SMALL LETTER TURNED M WITH LONG LEG] + case '\u0271': + // ɱ [LATIN SMALL LETTER M WITH HOOK] + case '\u1D6F': + // ᵯ [LATIN SMALL LETTER M WITH MIDDLE TILDE] + case '\u1D86': + // ᶆ[LATIN SMALL LETTER M WITH PALATAL HOOK] + case '\u1E3F': + // ḿ [LATIN SMALL LETTER M WITH ACUTE] + case '\u1E41': + // � [LATIN SMALL LETTER M WITH DOT ABOVE] + case '\u1E43': + // ṃ [LATIN SMALL LETTER M WITH DOT BELOW] + case '\u24DC': + // â“œ [CIRCLED LATIN SMALL LETTER M] + case '\uFF4D': // � [FULLWIDTH LATIN SMALL LETTER M] + output[outputPos++] = 'm'; + break; + + case '\u24A8': // â’¨ [PARENTHESIZED LATIN SMALL LETTER M] + output[outputPos++] = '('; + output[outputPos++] = 'm'; + output[outputPos++] = ')'; + break; + + case '\u00D1': + // Ñ [LATIN CAPITAL LETTER N WITH TILDE] + case '\u0143': + // Ã…Æ’ [LATIN CAPITAL LETTER N WITH ACUTE] + case '\u0145': + // Å… [LATIN CAPITAL LETTER N WITH CEDILLA] + case '\u0147': + // Ň [LATIN CAPITAL LETTER N WITH CARON] + case '\u014A': + // Ã…Å http://en.wikipedia.org/wiki/Eng_(letter) [LATIN CAPITAL LETTER ENG] + case '\u019D': + // � [LATIN CAPITAL LETTER N WITH LEFT HOOK] + case '\u01F8': + // Ǹ [LATIN CAPITAL LETTER N WITH GRAVE] + case '\u0220': + // È [LATIN CAPITAL LETTER N WITH LONG RIGHT LEG] + case '\u0274': + // É´ [LATIN LETTER SMALL CAPITAL N] + case '\u1D0E': + // á´Ž [LATIN LETTER SMALL CAPITAL REVERSED N] + case '\u1E44': + // Ṅ [LATIN CAPITAL LETTER N WITH DOT ABOVE] + case '\u1E46': + // Ṇ[LATIN CAPITAL LETTER N WITH DOT BELOW] + case '\u1E48': + // Ṉ [LATIN CAPITAL LETTER N WITH LINE BELOW] + case '\u1E4A': + // Ṋ[LATIN CAPITAL LETTER N WITH CIRCUMFLEX BELOW] + case '\u24C3': + // Ⓝ [CIRCLED LATIN CAPITAL LETTER N] + case '\uFF2E': // ï¼® [FULLWIDTH LATIN CAPITAL LETTER N] + output[outputPos++] = 'N'; + break; + + case '\u00F1': + // ñ [LATIN SMALL LETTER N WITH TILDE] + case '\u0144': + // Å„ [LATIN SMALL LETTER N WITH ACUTE] + case '\u0146': + // ņ[LATIN SMALL LETTER N WITH CEDILLA] + case '\u0148': + // ň [LATIN SMALL LETTER N WITH CARON] + case '\u0149': + // ʼn [LATIN SMALL LETTER N PRECEDED BY APOSTROPHE] + case '\u014B': + // Å‹ http://en.wikipedia.org/wiki/Eng_(letter) [LATIN SMALL LETTER ENG] + case '\u019E': + // Æž [LATIN SMALL LETTER N WITH LONG RIGHT LEG] + case '\u01F9': + // ǹ [LATIN SMALL LETTER N WITH GRAVE] + case '\u0235': + // ȵ [LATIN SMALL LETTER N WITH CURL] + case '\u0272': + // ɲ [LATIN SMALL LETTER N WITH LEFT HOOK] + case '\u0273': + // ɳ [LATIN SMALL LETTER N WITH RETROFLEX HOOK] + case '\u1D70': + // áµ° [LATIN SMALL LETTER N WITH MIDDLE TILDE] + case '\u1D87': + // ᶇ [LATIN SMALL LETTER N WITH PALATAL HOOK] + case '\u1E45': + // á¹… [LATIN SMALL LETTER N WITH DOT ABOVE] + case '\u1E47': + // ṇ [LATIN SMALL LETTER N WITH DOT BELOW] + case '\u1E49': + // ṉ [LATIN SMALL LETTER N WITH LINE BELOW] + case '\u1E4B': + // ṋ [LATIN SMALL LETTER N WITH CIRCUMFLEX BELOW] + case '\u207F': + // � [SUPERSCRIPT LATIN SMALL LETTER N] + case '\u24DD': + // � [CIRCLED LATIN SMALL LETTER N] + case '\uFF4E': // n [FULLWIDTH LATIN SMALL LETTER N] + output[outputPos++] = 'n'; + break; + + case '\u01CA': // ÇŠ[LATIN CAPITAL LETTER NJ] + output[outputPos++] = 'N'; + output[outputPos++] = 'J'; + break; + + case '\u01CB': // Ç‹ [LATIN CAPITAL LETTER N WITH SMALL LETTER J] + output[outputPos++] = 'N'; + output[outputPos++] = 'j'; + break; + + case '\u24A9': // â’© [PARENTHESIZED LATIN SMALL LETTER N] + output[outputPos++] = '('; + output[outputPos++] = 'n'; + output[outputPos++] = ')'; + break; + + case '\u01CC': // ÇŒ [LATIN SMALL LETTER NJ] + output[outputPos++] = 'n'; + output[outputPos++] = 'j'; + break; + + case '\u00D2': + // Ã’ [LATIN CAPITAL LETTER O WITH GRAVE] + case '\u00D3': + // Ó [LATIN CAPITAL LETTER O WITH ACUTE] + case '\u00D4': + // �? [LATIN CAPITAL LETTER O WITH CIRCUMFLEX] + case '\u00D5': + // Õ [LATIN CAPITAL LETTER O WITH TILDE] + case '\u00D6': + // Ö [LATIN CAPITAL LETTER O WITH DIAERESIS] + case '\u00D8': + // Ø [LATIN CAPITAL LETTER O WITH STROKE] + case '\u014C': + // Ã…Å’ [LATIN CAPITAL LETTER O WITH MACRON] + case '\u014E': + // ÅŽ [LATIN CAPITAL LETTER O WITH BREVE] + case '\u0150': + // � [LATIN CAPITAL LETTER O WITH DOUBLE ACUTE] + case '\u0186': + // Ɔ[LATIN CAPITAL LETTER OPEN O] + case '\u019F': + // ÆŸ [LATIN CAPITAL LETTER O WITH MIDDLE TILDE] + case '\u01A0': + // Æ [LATIN CAPITAL LETTER O WITH HORN] + case '\u01D1': + // Ç‘ [LATIN CAPITAL LETTER O WITH CARON] + case '\u01EA': + // Ǫ [LATIN CAPITAL LETTER O WITH OGONEK] + case '\u01EC': + // Ǭ [LATIN CAPITAL LETTER O WITH OGONEK AND MACRON] + case '\u01FE': + // Ǿ [LATIN CAPITAL LETTER O WITH STROKE AND ACUTE] + case '\u020C': + // ÈŒ [LATIN CAPITAL LETTER O WITH DOUBLE GRAVE] + case '\u020E': + // ÈŽ [LATIN CAPITAL LETTER O WITH INVERTED BREVE] + case '\u022A': + // Ȫ [LATIN CAPITAL LETTER O WITH DIAERESIS AND MACRON] + case '\u022C': + // Ȭ [LATIN CAPITAL LETTER O WITH TILDE AND MACRON] + case '\u022E': + // È® [LATIN CAPITAL LETTER O WITH DOT ABOVE] + case '\u0230': + // È° [LATIN CAPITAL LETTER O WITH DOT ABOVE AND MACRON] + case '\u1D0F': + // á´� [LATIN LETTER SMALL CAPITAL O] + case '\u1D10': + // á´� [LATIN LETTER SMALL CAPITAL OPEN O] + case '\u1E4C': + // Ṍ [LATIN CAPITAL LETTER O WITH TILDE AND ACUTE] + case '\u1E4E': + // Ṏ [LATIN CAPITAL LETTER O WITH TILDE AND DIAERESIS] + case '\u1E50': + // � [LATIN CAPITAL LETTER O WITH MACRON AND GRAVE] + case '\u1E52': + // á¹’ [LATIN CAPITAL LETTER O WITH MACRON AND ACUTE] + case '\u1ECC': + // Ọ [LATIN CAPITAL LETTER O WITH DOT BELOW] + case '\u1ECE': + // Ỏ [LATIN CAPITAL LETTER O WITH HOOK ABOVE] + case '\u1ED0': + // � [LATIN CAPITAL LETTER O WITH CIRCUMFLEX AND ACUTE] + case '\u1ED2': + // á»’ [LATIN CAPITAL LETTER O WITH CIRCUMFLEX AND GRAVE] + case '\u1ED4': + // �? [LATIN CAPITAL LETTER O WITH CIRCUMFLEX AND HOOK ABOVE] + case '\u1ED6': + // á»– [LATIN CAPITAL LETTER O WITH CIRCUMFLEX AND TILDE] + case '\u1ED8': + // Ộ [LATIN CAPITAL LETTER O WITH CIRCUMFLEX AND DOT BELOW] + case '\u1EDA': + // Ớ [LATIN CAPITAL LETTER O WITH HORN AND ACUTE] + case '\u1EDC': + // Ờ [LATIN CAPITAL LETTER O WITH HORN AND GRAVE] + case '\u1EDE': + // Ở [LATIN CAPITAL LETTER O WITH HORN AND HOOK ABOVE] + case '\u1EE0': + // á» [LATIN CAPITAL LETTER O WITH HORN AND TILDE] + case '\u1EE2': + // Ợ [LATIN CAPITAL LETTER O WITH HORN AND DOT BELOW] + case '\u24C4': + // â“„ [CIRCLED LATIN CAPITAL LETTER O] + case '\uA74A': + // �[LATIN CAPITAL LETTER O WITH LONG STROKE OVERLAY] + case '\uA74C': + // � [LATIN CAPITAL LETTER O WITH LOOP] + case '\uFF2F': // O [FULLWIDTH LATIN CAPITAL LETTER O] + output[outputPos++] = 'O'; + break; + + case '\u00F2': + // ò [LATIN SMALL LETTER O WITH GRAVE] + case '\u00F3': + // ó [LATIN SMALL LETTER O WITH ACUTE] + case '\u00F4': + // ô [LATIN SMALL LETTER O WITH CIRCUMFLEX] + case '\u00F5': + // õ [LATIN SMALL LETTER O WITH TILDE] + case '\u00F6': + // ö [LATIN SMALL LETTER O WITH DIAERESIS] + case '\u00F8': + // ø [LATIN SMALL LETTER O WITH STROKE] + case '\u014D': + // � [LATIN SMALL LETTER O WITH MACRON] + case '\u014F': + // � [LATIN SMALL LETTER O WITH BREVE] + case '\u0151': + // Å‘ [LATIN SMALL LETTER O WITH DOUBLE ACUTE] + case '\u01A1': + // Æ¡ [LATIN SMALL LETTER O WITH HORN] + case '\u01D2': + // Ç’ [LATIN SMALL LETTER O WITH CARON] + case '\u01EB': + // Ç« [LATIN SMALL LETTER O WITH OGONEK] + case '\u01ED': + // Ç [LATIN SMALL LETTER O WITH OGONEK AND MACRON] + case '\u01FF': + // Ç¿ [LATIN SMALL LETTER O WITH STROKE AND ACUTE] + case '\u020D': + // � [LATIN SMALL LETTER O WITH DOUBLE GRAVE] + case '\u020F': + // � [LATIN SMALL LETTER O WITH INVERTED BREVE] + case '\u022B': + // È« [LATIN SMALL LETTER O WITH DIAERESIS AND MACRON] + case '\u022D': + // È [LATIN SMALL LETTER O WITH TILDE AND MACRON] + case '\u022F': + // ȯ [LATIN SMALL LETTER O WITH DOT ABOVE] + case '\u0231': + // ȱ [LATIN SMALL LETTER O WITH DOT ABOVE AND MACRON] + case '\u0254': + // �? [LATIN SMALL LETTER OPEN O] + case '\u0275': + // ɵ [LATIN SMALL LETTER BARRED O] + case '\u1D16': + // á´– [LATIN SMALL LETTER TOP HALF O] + case '\u1D17': + // á´— [LATIN SMALL LETTER BOTTOM HALF O] + case '\u1D97': + // ᶗ [LATIN SMALL LETTER OPEN O WITH RETROFLEX HOOK] + case '\u1E4D': + // � [LATIN SMALL LETTER O WITH TILDE AND ACUTE] + case '\u1E4F': + // � [LATIN SMALL LETTER O WITH TILDE AND DIAERESIS] + case '\u1E51': + // ṑ [LATIN SMALL LETTER O WITH MACRON AND GRAVE] + case '\u1E53': + // ṓ [LATIN SMALL LETTER O WITH MACRON AND ACUTE] + case '\u1ECD': + // � [LATIN SMALL LETTER O WITH DOT BELOW] + case '\u1ECF': + // � [LATIN SMALL LETTER O WITH HOOK ABOVE] + case '\u1ED1': + // ố [LATIN SMALL LETTER O WITH CIRCUMFLEX AND ACUTE] + case '\u1ED3': + // ồ [LATIN SMALL LETTER O WITH CIRCUMFLEX AND GRAVE] + case '\u1ED5': + // ổ [LATIN SMALL LETTER O WITH CIRCUMFLEX AND HOOK ABOVE] + case '\u1ED7': + // á»— [LATIN SMALL LETTER O WITH CIRCUMFLEX AND TILDE] + case '\u1ED9': + // á»™ [LATIN SMALL LETTER O WITH CIRCUMFLEX AND DOT BELOW] + case '\u1EDB': + // á»› [LATIN SMALL LETTER O WITH HORN AND ACUTE] + case '\u1EDD': + // � [LATIN SMALL LETTER O WITH HORN AND GRAVE] + case '\u1EDF': + // ở [LATIN SMALL LETTER O WITH HORN AND HOOK ABOVE] + case '\u1EE1': + // ỡ [LATIN SMALL LETTER O WITH HORN AND TILDE] + case '\u1EE3': + // ợ [LATIN SMALL LETTER O WITH HORN AND DOT BELOW] + case '\u2092': + // â‚’ [LATIN SUBSCRIPT SMALL LETTER O] + case '\u24DE': + // â“ž [CIRCLED LATIN SMALL LETTER O] + case '\u2C7A': + // ⱺ [LATIN SMALL LETTER O WITH LOW RING INSIDE] + case '\uA74B': + // � [LATIN SMALL LETTER O WITH LONG STROKE OVERLAY] + case '\uA74D': + // � [LATIN SMALL LETTER O WITH LOOP] + case '\uFF4F': // � [FULLWIDTH LATIN SMALL LETTER O] + output[outputPos++] = 'o'; + break; + + case '\u0152': + // Å’ [LATIN CAPITAL LIGATURE OE] + case '\u0276': // ɶ [LATIN LETTER SMALL CAPITAL OE] + output[outputPos++] = 'O'; + output[outputPos++] = 'E'; + break; + + case '\uA74E': // � [LATIN CAPITAL LETTER OO] + output[outputPos++] = 'O'; + output[outputPos++] = 'O'; + break; + + case '\u0222': + // È¢ http://en.wikipedia.org/wiki/OU [LATIN CAPITAL LETTER OU] + case '\u1D15': // á´• [LATIN LETTER SMALL CAPITAL OU] + output[outputPos++] = 'O'; + output[outputPos++] = 'U'; + break; + + case '\u24AA': // â’ª [PARENTHESIZED LATIN SMALL LETTER O] + output[outputPos++] = '('; + output[outputPos++] = 'o'; + output[outputPos++] = ')'; + break; + + case '\u0153': + // Å“ [LATIN SMALL LIGATURE OE] + case '\u1D14': // á´�? [LATIN SMALL LETTER TURNED OE] + output[outputPos++] = 'o'; + output[outputPos++] = 'e'; + break; + + case '\uA74F': // � [LATIN SMALL LETTER OO] + output[outputPos++] = 'o'; + output[outputPos++] = 'o'; + break; + + case '\u0223': // È£ http://en.wikipedia.org/wiki/OU [LATIN SMALL LETTER OU] + output[outputPos++] = 'o'; + output[outputPos++] = 'u'; + break; + + case '\u01A4': + // Ƥ [LATIN CAPITAL LETTER P WITH HOOK] + case '\u1D18': + // á´˜ [LATIN LETTER SMALL CAPITAL P] + case '\u1E54': + // �? [LATIN CAPITAL LETTER P WITH ACUTE] + case '\u1E56': + // á¹– [LATIN CAPITAL LETTER P WITH DOT ABOVE] + case '\u24C5': + // â“… [CIRCLED LATIN CAPITAL LETTER P] + case '\u2C63': + // â±£ [LATIN CAPITAL LETTER P WITH STROKE] + case '\uA750': + // � [LATIN CAPITAL LETTER P WITH STROKE THROUGH DESCENDER] + case '\uA752': + // � [LATIN CAPITAL LETTER P WITH FLOURISH] + case '\uA754': + // �? [LATIN CAPITAL LETTER P WITH SQUIRREL TAIL] + case '\uFF30': // ï¼° [FULLWIDTH LATIN CAPITAL LETTER P] + output[outputPos++] = 'P'; + break; + + case '\u01A5': + // Æ¥ [LATIN SMALL LETTER P WITH HOOK] + case '\u1D71': + // áµ± [LATIN SMALL LETTER P WITH MIDDLE TILDE] + case '\u1D7D': + // áµ½ [LATIN SMALL LETTER P WITH STROKE] + case '\u1D88': + // ᶈ [LATIN SMALL LETTER P WITH PALATAL HOOK] + case '\u1E55': + // ṕ [LATIN SMALL LETTER P WITH ACUTE] + case '\u1E57': + // á¹— [LATIN SMALL LETTER P WITH DOT ABOVE] + case '\u24DF': + // â“Ÿ [CIRCLED LATIN SMALL LETTER P] + case '\uA751': + // � [LATIN SMALL LETTER P WITH STROKE THROUGH DESCENDER] + case '\uA753': + // � [LATIN SMALL LETTER P WITH FLOURISH] + case '\uA755': + // � [LATIN SMALL LETTER P WITH SQUIRREL TAIL] + case '\uA7FC': + // ꟼ [LATIN EPIGRAPHIC LETTER REVERSED P] + case '\uFF50': // � [FULLWIDTH LATIN SMALL LETTER P] + output[outputPos++] = 'p'; + break; + + case '\u24AB': // â’« [PARENTHESIZED LATIN SMALL LETTER P] + output[outputPos++] = '('; + output[outputPos++] = 'p'; + output[outputPos++] = ')'; + break; + + case '\u024A': + // ÉŠ[LATIN CAPITAL LETTER SMALL Q WITH HOOK TAIL] + case '\u24C6': + // Ⓠ[CIRCLED LATIN CAPITAL LETTER Q] + case '\uA756': + // � [LATIN CAPITAL LETTER Q WITH STROKE THROUGH DESCENDER] + case '\uA758': + // � [LATIN CAPITAL LETTER Q WITH DIAGONAL STROKE] + case '\uFF31': // ï¼± [FULLWIDTH LATIN CAPITAL LETTER Q] + output[outputPos++] = 'Q'; + break; + + case '\u0138': + // ĸ http://en.wikipedia.org/wiki/Kra_(letter) [LATIN SMALL LETTER KRA] + case '\u024B': + // É‹ [LATIN SMALL LETTER Q WITH HOOK TAIL] + case '\u02A0': + // Ê [LATIN SMALL LETTER Q WITH HOOK] + case '\u24E0': + // â“ [CIRCLED LATIN SMALL LETTER Q] + case '\uA757': + // � [LATIN SMALL LETTER Q WITH STROKE THROUGH DESCENDER] + case '\uA759': + // � [LATIN SMALL LETTER Q WITH DIAGONAL STROKE] + case '\uFF51': // q [FULLWIDTH LATIN SMALL LETTER Q] + output[outputPos++] = 'q'; + break; + + case '\u24AC': // â’¬ [PARENTHESIZED LATIN SMALL LETTER Q] + output[outputPos++] = '('; + output[outputPos++] = 'q'; + output[outputPos++] = ')'; + break; + + case '\u0239': // ȹ [LATIN SMALL LETTER QP DIGRAPH] + output[outputPos++] = 'q'; + output[outputPos++] = 'p'; + break; + + case '\u0154': + // �? [LATIN CAPITAL LETTER R WITH ACUTE] + case '\u0156': + // Å– [LATIN CAPITAL LETTER R WITH CEDILLA] + case '\u0158': + // Ã…Ëœ [LATIN CAPITAL LETTER R WITH CARON] + case '\u0210': + // È’ [LATIN CAPITAL LETTER R WITH DOUBLE GRAVE] + case '\u0212': + // È’ [LATIN CAPITAL LETTER R WITH INVERTED BREVE] + case '\u024C': + // ÉŒ [LATIN CAPITAL LETTER R WITH STROKE] + case '\u0280': + // Ê€ [LATIN LETTER SMALL CAPITAL R] + case '\u0281': + // � [LATIN LETTER SMALL CAPITAL INVERTED R] + case '\u1D19': + // á´™ [LATIN LETTER SMALL CAPITAL REVERSED R] + case '\u1D1A': + // á´š [LATIN LETTER SMALL CAPITAL TURNED R] + case '\u1E58': + // Ṙ [LATIN CAPITAL LETTER R WITH DOT ABOVE] + case '\u1E5A': + // Ṛ [LATIN CAPITAL LETTER R WITH DOT BELOW] + case '\u1E5C': + // Ṝ [LATIN CAPITAL LETTER R WITH DOT BELOW AND MACRON] + case '\u1E5E': + // Ṟ [LATIN CAPITAL LETTER R WITH LINE BELOW] + case '\u24C7': + // Ⓡ [CIRCLED LATIN CAPITAL LETTER R] + case '\u2C64': + // Ɽ [LATIN CAPITAL LETTER R WITH TAIL] + case '\uA75A': + // � [LATIN CAPITAL LETTER R ROTUNDA] + case '\uA782': + // êž‚ [LATIN CAPITAL LETTER INSULAR R] + case '\uFF32': // ï¼² [FULLWIDTH LATIN CAPITAL LETTER R] + output[outputPos++] = 'R'; + break; + + case '\u0155': + // Å• [LATIN SMALL LETTER R WITH ACUTE] + case '\u0157': + // Å— [LATIN SMALL LETTER R WITH CEDILLA] + case '\u0159': + // Ã…â„¢ [LATIN SMALL LETTER R WITH CARON] + case '\u0211': + // È‘ [LATIN SMALL LETTER R WITH DOUBLE GRAVE] + case '\u0213': + // È“ [LATIN SMALL LETTER R WITH INVERTED BREVE] + case '\u024D': + // � [LATIN SMALL LETTER R WITH STROKE] + case '\u027C': + // ɼ [LATIN SMALL LETTER R WITH LONG LEG] + case '\u027D': + // ɽ [LATIN SMALL LETTER R WITH TAIL] + case '\u027E': + // ɾ [LATIN SMALL LETTER R WITH FISHHOOK] + case '\u027F': + // É¿ [LATIN SMALL LETTER REVERSED R WITH FISHHOOK] + case '\u1D63': + // áµ£ [LATIN SUBSCRIPT SMALL LETTER R] + case '\u1D72': + // áµ² [LATIN SMALL LETTER R WITH MIDDLE TILDE] + case '\u1D73': + // áµ³ [LATIN SMALL LETTER R WITH FISHHOOK AND MIDDLE TILDE] + case '\u1D89': + // ᶉ [LATIN SMALL LETTER R WITH PALATAL HOOK] + case '\u1E59': + // á¹™ [LATIN SMALL LETTER R WITH DOT ABOVE] + case '\u1E5B': + // á¹› [LATIN SMALL LETTER R WITH DOT BELOW] + case '\u1E5D': + // � [LATIN SMALL LETTER R WITH DOT BELOW AND MACRON] + case '\u1E5F': + // ṟ [LATIN SMALL LETTER R WITH LINE BELOW] + case '\u24E1': + // â“¡ [CIRCLED LATIN SMALL LETTER R] + case '\uA75B': + // � [LATIN SMALL LETTER R ROTUNDA] + case '\uA783': + // ꞃ [LATIN SMALL LETTER INSULAR R] + case '\uFF52': // ï½’ [FULLWIDTH LATIN SMALL LETTER R] + output[outputPos++] = 'r'; + break; + + case '\u24AD': // â’ [PARENTHESIZED LATIN SMALL LETTER R] + output[outputPos++] = '('; + output[outputPos++] = 'r'; + output[outputPos++] = ')'; + break; + + case '\u015A': + // Ã…Å¡ [LATIN CAPITAL LETTER S WITH ACUTE] + case '\u015C': + // Ã…Å“ [LATIN CAPITAL LETTER S WITH CIRCUMFLEX] + case '\u015E': + // Åž [LATIN CAPITAL LETTER S WITH CEDILLA] + case '\u0160': + // Ã…Â [LATIN CAPITAL LETTER S WITH CARON] + case '\u0218': + // Ș [LATIN CAPITAL LETTER S WITH COMMA BELOW] + case '\u1E60': + // á¹ [LATIN CAPITAL LETTER S WITH DOT ABOVE] + case '\u1E62': + // á¹¢ [LATIN CAPITAL LETTER S WITH DOT BELOW] + case '\u1E64': + // Ṥ [LATIN CAPITAL LETTER S WITH ACUTE AND DOT ABOVE] + case '\u1E66': + // Ṧ [LATIN CAPITAL LETTER S WITH CARON AND DOT ABOVE] + case '\u1E68': + // Ṩ [LATIN CAPITAL LETTER S WITH DOT BELOW AND DOT ABOVE] + case '\u24C8': + // Ⓢ [CIRCLED LATIN CAPITAL LETTER S] + case '\uA731': + // ꜱ [LATIN LETTER SMALL CAPITAL S] + case '\uA785': + // êž… [LATIN SMALL LETTER INSULAR S] + case '\uFF33': // ï¼³ [FULLWIDTH LATIN CAPITAL LETTER S] + output[outputPos++] = 'S'; + break; + + case '\u015B': + // Å› [LATIN SMALL LETTER S WITH ACUTE] + case '\u015D': + // � [LATIN SMALL LETTER S WITH CIRCUMFLEX] + case '\u015F': + // ÅŸ [LATIN SMALL LETTER S WITH CEDILLA] + case '\u0161': + // Å¡ [LATIN SMALL LETTER S WITH CARON] + case '\u017F': + // Å¿ http://en.wikipedia.org/wiki/Long_S [LATIN SMALL LETTER LONG S] + case '\u0219': + // È™ [LATIN SMALL LETTER S WITH COMMA BELOW] + case '\u023F': + // È¿ [LATIN SMALL LETTER S WITH SWASH TAIL] + case '\u0282': + // Ê‚ [LATIN SMALL LETTER S WITH HOOK] + case '\u1D74': + // áµ´ [LATIN SMALL LETTER S WITH MIDDLE TILDE] + case '\u1D8A': + // ᶊ[LATIN SMALL LETTER S WITH PALATAL HOOK] + case '\u1E61': + // ṡ [LATIN SMALL LETTER S WITH DOT ABOVE] + case '\u1E63': + // á¹£ [LATIN SMALL LETTER S WITH DOT BELOW] + case '\u1E65': + // á¹¥ [LATIN SMALL LETTER S WITH ACUTE AND DOT ABOVE] + case '\u1E67': + // ṧ [LATIN SMALL LETTER S WITH CARON AND DOT ABOVE] + case '\u1E69': + // ṩ [LATIN SMALL LETTER S WITH DOT BELOW AND DOT ABOVE] + case '\u1E9C': + // ẜ [LATIN SMALL LETTER LONG S WITH DIAGONAL STROKE] + case '\u1E9D': + // � [LATIN SMALL LETTER LONG S WITH HIGH STROKE] + case '\u24E2': + // â“¢ [CIRCLED LATIN SMALL LETTER S] + case '\uA784': + // êž„ [LATIN CAPITAL LETTER INSULAR S] + case '\uFF53': // s [FULLWIDTH LATIN SMALL LETTER S] + output[outputPos++] = 's'; + break; + + case '\u1E9E': // ẞ [LATIN CAPITAL LETTER SHARP S] + output[outputPos++] = 'S'; + output[outputPos++] = 'S'; + break; + + case '\u24AE': // â’® [PARENTHESIZED LATIN SMALL LETTER S] + output[outputPos++] = '('; + output[outputPos++] = 's'; + output[outputPos++] = ')'; + break; + + case '\u00DF': // ß [LATIN SMALL LETTER SHARP S] + output[outputPos++] = 's'; + output[outputPos++] = 's'; + break; + + case '\uFB06': // st[LATIN SMALL LIGATURE ST] + output[outputPos++] = 's'; + output[outputPos++] = 't'; + break; + + case '\u0162': + // Å¢ [LATIN CAPITAL LETTER T WITH CEDILLA] + case '\u0164': + // Ť [LATIN CAPITAL LETTER T WITH CARON] + case '\u0166': + // Ŧ [LATIN CAPITAL LETTER T WITH STROKE] + case '\u01AC': + // Ƭ [LATIN CAPITAL LETTER T WITH HOOK] + case '\u01AE': + // Æ® [LATIN CAPITAL LETTER T WITH RETROFLEX HOOK] + case '\u021A': + // Èš [LATIN CAPITAL LETTER T WITH COMMA BELOW] + case '\u023E': + // Ⱦ [LATIN CAPITAL LETTER T WITH DIAGONAL STROKE] + case '\u1D1B': + // á´› [LATIN LETTER SMALL CAPITAL T] + case '\u1E6A': + // Ṫ [LATIN CAPITAL LETTER T WITH DOT ABOVE] + case '\u1E6C': + // Ṭ [LATIN CAPITAL LETTER T WITH DOT BELOW] + case '\u1E6E': + // á¹® [LATIN CAPITAL LETTER T WITH LINE BELOW] + case '\u1E70': + // á¹° [LATIN CAPITAL LETTER T WITH CIRCUMFLEX BELOW] + case '\u24C9': + // Ⓣ [CIRCLED LATIN CAPITAL LETTER T] + case '\uA786': + // Ꞇ[LATIN CAPITAL LETTER INSULAR T] + case '\uFF34': // ï¼´ [FULLWIDTH LATIN CAPITAL LETTER T] + output[outputPos++] = 'T'; + break; + + case '\u0163': + // Å£ [LATIN SMALL LETTER T WITH CEDILLA] + case '\u0165': + // Ã…Â¥ [LATIN SMALL LETTER T WITH CARON] + case '\u0167': + // ŧ [LATIN SMALL LETTER T WITH STROKE] + case '\u01AB': + // Æ« [LATIN SMALL LETTER T WITH PALATAL HOOK] + case '\u01AD': + // Æ [LATIN SMALL LETTER T WITH HOOK] + case '\u021B': + // È› [LATIN SMALL LETTER T WITH COMMA BELOW] + case '\u0236': + // ȶ [LATIN SMALL LETTER T WITH CURL] + case '\u0287': + // ʇ [LATIN SMALL LETTER TURNED T] + case '\u0288': + // ʈ [LATIN SMALL LETTER T WITH RETROFLEX HOOK] + case '\u1D75': + // áµµ [LATIN SMALL LETTER T WITH MIDDLE TILDE] + case '\u1E6B': + // ṫ [LATIN SMALL LETTER T WITH DOT ABOVE] + case '\u1E6D': + // á¹ [LATIN SMALL LETTER T WITH DOT BELOW] + case '\u1E6F': + // ṯ [LATIN SMALL LETTER T WITH LINE BELOW] + case '\u1E71': + // á¹± [LATIN SMALL LETTER T WITH CIRCUMFLEX BELOW] + case '\u1E97': + // ẗ [LATIN SMALL LETTER T WITH DIAERESIS] + case '\u24E3': + // â“£ [CIRCLED LATIN SMALL LETTER T] + case '\u2C66': + // ⱦ [LATIN SMALL LETTER T WITH DIAGONAL STROKE] + case '\uFF54': // �? [FULLWIDTH LATIN SMALL LETTER T] + output[outputPos++] = 't'; + break; + + case '\u00DE': + // Þ [LATIN CAPITAL LETTER THORN] + case '\uA766': // � [LATIN CAPITAL LETTER THORN WITH STROKE THROUGH DESCENDER] + output[outputPos++] = 'T'; + output[outputPos++] = 'H'; + break; + + case '\uA728': // Ꜩ [LATIN CAPITAL LETTER TZ] + output[outputPos++] = 'T'; + output[outputPos++] = 'Z'; + break; + + case '\u24AF': // â’¯ [PARENTHESIZED LATIN SMALL LETTER T] + output[outputPos++] = '('; + output[outputPos++] = 't'; + output[outputPos++] = ')'; + break; + + case '\u02A8': // ʨ [LATIN SMALL LETTER TC DIGRAPH WITH CURL] + output[outputPos++] = 't'; + output[outputPos++] = 'c'; + break; + + case '\u00FE': + // þ [LATIN SMALL LETTER THORN] + case '\u1D7A': + // ᵺ [LATIN SMALL LETTER TH WITH STRIKETHROUGH] + case '\uA767': // � [LATIN SMALL LETTER THORN WITH STROKE THROUGH DESCENDER] + output[outputPos++] = 't'; + output[outputPos++] = 'h'; + break; + + case '\u02A6': // ʦ [LATIN SMALL LETTER TS DIGRAPH] + output[outputPos++] = 't'; + output[outputPos++] = 's'; + break; + + case '\uA729': // ꜩ [LATIN SMALL LETTER TZ] + output[outputPos++] = 't'; + output[outputPos++] = 'z'; + break; + + case '\u00D9': + // Ù [LATIN CAPITAL LETTER U WITH GRAVE] + case '\u00DA': + // Ú [LATIN CAPITAL LETTER U WITH ACUTE] + case '\u00DB': + // Û [LATIN CAPITAL LETTER U WITH CIRCUMFLEX] + case '\u00DC': + // Ãœ [LATIN CAPITAL LETTER U WITH DIAERESIS] + case '\u0168': + // Ũ [LATIN CAPITAL LETTER U WITH TILDE] + case '\u016A': + // Ū [LATIN CAPITAL LETTER U WITH MACRON] + case '\u016C': + // Ŭ [LATIN CAPITAL LETTER U WITH BREVE] + case '\u016E': + // Å® [LATIN CAPITAL LETTER U WITH RING ABOVE] + case '\u0170': + // Å° [LATIN CAPITAL LETTER U WITH DOUBLE ACUTE] + case '\u0172': + // Ų [LATIN CAPITAL LETTER U WITH OGONEK] + case '\u01AF': + // Ư [LATIN CAPITAL LETTER U WITH HORN] + case '\u01D3': + // Ç“ [LATIN CAPITAL LETTER U WITH CARON] + case '\u01D5': + // Ç• [LATIN CAPITAL LETTER U WITH DIAERESIS AND MACRON] + case '\u01D7': + // Ç— [LATIN CAPITAL LETTER U WITH DIAERESIS AND ACUTE] + case '\u01D9': + // Ç™ [LATIN CAPITAL LETTER U WITH DIAERESIS AND CARON] + case '\u01DB': + // Ç› [LATIN CAPITAL LETTER U WITH DIAERESIS AND GRAVE] + case '\u0214': + // �? [LATIN CAPITAL LETTER U WITH DOUBLE GRAVE] + case '\u0216': + // È– [LATIN CAPITAL LETTER U WITH INVERTED BREVE] + case '\u0244': + // É„ [LATIN CAPITAL LETTER U BAR] + case '\u1D1C': + // á´œ [LATIN LETTER SMALL CAPITAL U] + case '\u1D7E': + // áµ¾ [LATIN SMALL CAPITAL LETTER U WITH STROKE] + case '\u1E72': + // á¹² [LATIN CAPITAL LETTER U WITH DIAERESIS BELOW] + case '\u1E74': + // á¹´ [LATIN CAPITAL LETTER U WITH TILDE BELOW] + case '\u1E76': + // Ṷ [LATIN CAPITAL LETTER U WITH CIRCUMFLEX BELOW] + case '\u1E78': + // Ṹ [LATIN CAPITAL LETTER U WITH TILDE AND ACUTE] + case '\u1E7A': + // Ṻ [LATIN CAPITAL LETTER U WITH MACRON AND DIAERESIS] + case '\u1EE4': + // Ụ [LATIN CAPITAL LETTER U WITH DOT BELOW] + case '\u1EE6': + // Ủ [LATIN CAPITAL LETTER U WITH HOOK ABOVE] + case '\u1EE8': + // Ứ [LATIN CAPITAL LETTER U WITH HORN AND ACUTE] + case '\u1EEA': + // Ừ [LATIN CAPITAL LETTER U WITH HORN AND GRAVE] + case '\u1EEC': + // Ử [LATIN CAPITAL LETTER U WITH HORN AND HOOK ABOVE] + case '\u1EEE': + // á»® [LATIN CAPITAL LETTER U WITH HORN AND TILDE] + case '\u1EF0': + // á»° [LATIN CAPITAL LETTER U WITH HORN AND DOT BELOW] + case '\u24CA': + // â“Š[CIRCLED LATIN CAPITAL LETTER U] + case '\uFF35': // ï¼µ [FULLWIDTH LATIN CAPITAL LETTER U] + output[outputPos++] = 'U'; + break; + + case '\u00F9': + // ù [LATIN SMALL LETTER U WITH GRAVE] + case '\u00FA': + // ú [LATIN SMALL LETTER U WITH ACUTE] + case '\u00FB': + // û [LATIN SMALL LETTER U WITH CIRCUMFLEX] + case '\u00FC': + // ü [LATIN SMALL LETTER U WITH DIAERESIS] + case '\u0169': + // Å© [LATIN SMALL LETTER U WITH TILDE] + case '\u016B': + // Å« [LATIN SMALL LETTER U WITH MACRON] + case '\u016D': + // Ã…Â [LATIN SMALL LETTER U WITH BREVE] + case '\u016F': + // ů [LATIN SMALL LETTER U WITH RING ABOVE] + case '\u0171': + // ű [LATIN SMALL LETTER U WITH DOUBLE ACUTE] + case '\u0173': + // ų [LATIN SMALL LETTER U WITH OGONEK] + case '\u01B0': + // Æ° [LATIN SMALL LETTER U WITH HORN] + case '\u01D4': + // �? [LATIN SMALL LETTER U WITH CARON] + case '\u01D6': + // Ç– [LATIN SMALL LETTER U WITH DIAERESIS AND MACRON] + case '\u01D8': + // ǘ [LATIN SMALL LETTER U WITH DIAERESIS AND ACUTE] + case '\u01DA': + // Çš [LATIN SMALL LETTER U WITH DIAERESIS AND CARON] + case '\u01DC': + // Çœ [LATIN SMALL LETTER U WITH DIAERESIS AND GRAVE] + case '\u0215': + // È• [LATIN SMALL LETTER U WITH DOUBLE GRAVE] + case '\u0217': + // È— [LATIN SMALL LETTER U WITH INVERTED BREVE] + case '\u0289': + // ʉ [LATIN SMALL LETTER U BAR] + case '\u1D64': + // ᵤ [LATIN SUBSCRIPT SMALL LETTER U] + case '\u1D99': + // ᶙ [LATIN SMALL LETTER U WITH RETROFLEX HOOK] + case '\u1E73': + // á¹³ [LATIN SMALL LETTER U WITH DIAERESIS BELOW] + case '\u1E75': + // á¹µ [LATIN SMALL LETTER U WITH TILDE BELOW] + case '\u1E77': + // á¹· [LATIN SMALL LETTER U WITH CIRCUMFLEX BELOW] + case '\u1E79': + // á¹¹ [LATIN SMALL LETTER U WITH TILDE AND ACUTE] + case '\u1E7B': + // á¹» [LATIN SMALL LETTER U WITH MACRON AND DIAERESIS] + case '\u1EE5': + // ụ [LATIN SMALL LETTER U WITH DOT BELOW] + case '\u1EE7': + // ủ [LATIN SMALL LETTER U WITH HOOK ABOVE] + case '\u1EE9': + // ứ [LATIN SMALL LETTER U WITH HORN AND ACUTE] + case '\u1EEB': + // ừ [LATIN SMALL LETTER U WITH HORN AND GRAVE] + case '\u1EED': + // á» [LATIN SMALL LETTER U WITH HORN AND HOOK ABOVE] + case '\u1EEF': + // ữ [LATIN SMALL LETTER U WITH HORN AND TILDE] + case '\u1EF1': + // á»± [LATIN SMALL LETTER U WITH HORN AND DOT BELOW] + case '\u24E4': + // ⓤ [CIRCLED LATIN SMALL LETTER U] + case '\uFF55': // u [FULLWIDTH LATIN SMALL LETTER U] + output[outputPos++] = 'u'; + break; + + case '\u24B0': // â’° [PARENTHESIZED LATIN SMALL LETTER U] + output[outputPos++] = '('; + output[outputPos++] = 'u'; + output[outputPos++] = ')'; + break; + + case '\u1D6B': // ᵫ [LATIN SMALL LETTER UE] + output[outputPos++] = 'u'; + output[outputPos++] = 'e'; + break; + + case '\u01B2': + // Ʋ [LATIN CAPITAL LETTER V WITH HOOK] + case '\u0245': + // É… [LATIN CAPITAL LETTER TURNED V] + case '\u1D20': + // á´ [LATIN LETTER SMALL CAPITAL V] + case '\u1E7C': + // á¹¼ [LATIN CAPITAL LETTER V WITH TILDE] + case '\u1E7E': + // á¹¾ [LATIN CAPITAL LETTER V WITH DOT BELOW] + case '\u1EFC': + // Ỽ [LATIN CAPITAL LETTER MIDDLE-WELSH V] + case '\u24CB': + // â“‹ [CIRCLED LATIN CAPITAL LETTER V] + case '\uA75E': + // � [LATIN CAPITAL LETTER V WITH DIAGONAL STROKE] + case '\uA768': + // � [LATIN CAPITAL LETTER VEND] + case '\uFF36': // V [FULLWIDTH LATIN CAPITAL LETTER V] + output[outputPos++] = 'V'; + break; + + case '\u028B': + // Ê‹ [LATIN SMALL LETTER V WITH HOOK] + case '\u028C': + // ÊŒ [LATIN SMALL LETTER TURNED V] + case '\u1D65': + // áµ¥ [LATIN SUBSCRIPT SMALL LETTER V] + case '\u1D8C': + // ᶌ [LATIN SMALL LETTER V WITH PALATAL HOOK] + case '\u1E7D': + // á¹½ [LATIN SMALL LETTER V WITH TILDE] + case '\u1E7F': + // ṿ [LATIN SMALL LETTER V WITH DOT BELOW] + case '\u24E5': + // â“¥ [CIRCLED LATIN SMALL LETTER V] + case '\u2C71': + // â±± [LATIN SMALL LETTER V WITH RIGHT HOOK] + case '\u2C74': + // â±´ [LATIN SMALL LETTER V WITH CURL] + case '\uA75F': + // � [LATIN SMALL LETTER V WITH DIAGONAL STROKE] + case '\uFF56': // ï½– [FULLWIDTH LATIN SMALL LETTER V] + output[outputPos++] = 'v'; + break; + + case '\uA760': // �[LATIN CAPITAL LETTER VY] + output[outputPos++] = 'V'; + output[outputPos++] = 'Y'; + break; + + case '\u24B1': // â’± [PARENTHESIZED LATIN SMALL LETTER V] + output[outputPos++] = '('; + output[outputPos++] = 'v'; + output[outputPos++] = ')'; + break; + + case '\uA761': // � [LATIN SMALL LETTER VY] + output[outputPos++] = 'v'; + output[outputPos++] = 'y'; + break; + + case '\u0174': + // Å´ [LATIN CAPITAL LETTER W WITH CIRCUMFLEX] + case '\u01F7': + // Ç· http://en.wikipedia.org/wiki/Wynn [LATIN CAPITAL LETTER WYNN] + case '\u1D21': + // á´¡ [LATIN LETTER SMALL CAPITAL W] + case '\u1E80': + // Ẁ [LATIN CAPITAL LETTER W WITH GRAVE] + case '\u1E82': + // Ẃ [LATIN CAPITAL LETTER W WITH ACUTE] + case '\u1E84': + // Ẅ [LATIN CAPITAL LETTER W WITH DIAERESIS] + case '\u1E86': + // Ẇ[LATIN CAPITAL LETTER W WITH DOT ABOVE] + case '\u1E88': + // Ẉ [LATIN CAPITAL LETTER W WITH DOT BELOW] + case '\u24CC': + // â“Œ [CIRCLED LATIN CAPITAL LETTER W] + case '\u2C72': + // â±² [LATIN CAPITAL LETTER W WITH HOOK] + case '\uFF37': // ï¼· [FULLWIDTH LATIN CAPITAL LETTER W] + output[outputPos++] = 'W'; + break; + + case '\u0175': + // ŵ [LATIN SMALL LETTER W WITH CIRCUMFLEX] + case '\u01BF': + // Æ¿ http://en.wikipedia.org/wiki/Wynn [LATIN LETTER WYNN] + case '\u028D': + // � [LATIN SMALL LETTER TURNED W] + case '\u1E81': + // � [LATIN SMALL LETTER W WITH GRAVE] + case '\u1E83': + // ẃ [LATIN SMALL LETTER W WITH ACUTE] + case '\u1E85': + // ẅ [LATIN SMALL LETTER W WITH DIAERESIS] + case '\u1E87': + // ẇ [LATIN SMALL LETTER W WITH DOT ABOVE] + case '\u1E89': + // ẉ [LATIN SMALL LETTER W WITH DOT BELOW] + case '\u1E98': + // ẘ [LATIN SMALL LETTER W WITH RING ABOVE] + case '\u24E6': + // ⓦ [CIRCLED LATIN SMALL LETTER W] + case '\u2C73': + // â±³ [LATIN SMALL LETTER W WITH HOOK] + case '\uFF57': // ï½— [FULLWIDTH LATIN SMALL LETTER W] + output[outputPos++] = 'w'; + break; + + case '\u24B2': // â’² [PARENTHESIZED LATIN SMALL LETTER W] + output[outputPos++] = '('; + output[outputPos++] = 'w'; + output[outputPos++] = ')'; + break; + + case '\u1E8A': + // Ẋ[LATIN CAPITAL LETTER X WITH DOT ABOVE] + case '\u1E8C': + // Ẍ [LATIN CAPITAL LETTER X WITH DIAERESIS] + case '\u24CD': + // � [CIRCLED LATIN CAPITAL LETTER X] + case '\uFF38': // X [FULLWIDTH LATIN CAPITAL LETTER X] + output[outputPos++] = 'X'; + break; + + case '\u1D8D': + // � [LATIN SMALL LETTER X WITH PALATAL HOOK] + case '\u1E8B': + // ẋ [LATIN SMALL LETTER X WITH DOT ABOVE] + case '\u1E8D': + // � [LATIN SMALL LETTER X WITH DIAERESIS] + case '\u2093': + // â‚“ [LATIN SUBSCRIPT SMALL LETTER X] + case '\u24E7': + // ⓧ [CIRCLED LATIN SMALL LETTER X] + case '\uFF58': // x [FULLWIDTH LATIN SMALL LETTER X] + output[outputPos++] = 'x'; + break; + + case '\u24B3': // â’³ [PARENTHESIZED LATIN SMALL LETTER X] + output[outputPos++] = '('; + output[outputPos++] = 'x'; + output[outputPos++] = ')'; + break; + + case '\u00DD': + // � [LATIN CAPITAL LETTER Y WITH ACUTE] + case '\u0176': + // Ŷ [LATIN CAPITAL LETTER Y WITH CIRCUMFLEX] + case '\u0178': + // Ÿ [LATIN CAPITAL LETTER Y WITH DIAERESIS] + case '\u01B3': + // Ƴ [LATIN CAPITAL LETTER Y WITH HOOK] + case '\u0232': + // Ȳ [LATIN CAPITAL LETTER Y WITH MACRON] + case '\u024E': + // ÉŽ [LATIN CAPITAL LETTER Y WITH STROKE] + case '\u028F': + // � [LATIN LETTER SMALL CAPITAL Y] + case '\u1E8E': + // Ẏ [LATIN CAPITAL LETTER Y WITH DOT ABOVE] + case '\u1EF2': + // Ỳ [LATIN CAPITAL LETTER Y WITH GRAVE] + case '\u1EF4': + // á»´ [LATIN CAPITAL LETTER Y WITH DOT BELOW] + case '\u1EF6': + // Ỷ [LATIN CAPITAL LETTER Y WITH HOOK ABOVE] + case '\u1EF8': + // Ỹ [LATIN CAPITAL LETTER Y WITH TILDE] + case '\u1EFE': + // Ỿ [LATIN CAPITAL LETTER Y WITH LOOP] + case '\u24CE': + // â“Ž [CIRCLED LATIN CAPITAL LETTER Y] + case '\uFF39': // ï¼¹ [FULLWIDTH LATIN CAPITAL LETTER Y] + output[outputPos++] = 'Y'; + break; + + case '\u00FD': + // ý [LATIN SMALL LETTER Y WITH ACUTE] + case '\u00FF': + // ÿ [LATIN SMALL LETTER Y WITH DIAERESIS] + case '\u0177': + // Å· [LATIN SMALL LETTER Y WITH CIRCUMFLEX] + case '\u01B4': + // Æ´ [LATIN SMALL LETTER Y WITH HOOK] + case '\u0233': + // ȳ [LATIN SMALL LETTER Y WITH MACRON] + case '\u024F': + // � [LATIN SMALL LETTER Y WITH STROKE] + case '\u028E': + // ÊŽ [LATIN SMALL LETTER TURNED Y] + case '\u1E8F': + // � [LATIN SMALL LETTER Y WITH DOT ABOVE] + case '\u1E99': + // ẙ [LATIN SMALL LETTER Y WITH RING ABOVE] + case '\u1EF3': + // ỳ [LATIN SMALL LETTER Y WITH GRAVE] + case '\u1EF5': + // ỵ [LATIN SMALL LETTER Y WITH DOT BELOW] + case '\u1EF7': + // á»· [LATIN SMALL LETTER Y WITH HOOK ABOVE] + case '\u1EF9': + // ỹ [LATIN SMALL LETTER Y WITH TILDE] + case '\u1EFF': + // ỿ [LATIN SMALL LETTER Y WITH LOOP] + case '\u24E8': + // ⓨ [CIRCLED LATIN SMALL LETTER Y] + case '\uFF59': // ï½™ [FULLWIDTH LATIN SMALL LETTER Y] + output[outputPos++] = 'y'; + break; + + case '\u24B4': // â’´ [PARENTHESIZED LATIN SMALL LETTER Y] + output[outputPos++] = '('; + output[outputPos++] = 'y'; + output[outputPos++] = ')'; + break; + + case '\u0179': + // Ź [LATIN CAPITAL LETTER Z WITH ACUTE] + case '\u017B': + // Å» [LATIN CAPITAL LETTER Z WITH DOT ABOVE] + case '\u017D': + // Ž [LATIN CAPITAL LETTER Z WITH CARON] + case '\u01B5': + // Ƶ [LATIN CAPITAL LETTER Z WITH STROKE] + case '\u021C': + // Èœ http://en.wikipedia.org/wiki/Yogh [LATIN CAPITAL LETTER YOGH] + case '\u0224': + // Ȥ [LATIN CAPITAL LETTER Z WITH HOOK] + case '\u1D22': + // á´¢ [LATIN LETTER SMALL CAPITAL Z] + case '\u1E90': + // � [LATIN CAPITAL LETTER Z WITH CIRCUMFLEX] + case '\u1E92': + // Ẓ [LATIN CAPITAL LETTER Z WITH DOT BELOW] + case '\u1E94': + // �? [LATIN CAPITAL LETTER Z WITH LINE BELOW] + case '\u24CF': + // � [CIRCLED LATIN CAPITAL LETTER Z] + case '\u2C6B': + // Ⱬ [LATIN CAPITAL LETTER Z WITH DESCENDER] + case '\uA762': + // � [LATIN CAPITAL LETTER VISIGOTHIC Z] + case '\uFF3A': // Z [FULLWIDTH LATIN CAPITAL LETTER Z] + output[outputPos++] = 'Z'; + break; + + case '\u017A': + // ź [LATIN SMALL LETTER Z WITH ACUTE] + case '\u017C': + // ż [LATIN SMALL LETTER Z WITH DOT ABOVE] + case '\u017E': + // ž [LATIN SMALL LETTER Z WITH CARON] + case '\u01B6': + // ƶ [LATIN SMALL LETTER Z WITH STROKE] + case '\u021D': + // � http://en.wikipedia.org/wiki/Yogh [LATIN SMALL LETTER YOGH] + case '\u0225': + // È¥ [LATIN SMALL LETTER Z WITH HOOK] + case '\u0240': + // É€ [LATIN SMALL LETTER Z WITH SWASH TAIL] + case '\u0290': + // � [LATIN SMALL LETTER Z WITH RETROFLEX HOOK] + case '\u0291': + // Ê‘ [LATIN SMALL LETTER Z WITH CURL] + case '\u1D76': + // ᵶ [LATIN SMALL LETTER Z WITH MIDDLE TILDE] + case '\u1D8E': + // ᶎ [LATIN SMALL LETTER Z WITH PALATAL HOOK] + case '\u1E91': + // ẑ [LATIN SMALL LETTER Z WITH CIRCUMFLEX] + case '\u1E93': + // ẓ [LATIN SMALL LETTER Z WITH DOT BELOW] + case '\u1E95': + // ẕ [LATIN SMALL LETTER Z WITH LINE BELOW] + case '\u24E9': + // â“© [CIRCLED LATIN SMALL LETTER Z] + case '\u2C6C': + // ⱬ [LATIN SMALL LETTER Z WITH DESCENDER] + case '\uA763': + // � [LATIN SMALL LETTER VISIGOTHIC Z] + case '\uFF5A': // z [FULLWIDTH LATIN SMALL LETTER Z] + output[outputPos++] = 'z'; + break; + + case '\u24B5': // â’µ [PARENTHESIZED LATIN SMALL LETTER Z] + output[outputPos++] = '('; + output[outputPos++] = 'z'; + output[outputPos++] = ')'; + break; + + case '\u2070': + // � [SUPERSCRIPT ZERO] + case '\u2080': + // â‚€ [SUBSCRIPT ZERO] + case '\u24EA': + // ⓪ [CIRCLED DIGIT ZERO] + case '\u24FF': + // â“¿ [NEGATIVE CIRCLED DIGIT ZERO] + case '\uFF10': // � [FULLWIDTH DIGIT ZERO] + output[outputPos++] = '0'; + break; + + case '\u00B9': + // ¹ [SUPERSCRIPT ONE] + case '\u2081': + // � [SUBSCRIPT ONE] + case '\u2460': + // â‘ [CIRCLED DIGIT ONE] + case '\u24F5': + // ⓵ [DOUBLE CIRCLED DIGIT ONE] + case '\u2776': + // � [DINGBAT NEGATIVE CIRCLED DIGIT ONE] + case '\u2780': + // ➀ [DINGBAT CIRCLED SANS-SERIF DIGIT ONE] + case '\u278A': + // ➊[DINGBAT NEGATIVE CIRCLED SANS-SERIF DIGIT ONE] + case '\uFF11': // 1 [FULLWIDTH DIGIT ONE] + output[outputPos++] = '1'; + break; + + case '\u2488': // â’ˆ [DIGIT ONE FULL STOP] + output[outputPos++] = '1'; + output[outputPos++] = '.'; + break; + + case '\u2474': // â‘´ [PARENTHESIZED DIGIT ONE] + output[outputPos++] = '('; + output[outputPos++] = '1'; + output[outputPos++] = ')'; + break; + + case '\u00B2': + // ² [SUPERSCRIPT TWO] + case '\u2082': + // â‚‚ [SUBSCRIPT TWO] + case '\u2461': + // â‘¡ [CIRCLED DIGIT TWO] + case '\u24F6': + // ⓶ [DOUBLE CIRCLED DIGIT TWO] + case '\u2777': + // � [DINGBAT NEGATIVE CIRCLED DIGIT TWO] + case '\u2781': + // � [DINGBAT CIRCLED SANS-SERIF DIGIT TWO] + case '\u278B': + // âž‹ [DINGBAT NEGATIVE CIRCLED SANS-SERIF DIGIT TWO] + case '\uFF12': // ï¼’ [FULLWIDTH DIGIT TWO] + output[outputPos++] = '2'; + break; + + case '\u2489': // â’‰ [DIGIT TWO FULL STOP] + output[outputPos++] = '2'; + output[outputPos++] = '.'; + break; + + case '\u2475': // ⑵ [PARENTHESIZED DIGIT TWO] + output[outputPos++] = '('; + output[outputPos++] = '2'; + output[outputPos++] = ')'; + break; + + case '\u00B3': + // ³ [SUPERSCRIPT THREE] + case '\u2083': + // ₃ [SUBSCRIPT THREE] + case '\u2462': + // â‘¢ [CIRCLED DIGIT THREE] + case '\u24F7': + // â“· [DOUBLE CIRCLED DIGIT THREE] + case '\u2778': + // � [DINGBAT NEGATIVE CIRCLED DIGIT THREE] + case '\u2782': + // âž‚ [DINGBAT CIRCLED SANS-SERIF DIGIT THREE] + case '\u278C': + // ➌ [DINGBAT NEGATIVE CIRCLED SANS-SERIF DIGIT THREE] + case '\uFF13': // 3 [FULLWIDTH DIGIT THREE] + output[outputPos++] = '3'; + break; + + case '\u248A': // â’Š[DIGIT THREE FULL STOP] + output[outputPos++] = '3'; + output[outputPos++] = '.'; + break; + + case '\u2476': // ⑶ [PARENTHESIZED DIGIT THREE] + output[outputPos++] = '('; + output[outputPos++] = '3'; + output[outputPos++] = ')'; + break; + + case '\u2074': + // � [SUPERSCRIPT FOUR] + case '\u2084': + // â‚„ [SUBSCRIPT FOUR] + case '\u2463': + // â‘£ [CIRCLED DIGIT FOUR] + case '\u24F8': + // ⓸ [DOUBLE CIRCLED DIGIT FOUR] + case '\u2779': + // � [DINGBAT NEGATIVE CIRCLED DIGIT FOUR] + case '\u2783': + // ➃ [DINGBAT CIRCLED SANS-SERIF DIGIT FOUR] + case '\u278D': + // � [DINGBAT NEGATIVE CIRCLED SANS-SERIF DIGIT FOUR] + case '\uFF14': // �? [FULLWIDTH DIGIT FOUR] + output[outputPos++] = '4'; + break; + + case '\u248B': // â’‹ [DIGIT FOUR FULL STOP] + output[outputPos++] = '4'; + output[outputPos++] = '.'; + break; + + case '\u2477': // â‘· [PARENTHESIZED DIGIT FOUR] + output[outputPos++] = '('; + output[outputPos++] = '4'; + output[outputPos++] = ')'; + break; + + case '\u2075': + // � [SUPERSCRIPT FIVE] + case '\u2085': + // â‚… [SUBSCRIPT FIVE] + case '\u2464': + // ⑤ [CIRCLED DIGIT FIVE] + case '\u24F9': + // ⓹ [DOUBLE CIRCLED DIGIT FIVE] + case '\u277A': + // � [DINGBAT NEGATIVE CIRCLED DIGIT FIVE] + case '\u2784': + // âž„ [DINGBAT CIRCLED SANS-SERIF DIGIT FIVE] + case '\u278E': + // ➎ [DINGBAT NEGATIVE CIRCLED SANS-SERIF DIGIT FIVE] + case '\uFF15': // 5 [FULLWIDTH DIGIT FIVE] + output[outputPos++] = '5'; + break; + + case '\u248C': // â’Œ [DIGIT FIVE FULL STOP] + output[outputPos++] = '5'; + output[outputPos++] = '.'; + break; + + case '\u2478': // ⑸ [PARENTHESIZED DIGIT FIVE] + output[outputPos++] = '('; + output[outputPos++] = '5'; + output[outputPos++] = ')'; + break; + + case '\u2076': + // � [SUPERSCRIPT SIX] + case '\u2086': + // ₆[SUBSCRIPT SIX] + case '\u2465': + // â‘¥ [CIRCLED DIGIT SIX] + case '\u24FA': + // ⓺ [DOUBLE CIRCLED DIGIT SIX] + case '\u277B': + // � [DINGBAT NEGATIVE CIRCLED DIGIT SIX] + case '\u2785': + // âž… [DINGBAT CIRCLED SANS-SERIF DIGIT SIX] + case '\u278F': + // � [DINGBAT NEGATIVE CIRCLED SANS-SERIF DIGIT SIX] + case '\uFF16': // ï¼– [FULLWIDTH DIGIT SIX] + output[outputPos++] = '6'; + break; + + case '\u248D': // â’� [DIGIT SIX FULL STOP] + output[outputPos++] = '6'; + output[outputPos++] = '.'; + break; + + case '\u2479': // ⑹ [PARENTHESIZED DIGIT SIX] + output[outputPos++] = '('; + output[outputPos++] = '6'; + output[outputPos++] = ')'; + break; + + case '\u2077': + // � [SUPERSCRIPT SEVEN] + case '\u2087': + // ₇ [SUBSCRIPT SEVEN] + case '\u2466': + // ⑦ [CIRCLED DIGIT SEVEN] + case '\u24FB': + // â“» [DOUBLE CIRCLED DIGIT SEVEN] + case '\u277C': + // � [DINGBAT NEGATIVE CIRCLED DIGIT SEVEN] + case '\u2786': + // ➆[DINGBAT CIRCLED SANS-SERIF DIGIT SEVEN] + case '\u2790': + // � [DINGBAT NEGATIVE CIRCLED SANS-SERIF DIGIT SEVEN] + case '\uFF17': // ï¼— [FULLWIDTH DIGIT SEVEN] + output[outputPos++] = '7'; + break; + + case '\u248E': // â’Ž [DIGIT SEVEN FULL STOP] + output[outputPos++] = '7'; + output[outputPos++] = '.'; + break; + + case '\u247A': // ⑺ [PARENTHESIZED DIGIT SEVEN] + output[outputPos++] = '('; + output[outputPos++] = '7'; + output[outputPos++] = ')'; + break; + + case '\u2078': + // � [SUPERSCRIPT EIGHT] + case '\u2088': + // ₈ [SUBSCRIPT EIGHT] + case '\u2467': + // ⑧ [CIRCLED DIGIT EIGHT] + case '\u24FC': + // ⓼ [DOUBLE CIRCLED DIGIT EIGHT] + case '\u277D': + // � [DINGBAT NEGATIVE CIRCLED DIGIT EIGHT] + case '\u2787': + // ➇ [DINGBAT CIRCLED SANS-SERIF DIGIT EIGHT] + case '\u2791': + // âž‘ [DINGBAT NEGATIVE CIRCLED SANS-SERIF DIGIT EIGHT] + case '\uFF18': // 8 [FULLWIDTH DIGIT EIGHT] + output[outputPos++] = '8'; + break; + + case '\u248F': // â’� [DIGIT EIGHT FULL STOP] + output[outputPos++] = '8'; + output[outputPos++] = '.'; + break; + + case '\u247B': // â‘» [PARENTHESIZED DIGIT EIGHT] + output[outputPos++] = '('; + output[outputPos++] = '8'; + output[outputPos++] = ')'; + break; + + case '\u2079': + // � [SUPERSCRIPT NINE] + case '\u2089': + // ₉ [SUBSCRIPT NINE] + case '\u2468': + // ⑨ [CIRCLED DIGIT NINE] + case '\u24FD': + // ⓽ [DOUBLE CIRCLED DIGIT NINE] + case '\u277E': + // � [DINGBAT NEGATIVE CIRCLED DIGIT NINE] + case '\u2788': + // ➈ [DINGBAT CIRCLED SANS-SERIF DIGIT NINE] + case '\u2792': + // âž’ [DINGBAT NEGATIVE CIRCLED SANS-SERIF DIGIT NINE] + case '\uFF19': // ï¼™ [FULLWIDTH DIGIT NINE] + output[outputPos++] = '9'; + break; + + case '\u2490': // â’� [DIGIT NINE FULL STOP] + output[outputPos++] = '9'; + output[outputPos++] = '.'; + break; + + case '\u247C': // ⑼ [PARENTHESIZED DIGIT NINE] + output[outputPos++] = '('; + output[outputPos++] = '9'; + output[outputPos++] = ')'; + break; + + case '\u2469': + // â‘© [CIRCLED NUMBER TEN] + case '\u24FE': + // ⓾ [DOUBLE CIRCLED NUMBER TEN] + case '\u277F': + // � [DINGBAT NEGATIVE CIRCLED NUMBER TEN] + case '\u2789': + // ➉ [DINGBAT CIRCLED SANS-SERIF NUMBER TEN] + case '\u2793': // âž“ [DINGBAT NEGATIVE CIRCLED SANS-SERIF NUMBER TEN] + output[outputPos++] = '1'; + output[outputPos++] = '0'; + break; + + case '\u2491': // â’‘ [NUMBER TEN FULL STOP] + output[outputPos++] = '1'; + output[outputPos++] = '0'; + output[outputPos++] = '.'; + break; + + case '\u247D': // ⑽ [PARENTHESIZED NUMBER TEN] + output[outputPos++] = '('; + output[outputPos++] = '1'; + output[outputPos++] = '0'; + output[outputPos++] = ')'; + break; + + case '\u246A': + // ⑪ [CIRCLED NUMBER ELEVEN] + case '\u24EB': // â“« [NEGATIVE CIRCLED NUMBER ELEVEN] + output[outputPos++] = '1'; + output[outputPos++] = '1'; + break; + + case '\u2492': // â’’ [NUMBER ELEVEN FULL STOP] + output[outputPos++] = '1'; + output[outputPos++] = '1'; + output[outputPos++] = '.'; + break; + + case '\u247E': // ⑾ [PARENTHESIZED NUMBER ELEVEN] + output[outputPos++] = '('; + output[outputPos++] = '1'; + output[outputPos++] = '1'; + output[outputPos++] = ')'; + break; + + case '\u246B': + // â‘« [CIRCLED NUMBER TWELVE] + case '\u24EC': // ⓬ [NEGATIVE CIRCLED NUMBER TWELVE] + output[outputPos++] = '1'; + output[outputPos++] = '2'; + break; + + case '\u2493': // â’“ [NUMBER TWELVE FULL STOP] + output[outputPos++] = '1'; + output[outputPos++] = '2'; + output[outputPos++] = '.'; + break; + + case '\u247F': // â‘¿ [PARENTHESIZED NUMBER TWELVE] + output[outputPos++] = '('; + output[outputPos++] = '1'; + output[outputPos++] = '2'; + output[outputPos++] = ')'; + break; + + case '\u246C': + // ⑬ [CIRCLED NUMBER THIRTEEN] + case '\u24ED': // â“ [NEGATIVE CIRCLED NUMBER THIRTEEN] + output[outputPos++] = '1'; + output[outputPos++] = '3'; + break; + + case '\u2494': // â’�? [NUMBER THIRTEEN FULL STOP] + output[outputPos++] = '1'; + output[outputPos++] = '3'; + output[outputPos++] = '.'; + break; + + case '\u2480': // â’€ [PARENTHESIZED NUMBER THIRTEEN] + output[outputPos++] = '('; + output[outputPos++] = '1'; + output[outputPos++] = '3'; + output[outputPos++] = ')'; + break; + + case '\u246D': + // â‘ [CIRCLED NUMBER FOURTEEN] + case '\u24EE': // â“® [NEGATIVE CIRCLED NUMBER FOURTEEN] + output[outputPos++] = '1'; + output[outputPos++] = '4'; + break; + + case '\u2495': // â’• [NUMBER FOURTEEN FULL STOP] + output[outputPos++] = '1'; + output[outputPos++] = '4'; + output[outputPos++] = '.'; + break; + + case '\u2481': // â’� [PARENTHESIZED NUMBER FOURTEEN] + output[outputPos++] = '('; + output[outputPos++] = '1'; + output[outputPos++] = '4'; + output[outputPos++] = ')'; + break; + + case '\u246E': + // â‘® [CIRCLED NUMBER FIFTEEN] + case '\u24EF': // ⓯ [NEGATIVE CIRCLED NUMBER FIFTEEN] + output[outputPos++] = '1'; + output[outputPos++] = '5'; + break; + + case '\u2496': // â’– [NUMBER FIFTEEN FULL STOP] + output[outputPos++] = '1'; + output[outputPos++] = '5'; + output[outputPos++] = '.'; + break; + + case '\u2482': // â’‚ [PARENTHESIZED NUMBER FIFTEEN] + output[outputPos++] = '('; + output[outputPos++] = '1'; + output[outputPos++] = '5'; + output[outputPos++] = ')'; + break; + + case '\u246F': + // ⑯ [CIRCLED NUMBER SIXTEEN] + case '\u24F0': // â“° [NEGATIVE CIRCLED NUMBER SIXTEEN] + output[outputPos++] = '1'; + output[outputPos++] = '6'; + break; + + case '\u2497': // â’— [NUMBER SIXTEEN FULL STOP] + output[outputPos++] = '1'; + output[outputPos++] = '6'; + output[outputPos++] = '.'; + break; + + case '\u2483': // â’ƒ [PARENTHESIZED NUMBER SIXTEEN] + output[outputPos++] = '('; + output[outputPos++] = '1'; + output[outputPos++] = '6'; + output[outputPos++] = ')'; + break; + + case '\u2470': + // â‘° [CIRCLED NUMBER SEVENTEEN] + case '\u24F1': // ⓱ [NEGATIVE CIRCLED NUMBER SEVENTEEN] + output[outputPos++] = '1'; + output[outputPos++] = '7'; + break; + + case '\u2498': // â’˜ [NUMBER SEVENTEEN FULL STOP] + output[outputPos++] = '1'; + output[outputPos++] = '7'; + output[outputPos++] = '.'; + break; + + case '\u2484': // â’„ [PARENTHESIZED NUMBER SEVENTEEN] + output[outputPos++] = '('; + output[outputPos++] = '1'; + output[outputPos++] = '7'; + output[outputPos++] = ')'; + break; + + case '\u2471': + // ⑱ [CIRCLED NUMBER EIGHTEEN] + case '\u24F2': // ⓲ [NEGATIVE CIRCLED NUMBER EIGHTEEN] + output[outputPos++] = '1'; + output[outputPos++] = '8'; + break; + + case '\u2499': // â’™ [NUMBER EIGHTEEN FULL STOP] + output[outputPos++] = '1'; + output[outputPos++] = '8'; + output[outputPos++] = '.'; + break; + + case '\u2485': // â’… [PARENTHESIZED NUMBER EIGHTEEN] + output[outputPos++] = '('; + output[outputPos++] = '1'; + output[outputPos++] = '8'; + output[outputPos++] = ')'; + break; + + case '\u2472': + // ⑲ [CIRCLED NUMBER NINETEEN] + case '\u24F3': // ⓳ [NEGATIVE CIRCLED NUMBER NINETEEN] + output[outputPos++] = '1'; + output[outputPos++] = '9'; + break; + + case '\u249A': // â’š [NUMBER NINETEEN FULL STOP] + output[outputPos++] = '1'; + output[outputPos++] = '9'; + output[outputPos++] = '.'; + break; + + case '\u2486': // â’†[PARENTHESIZED NUMBER NINETEEN] + output[outputPos++] = '('; + output[outputPos++] = '1'; + output[outputPos++] = '9'; + output[outputPos++] = ')'; + break; + + case '\u2473': + // ⑳ [CIRCLED NUMBER TWENTY] + case '\u24F4': // â“´ [NEGATIVE CIRCLED NUMBER TWENTY] + output[outputPos++] = '2'; + output[outputPos++] = '0'; + break; + + case '\u249B': // â’› [NUMBER TWENTY FULL STOP] + output[outputPos++] = '2'; + output[outputPos++] = '0'; + output[outputPos++] = '.'; + break; + + case '\u2487': // â’‡ [PARENTHESIZED NUMBER TWENTY] + output[outputPos++] = '('; + output[outputPos++] = '2'; + output[outputPos++] = '0'; + output[outputPos++] = ')'; + break; + + case '\u00AB': + // « [LEFT-POINTING DOUBLE ANGLE QUOTATION MARK] + case '\u00BB': + // » [RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK] + case '\u201C': + // “ [LEFT DOUBLE QUOTATION MARK] + case '\u201D': + // � [RIGHT DOUBLE QUOTATION MARK] + case '\u201E': + // „ [DOUBLE LOW-9 QUOTATION MARK] + case '\u2033': + // ″ [DOUBLE PRIME] + case '\u2036': + // ‶ [REVERSED DOUBLE PRIME] + case '\u275D': + // � [HEAVY DOUBLE TURNED COMMA QUOTATION MARK ORNAMENT] + case '\u275E': + // � [HEAVY DOUBLE COMMA QUOTATION MARK ORNAMENT] + case '\u276E': + // � [HEAVY LEFT-POINTING ANGLE QUOTATION MARK ORNAMENT] + case '\u276F': + // � [HEAVY RIGHT-POINTING ANGLE QUOTATION MARK ORNAMENT] + case '\uFF02': // " [FULLWIDTH QUOTATION MARK] + output[outputPos++] = '"'; + break; + + case '\u2018': + // ‘ [LEFT SINGLE QUOTATION MARK] + case '\u2019': + // ’ [RIGHT SINGLE QUOTATION MARK] + case '\u201A': + // ‚ [SINGLE LOW-9 QUOTATION MARK] + case '\u201B': + // ‛ [SINGLE HIGH-REVERSED-9 QUOTATION MARK] + case '\u2032': + // ′ [PRIME] + case '\u2035': + // ‵ [REVERSED PRIME] + case '\u2039': + // ‹ [SINGLE LEFT-POINTING ANGLE QUOTATION MARK] + case '\u203A': + // › [SINGLE RIGHT-POINTING ANGLE QUOTATION MARK] + case '\u275B': + // � [HEAVY SINGLE TURNED COMMA QUOTATION MARK ORNAMENT] + case '\u275C': + // � [HEAVY SINGLE COMMA QUOTATION MARK ORNAMENT] + case '\uFF07': // ' [FULLWIDTH APOSTROPHE] + output[outputPos++] = '\''; + break; + + case '\u2010': + // � [HYPHEN] + case '\u2011': + // ‑ [NON-BREAKING HYPHEN] + case '\u2012': + // ‒ [FIGURE DASH] + case '\u2013': + // – [EN DASH] + case '\u2014': + // �? [EM DASH] + case '\u207B': + // � [SUPERSCRIPT MINUS] + case '\u208B': + // â‚‹ [SUBSCRIPT MINUS] + case '\uFF0D': // � [FULLWIDTH HYPHEN-MINUS] + output[outputPos++] = '-'; + break; + + case '\u2045': + // � [LEFT SQUARE BRACKET WITH QUILL] + case '\u2772': + // � [LIGHT LEFT TORTOISE SHELL BRACKET ORNAMENT] + case '\uFF3B': // ï¼» [FULLWIDTH LEFT SQUARE BRACKET] + output[outputPos++] = '['; + break; + + case '\u2046': + // �[RIGHT SQUARE BRACKET WITH QUILL] + case '\u2773': + // � [LIGHT RIGHT TORTOISE SHELL BRACKET ORNAMENT] + case '\uFF3D': // ï¼½ [FULLWIDTH RIGHT SQUARE BRACKET] + output[outputPos++] = ']'; + break; + + case '\u207D': + // � [SUPERSCRIPT LEFT PARENTHESIS] + case '\u208D': + // � [SUBSCRIPT LEFT PARENTHESIS] + case '\u2768': + // � [MEDIUM LEFT PARENTHESIS ORNAMENT] + case '\u276A': + // � [MEDIUM FLATTENED LEFT PARENTHESIS ORNAMENT] + case '\uFF08': // ( [FULLWIDTH LEFT PARENTHESIS] + output[outputPos++] = '('; + break; + + case '\u2E28': // ⸨ [LEFT DOUBLE PARENTHESIS] + output[outputPos++] = '('; + output[outputPos++] = '('; + break; + + case '\u207E': + // � [SUPERSCRIPT RIGHT PARENTHESIS] + case '\u208E': + // â‚Ž [SUBSCRIPT RIGHT PARENTHESIS] + case '\u2769': + // � [MEDIUM RIGHT PARENTHESIS ORNAMENT] + case '\u276B': + // � [MEDIUM FLATTENED RIGHT PARENTHESIS ORNAMENT] + case '\uFF09': // ) [FULLWIDTH RIGHT PARENTHESIS] + output[outputPos++] = ')'; + break; + + case '\u2E29': // ⸩ [RIGHT DOUBLE PARENTHESIS] + output[outputPos++] = ')'; + output[outputPos++] = ')'; + break; + + case '\u276C': + // � [MEDIUM LEFT-POINTING ANGLE BRACKET ORNAMENT] + case '\u2770': + // � [HEAVY LEFT-POINTING ANGLE BRACKET ORNAMENT] + case '\uFF1C': // < [FULLWIDTH LESS-THAN SIGN] + output[outputPos++] = '<'; + break; + + case '\u276D': + // �[MEDIUM RIGHT-POINTING ANGLE BRACKET ORNAMENT] + case '\u2771': + // � [HEAVY RIGHT-POINTING ANGLE BRACKET ORNAMENT] + case '\uFF1E': // > [FULLWIDTH GREATER-THAN SIGN] + output[outputPos++] = '>'; + break; + + case '\u2774': + // � [MEDIUM LEFT CURLY BRACKET ORNAMENT] + case '\uFF5B': // ï½› [FULLWIDTH LEFT CURLY BRACKET] + output[outputPos++] = '{'; + break; + + case '\u2775': + // � [MEDIUM RIGHT CURLY BRACKET ORNAMENT] + case '\uFF5D': // � [FULLWIDTH RIGHT CURLY BRACKET] + output[outputPos++] = '}'; + break; + + case '\u207A': + // � [SUPERSCRIPT PLUS SIGN] + case '\u208A': + // â‚Š[SUBSCRIPT PLUS SIGN] + case '\uFF0B': // + [FULLWIDTH PLUS SIGN] + output[outputPos++] = '+'; + break; + + case '\u207C': + // � [SUPERSCRIPT EQUALS SIGN] + case '\u208C': + // â‚Œ [SUBSCRIPT EQUALS SIGN] + case '\uFF1D': // � [FULLWIDTH EQUALS SIGN] + output[outputPos++] = '='; + break; + + case '\uFF01': // � [FULLWIDTH EXCLAMATION MARK] + output[outputPos++] = '!'; + break; + + case '\u203C': // ‼ [DOUBLE EXCLAMATION MARK] + output[outputPos++] = '!'; + output[outputPos++] = '!'; + break; + + case '\u2049': // � [EXCLAMATION QUESTION MARK] + output[outputPos++] = '!'; + output[outputPos++] = '?'; + break; + + case '\uFF03': // # [FULLWIDTH NUMBER SIGN] + output[outputPos++] = '#'; + break; + + case '\uFF04': // $ [FULLWIDTH DOLLAR SIGN] + output[outputPos++] = '$'; + break; + + case '\u2052': + // � [COMMERCIAL MINUS SIGN] + case '\uFF05': // ï¼… [FULLWIDTH PERCENT SIGN] + output[outputPos++] = '%'; + break; + + case '\uFF06': // &[FULLWIDTH AMPERSAND] + output[outputPos++] = '&'; + break; + + case '\u204E': + // � [LOW ASTERISK] + case '\uFF0A': // *[FULLWIDTH ASTERISK] + output[outputPos++] = '*'; + break; + + case '\uFF0C': // , [FULLWIDTH COMMA] + output[outputPos++] = ','; + break; + + case '\uFF0E': // . [FULLWIDTH FULL STOP] + output[outputPos++] = '.'; + break; + + case '\u2044': + // � [FRACTION SLASH] + case '\uFF0F': // � [FULLWIDTH SOLIDUS] + output[outputPos++] = '/'; + break; + + case '\uFF1A': // : [FULLWIDTH COLON] + output[outputPos++] = ':'; + break; + + case '\u204F': + // � [REVERSED SEMICOLON] + case '\uFF1B': // ï¼› [FULLWIDTH SEMICOLON] + output[outputPos++] = ';'; + break; + + case '\uFF1F': // ? [FULLWIDTH QUESTION MARK] + output[outputPos++] = '?'; + break; + + case '\u2047': // � [DOUBLE QUESTION MARK] + output[outputPos++] = '?'; + output[outputPos++] = '?'; + break; + + case '\u2048': // � [QUESTION EXCLAMATION MARK] + output[outputPos++] = '?'; + output[outputPos++] = '!'; + break; + + case '\uFF20': // ï¼ [FULLWIDTH COMMERCIAL AT] + output[outputPos++] = '@'; + break; + + case '\uFF3C': // ï¼¼ [FULLWIDTH REVERSE SOLIDUS] + output[outputPos++] = '\\'; + break; + + case '\u2038': + // ‸ [CARET] + case '\uFF3E': // ï¼¾ [FULLWIDTH CIRCUMFLEX ACCENT] + output[outputPos++] = '^'; + break; + + case '\uFF3F': // _ [FULLWIDTH LOW LINE] + output[outputPos++] = '_'; + break; + + case '\u2053': + // � [SWUNG DASH] + case '\uFF5E': // ~ [FULLWIDTH TILDE] + output[outputPos++] = '~'; + break; + + default: + output[outputPos++] = c; + break; + + } + } + } + } + } +}
\ No newline at end of file diff --git a/src/core/Analysis/Analyzer.cs b/src/core/Analysis/Analyzer.cs new file mode 100644 index 0000000..cea0ee3 --- /dev/null +++ b/src/core/Analysis/Analyzer.cs @@ -0,0 +1,171 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +using System; +using Lucene.Net.Documents; +using Lucene.Net.Store; +using Lucene.Net.Util; + +namespace Lucene.Net.Analysis +{ + /// <summary>An Analyzer builds TokenStreams, which analyze text. It thus represents a + /// policy for extracting index terms from text. + /// <p/> + /// Typical implementations first build a Tokenizer, which breaks the stream of + /// characters from the Reader into raw Tokens. One or more TokenFilters may + /// then be applied to the output of the Tokenizer. + /// </summary> + public abstract class Analyzer : IDisposable + { + /// <summary>Creates a TokenStream which tokenizes all the text in the provided + /// Reader. Must be able to handle null field name for + /// backward compatibility. + /// </summary> + public abstract TokenStream TokenStream(String fieldName, System.IO.TextReader reader); + + /// <summary>Creates a TokenStream that is allowed to be re-used + /// from the previous time that the same thread called + /// this method. Callers that do not need to use more + /// than one TokenStream at the same time from this + /// analyzer should use this method for better + /// performance. + /// </summary> + public virtual TokenStream ReusableTokenStream(String fieldName, System.IO.TextReader reader) + { + return TokenStream(fieldName, reader); + } + + private CloseableThreadLocal<Object> tokenStreams = new CloseableThreadLocal<Object>(); + private bool isDisposed; + + /// <summary>Used by Analyzers that implement reusableTokenStream + /// to retrieve previously saved TokenStreams for re-use + /// by the same thread. + /// </summary> + protected internal virtual object PreviousTokenStream + { + get + { + if (tokenStreams == null) + { + throw new AlreadyClosedException("this Analyzer is closed"); + } + return tokenStreams.Get(); + } + set + { + if (tokenStreams == null) + { + throw new AlreadyClosedException("this Analyzer is closed"); + } + tokenStreams.Set(value); + } + } + + [Obsolete()] + protected internal bool overridesTokenStreamMethod = false; + + /// <deprecated> This is only present to preserve + /// back-compat of classes that subclass a core analyzer + /// and override tokenStream but not reusableTokenStream + /// </deprecated> + /// <summary> + /// Java uses Class<? extends Analyer> to constrain <typeparamref name="TClass"/> to + /// only Types that inherit from Analyzer. C# does not have a generic type class, + /// ie Type<t>. The method signature stays the same, and an exception may + /// still be thrown, if the method doesn't exist. + /// </summary> + [Obsolete("This is only present to preserve back-compat of classes that subclass a core analyzer and override tokenStream but not reusableTokenStream ")] + protected internal virtual void SetOverridesTokenStreamMethod<TClass>() + where TClass : Analyzer + { + try + { + System.Reflection.MethodInfo m = this.GetType().GetMethod("TokenStream", new[] { typeof(string), typeof(System.IO.TextReader) }); + overridesTokenStreamMethod = m.DeclaringType != typeof(TClass); + } + catch (MethodAccessException) + { + // can't happen, as baseClass is subclass of Analyzer + overridesTokenStreamMethod = false; + } + } + + + /// <summary> Invoked before indexing a Fieldable instance if + /// terms have already been added to that field. This allows custom + /// analyzers to place an automatic position increment gap between + /// Fieldable instances using the same field name. The default value + /// position increment gap is 0. With a 0 position increment gap and + /// the typical default token position increment of 1, all terms in a field, + /// including across Fieldable instances, are in successive positions, allowing + /// exact PhraseQuery matches, for instance, across Fieldable instance boundaries. + /// + /// </summary> + /// <param name="fieldName">Fieldable name being indexed. + /// </param> + /// <returns> position increment gap, added to the next token emitted from <see cref="TokenStream(String,System.IO.TextReader)" /> + /// </returns> + public virtual int GetPositionIncrementGap(String fieldName) + { + return 0; + } + + /// <summary> Just like <see cref="GetPositionIncrementGap" />, except for + /// Token offsets instead. By default this returns 1 for + /// tokenized fields and, as if the fields were joined + /// with an extra space character, and 0 for un-tokenized + /// fields. This method is only called if the field + /// produced at least one token for indexing. + /// + /// </summary> + /// <param name="field">the field just indexed + /// </param> + /// <returns> offset gap, added to the next token emitted from <see cref="TokenStream(String,System.IO.TextReader)" /> + /// </returns> + public virtual int GetOffsetGap(IFieldable field) + { + return field.IsTokenized ? 1 : 0; + } + + /// <summary>Frees persistent resources used by this Analyzer </summary> + public void Close() + { + Dispose(); + } + + public virtual void Dispose() + { + Dispose(true); + } + + protected virtual void Dispose(bool disposing) + { + if (isDisposed) return; + + if (disposing) + { + if (tokenStreams != null) + { + tokenStreams.Close(); + tokenStreams = null; + } + } + isDisposed = true; + } + } +}
\ No newline at end of file diff --git a/src/core/Analysis/BaseCharFilter.cs b/src/core/Analysis/BaseCharFilter.cs new file mode 100644 index 0000000..b84fce0 --- /dev/null +++ b/src/core/Analysis/BaseCharFilter.cs @@ -0,0 +1,105 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +using System; +using Lucene.Net.Support; +using Lucene.Net.Util; + +namespace Lucene.Net.Analysis +{ + + /// <summary> + /// * Base utility class for implementing a <see cref="CharFilter" />. + /// * You subclass this, and then record mappings by calling + /// * <see cref="AddOffCorrectMap" />, and then invoke the correct + /// * method to correct an offset. + /// </summary> + public abstract class BaseCharFilter : CharFilter + { + + private int[] offsets; + private int[] diffs; + private int size = 0; + + protected BaseCharFilter(CharStream @in) : base(@in) + { + } + + /* Retrieve the corrected offset. */ + //@Override + protected internal override int Correct(int currentOff) + { + if (offsets == null || currentOff < offsets[0]) + { + return currentOff; + } + + int hi = size - 1; + if (currentOff >= offsets[hi]) + return currentOff + diffs[hi]; + + int lo = 0; + int mid = -1; + + while (hi >= lo) + { + mid = Number.URShift(lo + hi, 1); + if (currentOff < offsets[mid]) + hi = mid - 1; + else if (currentOff > offsets[mid]) + lo = mid + 1; + else + return currentOff + diffs[mid]; + } + + if (currentOff < offsets[mid]) + return mid == 0 ? currentOff : currentOff + diffs[mid - 1]; + return currentOff + diffs[mid]; + } + + protected int LastCumulativeDiff + { + get + { + return offsets == null ? 0 : diffs[size - 1]; + } + } + + [Obsolete("Use LastCumulativeDiff property instead")] + protected int GetLastCumulativeDiff() + { + return LastCumulativeDiff; + } + + protected void AddOffCorrectMap(int off, int cumulativeDiff) + { + if (offsets == null) + { + offsets = new int[64]; + diffs = new int[64]; + } + else if (size == offsets.Length) + { + offsets = ArrayUtil.Grow(offsets); + diffs = ArrayUtil.Grow(diffs); + } + + offsets[size] = off; + diffs[size++] = cumulativeDiff; + } + } +}
\ No newline at end of file diff --git a/src/core/Analysis/CachingTokenFilter.cs b/src/core/Analysis/CachingTokenFilter.cs new file mode 100644 index 0000000..c5f7694 --- /dev/null +++ b/src/core/Analysis/CachingTokenFilter.cs @@ -0,0 +1,86 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +namespace Lucene.Net.Analysis +{ + + /// <summary> This class can be used if the token attributes of a TokenStream + /// are intended to be consumed more than once. It caches + /// all token attribute states locally in a List. + /// + /// <p/>CachingTokenFilter implements the optional method + /// <see cref="TokenStream.Reset()" />, which repositions the + /// stream to the first Token. + /// </summary> + public sealed class CachingTokenFilter : TokenFilter + { + private System.Collections.Generic.LinkedList<State> cache = null; + private System.Collections.Generic.IEnumerator<State> iterator = null; + private State finalState; + + public CachingTokenFilter(TokenStream input):base(input) + { + } + + public override bool IncrementToken() + { + if (cache == null) + { + // fill cache lazily + cache = new System.Collections.Generic.LinkedList<State>(); + FillCache(); + iterator = cache.GetEnumerator(); + } + + if (!iterator.MoveNext()) + { + // the cache is exhausted, return false + return false; + } + // Since the TokenFilter can be reset, the tokens need to be preserved as immutable. + RestoreState(iterator.Current); + return true; + } + + public override void End() + { + if (finalState != null) + { + RestoreState(finalState); + } + } + + public override void Reset() + { + if (cache != null) + { + iterator = cache.GetEnumerator(); + } + } + + private void FillCache() + { + while (input.IncrementToken()) + { + cache.AddLast(CaptureState()); + } + // capture final state + input.End(); + finalState = CaptureState(); + } + } +}
\ No newline at end of file diff --git a/src/core/Analysis/CharArraySet.cs b/src/core/Analysis/CharArraySet.cs new file mode 100644 index 0000000..e7df0ba --- /dev/null +++ b/src/core/Analysis/CharArraySet.cs @@ -0,0 +1,517 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +using System; +using System.Collections; +using System.Linq; +using System.Collections.Generic; + +namespace Lucene.Net.Analysis +{ + /// <summary> A simple class that stores Strings as char[]'s in a + /// hash table. Note that this is not a general purpose + /// class. For example, it cannot remove items from the + /// set, nor does it resize its hash table to be smaller, + /// etc. It is designed to be quick to test if a char[] + /// is in the set without the necessity of converting it + /// to a String first. + /// <p/> + /// <em>Please note:</em> This class implements <see cref="System.Collections.Generic.ISet{T}"/> but + /// does not behave like it should in all cases. The generic type is + /// <see cref="System.Collections.Generic.ICollection{T}"/>, because you can add any object to it, + /// that has a string representation. The add methods will use + /// <see cref="object.ToString()"/> and store the result using a <see cref="char"/> + /// buffer. The same behaviour have the <see cref="Contains(object)"/> methods. + /// The <see cref="GetEnumerator"/> method returns an <see cref="string"/> IEnumerable. + /// For type safety also {@link #stringIterator()} is provided. + /// </summary> + // TODO: java uses wildcards, .net doesn't have this, easiest way is to + // make the entire class generic. Ultimately, though, since this + // works with strings, I can't think of a reason not to just declare + // this as an ISet<string>. + public class CharArraySet : ISet<string> + { + bool _ReadOnly = false; + const int INIT_SIZE = 8; + char[][] _Entries; + int _Count; + bool _IgnoreCase; + public static CharArraySet EMPTY_SET = UnmodifiableSet(new CharArraySet(0, false)); + + private void Init(int startSize, bool ignoreCase) + { + this._IgnoreCase = ignoreCase; + int size = INIT_SIZE; + while (startSize + (startSize >> 2) > size) + size <<= 1; + _Entries = new char[size][]; + } + + /// <summary>Create set with enough capacity to hold startSize + /// terms + /// </summary> + public CharArraySet(int startSize, bool ignoreCase) + { + Init(startSize, ignoreCase); + } + + public CharArraySet(IEnumerable<string> c, bool ignoreCase) + { + Init(c.Count(), ignoreCase); + AddItems(c); + } + + /// <summary>Create set from a Collection of char[] or String </summary> + public CharArraySet(IEnumerable<object> c, bool ignoreCase) + { + Init(c.Count(), ignoreCase); + AddItems(c); + } + + private void AddItems<T>(IEnumerable<T> items) + { + foreach(var item in items) + { + Add(item.ToString()); + } + } + + /// <summary>Create set from entries </summary> + private CharArraySet(char[][] entries, bool ignoreCase, int count) + { + this._Entries = entries; + this._IgnoreCase = ignoreCase; + this._Count = count; + } + + /// <summary>true if the <c>len</c> chars of <c>text</c> starting at <c>off</c> + /// are in the set + /// </summary> + public virtual bool Contains(char[] text, int off, int len) + { + return _Entries[GetSlot(text, off, len)] != null; + } + + public virtual bool Contains(string text) + { + return _Entries[GetSlot(text)] != null; + } + + + private int GetSlot(char[] text, int off, int len) + { + int code = GetHashCode(text, off, len); + int pos = code & (_Entries.Length - 1); + char[] text2 = _Entries[pos]; + if (text2 != null && !Equals(text, off, len, text2)) + { + int inc = ((code >> 8) + code) | 1; + do + { + code += inc; + pos = code & (_Entries.Length - 1); + text2 = _Entries[pos]; + } + while (text2 != null && !Equals(text, off, len, text2)); + } + return pos; + } + + /// <summary>Returns true if the String is in the set </summary> + private int GetSlot(string text) + { + int code = GetHashCode(text); + int pos = code & (_Entries.Length - 1); + char[] text2 = _Entries[pos]; + if (text2 != null && !Equals(text, text2)) + { + int inc = ((code >> 8) + code) | 1; + do + { + code += inc; + pos = code & (_Entries.Length - 1); + text2 = _Entries[pos]; + } + while (text2 != null && !Equals(text, text2)); + } + return pos; + } + + public bool Add(string text) + { + if (_ReadOnly) throw new NotSupportedException(); + return Add(text.ToCharArray()); + } + + /// <summary>Add this char[] directly to the set. + /// If ignoreCase is true for this Set, the text array will be directly modified. + /// The user should never modify this text array after calling this method. + /// </summary> + public bool Add(char[] text) + { + if (_ReadOnly) throw new NotSupportedException(); + + if (_IgnoreCase) + for (int i = 0; i < text.Length; i++) + text[i] = Char.ToLower(text[i]); + int slot = GetSlot(text, 0, text.Length); + if (_Entries[slot] != null) + return false; + _Entries[slot] = text; + _Count++; + + if (_Count + (_Count >> 2) > _Entries.Length) + { + Rehash(); + } + + return true; + } + + private bool Equals(char[] text1, int off, int len, char[] text2) + { + if (len != text2.Length) + return false; + if (_IgnoreCase) + { + for (int i = 0; i < len; i++) + { + if (char.ToLower(text1[off + i]) != text2[i]) + return false; + } + } + else + { + for (int i = 0; i < len; i++) + { + if (text1[off + i] != text2[i]) + return false; + } + } + return true; + } + + private bool Equals(string text1, char[] text2) + { + int len = text1.Length; + if (len != text2.Length) + return false; + if (_IgnoreCase) + { + for (int i = 0; i < len; i++) + { + if (char.ToLower(text1[i]) != text2[i]) + return false; + } + } + else + { + for (int i = 0; i < len; i++) + { + if (text1[i] != text2[i]) + return false; + } + } + return true; + } + + private void Rehash() + { + int newSize = 2 * _Entries.Length; + char[][] oldEntries = _Entries; + _Entries = new char[newSize][]; + + for (int i = 0; i < oldEntries.Length; i++) + { + char[] text = oldEntries[i]; + if (text != null) + { + // todo: could be faster... no need to compare strings on collision + _Entries[GetSlot(text, 0, text.Length)] = text; + } + } + } + + private int GetHashCode(char[] text, int offset, int len) + { + int code = 0; + int stop = offset + len; + if (_IgnoreCase) + { + for (int i = offset; i < stop; i++) + { + code = code * 31 + char.ToLower(text[i]); + } + } + else + { + for (int i = offset; i < stop; i++) + { + code = code * 31 + text[i]; + } + } + return code; + } + + private int GetHashCode(string text) + { + int code = 0; + int len = text.Length; + if (_IgnoreCase) + { + for (int i = 0; i < len; i++) + { + code = code * 31 + char.ToLower(text[i]); + } + } + else + { + for (int i = 0; i < len; i++) + { + code = code * 31 + text[i]; + } + } + return code; + } + + public int Count + { + get { return _Count; } + } + + public bool IsEmpty + { + get { return _Count == 0; } + } + + public bool Contains(object item) + { + var text = item as char[]; + return text != null ? Contains(text, 0, text.Length) : Contains(item.ToString()); + } + + public bool Add(object item) + { + return Add(item.ToString()); + } + + void ICollection<string>.Add(string item) + { + this.Add(item); + } + + /// <summary> + /// Returns an unmodifiable <see cref="CharArraySet"/>. This allows to provide + /// unmodifiable views of internal sets for "read-only" use + /// </summary> + /// <param name="set">A Set for which the unmodifiable set it returns.</param> + /// <returns>A new unmodifiable <see cref="CharArraySet"/></returns> + /// <throws>ArgumentNullException of the given set is <c>null</c></throws> + public static CharArraySet UnmodifiableSet(CharArraySet set) + { + if(set == null) + throw new ArgumentNullException("Given set is null"); + if (set == EMPTY_SET) + return EMPTY_SET; + if (set._ReadOnly) + return set; + + var newSet = new CharArraySet(set._Entries, set._IgnoreCase, set.Count) {IsReadOnly = true}; + return newSet; + } + + /// <summary> + /// returns a copy of the given set as a <see cref="CharArraySet"/>. If the given set + /// is a <see cref="CharArraySet"/> the ignoreCase property will be preserved. + /// </summary> + /// <param name="set">A set to copy</param> + /// <returns>a copy of the given set as a <see cref="CharArraySet"/>. If the given set + /// is a <see cref="CharArraySet"/> the ignoreCase property will be preserved.</returns> + public static CharArraySet Copy<T>(ISet<T> set) + { + if (set == null) + throw new ArgumentNullException("set", "Given set is null!"); + if (set == EMPTY_SET) + return EMPTY_SET; + bool ignoreCase = set is CharArraySet && ((CharArraySet)set)._IgnoreCase; + var arrSet = new CharArraySet(set.Count, ignoreCase); + arrSet.AddItems(set); + return arrSet; + } + + public void Clear() + { + throw new NotSupportedException("Remove not supported!"); + } + + public bool IsReadOnly + { + get { return _ReadOnly; } + private set { _ReadOnly = value; } + } + + /// <summary>Adds all of the elements in the specified collection to this collection </summary> + public void UnionWith(IEnumerable<string> other) + { + if (_ReadOnly) throw new NotSupportedException(); + + foreach (string s in other) + { + Add(s.ToCharArray()); + } + } + + /// <summary>Wrapper that calls UnionWith</summary> + public void AddAll(IEnumerable<string> coll) + { + UnionWith(coll); + } + + #region Unneeded methods + public void RemoveAll(ICollection<string> c) + { + throw new NotSupportedException(); + } + + public void RetainAll(ICollection<string> c) + { + throw new NotSupportedException(); + } + + void ICollection<string>.CopyTo(string[] array, int arrayIndex) + { + throw new NotSupportedException(); + } + + void ISet<string>.IntersectWith(IEnumerable<string> other) + { + throw new NotSupportedException(); + } + + void ISet<string>.ExceptWith(IEnumerable<string> other) + { + throw new NotSupportedException(); + } + + void ISet<string>.SymmetricExceptWith(IEnumerable<string> other) + { + throw new NotSupportedException(); + } + + bool ISet<string>.IsSubsetOf(IEnumerable<string> other) + { + throw new NotSupportedException(); + } + + bool ISet<string>.IsSupersetOf(IEnumerable<string> other) + { + throw new NotSupportedException(); + } + + bool ISet<string>.IsProperSupersetOf(IEnumerable<string> other) + { + throw new NotSupportedException(); + } + + bool ISet<string>.IsProperSubsetOf(IEnumerable<string> other) + { + throw new NotSupportedException(); + } + + bool ISet<string>.Overlaps(IEnumerable<string> other) + { + throw new NotSupportedException(); + } + + bool ISet<string>.SetEquals(IEnumerable<string> other) + { + throw new NotSupportedException(); + } + + bool ICollection<string>.Remove(string item) + { + throw new NotSupportedException(); + } + #endregion + + /// <summary> + /// The IEnumerator<String> for this set. Strings are constructed on the fly, + /// so use <c>nextCharArray</c> for more efficient access + /// </summary> + public class CharArraySetEnumerator : IEnumerator<string> + { + readonly CharArraySet _Creator; + int pos = -1; + char[] cur; + + protected internal CharArraySetEnumerator(CharArraySet creator) + { + _Creator = creator; + } + + public bool MoveNext() + { + cur = null; + pos++; + while (pos < _Creator._Entries.Length && (cur = _Creator._Entries[pos]) == null) + pos++; + return cur != null; + } + + /// <summary>do not modify the returned char[] </summary> + public char[] NextCharArray() + { + return cur; + } + + public string Current + { + get { return new string(NextCharArray()); } + } + + public void Dispose() + { + } + + object IEnumerator.Current + { + get { return new string(NextCharArray()); } + } + + public void Reset() + { + throw new NotImplementedException(); + } + } + + public IEnumerator<string> StringEnumerator() + { + return new CharArraySetEnumerator(this); + } + + public IEnumerator<string> GetEnumerator() + { + return new CharArraySetEnumerator(this); + } + + IEnumerator IEnumerable.GetEnumerator() + { + return GetEnumerator(); + } + } + +}
\ No newline at end of file diff --git a/src/core/Analysis/CharFilter.cs b/src/core/Analysis/CharFilter.cs new file mode 100644 index 0000000..039f841 --- /dev/null +++ b/src/core/Analysis/CharFilter.cs @@ -0,0 +1,95 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +namespace Lucene.Net.Analysis +{ + + /// <summary> Subclasses of CharFilter can be chained to filter CharStream. + /// They can be used as <see cref="System.IO.TextReader" /> with additional offset + /// correction. <see cref="Tokenizer" />s will automatically use <see cref="CorrectOffset" /> + /// if a CharFilter/CharStream subclass is used. + /// + /// </summary> + /// <version> $Id$ + /// + /// </version> + public abstract class CharFilter : CharStream + { + private long currentPosition = -1; + private bool isDisposed; + protected internal CharStream input; + + protected internal CharFilter(CharStream in_Renamed) : base(in_Renamed) + { + input = in_Renamed; + } + + /// <summary>Subclass may want to override to correct the current offset.</summary> + /// <param name="currentOff">current offset</param> + /// <returns>corrected offset</returns> + protected internal virtual int Correct(int currentOff) + { + return currentOff; + } + + /// <summary> Chains the corrected offset through the input + /// CharFilter. + /// </summary> + public override int CorrectOffset(int currentOff) + { + return input.CorrectOffset(Correct(currentOff)); + } + + protected override void Dispose(bool disposing) + { + if (isDisposed) return; + + if (disposing) + { + if (input != null) + { + input.Close(); + } + } + + input = null; + isDisposed = true; + base.Dispose(disposing); + } + + public override int Read(System.Char[] cbuf, int off, int len) + { + return input.Read(cbuf, off, len); + } + + public bool MarkSupported() + { + return input.BaseStream.CanSeek; + } + + public void Mark(int readAheadLimit) + { + currentPosition = input.BaseStream.Position; + input.BaseStream.Position = readAheadLimit; + } + + public void Reset() + { + input.BaseStream.Position = currentPosition; + } + } +}
\ No newline at end of file diff --git a/src/core/Analysis/CharReader.cs b/src/core/Analysis/CharReader.cs new file mode 100644 index 0000000..2120bd4 --- /dev/null +++ b/src/core/Analysis/CharReader.cs @@ -0,0 +1,94 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +namespace Lucene.Net.Analysis +{ + + /// <summary> CharReader is a Reader wrapper. It reads chars from + /// Reader and outputs <see cref="CharStream" />, defining an + /// identify function <see cref="CorrectOffset" /> method that + /// simply returns the provided offset. + /// </summary> + public sealed class CharReader:CharStream + { + private long currentPosition = -1; + + private bool isDisposed; + + internal System.IO.StreamReader input; + + public static CharStream Get(System.IO.TextReader input) + { + var charStream = input as CharStream; + if (charStream != null) + return charStream; + + // {{Aroush-2.9}} isn't there a better (faster) way to do this? + var theString = new System.IO.MemoryStream(System.Text.Encoding.UTF8.GetBytes(input.ReadToEnd())); + return new CharReader(new System.IO.StreamReader(theString)); + //return input is CharStream?(CharStream) input:new CharReader(input); + } + + private CharReader(System.IO.StreamReader in_Renamed) : base(in_Renamed) + { + input = in_Renamed; + } + + public override int CorrectOffset(int currentOff) + { + return currentOff; + } + + protected override void Dispose(bool disposing) + { + if (isDisposed) return; + + if (disposing) + { + if (input != null) + { + input.Close(); + } + } + + input = null; + isDisposed = true; + base.Dispose(disposing); + } + + public override int Read(System.Char[] cbuf, int off, int len) + { + return input.Read(cbuf, off, len); + } + + public bool MarkSupported() + { + return input.BaseStream.CanSeek; + } + + public void Mark(int readAheadLimit) + { + currentPosition = input.BaseStream.Position; + input.BaseStream.Position = readAheadLimit; + } + + public void Reset() + { + input.BaseStream.Position = currentPosition; + } + } +}
\ No newline at end of file diff --git a/src/core/Analysis/CharStream.cs b/src/core/Analysis/CharStream.cs new file mode 100644 index 0000000..0b36fe2 --- /dev/null +++ b/src/core/Analysis/CharStream.cs @@ -0,0 +1,45 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +namespace Lucene.Net.Analysis +{ + + /// <summary> CharStream adds <see cref="CorrectOffset" /> + /// functionality over <see cref="System.IO.TextReader" />. All Tokenizers accept a + /// CharStream instead of <see cref="System.IO.TextReader" /> as input, which enables + /// arbitrary character based filtering before tokenization. + /// The <see cref="CorrectOffset" /> method fixed offsets to account for + /// removal or insertion of characters, so that the offsets + /// reported in the tokens match the character offsets of the + /// original Reader. + /// </summary> + public abstract class CharStream : System.IO.StreamReader + { + protected CharStream(System.IO.StreamReader reader) : base(reader.BaseStream) + { + } + + /// <summary> Called by CharFilter(s) and Tokenizer to correct token offset. + /// + /// </summary> + /// <param name="currentOff">offset as seen in the output + /// </param> + /// <returns> corrected offset based on the input + /// </returns> + public abstract int CorrectOffset(int currentOff); + } +}
\ No newline at end of file diff --git a/src/core/Analysis/CharTokenizer.cs b/src/core/Analysis/CharTokenizer.cs new file mode 100644 index 0000000..22423ec --- /dev/null +++ b/src/core/Analysis/CharTokenizer.cs @@ -0,0 +1,135 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +using Lucene.Net.Analysis.Tokenattributes; +using AttributeSource = Lucene.Net.Util.AttributeSource; + +namespace Lucene.Net.Analysis +{ + + /// <summary>An abstract base class for simple, character-oriented tokenizers.</summary> + public abstract class CharTokenizer:Tokenizer + { + protected CharTokenizer(System.IO.TextReader input):base(input) + { + offsetAtt = AddAttribute<IOffsetAttribute>(); + termAtt = AddAttribute<ITermAttribute>(); + } + + protected CharTokenizer(AttributeSource source, System.IO.TextReader input):base(source, input) + { + offsetAtt = AddAttribute<IOffsetAttribute>(); + termAtt = AddAttribute<ITermAttribute>(); + } + + protected CharTokenizer(AttributeFactory factory, System.IO.TextReader input):base(factory, input) + { + offsetAtt = AddAttribute<IOffsetAttribute>(); + termAtt = AddAttribute<ITermAttribute>(); + } + + private int offset = 0, bufferIndex = 0, dataLen = 0; + private const int MAX_WORD_LEN = 255; + private const int IO_BUFFER_SIZE = 4096; + private readonly char[] ioBuffer = new char[IO_BUFFER_SIZE]; + + private readonly ITermAttribute termAtt; + private readonly IOffsetAttribute offsetAtt; + + /// <summary>Returns true iff a character should be included in a token. This + /// tokenizer generates as tokens adjacent sequences of characters which + /// satisfy this predicate. Characters for which this is false are used to + /// define token boundaries and are not included in tokens. + /// </summary> + protected internal abstract bool IsTokenChar(char c); + + /// <summary>Called on each token character to normalize it before it is added to the + /// token. The default implementation does nothing. Subclasses may use this + /// to, e.g., lowercase tokens. + /// </summary> + protected internal virtual char Normalize(char c) + { + return c; + } + + public override bool IncrementToken() + { + ClearAttributes(); + int length = 0; + int start = bufferIndex; + char[] buffer = termAtt.TermBuffer(); + while (true) + { + + if (bufferIndex >= dataLen) + { + offset += dataLen; + dataLen = input.Read(ioBuffer, 0, ioBuffer.Length); + if (dataLen <= 0) + { + dataLen = 0; // so next offset += dataLen won't decrement offset + if (length > 0) + break; + return false; + } + bufferIndex = 0; + } + + char c = ioBuffer[bufferIndex++]; + + if (IsTokenChar(c)) + { + // if it's a token char + + if (length == 0) + // start of token + start = offset + bufferIndex - 1; + else if (length == buffer.Length) + buffer = termAtt.ResizeTermBuffer(1 + length); + + buffer[length++] = Normalize(c); // buffer it, normalized + + if (length == MAX_WORD_LEN) + // buffer overflow! + break; + } + else if (length > 0) + // at non-Letter w/ chars + break; // return 'em + } + + termAtt.SetTermLength(length); + offsetAtt.SetOffset(CorrectOffset(start), CorrectOffset(start + length)); + return true; + } + + public override void End() + { + // set final offset + int finalOffset = CorrectOffset(offset); + offsetAtt.SetOffset(finalOffset, finalOffset); + } + + public override void Reset(System.IO.TextReader input) + { + base.Reset(input); + bufferIndex = 0; + offset = 0; + dataLen = 0; + } + } +}
\ No newline at end of file diff --git a/src/core/Analysis/ISOLatin1AccentFilter.cs b/src/core/Analysis/ISOLatin1AccentFilter.cs new file mode 100644 index 0000000..5fd839e --- /dev/null +++ b/src/core/Analysis/ISOLatin1AccentFilter.cs @@ -0,0 +1,344 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +using System; +using Lucene.Net.Analysis.Tokenattributes; + +namespace Lucene.Net.Analysis +{ + + /// <summary> A filter that replaces accented characters in the ISO Latin 1 character set + /// (ISO-8859-1) by their unaccented equivalent. The case will not be altered. + /// <p/> + /// For instance, 'À' will be replaced by 'a'. + /// <p/> + /// + /// </summary> + /// <deprecated> If you build a new index, use <see cref="ASCIIFoldingFilter"/> + /// which covers a superset of Latin 1. + /// This class is included for use with existing indexes and will be removed + /// in a future release (possible Lucene 4.0) + /// </deprecated> + [Obsolete("If you build a new index, use ASCIIFoldingFilter which covers a superset of Latin 1. This class is included for use with existing indexes and will be removed in a future release (possible Lucene 4.0).")] + public class ISOLatin1AccentFilter : TokenFilter + { + public ISOLatin1AccentFilter(TokenStream input):base(input) + { + termAtt = AddAttribute<ITermAttribute>(); + } + + private char[] output = new char[256]; + private int outputPos; + private readonly ITermAttribute termAtt; + + public override bool IncrementToken() + { + if (input.IncrementToken()) + { + char[] buffer = termAtt.TermBuffer(); + int length = termAtt.TermLength(); + // If no characters actually require rewriting then we + // just return token as-is: + for (int i = 0; i < length; i++) + { + char c = buffer[i]; + if (c >= '\u00c0' && c <= '\uFB06') + { + RemoveAccents(buffer, length); + termAtt.SetTermBuffer(output, 0, outputPos); + break; + } + } + return true; + } + return false; + } + + /// <summary> To replace accented characters in a String by unaccented equivalents.</summary> + public void RemoveAccents(char[] input, int length) + { + + // Worst-case length required: + int maxSizeNeeded = 2 * length; + + int size = output.Length; + while (size < maxSizeNeeded) + size *= 2; + + if (size != output.Length) + output = new char[size]; + + outputPos = 0; + + int pos = 0; + + for (int i = 0; i < length; i++, pos++) + { + char c = input[pos]; + + // Quick test: if it's not in range then just keep + // current character + if (c < '\u00c0' || c > '\uFB06') + output[outputPos++] = c; + else + { + switch (c) + { + + case '\u00C0': + // À + case '\u00C1': + // �? + case '\u00C2': + //  + case '\u00C3': + // à + case '\u00C4': + // Ä + case '\u00C5': // Ã… + output[outputPos++] = 'A'; + break; + + case '\u00C6': // Æ + output[outputPos++] = 'A'; + output[outputPos++] = 'E'; + break; + + case '\u00C7': // Ç + output[outputPos++] = 'C'; + break; + + case '\u00C8': + // È + case '\u00C9': + // É + case '\u00CA': + // Ê + case '\u00CB': // Ë + output[outputPos++] = 'E'; + break; + + case '\u00CC': + // ÃŒ + case '\u00CD': + // �? + case '\u00CE': + // ÃŽ + case '\u00CF': // �? + output[outputPos++] = 'I'; + break; + + case '\u0132': // IJ + output[outputPos++] = 'I'; + output[outputPos++] = 'J'; + break; + + case '\u00D0': // �? + output[outputPos++] = 'D'; + break; + + case '\u00D1': // Ñ + output[outputPos++] = 'N'; + break; + + case '\u00D2': + // Ã’ + case '\u00D3': + // Ó + case '\u00D4': + // Ô + case '\u00D5': + // Õ + case '\u00D6': + // Ö + case '\u00D8': // Ø + output[outputPos++] = 'O'; + break; + + case '\u0152': // Å’ + output[outputPos++] = 'O'; + output[outputPos++] = 'E'; + break; + + case '\u00DE': // Þ + output[outputPos++] = 'T'; + output[outputPos++] = 'H'; + break; + + case '\u00D9': + // Ù + case '\u00DA': + // Ú + case '\u00DB': + // Û + case '\u00DC': // Ãœ + output[outputPos++] = 'U'; + break; + + case '\u00DD': + // �? + case '\u0178': // Ÿ + output[outputPos++] = 'Y'; + break; + + case '\u00E0': + // à + case '\u00E1': + // á + case '\u00E2': + // â + case '\u00E3': + // ã + case '\u00E4': + // ä + case '\u00E5': // Ã¥ + output[outputPos++] = 'a'; + break; + + case '\u00E6': // æ + output[outputPos++] = 'a'; + output[outputPos++] = 'e'; + break; + + case '\u00E7': // ç + output[outputPos++] = 'c'; + break; + + case '\u00E8': + // è + case '\u00E9': + // é + case '\u00EA': + // ê + case '\u00EB': // ë + output[outputPos++] = 'e'; + break; + + case '\u00EC': + // ì + case '\u00ED': + // à + case '\u00EE': + // î + case '\u00EF': // ï + output[outputPos++] = 'i'; + break; + + case '\u0133': // ij + output[outputPos++] = 'i'; + output[outputPos++] = 'j'; + break; + + case '\u00F0': // ð + output[outputPos++] = 'd'; + break; + + case '\u00F1': // ñ + output[outputPos++] = 'n'; + break; + + case '\u00F2': + // ò + case '\u00F3': + // ó + case '\u00F4': + // ô + case '\u00F5': + // õ + case '\u00F6': + // ö + case '\u00F8': // ø + output[outputPos++] = 'o'; + break; + + case '\u0153': // Å“ + output[outputPos++] = 'o'; + output[outputPos++] = 'e'; + break; + + case '\u00DF': // ß + output[outputPos++] = 's'; + output[outputPos++] = 's'; + break; + + case '\u00FE': // þ + output[outputPos++] = 't'; + output[outputPos++] = 'h'; + break; + + case '\u00F9': + // ù + case '\u00FA': + // ú + case '\u00FB': + // û + case '\u00FC': // ü + output[outputPos++] = 'u'; + break; + + case '\u00FD': + // ý + case '\u00FF': // ÿ + output[outputPos++] = 'y'; + break; + + case '\uFB00': // ff + output[outputPos++] = 'f'; + output[outputPos++] = 'f'; + break; + + case '\uFB01': // �? + output[outputPos++] = 'f'; + output[outputPos++] = 'i'; + break; + + case '\uFB02': // fl + output[outputPos++] = 'f'; + output[outputPos++] = 'l'; + break; + // following 2 are commented as they can break the maxSizeNeeded (and doing *3 could be expensive) + // case '\uFB03': // ffi + // output[outputPos++] = 'f'; + // output[outputPos++] = 'f'; + // output[outputPos++] = 'i'; + // break; + // case '\uFB04': // ffl + // output[outputPos++] = 'f'; + // output[outputPos++] = 'f'; + // output[outputPos++] = 'l'; + // break; + + case '\uFB05': // ſt + output[outputPos++] = 'f'; + output[outputPos++] = 't'; + break; + + case '\uFB06': // st + output[outputPos++] = 's'; + output[outputPos++] = 't'; + break; + + default: + output[outputPos++] = c; + break; + + } + } + } + } + } +}
\ No newline at end of file diff --git a/src/core/Analysis/KeywordAnalyzer.cs b/src/core/Analysis/KeywordAnalyzer.cs new file mode 100644 index 0000000..116babb --- /dev/null +++ b/src/core/Analysis/KeywordAnalyzer.cs @@ -0,0 +1,54 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +namespace Lucene.Net.Analysis +{ + + /// <summary> "Tokenizes" the entire stream as a single token. This is useful + /// for data like zip codes, ids, and some product names. + /// </summary> + public class KeywordAnalyzer:Analyzer + { + public KeywordAnalyzer() + { + SetOverridesTokenStreamMethod<KeywordAnalyzer>(); + } + public override TokenStream TokenStream(System.String fieldName, System.IO.TextReader reader) + { + return new KeywordTokenizer(reader); + } + public override TokenStream ReusableTokenStream(System.String fieldName, System.IO.TextReader reader) + { + if (overridesTokenStreamMethod) + { + // LUCENE-1678: force fallback to tokenStream() if we + // have been subclassed and that subclass overrides + // tokenStream but not reusableTokenStream + return TokenStream(fieldName, reader); + } + var tokenizer = (Tokenizer) PreviousTokenStream; + if (tokenizer == null) + { + tokenizer = new KeywordTokenizer(reader); + PreviousTokenStream = tokenizer; + } + else + tokenizer.Reset(reader); + return tokenizer; + } + } +}
\ No newline at end of file diff --git a/src/core/Analysis/KeywordTokenizer.cs b/src/core/Analysis/KeywordTokenizer.cs new file mode 100644 index 0000000..f97ff95 --- /dev/null +++ b/src/core/Analysis/KeywordTokenizer.cs @@ -0,0 +1,99 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +using Lucene.Net.Analysis.Tokenattributes; +using AttributeSource = Lucene.Net.Util.AttributeSource; + +namespace Lucene.Net.Analysis +{ + + /// <summary> Emits the entire input as a single token.</summary> + public sealed class KeywordTokenizer:Tokenizer + { + + private const int DEFAULT_BUFFER_SIZE = 256; + + private bool done; + private int finalOffset; + private ITermAttribute termAtt; + private IOffsetAttribute offsetAtt; + + public KeywordTokenizer(System.IO.TextReader input):this(input, DEFAULT_BUFFER_SIZE) + { + } + + public KeywordTokenizer(System.IO.TextReader input, int bufferSize):base(input) + { + Init(bufferSize); + } + + public KeywordTokenizer(AttributeSource source, System.IO.TextReader input, int bufferSize):base(source, input) + { + Init(bufferSize); + } + + public KeywordTokenizer(AttributeFactory factory, System.IO.TextReader input, int bufferSize):base(factory, input) + { + Init(bufferSize); + } + + private void Init(int bufferSize) + { + this.done = false; + termAtt = AddAttribute<ITermAttribute>(); + offsetAtt = AddAttribute<IOffsetAttribute>(); + termAtt.ResizeTermBuffer(bufferSize); + } + + public override bool IncrementToken() + { + if (!done) + { + ClearAttributes(); + done = true; + int upto = 0; + char[] buffer = termAtt.TermBuffer(); + while (true) + { + int length = input.Read(buffer, upto, buffer.Length - upto); + if (length == 0) + break; + upto += length; + if (upto == buffer.Length) + buffer = termAtt.ResizeTermBuffer(1 + buffer.Length); + } + termAtt.SetTermLength(upto); + finalOffset = CorrectOffset(upto); + offsetAtt.SetOffset(CorrectOffset(0), finalOffset); + return true; + } + return false; + } + + public override void End() + { + // set final offset + offsetAtt.SetOffset(finalOffset, finalOffset); + } + + public override void Reset(System.IO.TextReader input) + { + base.Reset(input); + this.done = false; + } + } +}
\ No newline at end of file diff --git a/src/core/Analysis/LengthFilter.cs b/src/core/Analysis/LengthFilter.cs new file mode 100644 index 0000000..c4f60ad --- /dev/null +++ b/src/core/Analysis/LengthFilter.cs @@ -0,0 +1,60 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +using Lucene.Net.Analysis.Tokenattributes; + +namespace Lucene.Net.Analysis +{ + + /// <summary>Removes words that are too long or too short from the stream.</summary> + public sealed class LengthFilter:TokenFilter + { + + internal int min; + internal int max; + + private readonly ITermAttribute termAtt; + + /// <summary> Build a filter that removes words that are too long or too + /// short from the text. + /// </summary> + public LengthFilter(TokenStream in_Renamed, int min, int max) + : base(in_Renamed) + { + this.min = min; + this.max = max; + termAtt = AddAttribute<ITermAttribute>(); + } + + /// <summary> Returns the next input Token whose term() is the right len</summary> + public override bool IncrementToken() + { + // return the first non-stop word found + while (input.IncrementToken()) + { + var len = termAtt.TermLength(); + if (len >= min && len <= max) + { + return true; + } + // note: else we ignore it but should we index each part of it? + } + // reached EOS -- return false + return false; + } + } +}
\ No newline at end of file diff --git a/src/core/Analysis/LetterTokenizer.cs b/src/core/Analysis/LetterTokenizer.cs new file mode 100644 index 0000000..77629a8 --- /dev/null +++ b/src/core/Analysis/LetterTokenizer.cs @@ -0,0 +1,57 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +using AttributeSource = Lucene.Net.Util.AttributeSource; + +namespace Lucene.Net.Analysis +{ + + /// <summary>A LetterTokenizer is a tokenizer that divides text at non-letters. That's + /// to say, it defines tokens as maximal strings of adjacent letters, as defined + /// by java.lang.Character.isLetter() predicate. + /// Note: this does a decent job for most European languages, but does a terrible + /// job for some Asian languages, where words are not separated by spaces. + /// </summary> + + public class LetterTokenizer:CharTokenizer + { + /// <summary>Construct a new LetterTokenizer. </summary> + public LetterTokenizer(System.IO.TextReader @in):base(@in) + { + } + + /// <summary>Construct a new LetterTokenizer using a given <see cref="AttributeSource" />. </summary> + public LetterTokenizer(AttributeSource source, System.IO.TextReader @in) + : base(source, @in) + { + } + + /// <summary>Construct a new LetterTokenizer using a given <see cref="Lucene.Net.Util.AttributeSource.AttributeFactory" />. </summary> + public LetterTokenizer(AttributeFactory factory, System.IO.TextReader @in) + : base(factory, @in) + { + } + + /// <summary>Collects only characters which satisfy + /// <see cref="char.IsLetter(char)" />. + /// </summary> + protected internal override bool IsTokenChar(char c) + { + return System.Char.IsLetter(c); + } + } +}
\ No newline at end of file diff --git a/src/core/Analysis/LowerCaseFilter.cs b/src/core/Analysis/LowerCaseFilter.cs new file mode 100644 index 0000000..cad0197 --- /dev/null +++ b/src/core/Analysis/LowerCaseFilter.cs @@ -0,0 +1,49 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +using Lucene.Net.Analysis.Tokenattributes; + +namespace Lucene.Net.Analysis +{ + + /// <summary>Normalizes token text to lower case.</summary> + public sealed class LowerCaseFilter:TokenFilter + { + public LowerCaseFilter(TokenStream @in) + : base(@in) + { + termAtt = AddAttribute<ITermAttribute>(); + } + + private readonly ITermAttribute termAtt; + + public override bool IncrementToken() + { + if (input.IncrementToken()) + { + + char[] buffer = termAtt.TermBuffer(); + int length = termAtt.TermLength(); + for (int i = 0; i < length; i++) + buffer[i] = System.Char.ToLower(buffer[i]); + + return true; + } + return false; + } + } +}
\ No newline at end of file diff --git a/src/core/Analysis/LowerCaseTokenizer.cs b/src/core/Analysis/LowerCaseTokenizer.cs new file mode 100644 index 0000000..4cea217 --- /dev/null +++ b/src/core/Analysis/LowerCaseTokenizer.cs @@ -0,0 +1,60 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +using AttributeSource = Lucene.Net.Util.AttributeSource; + +namespace Lucene.Net.Analysis +{ + + /// <summary> LowerCaseTokenizer performs the function of LetterTokenizer + /// and LowerCaseFilter together. It divides text at non-letters and converts + /// them to lower case. While it is functionally equivalent to the combination + /// of LetterTokenizer and LowerCaseFilter, there is a performance advantage + /// to doing the two tasks at once, hence this (redundant) implementation. + /// <p/> + /// Note: this does a decent job for most European languages, but does a terrible + /// job for some Asian languages, where words are not separated by spaces. + /// </summary> + public sealed class LowerCaseTokenizer:LetterTokenizer + { + /// <summary>Construct a new LowerCaseTokenizer. </summary> + public LowerCaseTokenizer(System.IO.TextReader @in) + : base(@in) + { + } + + /// <summary>Construct a new LowerCaseTokenizer using a given <see cref="AttributeSource" />. </summary> + public LowerCaseTokenizer(AttributeSource source, System.IO.TextReader @in) + : base(source, @in) + { + } + + /// <summary>Construct a new LowerCaseTokenizer using a given <see cref="Lucene.Net.Util.AttributeSource.AttributeFactory" />. </summary> + public LowerCaseTokenizer(AttributeFactory factory, System.IO.TextReader @in) + : base(factory, @in) + { + } + + /// <summary>Converts char to lower case + /// <see cref="char.ToLower(char)" />. + /// </summary> + protected internal override char Normalize(char c) + { + return System.Char.ToLower(c); + } + } +}
\ No newline at end of file diff --git a/src/core/Analysis/MappingCharFilter.cs b/src/core/Analysis/MappingCharFilter.cs new file mode 100644 index 0000000..9705719 --- /dev/null +++ b/src/core/Analysis/MappingCharFilter.cs @@ -0,0 +1,166 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +using System.Collections.Generic; + +namespace Lucene.Net.Analysis +{ + + /// <summary> Simplistic <see cref="CharFilter" /> that applies the mappings + /// contained in a <see cref="NormalizeCharMap" /> to the character + /// stream, and correcting the resulting changes to the + /// offsets. + /// </summary> + public class MappingCharFilter : BaseCharFilter + { + private readonly NormalizeCharMap normMap; + private LinkedList<char> buffer; + private System.String replacement; + private int charPointer; + private int nextCharCounter; + + /// Default constructor that takes a <see cref="CharStream" />. + public MappingCharFilter(NormalizeCharMap normMap, CharStream @in) + : base(@in) + { + this.normMap = normMap; + } + + /// Easy-use constructor that takes a <see cref="System.IO.TextReader" />. + public MappingCharFilter(NormalizeCharMap normMap, System.IO.TextReader @in) + : base(CharReader.Get(@in)) + { + this.normMap = normMap; + } + + public override int Read() + { + while (true) + { + if (replacement != null && charPointer < replacement.Length) + { + return replacement[charPointer++]; + } + + int firstChar = NextChar(); + if (firstChar == - 1) + return - 1; + NormalizeCharMap nm = normMap.submap != null + ? normMap.submap[(char) firstChar] + : null; + if (nm == null) + return firstChar; + NormalizeCharMap result = Match(nm); + if (result == null) + return firstChar; + replacement = result.normStr; + charPointer = 0; + if (result.diff != 0) + { + int prevCumulativeDiff = LastCumulativeDiff; + if (result.diff < 0) + { + for (int i = 0; i < - result.diff; i++) + AddOffCorrectMap(nextCharCounter + i - prevCumulativeDiff, prevCumulativeDiff - 1 - i); + } + else + { + AddOffCorrectMap(nextCharCounter - result.diff - prevCumulativeDiff, prevCumulativeDiff + result.diff); + } + } + } + } + + private int NextChar() + { + nextCharCounter++; + if (buffer != null && buffer.Count != 0) + { + char tempObject = buffer.First.Value; + buffer.RemoveFirst(); + return (tempObject); + } + return input.Read(); + } + + private void PushChar(int c) + { + nextCharCounter--; + if (buffer == null) + { + buffer = new LinkedList<char>(); + } + buffer.AddFirst((char)c); + } + + private void PushLastChar(int c) + { + if (buffer == null) + { + buffer = new LinkedList<char>(); + } + buffer.AddLast((char)c); + } + + private NormalizeCharMap Match(NormalizeCharMap map) + { + NormalizeCharMap result = null; + if (map.submap != null) + { + int chr = NextChar(); + if (chr != - 1) + { + NormalizeCharMap subMap = map.submap[(char)chr]; + if (subMap != null) + { + result = Match(subMap); + } + if (result == null) + { + PushChar(chr); + } + } + } + if (result == null && map.normStr != null) + { + result = map; + } + return result; + } + + public override int Read(System.Char[] cbuf, int off, int len) + { + var tmp = new char[len]; + int l = input.Read(tmp, 0, len); + if (l != 0) + { + for (int i = 0; i < l; i++) + PushLastChar(tmp[i]); + } + l = 0; + for (int i = off; i < off + len; i++) + { + int c = Read(); + if (c == - 1) + break; + cbuf[i] = (char) c; + l++; + } + return l == 0?- 1:l; + } + } +}
\ No newline at end of file diff --git a/src/core/Analysis/NormalizeCharMap.cs b/src/core/Analysis/NormalizeCharMap.cs new file mode 100644 index 0000000..7fd520c --- /dev/null +++ b/src/core/Analysis/NormalizeCharMap.cs @@ -0,0 +1,68 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +using Lucene.Net.Support; + +namespace Lucene.Net.Analysis +{ + + /// <summary> Holds a map of String input to String output, to be used + /// with <see cref="MappingCharFilter" />. + /// </summary> + public class NormalizeCharMap + { + internal System.Collections.Generic.IDictionary<char, NormalizeCharMap> submap; + internal System.String normStr; + internal int diff; + + /// <summary>Records a replacement to be applied to the inputs + /// stream. Whenever <c>singleMatch</c> occurs in + /// the input, it will be replaced with + /// <c>replacement</c>. + /// + /// </summary> + /// <param name="singleMatch">input String to be replaced + /// </param> + /// <param name="replacement">output String + /// </param> + public virtual void Add(System.String singleMatch, System.String replacement) + { + NormalizeCharMap currMap = this; + for (var i = 0; i < singleMatch.Length; i++) + { + char c = singleMatch[i]; + if (currMap.submap == null) + { + currMap.submap = new HashMap<char, NormalizeCharMap>(1); + } + var map = currMap.submap[c]; + if (map == null) + { + map = new NormalizeCharMap(); + currMap.submap[c] = map; + } + currMap = map; + } + if (currMap.normStr != null) + { + throw new System.SystemException("MappingCharFilter: there is already a mapping for " + singleMatch); + } + currMap.normStr = replacement; + currMap.diff = singleMatch.Length - replacement.Length; + } + } +}
\ No newline at end of file diff --git a/src/core/Analysis/NumericTokenStream.cs b/src/core/Analysis/NumericTokenStream.cs new file mode 100644 index 0000000..90b6e72 --- /dev/null +++ b/src/core/Analysis/NumericTokenStream.cs @@ -0,0 +1,270 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +using Lucene.Net.Analysis.Tokenattributes; +using Lucene.Net.Search; +using AttributeSource = Lucene.Net.Util.AttributeSource; +using NumericUtils = Lucene.Net.Util.NumericUtils; +using NumericField = Lucene.Net.Documents.NumericField; +// javadocs + +namespace Lucene.Net.Analysis +{ + + /// <summary> <b>Expert:</b> This class provides a <see cref="TokenStream" /> + /// for indexing numeric values that can be used by <see cref="NumericRangeQuery{T}" /> + /// or <see cref="NumericRangeFilter{T}" />. + /// + /// <p/>Note that for simple usage, <see cref="NumericField" /> is + /// recommended. <see cref="NumericField" /> disables norms and + /// term freqs, as they are not usually needed during + /// searching. If you need to change these settings, you + /// should use this class. + /// + /// <p/>See <see cref="NumericField" /> for capabilities of fields + /// indexed numerically.<p/> + /// + /// <p/>Here's an example usage, for an <c>int</c> field: + /// + /// <code> + /// Field field = new Field(name, new NumericTokenStream(precisionStep).setIntValue(value)); + /// field.setOmitNorms(true); + /// field.setOmitTermFreqAndPositions(true); + /// document.add(field); + /// </code> + /// + /// <p/>For optimal performance, re-use the TokenStream and Field instance + /// for more than one document: + /// + /// <code> + /// NumericTokenStream stream = new NumericTokenStream(precisionStep); + /// Field field = new Field(name, stream); + /// field.setOmitNorms(true); + /// field.setOmitTermFreqAndPositions(true); + /// Document document = new Document(); + /// document.add(field); + /// + /// for(all documents) { + /// stream.setIntValue(value) + /// writer.addDocument(document); + /// } + /// </code> + /// + /// <p/>This stream is not intended to be used in analyzers; + /// it's more for iterating the different precisions during + /// indexing a specific numeric value.<p/> + /// + /// <p/><b>NOTE</b>: as token streams are only consumed once + /// the document is added to the index, if you index more + /// than one numeric field, use a separate <c>NumericTokenStream</c> + /// instance for each.<p/> + /// + /// <p/>See <see cref="NumericRangeQuery{T}" /> for more details on the + /// <a href="../search/NumericRangeQuery.html#precisionStepDesc"><c>precisionStep</c></a> + /// parameter as well as how numeric fields work under the hood.<p/> + /// + /// <p/><font color="red"><b>NOTE:</b> This API is experimental and + /// might change in incompatible ways in the next release.</font> + /// Since 2.9 + /// </summary> + public sealed class NumericTokenStream : TokenStream + { + private void InitBlock() + { + termAtt = AddAttribute<ITermAttribute>(); + typeAtt = AddAttribute<ITypeAttribute>(); + posIncrAtt = AddAttribute<IPositionIncrementAttribute>(); + } + + /// <summary>The full precision token gets this token type assigned. </summary> + public const System.String TOKEN_TYPE_FULL_PREC = "fullPrecNumeric"; + + /// <summary>The lower precision tokens gets this token type assigned. </summary> + public const System.String TOKEN_TYPE_LOWER_PREC = "lowerPrecNumeric"; + + /// <summary> Creates a token stream for numeric values using the default <c>precisionStep</c> + /// <see cref="NumericUtils.PRECISION_STEP_DEFAULT" /> (4). The stream is not yet initialized, + /// before using set a value using the various set<em>???</em>Value() methods. + /// </summary> + public NumericTokenStream():this(NumericUtils.PRECISION_STEP_DEFAULT) + { + } + + /// <summary> Creates a token stream for numeric values with the specified + /// <c>precisionStep</c>. The stream is not yet initialized, + /// before using set a value using the various set<em>???</em>Value() methods. + /// </summary> + public NumericTokenStream(int precisionStep):base() + { + InitBlock(); + this.precisionStep = precisionStep; + if (precisionStep < 1) + throw new System.ArgumentException("precisionStep must be >=1"); + } + + /// <summary> Expert: Creates a token stream for numeric values with the specified + /// <c>precisionStep</c> using the given <see cref="AttributeSource" />. + /// The stream is not yet initialized, + /// before using set a value using the various set<em>???</em>Value() methods. + /// </summary> + public NumericTokenStream(AttributeSource source, int precisionStep):base(source) + { + InitBlock(); + this.precisionStep = precisionStep; + if (precisionStep < 1) + throw new System.ArgumentException("precisionStep must be >=1"); + } + + /// <summary> Expert: Creates a token stream for numeric values with the specified + /// <c>precisionStep</c> using the given + /// <see cref="Lucene.Net.Util.AttributeSource.AttributeFactory" />. + /// The stream is not yet initialized, + /// before using set a value using the various set<em>???</em>Value() methods. + /// </summary> + public NumericTokenStream(AttributeFactory factory, int precisionStep):base(factory) + { + InitBlock(); + this.precisionStep = precisionStep; + if (precisionStep < 1) + throw new System.ArgumentException("precisionStep must be >=1"); + } + + /// <summary> Initializes the token stream with the supplied <c>long</c> value.</summary> + /// <param name="value_Renamed">the value, for which this TokenStream should enumerate tokens. + /// </param> + /// <returns> this instance, because of this you can use it the following way: + /// <c>new Field(name, new NumericTokenStream(precisionStep).SetLongValue(value))</c> + /// </returns> + public NumericTokenStream SetLongValue(long value_Renamed) + { + this.value_Renamed = value_Renamed; + valSize = 64; + shift = 0; + return this; + } + + /// <summary> Initializes the token stream with the supplied <c>int</c> value.</summary> + /// <param name="value_Renamed">the value, for which this TokenStream should enumerate tokens. + /// </param> + /// <returns> this instance, because of this you can use it the following way: + /// <c>new Field(name, new NumericTokenStream(precisionStep).SetIntValue(value))</c> + /// </returns> + public NumericTokenStream SetIntValue(int value_Renamed) + { + this.value_Renamed = (long) value_Renamed; + valSize = 32; + shift = 0; + return this; + } + + /// <summary> Initializes the token stream with the supplied <c>double</c> value.</summary> + /// <param name="value_Renamed">the value, for which this TokenStream should enumerate tokens. + /// </param> + /// <returns> this instance, because of this you can use it the following way: + /// <c>new Field(name, new NumericTokenStream(precisionStep).SetDoubleValue(value))</c> + /// </returns> + public NumericTokenStream SetDoubleValue(double value_Renamed) + { + this.value_Renamed = NumericUtils.DoubleToSortableLong(value_Renamed); + valSize = 64; + shift = 0; + return this; + } + + /// <summary> Initializes the token stream with the supplied <c>float</c> value.</summary> + /// <param name="value_Renamed">the value, for which this TokenStream should enumerate tokens. + /// </param> + /// <returns> this instance, because of this you can use it the following way: + /// <c>new Field(name, new NumericTokenStream(precisionStep).SetFloatValue(value))</c> + /// </returns> + public NumericTokenStream SetFloatValue(float value_Renamed) + { + this.value_Renamed = (long) NumericUtils.FloatToSortableInt(value_Renamed); + valSize = 32; + shift = 0; + return this; + } + + // @Override + public override void Reset() + { + if (valSize == 0) + throw new System.SystemException("call set???Value() before usage"); + shift = 0; + } + + protected override void Dispose(bool disposing) + { + // Do nothing. + } + + // @Override + public override bool IncrementToken() + { + if (valSize == 0) + throw new System.SystemException("call set???Value() before usage"); + if (shift >= valSize) + return false; + + ClearAttributes(); + char[] buffer; + switch (valSize) + { + + case 64: + buffer = termAtt.ResizeTermBuffer(NumericUtils.BUF_SIZE_LONG); + termAtt.SetTermLength(NumericUtils.LongToPrefixCoded(value_Renamed, shift, buffer)); + break; + + + case 32: + buffer = termAtt.ResizeTermBuffer(NumericUtils.BUF_SIZE_INT); + termAtt.SetTermLength(NumericUtils.IntToPrefixCoded((int) value_Renamed, shift, buffer)); + break; + + + default: + // should not happen + throw new System.ArgumentException("valSize must be 32 or 64"); + + } + + typeAtt.Type = (shift == 0)?TOKEN_TYPE_FULL_PREC:TOKEN_TYPE_LOWER_PREC; + posIncrAtt.PositionIncrement = (shift == 0)?1:0; + shift += precisionStep; + return true; + } + + // @Override + public override System.String ToString() + { + System.Text.StringBuilder sb = new System.Text.StringBuilder("(numeric,valSize=").Append(valSize); + sb.Append(",precisionStep=").Append(precisionStep).Append(')'); + return sb.ToString(); + } + + // members + private ITermAttribute termAtt; + private ITypeAttribute typeAtt; + private IPositionIncrementAttribute posIncrAtt; + + private int shift = 0, valSize = 0; // valSize==0 means not initialized + private readonly int precisionStep; + + private long value_Renamed = 0L; + } +}
\ No newline at end of file diff --git a/src/core/Analysis/PerFieldAnalyzerWrapper.cs b/src/core/Analysis/PerFieldAnalyzerWrapper.cs new file mode 100644 index 0000000..b1c43aa --- /dev/null +++ b/src/core/Analysis/PerFieldAnalyzerWrapper.cs @@ -0,0 +1,135 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +using System.Collections.Generic; +using Lucene.Net.Support; + +namespace Lucene.Net.Analysis +{ + + /// <summary> This analyzer is used to facilitate scenarios where different + /// fields require different analysis techniques. Use <see cref="AddAnalyzer" /> + /// to add a non-default analyzer on a field name basis. + /// + /// <p/>Example usage: + /// + /// <code> + /// PerFieldAnalyzerWrapper aWrapper = + /// new PerFieldAnalyzerWrapper(new StandardAnalyzer()); + /// aWrapper.addAnalyzer("firstname", new KeywordAnalyzer()); + /// aWrapper.addAnalyzer("lastname", new KeywordAnalyzer()); + /// </code> + /// + /// <p/>In this example, StandardAnalyzer will be used for all fields except "firstname" + /// and "lastname", for which KeywordAnalyzer will be used. + /// + /// <p/>A PerFieldAnalyzerWrapper can be used like any other analyzer, for both indexing + /// and query parsing. + /// </summary> + public class PerFieldAnalyzerWrapper:Analyzer + { + private readonly Analyzer defaultAnalyzer; + private readonly IDictionary<string, Analyzer> analyzerMap = new HashMap<string, Analyzer>(); + + + /// <summary> Constructs with default analyzer. + /// + /// </summary> + /// <param name="defaultAnalyzer">Any fields not specifically + /// defined to use a different analyzer will use the one provided here. + /// </param> + public PerFieldAnalyzerWrapper(Analyzer defaultAnalyzer) + : this(defaultAnalyzer, null) + { + } + + /// <summary> Constructs with default analyzer and a map of analyzers to use for + /// specific fields. + /// + /// </summary> + /// <param name="defaultAnalyzer">Any fields not specifically + /// defined to use a different analyzer will use the one provided here. + /// </param> + /// <param name="fieldAnalyzers">a Map (String field name to the Analyzer) to be + /// used for those fields + /// </param> + public PerFieldAnalyzerWrapper(Analyzer defaultAnalyzer, IEnumerable<KeyValuePair<string, Analyzer>> fieldAnalyzers) + { + this.defaultAnalyzer = defaultAnalyzer; + if (fieldAnalyzers != null) + { + foreach(var entry in fieldAnalyzers) + analyzerMap[entry.Key] = entry.Value; + } + SetOverridesTokenStreamMethod<PerFieldAnalyzerWrapper>(); + } + + + /// <summary> Defines an analyzer to use for the specified field. + /// + /// </summary> + /// <param name="fieldName">field name requiring a non-default analyzer + /// </param> + /// <param name="analyzer">non-default analyzer to use for field + /// </param> + public virtual void AddAnalyzer(System.String fieldName, Analyzer analyzer) + { + analyzerMap[fieldName] = analyzer; + } + + public override TokenStream TokenStream(System.String fieldName, System.IO.TextReader reader) + { + var analyzer = analyzerMap[fieldName] ?? defaultAnalyzer; + + return analyzer.TokenStream(fieldName, reader); + } + + public override TokenStream ReusableTokenStream(string fieldName, System.IO.TextReader reader) + { + if (overridesTokenStreamMethod) + { + // LUCENE-1678: force fallback to tokenStream() if we + // have been subclassed and that subclass overrides + // tokenStream but not reusableTokenStream + return TokenStream(fieldName, reader); + } + var analyzer = analyzerMap[fieldName] ?? defaultAnalyzer; + + return analyzer.ReusableTokenStream(fieldName, reader); + } + + /// <summary>Return the positionIncrementGap from the analyzer assigned to fieldName </summary> + public override int GetPositionIncrementGap(string fieldName) + { + var analyzer = analyzerMap[fieldName] ?? defaultAnalyzer; + return analyzer.GetPositionIncrementGap(fieldName); + } + + /// <summary> Return the offsetGap from the analyzer assigned to field </summary> + public override int GetOffsetGap(Documents.IFieldable field) + { + Analyzer analyzer = analyzerMap[field.Name] ?? defaultAnalyzer; + return analyzer.GetOffsetGap(field); + } + + public override System.String ToString() + { + // {{Aroush-2.9}} will 'analyzerMap.ToString()' work in the same way as Java's java.util.HashMap.toString()? + return "PerFieldAnalyzerWrapper(" + analyzerMap + ", default=" + defaultAnalyzer + ")"; + } + } +}
\ No newline at end of file diff --git a/src/core/Analysis/PorterStemFilter.cs b/src/core/Analysis/PorterStemFilter.cs new file mode 100644 index 0000000..b7f1dbf --- /dev/null +++ b/src/core/Analysis/PorterStemFilter.cs @@ -0,0 +1,62 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +using Lucene.Net.Analysis.Tokenattributes; + +namespace Lucene.Net.Analysis +{ + + /// <summary>Transforms the token stream as per the Porter stemming algorithm. + /// Note: the input to the stemming filter must already be in lower case, + /// so you will need to use LowerCaseFilter or LowerCaseTokenizer farther + /// down the Tokenizer chain in order for this to work properly! + /// <p/> + /// To use this filter with other analyzers, you'll want to write an + /// Analyzer class that sets up the TokenStream chain as you want it. + /// To use this with LowerCaseTokenizer, for example, you'd write an + /// analyzer like this: + /// <p/> + /// <code> + /// class MyAnalyzer extends Analyzer { + /// public final TokenStream tokenStream(String fieldName, Reader reader) { + /// return new PorterStemFilter(new LowerCaseTokenizer(reader)); + /// } + /// } + /// </code> + /// </summary> + public sealed class PorterStemFilter:TokenFilter + { + private readonly PorterStemmer stemmer; + private readonly ITermAttribute termAtt; + + public PorterStemFilter(TokenStream in_Renamed):base(in_Renamed) + { + stemmer = new PorterStemmer(); + termAtt = AddAttribute<ITermAttribute>(); + } + + public override bool IncrementToken() + { + if (!input.IncrementToken()) + return false; + + if (stemmer.Stem(termAtt.TermBuffer(), 0, termAtt.TermLength())) + termAtt.SetTermBuffer(stemmer.ResultBuffer, 0, stemmer.ResultLength); + return true; + } + } +}
\ No newline at end of file diff --git a/src/core/Analysis/PorterStemmer.cs b/src/core/Analysis/PorterStemmer.cs new file mode 100644 index 0000000..f47c5a7 --- /dev/null +++ b/src/core/Analysis/PorterStemmer.cs @@ -0,0 +1,746 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + +Porter stemmer in Java. The original paper is in + +Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14, +no. 3, pp 130-137, + +See also http://www.tartarus.org/~martin/PorterStemmer/index.html + +Bug 1 (reported by Gonzalo Parra 16/10/99) fixed as marked below. +Tthe words 'aed', 'eed', 'oed' leave k at 'a' for step 3, and b[k-1] +is then out outside the bounds of b. + +Similarly, + +Bug 2 (reported by Steve Dyrdahl 22/2/00) fixed as marked below. +'ion' by itself leaves j = -1 in the test for 'ion' in step 5, and +b[j] is then outside the bounds of b. + +Release 3. + +[ This version is derived from Release 3, modified by Brian Goetz to +optimize for fewer object creations. ] +*/ +using System; +namespace Lucene.Net.Analysis +{ + + /// <summary> + /// Stemmer, implementing the Porter Stemming Algorithm + /// + /// The Stemmer class transforms a word into its root form. The input + /// word can be provided a character at time (by calling add()), or at once + /// by calling one of the various stem(something) methods. + /// </summary> + + class PorterStemmer + { + private char[] b; + private int i, j, k, k0; + private bool dirty = false; + private const int INC = 50; /* unit of size whereby b is increased */ + private const int EXTRA = 1; + + public PorterStemmer() + { + b = new char[INC]; + i = 0; + } + + /// <summary> reset() resets the stemmer so it can stem another word. If you invoke + /// the stemmer by calling add(char) and then stem(), you must call reset() + /// before starting another word. + /// </summary> + public virtual void Reset() + { + i = 0; dirty = false; + } + + /// <summary> Add a character to the word being stemmed. When you are finished + /// adding characters, you can call stem(void) to process the word. + /// </summary> + public virtual void Add(char ch) + { + if (b.Length <= i + EXTRA) + { + var new_b = new char[b.Length + INC]; + Array.Copy(b, 0, new_b, 0, b.Length); + b = new_b; + } + b[i++] = ch; + } + + /// <summary> After a word has been stemmed, it can be retrieved by toString(), + /// or a reference to the internal buffer can be retrieved by getResultBuffer + /// and getResultLength (which is generally more efficient.) + /// </summary> + public override System.String ToString() + { + return new System.String(b, 0, i); + } + + /// <summary> Returns the length of the word resulting from the stemming process.</summary> + public virtual int ResultLength + { + get { return i; } + } + + /// <summary> Returns a reference to a character buffer containing the results of + /// the stemming process. You also need to consult getResultLength() + /// to determine the length of the result. + /// </summary> + public virtual char[] ResultBuffer + { + get { return b; } + } + + /* cons(i) is true <=> b[i] is a consonant. */ + + private bool Cons(int i) + { + switch (b[i]) + { + + case 'a': + case 'e': + case 'i': + case 'o': + case 'u': + return false; + + case 'y': + return (i == k0)?true:!Cons(i - 1); + + default: + return true; + + } + } + + /* m() measures the number of consonant sequences between k0 and j. if c is + a consonant sequence and v a vowel sequence, and <..> indicates arbitrary + presence, + + <c><v> gives 0 + <c>vc<v> gives 1 + <c>vcvc<v> gives 2 + <c>vcvcvc<v> gives 3 + .... + */ + + private int M() + { + int n = 0; + int i = k0; + while (true) + { + if (i > j) + return n; + if (!Cons(i)) + break; + i++; + } + i++; + while (true) + { + while (true) + { + if (i > j) + return n; + if (Cons(i)) + break; + i++; + } + i++; + n++; + while (true) + { + if (i > j) + return n; + if (!Cons(i)) + break; + i++; + } + i++; + } + } + + /* vowelinstem() is true <=> k0,...j contains a vowel */ + + private bool Vowelinstem() + { + int i; + for (i = k0; i <= j; i++) + if (!Cons(i)) + return true; + return false; + } + + /* doublec(j) is true <=> j,(j-1) contain a double consonant. */ + + private bool Doublec(int j) + { + if (j < k0 + 1) + return false; + if (b[j] != b[j - 1]) + return false; + return Cons(j); + } + + /* cvc(i) is true <=> i-2,i-1,i has the form consonant - vowel - consonant + and also if the second c is not w,x or y. this is used when trying to + restore an e at the end of a short word. e.g. + + cav(e), lov(e), hop(e), crim(e), but + snow, box, tray. + + */ + + private bool Cvc(int i) + { + if (i < k0 + 2 || !Cons(i) || Cons(i - 1) || !Cons(i - 2)) + return false; + else + { + int ch = b[i]; + if (ch == 'w' || ch == 'x' || ch == 'y') + return false; + } + return true; + } + + private bool Ends(System.String s) + { + int l = s.Length; + int o = k - l + 1; + if (o < k0) + return false; + for (int i = 0; i < l; i++) + if (b[o + i] != s[i]) + return false; + j = k - l; + return true; + } + + /* setto(s) sets (j+1),...k to the characters in the string s, readjusting + k. */ + + internal virtual void Setto(System.String s) + { + int l = s.Length; + int o = j + 1; + for (int i = 0; i < l; i++) + b[o + i] = s[i]; + k = j + l; + dirty = true; + } + + /* r(s) is used further down. */ + + internal virtual void R(System.String s) + { + if (M() > 0) + Setto(s); + } + + /* step1() gets rid of plurals and -ed or -ing. e.g. + + caresses -> caress + ponies -> poni + ties -> ti + caress -> caress + cats -> cat + + feed -> feed + agreed -> agree + disabled -> disable + + matting -> mat + mating -> mate + meeting -> meet + milling -> mill + messing -> mess + + meetings -> meet + + */ + + private void Step1() + { + if (b[k] == 's') + { + if (Ends("sses")) + k -= 2; + else if (Ends("ies")) + Setto("i"); + else if (b[k - 1] != 's') + k--; + } + if (Ends("eed")) + { + if (M() > 0) + k--; + } + else if ((Ends("ed") || Ends("ing")) && Vowelinstem()) + { + k = j; + if (Ends("at")) + Setto("ate"); + else if (Ends("bl")) + Setto("ble"); + else if (Ends("iz")) + Setto("ize"); + else if (Doublec(k)) + { + int ch = b[k--]; + if (ch == 'l' || ch == 's' || ch == 'z') + k++; + } + else if (M() == 1 && Cvc(k)) + Setto("e"); + } + } + + /* step2() turns terminal y to i when there is another vowel in the stem. */ + + private void Step2() + { + if (Ends("y") && Vowelinstem()) + { + b[k] = 'i'; + dirty = true; + } + } + + /* step3() maps double suffices to single ones. so -ization ( = -ize plus + -ation) maps to -ize etc. note that the string before the suffix must give + m() > 0. */ + + private void Step3() + { + if (k == k0) + return ; /* For Bug 1 */ + switch (b[k - 1]) + { + + case 'a': + if (Ends("ational")) + { + R("ate"); break; + } + if (Ends("tional")) + { + R("tion"); break; + } + break; + + case 'c': + if (Ends("enci")) + { + R("ence"); break; + } + if (Ends("anci")) + { + R("ance"); break; + } + break; + + case 'e': + if (Ends("izer")) + { + R("ize"); break; + } + break; + + case 'l': + if (Ends("bli")) + { + R("ble"); break; + } + if (Ends("alli")) + { + R("al"); break; + } + if (Ends("entli")) + { + R("ent"); break; + } + if (Ends("eli")) + { + R("e"); break; + } + if (Ends("ousli")) + { + R("ous"); break; + } + break; + + case 'o': + if (Ends("ization")) + { + R("ize"); break; + } + if (Ends("ation")) + { + R("ate"); break; + } + if (Ends("ator")) + { + R("ate"); break; + } + break; + + case 's': + if (Ends("alism")) + { + R("al"); break; + } + if (Ends("iveness")) + { + R("ive"); break; + } + if (Ends("fulness")) + { + R("ful"); break; + } + if (Ends("ousness")) + { + R("ous"); break; + } + break; + + case 't': + if (Ends("aliti")) + { + R("al"); break; + } + if (Ends("iviti")) + { + R("ive"); break; + } + if (Ends("biliti")) + { + R("ble"); break; + } + break; + + case 'g': + if (Ends("logi")) + { + R("log"); break; + } + break; + } + } + + /* step4() deals with -ic-, -full, -ness etc. similar strategy to step3. */ + + private void Step4() + { + switch (b[k]) + { + + case 'e': + if (Ends("icate")) + { + R("ic"); break; + } + if (Ends("ative")) + { + R(""); break; + } + if (Ends("alize")) + { + R("al"); break; + } + break; + + case 'i': + if (Ends("iciti")) + { + R("ic"); break; + } + break; + + case 'l': + if (Ends("ical")) + { + R("ic"); break; + } + if (Ends("ful")) + { + R(""); break; + } + break; + + case 's': + if (Ends("ness")) + { + R(""); break; + } + break; + } + } + + /* step5() takes off -ant, -ence etc., in context <c>vcvc<v>. */ + + private void Step5() + { + if (k == k0) + return ; /* for Bug 1 */ + switch (b[k - 1]) + { + + case 'a': + if (Ends("al")) + break; + return ; + + case 'c': + if (Ends("ance")) + break; + if (Ends("ence")) + break; + return ; + + case 'e': + if (Ends("er")) + break; return ; + + case 'i': + if (Ends("ic")) + break; return ; + + case 'l': + if (Ends("able")) + break; + if (Ends("ible")) + break; return ; + + case 'n': + if (Ends("ant")) + break; + if (Ends("ement")) + break; + if (Ends("ment")) + break; + /* element etc. not stripped before the m */ + if (Ends("ent")) + break; + return ; + + case 'o': + if (Ends("ion") && j >= 0 && (b[j] == 's' || b[j] == 't')) + break; + /* j >= 0 fixes Bug 2 */ + if (Ends("ou")) + break; + return ; + /* takes care of -ous */ + + case 's': + if (Ends("ism")) + break; + return ; + + case 't': + if (Ends("ate")) + break; + if (Ends("iti")) + break; + return ; + + case 'u': + if (Ends("ous")) + break; + return ; + + case 'v': + if (Ends("ive")) + break; + return ; + + case 'z': + if (Ends("ize")) + break; + return ; + + default: + return ; + + } + if (M() > 1) + k = j; + } + + /* step6() removes a final -e if m() > 1. */ + + private void Step6() + { + j = k; + if (b[k] == 'e') + { + int a = M(); + if (a > 1 || a == 1 && !Cvc(k - 1)) + k--; + } + if (b[k] == 'l' && Doublec(k) && M() > 1) + k--; + } + + + /// <summary> Stem a word provided as a String. Returns the result as a String.</summary> + public virtual System.String Stem(System.String s) + { + if (Stem(s.ToCharArray(), s.Length)) + { + return ToString(); + } + else + return s; + } + + /// <summary>Stem a word contained in a char[]. Returns true if the stemming process + /// resulted in a word different from the input. You can retrieve the + /// result with getResultLength()/getResultBuffer() or toString(). + /// </summary> + public virtual bool Stem(char[] word) + { + return Stem(word, word.Length); + } + + /// <summary>Stem a word contained in a portion of a char[] array. Returns + /// true if the stemming process resulted in a word different from + /// the input. You can retrieve the result with + /// getResultLength()/getResultBuffer() or toString(). + /// </summary> + public virtual bool Stem(char[] wordBuffer, int offset, int wordLen) + { + Reset(); + if (b.Length < wordLen) + { + var new_b = new char[wordLen + EXTRA]; + b = new_b; + } + Array.Copy(wordBuffer, offset, b, 0, wordLen); + i = wordLen; + return Stem(0); + } + + /// <summary>Stem a word contained in a leading portion of a char[] array. + /// Returns true if the stemming process resulted in a word different + /// from the input. You can retrieve the result with + /// getResultLength()/getResultBuffer() or toString(). + /// </summary> + public virtual bool Stem(char[] word, int wordLen) + { + return Stem(word, 0, wordLen); + } + + /// <summary>Stem the word placed into the Stemmer buffer through calls to add(). + /// Returns true if the stemming process resulted in a word different + /// from the input. You can retrieve the result with + /// getResultLength()/getResultBuffer() or toString(). + /// </summary> + public virtual bool Stem() + { + return Stem(0); + } + + public virtual bool Stem(int i0) + { + k = i - 1; + k0 = i0; + if (k > k0 + 1) + { + Step1(); Step2(); Step3(); Step4(); Step5(); Step6(); + } + // Also, a word is considered dirty if we lopped off letters + // Thanks to Ifigenia Vairelles for pointing this out. + if (i != k + 1) + dirty = true; + i = k + 1; + return dirty; + } + + /// <summary>Test program for demonstrating the Stemmer. It reads a file and + /// stems each word, writing the result to standard out. + /// Usage: Stemmer file-name + /// </summary> + [STAThread] + public static void Main(System.String[] args) + { + var s = new PorterStemmer(); + + for (int i = 0; i < args.Length; i++) + { + try + { + System.IO.Stream in_Renamed = new System.IO.FileStream(args[i], System.IO.FileMode.Open, System.IO.FileAccess.Read); + var buffer = new byte[1024]; + + int bufferLen = in_Renamed.Read(buffer, 0, buffer.Length); + int offset = 0; + s.Reset(); + + while (true) + { + int ch; + if (offset < bufferLen) + ch = buffer[offset++]; + else + { + bufferLen = in_Renamed.Read(buffer, 0, buffer.Length); + offset = 0; + if (bufferLen < 0) + ch = - 1; + else + ch = buffer[offset++]; + } + + if (Char.IsLetter((char) ch)) + { + s.Add(Char.ToLower((char) ch)); + } + else + { + s.Stem(); + Console.Out.Write(s.ToString()); + s.Reset(); + if (ch < 0) + break; + else + { + System.Console.Out.Write((char) ch); + } + } + } + + in_Renamed.Close(); + } + catch (System.IO.IOException) + { + Console.Out.WriteLine("error reading " + args[i]); + } + } + } + } +}
\ No newline at end of file diff --git a/src/core/Analysis/SimpleAnalyzer.cs b/src/core/Analysis/SimpleAnalyzer.cs new file mode 100644 index 0000000..b84f470 --- /dev/null +++ b/src/core/Analysis/SimpleAnalyzer.cs @@ -0,0 +1,45 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +namespace Lucene.Net.Analysis +{ + + /// <summary>An <see cref="Analyzer" /> that filters <see cref="LetterTokenizer" /> + /// with <see cref="LowerCaseFilter" /> + /// </summary> + + public sealed class SimpleAnalyzer : Analyzer + { + public override TokenStream TokenStream(System.String fieldName, System.IO.TextReader reader) + { + return new LowerCaseTokenizer(reader); + } + + public override TokenStream ReusableTokenStream(System.String fieldName, System.IO.TextReader reader) + { + var tokenizer = (Tokenizer) PreviousTokenStream; + if (tokenizer == null) + { + tokenizer = new LowerCaseTokenizer(reader); + PreviousTokenStream = tokenizer; + } + else + tokenizer.Reset(reader); + return tokenizer; + } + } +}
\ No newline at end of file diff --git a/src/core/Analysis/Standard/StandardAnalyzer.cs b/src/core/Analysis/Standard/StandardAnalyzer.cs new file mode 100644 index 0000000..347d026 --- /dev/null +++ b/src/core/Analysis/Standard/StandardAnalyzer.cs @@ -0,0 +1,174 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +using System; +using System.Collections; +using System.Collections.Generic; +using Lucene.Net.Analysis; +using Lucene.Net.Util; +using Version = Lucene.Net.Util.Version; + +namespace Lucene.Net.Analysis.Standard +{ + + /// <summary> Filters <see cref="StandardTokenizer" /> with <see cref="StandardFilter" />, + /// <see cref="LowerCaseFilter" /> and <see cref="StopFilter" />, using a list of English stop + /// words. + /// + /// <a name="version"/> + /// <p/> + /// You must specify the required <see cref="Version" /> compatibility when creating + /// StandardAnalyzer: + /// <list type="bullet"> + /// <item>As of 2.9, StopFilter preserves position increments</item> + /// <item>As of 2.4, Tokens incorrectly identified as acronyms are corrected (see + /// <a href="https://issues.apache.org/jira/browse/LUCENE-1068">LUCENE-1608</a>)</item> + /// </list> + /// </summary> + public class StandardAnalyzer : Analyzer + { + private ISet<string> stopSet; + + /// <summary> Specifies whether deprecated acronyms should be replaced with HOST type. + /// See <a href="https://issues.apache.org/jira/browse/LUCENE-1068">https://issues.apache.org/jira/browse/LUCENE-1068</a> + /// </summary> + private bool replaceInvalidAcronym, enableStopPositionIncrements; + + /// <summary>An unmodifiable set containing some common English words that are usually not + /// useful for searching. + /// </summary> + public static readonly ISet<string> STOP_WORDS_SET; + private Version matchVersion; + + /// <summary>Builds an analyzer with the default stop words (<see cref="STOP_WORDS_SET" />). + /// </summary> + /// <param name="matchVersion">Lucene version to match see <see cref="Version">above</see></param> + public StandardAnalyzer(Version matchVersion) + : this(matchVersion, STOP_WORDS_SET) + { } + + /// <summary>Builds an analyzer with the given stop words.</summary> + /// <param name="matchVersion">Lucene version to match See <see cref="Version">above</see> /> + /// + /// </param> + /// <param name="stopWords">stop words + /// </param> + public StandardAnalyzer(Version matchVersion, ISet<string> stopWords) + { + stopSet = stopWords; + SetOverridesTokenStreamMethod<StandardAnalyzer>(); + enableStopPositionIncrements = StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion); + replaceInvalidAcronym = matchVersion.OnOrAfter(Version.LUCENE_24); + this.matchVersion = matchVersion; + } + + /// <summary>Builds an analyzer with the stop words from the given file.</summary> + /// <seealso cref="WordlistLoader.GetWordSet(System.IO.FileInfo)"> + /// </seealso> + /// <param name="matchVersion">Lucene version to match See <see cref="Version">above</see> /> + /// + /// </param> + /// <param name="stopwords">File to read stop words from + /// </param> + public StandardAnalyzer(Version matchVersion, System.IO.FileInfo stopwords) + : this (matchVersion, WordlistLoader.GetWordSet(stopwords)) + { + } + + /// <summary>Builds an analyzer with the stop words from the given reader.</summary> + /// <seealso cref="WordlistLoader.GetWordSet(System.IO.TextReader)"> + /// </seealso> + /// <param name="matchVersion">Lucene version to match See <see cref="Version">above</see> /> + /// + /// </param> + /// <param name="stopwords">Reader to read stop words from + /// </param> + public StandardAnalyzer(Version matchVersion, System.IO.TextReader stopwords) + : this(matchVersion, WordlistLoader.GetWordSet(stopwords)) + { } + + /// <summary>Constructs a <see cref="StandardTokenizer" /> filtered by a <see cref="StandardFilter" /> + ///, a <see cref="LowerCaseFilter" /> and a <see cref="StopFilter" />. + /// </summary> + public override TokenStream TokenStream(System.String fieldName, System.IO.TextReader reader) + { + StandardTokenizer tokenStream = new StandardTokenizer(matchVersion, reader); + tokenStream.MaxTokenLength = maxTokenLength; + TokenStream result = new StandardFilter(tokenStream); + result = new LowerCaseFilter(result); + result = new StopFilter(enableStopPositionIncrements, result, stopSet); + return result; + } + + private sealed class SavedStreams + { + internal StandardTokenizer tokenStream; + internal TokenStream filteredTokenStream; + } + + /// <summary>Default maximum allowed token length </summary> + public const int DEFAULT_MAX_TOKEN_LENGTH = 255; + + private int maxTokenLength = DEFAULT_MAX_TOKEN_LENGTH; + + /// <summary> Set maximum allowed token length. If a token is seen + /// that exceeds this length then it is discarded. This + /// setting only takes effect the next time tokenStream or + /// reusableTokenStream is called. + /// </summary> + public virtual int MaxTokenLength + { + get { return maxTokenLength; } + set { maxTokenLength = value; } + } + + public override TokenStream ReusableTokenStream(System.String fieldName, System.IO.TextReader reader) + { + if (overridesTokenStreamMethod) + { + // LUCENE-1678: force fallback to tokenStream() if we + // have been subclassed and that subclass overrides + // tokenStream but not reusableTokenStream + return TokenStream(fieldName, reader); + } + SavedStreams streams = (SavedStreams) PreviousTokenStream; + if (streams == null) + { + streams = new SavedStreams(); + PreviousTokenStream = streams; + streams.tokenStream = new StandardTokenizer(matchVersion, reader); + streams.filteredTokenStream = new StandardFilter(streams.tokenStream); + streams.filteredTokenStream = new LowerCaseFilter(streams.filteredTokenStream); + streams.filteredTokenStream = new StopFilter(enableStopPositionIncrements, + streams.filteredTokenStream, stopSet); + } + else + { + streams.tokenStream.Reset(reader); + } + streams.tokenStream.MaxTokenLength = maxTokenLength; + + streams.tokenStream.SetReplaceInvalidAcronym(replaceInvalidAcronym); + + return streams.filteredTokenStream; + } + static StandardAnalyzer() + { + STOP_WORDS_SET = StopAnalyzer.ENGLISH_STOP_WORDS_SET; + } + } +}
\ No newline at end of file diff --git a/src/core/Analysis/Standard/StandardFilter.cs b/src/core/Analysis/Standard/StandardFilter.cs new file mode 100644 index 0000000..fd13261 --- /dev/null +++ b/src/core/Analysis/Standard/StandardFilter.cs @@ -0,0 +1,88 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +using System; +using Lucene.Net.Analysis.Tokenattributes; +using Token = Lucene.Net.Analysis.Token; +using TokenFilter = Lucene.Net.Analysis.TokenFilter; +using TokenStream = Lucene.Net.Analysis.TokenStream; + +namespace Lucene.Net.Analysis.Standard +{ + + /// <summary>Normalizes tokens extracted with <see cref="StandardTokenizer" />. </summary> + + public sealed class StandardFilter:TokenFilter + { + + + /// <summary>Construct filtering <i>in</i>. </summary> + public StandardFilter(TokenStream in_Renamed):base(in_Renamed) + { + termAtt = AddAttribute<ITermAttribute>(); + typeAtt = AddAttribute<ITypeAttribute>(); + } + + private static readonly System.String APOSTROPHE_TYPE; + private static readonly System.String ACRONYM_TYPE; + + // this filters uses attribute type + private ITypeAttribute typeAtt; + private ITermAttribute termAtt; + + /// <summary>Returns the next token in the stream, or null at EOS. + /// <p/>Removes <tt>'s</tt> from the end of words. + /// <p/>Removes dots from acronyms. + /// </summary> + public override bool IncrementToken() + { + if (!input.IncrementToken()) + { + return false; + } + + char[] buffer = termAtt.TermBuffer(); + int bufferLength = termAtt.TermLength(); + System.String type = typeAtt.Type; + + if ((System.Object) type == (System.Object) APOSTROPHE_TYPE && bufferLength >= 2 && buffer[bufferLength - 2] == '\'' && (buffer[bufferLength - 1] == 's' || buffer[bufferLength - 1] == 'S')) + { + // Strip last 2 characters off + termAtt.SetTermLength(bufferLength - 2); + } + else if ((System.Object) type == (System.Object) ACRONYM_TYPE) + { + // remove dots + int upto = 0; + for (int i = 0; i < bufferLength; i++) + { + char c = buffer[i]; + if (c != '.') + buffer[upto++] = c; + } + termAtt.SetTermLength(upto); + } + + return true; + } + static StandardFilter() + { + APOSTROPHE_TYPE = StandardTokenizerImpl.TOKEN_TYPES[StandardTokenizerImpl.APOSTROPHE]; + ACRONYM_TYPE = StandardTokenizerImpl.TOKEN_TYPES[StandardTokenizerImpl.ACRONYM]; + } + } +}
\ No newline at end of file diff --git a/src/core/Analysis/Standard/StandardTokenizer.cs b/src/core/Analysis/Standard/StandardTokenizer.cs new file mode 100644 index 0000000..dca409d --- /dev/null +++ b/src/core/Analysis/Standard/StandardTokenizer.cs @@ -0,0 +1,232 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +using System; +using Lucene.Net.Analysis.Tokenattributes; +using Lucene.Net.Util; +using CharReader = Lucene.Net.Analysis.CharReader; +using Token = Lucene.Net.Analysis.Token; +using Tokenizer = Lucene.Net.Analysis.Tokenizer; +using AttributeSource = Lucene.Net.Util.AttributeSource; +using Version = Lucene.Net.Util.Version; + +namespace Lucene.Net.Analysis.Standard +{ + + /// <summary>A grammar-based tokenizer constructed with JFlex + /// + /// <p/> This should be a good tokenizer for most European-language documents: + /// + /// <list type="bullet"> + /// <item>Splits words at punctuation characters, removing punctuation. However, a + /// dot that's not followed by whitespace is considered part of a token.</item> + /// <item>Splits words at hyphens, unless there's a number in the token, in which case + /// the whole token is interpreted as a product number and is not split.</item> + /// <item>Recognizes email addresses and internet hostnames as one token.</item> + /// </list> + /// + /// <p/>Many applications have specific tokenizer needs. If this tokenizer does + /// not suit your application, please consider copying this source code + /// directory to your project and maintaining your own grammar-based tokenizer. + /// + /// <a name="version"/> + /// <p/> + /// You must specify the required <see cref="Version" /> compatibility when creating + /// StandardAnalyzer: + /// <list type="bullet"> + /// <item>As of 2.4, Tokens incorrectly identified as acronyms are corrected (see + /// <a href="https://issues.apache.org/jira/browse/LUCENE-1068">LUCENE-1608</a></item> + /// </list> + /// </summary> + + public sealed class StandardTokenizer:Tokenizer + { + private void InitBlock() + { + maxTokenLength = StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH; + } + /// <summary>A private instance of the JFlex-constructed scanner </summary> + private StandardTokenizerImpl scanner; + + public const int ALPHANUM = 0; + public const int APOSTROPHE = 1; + public const int ACRONYM = 2; + public const int COMPANY = 3; + public const int EMAIL = 4; + public const int HOST = 5; + public const int NUM = 6; + public const int CJ = 7; + + /// <deprecated> this solves a bug where HOSTs that end with '.' are identified + /// as ACRONYMs. + /// </deprecated> + [Obsolete("this solves a bug where HOSTs that end with '.' are identified as ACRONYMs.")] + public const int ACRONYM_DEP = 8; + + /// <summary>String token types that correspond to token type int constants </summary> + public static readonly System.String[] TOKEN_TYPES = new System.String[]{"<ALPHANUM>", "<APOSTROPHE>", "<ACRONYM>", "<COMPANY>", "<EMAIL>", "<HOST>", "<NUM>", "<CJ>", "<ACRONYM_DEP>"}; + + private bool replaceInvalidAcronym; + + private int maxTokenLength; + + /// <summary>Set the max allowed token length. Any token longer + /// than this is skipped. + /// </summary> + public int MaxTokenLength + { + get { return maxTokenLength; } + set { this.maxTokenLength = value; } + } + + /// <summary> Creates a new instance of the + /// <see cref="Lucene.Net.Analysis.Standard.StandardTokenizer" />. Attaches + /// the <c>input</c> to the newly created JFlex scanner. + /// + /// </summary> + /// <param name="matchVersion"></param> + /// <param name="input">The input reader + /// + /// See http://issues.apache.org/jira/browse/LUCENE-1068 + /// </param> + public StandardTokenizer(Version matchVersion, System.IO.TextReader input):base() + { + InitBlock(); + this.scanner = new StandardTokenizerImpl(input); + Init(input, matchVersion); + } + + /// <summary> Creates a new StandardTokenizer with a given <see cref="AttributeSource" />.</summary> + public StandardTokenizer(Version matchVersion, AttributeSource source, System.IO.TextReader input):base(source) + { + InitBlock(); + this.scanner = new StandardTokenizerImpl(input); + Init(input, matchVersion); + } + + /// <summary> Creates a new StandardTokenizer with a given + /// <see cref="Lucene.Net.Util.AttributeSource.AttributeFactory" /> + /// </summary> + public StandardTokenizer(Version matchVersion, AttributeFactory factory, System.IO.TextReader input):base(factory) + { + InitBlock(); + this.scanner = new StandardTokenizerImpl(input); + Init(input, matchVersion); + } + + private void Init(System.IO.TextReader input, Version matchVersion) + { + if (matchVersion.OnOrAfter(Version.LUCENE_24)) + { + replaceInvalidAcronym = true; + } + else + { + replaceInvalidAcronym = false; + } + this.input = input; + termAtt = AddAttribute<ITermAttribute>(); + offsetAtt = AddAttribute<IOffsetAttribute>(); + posIncrAtt = AddAttribute<IPositionIncrementAttribute>(); + typeAtt = AddAttribute<ITypeAttribute>(); + } + + // this tokenizer generates three attributes: + // offset, positionIncrement and type + private ITermAttribute termAtt; + private IOffsetAttribute offsetAtt; + private IPositionIncrementAttribute posIncrAtt; + private ITypeAttribute typeAtt; + + ///<summary> + /// (non-Javadoc) + /// <see cref="Lucene.Net.Analysis.TokenStream.IncrementToken()" /> + ///</summary> + public override bool IncrementToken() + { + ClearAttributes(); + int posIncr = 1; + + while (true) + { + int tokenType = scanner.GetNextToken(); + + if (tokenType == StandardTokenizerImpl.YYEOF) + { + return false; + } + + if (scanner.Yylength() <= maxTokenLength) + { + posIncrAtt.PositionIncrement = posIncr; + scanner.GetText(termAtt); + int start = scanner.Yychar(); + offsetAtt.SetOffset(CorrectOffset(start), CorrectOffset(start + termAtt.TermLength())); + // This 'if' should be removed in the next release. For now, it converts + // invalid acronyms to HOST. When removed, only the 'else' part should + // remain. + if (tokenType == StandardTokenizerImpl.ACRONYM_DEP) + { + if (replaceInvalidAcronym) + { + typeAtt.Type = StandardTokenizerImpl.TOKEN_TYPES[StandardTokenizerImpl.HOST]; + termAtt.SetTermLength(termAtt.TermLength() - 1); // remove extra '.' + } + else + { + typeAtt.Type = StandardTokenizerImpl.TOKEN_TYPES[StandardTokenizerImpl.ACRONYM]; + } + } + else + { + typeAtt.Type = StandardTokenizerImpl.TOKEN_TYPES[tokenType]; + } + return true; + } + // When we skip a too-long term, we still increment the + // position increment + else + posIncr++; + } + } + + public override void End() + { + // set final offset + int finalOffset = CorrectOffset(scanner.Yychar() + scanner.Yylength()); + offsetAtt.SetOffset(finalOffset, finalOffset); + } + + public override void Reset(System.IO.TextReader reader) + { + base.Reset(reader); + scanner.Reset(reader); + } + + /// <summary> + /// Remove in 3.X and make true the only valid value + /// See https://issues.apache.org/jira/browse/LUCENE-1068 + /// </summary> + /// <param name="replaceInvalidAcronym">Set to true to replace mischaracterized acronyms as HOST. + /// </param> + [Obsolete("Remove in 3.X and make true the only valid value. See https://issues.apache.org/jira/browse/LUCENE-1068")] + public void SetReplaceInvalidAcronym(bool replaceInvalidAcronym) + { + this.replaceInvalidAcronym = replaceInvalidAcronym; + } + } +}
\ No newline at end of file diff --git a/src/core/Analysis/Standard/StandardTokenizerImpl.cs b/src/core/Analysis/Standard/StandardTokenizerImpl.cs new file mode 100644 index 0000000..cb4bf5f --- /dev/null +++ b/src/core/Analysis/Standard/StandardTokenizerImpl.cs @@ -0,0 +1,707 @@ +/* The following code was generated by JFlex 1.4.1 on 9/4/08 6:49 PM */ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +/* + NOTE: if you change StandardTokenizerImpl.jflex and need to regenerate the tokenizer, + the tokenizer, only use Java 1.4 !!! + This grammar currently uses constructs (eg :digit:, :letter:) whose + meaning can vary according to the JRE used to run jflex. See + https://issues.apache.org/jira/browse/LUCENE-1126 for details. + For current backwards compatibility it is needed to support + only Java 1.4 - this will change in Lucene 3.1. +*/ + +using System; +using Lucene.Net.Analysis.Tokenattributes; +using Token = Lucene.Net.Analysis.Token; + +namespace Lucene.Net.Analysis.Standard +{ + + + /// <summary> This class is a scanner generated by + /// <a href="http://www.jflex.de/">JFlex</a> 1.4.1 + /// on 9/4/08 6:49 PM from the specification file + /// <tt>/tango/mike/src/lucene.standarddigit/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.jflex</tt> + /// </summary> + class StandardTokenizerImpl + { + + /// <summary>This character denotes the end of file </summary> + public const int YYEOF = - 1; + + /// <summary>initial size of the lookahead buffer </summary> + private const int ZZ_BUFFERSIZE = 16384; + + /// <summary>lexical states </summary> + public const int YYINITIAL = 0; + + /// <summary> Translates characters to character classes</summary> + private const System.String ZZ_CMAP_PACKED = "\x0009\x0000\x0001\x0000\x0001\x000D\x0001\x0000\x0001\x0000\x0001\x000C\x0012\x0000\x0001\x0000\x0005\x0000\x0001\x0005" + "\x0001\x0003\x0004\x0000\x0001\x0009\x0001\x0007\x0001\x0004\x0001\x0009\x000A\x0002\x0006\x0000\x0001\x0006\x001A\x000A" + "\x0004\x0000\x0001\x0008\x0001\x0000\x001A\x000A\x002F\x0000\x0001\x000A\x000A\x0000\x0001\x000A\x0004\x0000\x0001\x000A" + "\x0005\x0000\x0017\x000A\x0001\x0000\x001F\x000A\x0001\x0000\u0128\x000A\x0002\x0000\x0012\x000A\x001C\x0000\x005E\x000A" + "\x0002\x0000\x0009\x000A\x0002\x0000\x0007\x000A\x000E\x0000\x0002\x000A\x000E\x0000\x0005\x000A\x0009\x0000\x0001\x000A" + "\x008B\x0000\x0001\x000A\x000B\x0000\x0001\x000A\x0001\x0000\x0003\x000A\x0001\x0000\x0001\x000A\x0001\x0000\x0014\x000A" + "\x0001\x0000\x002C\x000A\x0001\x0000\x0008\x000A\x0002\x0000\x001A\x000A\x000C\x0000\x0082\x000A\x000A\x0000\x0039\x000A" + "\x0002\x0000\x0002\x000A\x0002\x0000\x0002\x000A\x0003\x0000\x0026\x000A\x0002\x0000\x0002\x000A\x0037\x0000\x0026\x000A" + "\x0002\x0000\x0001\x000A\x0007\x0000\x0027\x000A\x0048\x0000\x001B\x000A\x0005\x0000\x0003\x000A\x002E\x0000\x001A\x000A" + "\x0005\x0000\x000B\x000A\x0015\x0000\x000A\x0002\x0007\x0000\x0063\x000A\x0001\x0000\x0001\x000A\x000F\x0000\x0002\x000A" + "\x0009\x0000\x000A\x0002\x0003\x000A\x0013\x0000\x0001\x000A\x0001\x0000\x001B\x000A\x0053\x0000\x0026\x000A\u015f\x0000" + "\x0035\x000A\x0003\x0000\x0001\x000A\x0012\x0000\x0001\x000A\x0007\x0000\x000A\x000A\x0004\x0000\x000A\x0002\x0015\x0000" + "\x0008\x000A\x0002\x0000\x0002\x000A\x0002\x0000\x0016\x000A\x0001\x0000\x0007\x000A\x0001\x0000\x0001\x000A\x0003\x0000" + "\x0004\x000A\x0022\x0000\x0002\x000A\x0001\x0000\x0003\x000A\x0004\x0000\x000A\x0002\x0002\x000A\x0013\x0000\x0006\x000A" + "\x0004\x0000\x0002\x000A\x0002\x0000\x0016\x000A\x0001\x0000\x0007\x000A\x0001\x0000\x0002\x000A\x0001\x0000\x0002\x000A" + + "\x0001\x0000\x0002\x000A\x001F\x0000\x0004\x000A\x0001\x0000\x0001\x000A\x0007\x0000\x000A\x0002\x0002\x0000\x0003\x000A" + "\x0010\x0000\x0007\x000A\x0001\x0000\x0001\x000A\x0001\x0000\x0003\x000A\x0001\x0000\x0016\x000A\x0001\x0000\x0007\x000A" + "\x0001\x0000\x0002\x000A\x0001\x0000\x0005\x000A\x0003\x0000\x0001\x000A\x0012\x0000\x0001\x000A\x000F\x0000\x0001\x000A" + "\x0005\x0000\x000A\x0002\x0015\x0000\x0008\x000A\x0002\x0000\x0002\x000A\x0002\x0000\x0016\x000A\x0001\x0000\x0007\x000A" + "\x0001\x0000\x0002\x000A\x0002\x0000\x0004\x000A\x0003\x0000\x0001\x000A\x001E\x0000\x0002\x000A\x0001\x0000\x0003\x000A" + "\x0004\x0000\x000A\x0002\x0015\x0000\x0006\x000A\x0003\x0000\x0003\x000A\x0001\x0000\x0004\x000A\x0003\x0000\x0002\x000A" + "\x0001\x0000\x0001\x000A\x0001\x0000\x0002\x000A\x0003\x0000\x0002\x000A\x0003\x0000\x0003\x000A\x0003\x0000\x0008\x000A" + "\x0001\x0000\x0003\x000A\x002D\x0000\x0009\x0002\x0015\x0000\x0008\x000A\x0001\x0000\x0003\x000A\x0001\x0000\x0017\x000A" + "\x0001\x0000\x000A\x000A\x0001\x0000\x0005\x000A\x0026\x0000\x0002\x000A\x0004\x0000\x000A\x0002\x0015\x0000\x0008\x000A" + "\x0001\x0000\x0003\x000A\x0001\x0000\x0017\x000A\x0001\x0000\x000A\x000A\x0001\x0000\x0005\x000A\x0024\x0000\x0001\x000A" + "\x0001\x0000\x0002\x000A\x0004\x0000\x000A\x0002\x0015\x0000\x0008\x000A\x0001\x0000\x0003\x000A\x0001\x0000\x0017\x000A" + "\x0001\x0000\x0010\x000A\x0026\x0000\x0002\x000A\x0004\x0000\x000A\x0002\x0015\x0000\x0012\x000A\x0003\x0000\x0018\x000A" + "\x0001\x0000\x0009\x000A\x0001\x0000\x0001\x000A\x0002\x0000\x0007\x000A\x0039\x0000\x0001\x0001\x0030\x000A\x0001\x0001" + "\x0002\x000A\x000C\x0001\x0007\x000A\x0009\x0001\x000A\x0002\x0027\x0000\x0002\x000A\x0001\x0000\x0001\x000A\x0002\x0000" + "\x0002\x000A\x0001\x0000\x0001\x000A\x0002\x0000\x0001\x000A\x0006\x0000\x0004\x000A\x0001\x0000\x0007\x000A\x0001\x0000" + "\x0003\x000A\x0001\x0000\x0001\x000A\x0001\x0000\x0001\x000A\x0002\x0000\x0002\x000A\x0001\x0000\x0004\x000A\x0001\x0000" + + "\x0002\x000A\x0009\x0000\x0001\x000A\x0002\x0000\x0005\x000A\x0001\x0000\x0001\x000A\x0009\x0000\x000A\x0002\x0002\x0000" + "\x0002\x000A\x0022\x0000\x0001\x000A\x001F\x0000\x000A\x0002\x0016\x0000\x0008\x000A\x0001\x0000\x0022\x000A\x001D\x0000" + "\x0004\x000A\x0074\x0000\x0022\x000A\x0001\x0000\x0005\x000A\x0001\x0000\x0002\x000A\x0015\x0000\x000A\x0002\x0006\x0000" + "\x0006\x000A\x004A\x0000\x0026\x000A\x000A\x0000\x0027\x000A\x0009\x0000\x005A\x000A\x0005\x0000\x0044\x000A\x0005\x0000" + "\x0052\x000A\x0006\x0000\x0007\x000A\x0001\x0000\x003F\x000A\x0001\x0000\x0001\x000A\x0001\x0000\x0004\x000A\x0002\x0000" + "\x0007\x000A\x0001\x0000\x0001\x000A\x0001\x0000\x0004\x000A\x0002\x0000\x0027\x000A\x0001\x0000\x0001\x000A\x0001\x0000" + "\x0004\x000A\x0002\x0000\x001F\x000A\x0001\x0000\x0001\x000A\x0001\x0000\x0004\x000A\x0002\x0000\x0007\x000A\x0001\x0000" + "\x0001\x000A\x0001\x0000\x0004\x000A\x0002\x0000\x0007\x000A\x0001\x0000\x0007\x000A\x0001\x0000\x0017\x000A\x0001\x0000" + "\x001F\x000A\x0001\x0000\x0001\x000A\x0001\x0000\x0004\x000A\x0002\x0000\x0007\x000A\x0001\x0000\x0027\x000A\x0001\x0000" + "\x0013\x000A\x000E\x0000\x0009\x0002\x002E\x0000\x0055\x000A\x000C\x0000\u026c\x000A\x0002\x0000\x0008\x000A\x000A\x0000" + "\x001A\x000A\x0005\x0000\x004B\x000A\x0095\x0000\x0034\x000A\x002C\x0000\x000A\x0002\x0026\x0000\x000A\x0002\x0006\x0000" + "\x0058\x000A\x0008\x0000\x0029\x000A\u0557\x0000\x009C\x000A\x0004\x0000\x005A\x000A\x0006\x0000\x0016\x000A\x0002\x0000" + "\x0006\x000A\x0002\x0000\x0026\x000A\x0002\x0000\x0006\x000A\x0002\x0000\x0008\x000A\x0001\x0000\x0001\x000A\x0001\x0000" + "\x0001\x000A\x0001\x0000\x0001\x000A\x0001\x0000\x001F\x000A\x0002\x0000\x0035\x000A\x0001\x0000\x0007\x000A\x0001\x0000" + "\x0001\x000A\x0003\x0000\x0003\x000A\x0001\x0000\x0007\x000A\x0003\x0000\x0004\x000A\x0002\x0000\x0006\x000A\x0004\x0000" + "\x000D\x000A\x0005\x0000\x0003\x000A\x0001\x0000\x0007\x000A\x0082\x0000\x0001\x000A\x0082\x0000\x0001\x000A\x0004\x0000" + + "\x0001\x000A\x0002\x0000\x000A\x000A\x0001\x0000\x0001\x000A\x0003\x0000\x0005\x000A\x0006\x0000\x0001\x000A\x0001\x0000" + "\x0001\x000A\x0001\x0000\x0001\x000A\x0001\x0000\x0004\x000A\x0001\x0000\x0003\x000A\x0001\x0000\x0007\x000A\u0ecb\x0000" + "\x0002\x000A\x002A\x0000\x0005\x000A\x000A\x0000\x0001\x000B\x0054\x000B\x0008\x000B\x0002\x000B\x0002\x000B\x005A\x000B" + "\x0001\x000B\x0003\x000B\x0006\x000B\x0028\x000B\x0003\x000B\x0001\x0000\x005E\x000A\x0011\x0000\x0018\x000A\x0038\x0000" + "\x0010\x000B\u0100\x0000\x0080\x000B\x0080\x0000\u19b6\x000B\x000A\x000B\x0040\x0000\u51a6\x000B\x005A\x000B\u048d\x000A" + "\u0773\x0000\u2ba4\x000A\u215c\x0000\u012e\x000B\x00D2\x000B\x0007\x000A\x000C\x0000\x0005\x000A\x0005\x0000\x0001\x000A" + "\x0001\x0000\x000A\x000A\x0001\x0000\x000D\x000A\x0001\x0000\x0005\x000A\x0001\x0000\x0001\x000A\x0001\x0000\x0002\x000A" + "\x0001\x0000\x0002\x000A\x0001\x0000\x006C\x000A\x0021\x0000\u016b\x000A\x0012\x0000\x0040\x000A\x0002\x0000\x0036\x000A" + "\x0028\x0000\x000C\x000A\x0074\x0000\x0003\x000A\x0001\x0000\x0001\x000A\x0001\x0000\x0087\x000A\x0013\x0000\x000A\x0002" + "\x0007\x0000\x001A\x000A\x0006\x0000\x001A\x000A\x000A\x0000\x0001\x000B\x003A\x000B\x001F\x000A\x0003\x0000\x0006\x000A" + "\x0002\x0000\x0006\x000A\x0002\x0000\x0006\x000A\x0002\x0000\x0003\x000A\x0023\x0000"; + + /// <summary> Translates characters to character classes</summary> + private static readonly char[] ZZ_CMAP = ZzUnpackCMap(ZZ_CMAP_PACKED); + + /// <summary> Translates DFA states to action switch labels.</summary> + private static readonly int[] ZZ_ACTION = ZzUnpackAction(); + + private const System.String ZZ_ACTION_PACKED_0 = "\x0001\x0000\x0001\x0001\x0003\x0002\x0001\x0003\x0001\x0001\x000B\x0000\x0001\x0002\x0003\x0004" + "\x0002\x0000\x0001\x0005\x0001\x0000\x0001\x0005\x0003\x0004\x0006\x0005\x0001\x0006\x0001\x0004" + "\x0002\x0007\x0001\x0008\x0001\x0000\x0001\x0008\x0003\x0000\x0002\x0008\x0001\x0009\x0001\x000A" + "\x0001\x0004"; + + private static int[] ZzUnpackAction() + { + int[] result = new int[51]; + int offset = 0; + offset = ZzUnpackAction(ZZ_ACTION_PACKED_0, offset, result); + return result; + } + + private static int ZzUnpackAction(System.String packed, int offset, int[] result) + { + int i = 0; /* index in packed string */ + int j = offset; /* index in unpacked array */ + int l = packed.Length; + while (i < l) + { + int count = packed[i++]; + int value_Renamed = packed[i++]; + do + result[j++] = value_Renamed; + while (--count > 0); + } + return j; + } + + + /// <summary> Translates a state to a row index in the transition table</summary> + private static readonly int[] ZZ_ROWMAP = ZzUnpackRowMap(); + + private const System.String ZZ_ROWMAP_PACKED_0 = "\x0000\x0000\x0000\x000E\x0000\x001C\x0000\x002A\x0000\x0038\x0000\x000E\x0000\x0046\x0000\x0054" + "\x0000\x0062\x0000\x0070\x0000\x007E\x0000\x008C\x0000\x009A\x0000\x00A8\x0000\x00B6\x0000\x00C4" + "\x0000\x00D2\x0000\x00E0\x0000\x00EE\x0000\x00FC\x0000\u010a\x0000\u0118\x0000\u0126\x0000\u0134" + "\x0000\u0142\x0000\u0150\x0000\u015e\x0000\u016c\x0000\u017a\x0000\u0188\x0000\u0196\x0000\u01a4" + "\x0000\u01b2\x0000\u01c0\x0000\u01ce\x0000\u01dc\x0000\u01ea\x0000\u01f8\x0000\x00D2\x0000\u0206" + "\x0000\u0214\x0000\u0222\x0000\u0230\x0000\u023e\x0000\u024c\x0000\u025a\x0000\x0054\x0000\x008C" + "\x0000\u0268\x0000\u0276\x0000\u0284"; + + private static int[] ZzUnpackRowMap() + { + int[] result = new int[51]; + int offset = 0; + offset = ZzUnpackRowMap(ZZ_ROWMAP_PACKED_0, offset, result); + return result; + } + + private static int ZzUnpackRowMap(System.String packed, int offset, int[] result) + { + int i = 0; /* index in packed string */ + int j = offset; /* index in unpacked array */ + int l = packed.Length; + while (i < l) + { + int high = packed[i++] << 16; + result[j++] = high | packed[i++]; + } + return j; + } + + /// <summary> The transition table of the DFA</summary> + private static readonly int[] ZZ_TRANS = ZzUnpackTrans(); + + private const System.String ZZ_TRANS_PACKED_0 = "\x0001\x0002\x0001\x0003\x0001\x0004\x0007\x0002\x0001\x0005\x0001\x0006\x0001\x0007\x0001\x0002" + "\x000F\x0000\x0002\x0003\x0001\x0000\x0001\x0008\x0001\x0000\x0001\x0009\x0002\x000A\x0001\x000B" + "\x0001\x0003\x0004\x0000\x0001\x0003\x0001\x0004\x0001\x0000\x0001\x000C\x0001\x0000\x0001\x0009" + "\x0002\x000D\x0001\x000E\x0001\x0004\x0004\x0000\x0001\x0003\x0001\x0004\x0001\x000F\x0001\x0010" + "\x0001\x0011\x0001\x0012\x0002\x000A\x0001\x000B\x0001\x0013\x0010\x0000\x0001\x0002\x0001\x0000" + "\x0001\x0014\x0001\x0015\x0007\x0000\x0001\x0016\x0004\x0000\x0002\x0017\x0007\x0000\x0001\x0017" + "\x0004\x0000\x0001\x0018\x0001\x0019\x0007\x0000\x0001\x001A\x0005\x0000\x0001\x001B\x0007\x0000" + "\x0001\x000B\x0004\x0000\x0001\x001C\x0001\x001D\x0007\x0000\x0001\x001E\x0004\x0000\x0001\x001F" + "\x0001\x0020\x0007\x0000\x0001\x0021\x0004\x0000\x0001\x0022\x0001\x0023\x0007\x0000\x0001\x0024" + "\x000D\x0000\x0001\x0025\x0004\x0000\x0001\x0014\x0001\x0015\x0007\x0000\x0001\x0026\x000D\x0000" + "\x0001\x0027\x0004\x0000\x0002\x0017\x0007\x0000\x0001\x0028\x0004\x0000\x0001\x0003\x0001\x0004" + "\x0001\x000F\x0001\x0008\x0001\x0011\x0001\x0012\x0002\x000A\x0001\x000B\x0001\x0013\x0004\x0000" + "\x0002\x0014\x0001\x0000\x0001\x0029\x0001\x0000\x0001\x0009\x0002\x002A\x0001\x0000\x0001\x0014" + "\x0004\x0000\x0001\x0014\x0001\x0015\x0001\x0000\x0001\x002B\x0001\x0000\x0001\x0009\x0002\x002C" + "\x0001\x002D\x0001\x0015\x0004\x0000\x0001\x0014\x0001\x0015\x0001\x0000\x0001\x0029\x0001\x0000" + "\x0001\x0009\x0002\x002A\x0001\x0000\x0001\x0016\x0004\x0000\x0002\x0017\x0001\x0000\x0001\x002E" + "\x0002\x0000\x0001\x002E\x0002\x0000\x0001\x0017\x0004\x0000\x0002\x0018\x0001\x0000\x0001\x002A" + "\x0001\x0000\x0001\x0009\x0002\x002A\x0001\x0000\x0001\x0018\x0004\x0000\x0001\x0018\x0001\x0019" + "\x0001\x0000\x0001\x002C\x0001\x0000\x0001\x0009\x0002\x002C\x0001\x002D\x0001\x0019\x0004\x0000" + + "\x0001\x0018\x0001\x0019\x0001\x0000\x0001\x002A\x0001\x0000\x0001\x0009\x0002\x002A\x0001\x0000" + "\x0001\x001A\x0005\x0000\x0001\x001B\x0001\x0000\x0001\x002D\x0002\x0000\x0003\x002D\x0001\x001B" + "\x0004\x0000\x0002\x001C\x0001\x0000\x0001\x002F\x0001\x0000\x0001\x0009\x0002\x000A\x0001\x000B" + "\x0001\x001C\x0004\x0000\x0001\x001C\x0001\x001D\x0001\x0000\x0001\x0030\x0001\x0000\x0001\x0009" + "\x0002\x000D\x0001\x000E\x0001\x001D\x0004\x0000\x0001\x001C\x0001\x001D\x0001\x0000\x0001\x002F" + "\x0001\x0000\x0001\x0009\x0002\x000A\x0001\x000B\x0001\x001E\x0004\x0000\x0002\x001F\x0001\x0000" + "\x0001\x000A\x0001\x0000\x0001\x0009\x0002\x000A\x0001\x000B\x0001\x001F\x0004\x0000\x0001\x001F" + "\x0001\x0020\x0001\x0000\x0001\x000D\x0001\x0000\x0001\x0009\x0002\x000D\x0001\x000E\x0001\x0020" + "\x0004\x0000\x0001\x001F\x0001\x0020\x0001\x0000\x0001\x000A\x0001\x0000\x0001\x0009\x0002\x000A" + "\x0001\x000B\x0001\x0021\x0004\x0000\x0002\x0022\x0001\x0000\x0001\x000B\x0002\x0000\x0003\x000B" + "\x0001\x0022\x0004\x0000\x0001\x0022\x0001\x0023\x0001\x0000\x0001\x000E\x0002\x0000\x0003\x000E" + "\x0001\x0023\x0004\x0000\x0001\x0022\x0001\x0023\x0001\x0000\x0001\x000B\x0002\x0000\x0003\x000B" + "\x0001\x0024\x0006\x0000\x0001\x000F\x0006\x0000\x0001\x0025\x0004\x0000\x0001\x0014\x0001\x0015" + "\x0001\x0000\x0001\x0031\x0001\x0000\x0001\x0009\x0002\x002A\x0001\x0000\x0001\x0016\x0004\x0000" + "\x0002\x0017\x0001\x0000\x0001\x002E\x0002\x0000\x0001\x002E\x0002\x0000\x0001\x0028\x0004\x0000" + "\x0002\x0014\x0007\x0000\x0001\x0014\x0004\x0000\x0002\x0018\x0007\x0000\x0001\x0018\x0004\x0000" + "\x0002\x001C\x0007\x0000\x0001\x001C\x0004\x0000\x0002\x001F\x0007\x0000\x0001\x001F\x0004\x0000" + "\x0002\x0022\x0007\x0000\x0001\x0022\x0004\x0000\x0002\x0032\x0007\x0000\x0001\x0032\x0004\x0000" + "\x0002\x0014\x0007\x0000\x0001\x0033\x0004\x0000\x0002\x0032\x0001\x0000\x0001\x002E\x0002\x0000" + "\x0001\x002E\x0002\x0000\x0001\x0032\x0004\x0000\x0002\x0014\x0001\x0000\x0001\x0031\x0001\x0000" + + "\x0001\x0009\x0002\x002A\x0001\x0000\x0001\x0014\x0003\x0000"; + + private static int[] ZzUnpackTrans() + { + int[] result = new int[658]; + int offset = 0; + offset = ZzUnpackTrans(ZZ_TRANS_PACKED_0, offset, result); + return result; + } + + private static int ZzUnpackTrans(System.String packed, int offset, int[] result) + { + int i = 0; /* index in packed string */ + int j = offset; /* index in unpacked array */ + int l = packed.Length; + while (i < l) + { + int count = packed[i++]; + int value_Renamed = packed[i++]; + value_Renamed--; + do + result[j++] = value_Renamed; + while (--count > 0); + } + return j; + } + + + /* error codes */ + private const int ZZ_UNKNOWN_ERROR = 0; + private const int ZZ_NO_MATCH = 1; + private const int ZZ_PUSHBACK_2BIG = 2; + + /* error messages for the codes above */ + private static readonly System.String[] ZZ_ERROR_MSG = new System.String[]{"Unkown internal scanner error", "Error: could not match input", "Error: pushback value was too large"}; + + /// <summary> ZZ_ATTRIBUTE[aState] contains the attributes of state <c>aState</c></summary> + private static readonly int[] ZZ_ATTRIBUTE = ZzUnpackAttribute(); + + private const System.String ZZ_ATTRIBUTE_PACKED_0 = "\x0001\x0000\x0001\x0009\x0003\x0001\x0001\x0009\x0001\x0001\x000B\x0000\x0004\x0001\x0002\x0000" + "\x0001\x0001\x0001\x0000\x000F\x0001\x0001\x0000\x0001\x0001\x0003\x0000\x0005\x0001"; + + private static int[] ZzUnpackAttribute() + { + int[] result = new int[51]; + int offset = 0; + offset = ZzUnpackAttribute(ZZ_ATTRIBUTE_PACKED_0, offset, result); + return result; + } + + private static int ZzUnpackAttribute(System.String packed, int offset, int[] result) + { + int i = 0; /* index in packed string */ + int j = offset; /* index in unpacked array */ + int l = packed.Length; + while (i < l) + { + int count = packed[i++]; + int value_Renamed = packed[i++]; + do + result[j++] = value_Renamed; + while (--count > 0); + } + return j; + } + + /// <summary>the input device </summary> + private System.IO.TextReader zzReader; + + /// <summary>the current state of the DFA </summary> + private int zzState; + + /// <summary>the current lexical state </summary> + private int zzLexicalState = YYINITIAL; + + /// <summary>this buffer contains the current text to be matched and is + /// the source of the yytext() string + /// </summary> + private char[] zzBuffer = new char[ZZ_BUFFERSIZE]; + + /// <summary>the textposition at the last accepting state </summary> + private int zzMarkedPos; + + /// <summary>the textposition at the last state to be included in yytext </summary> + private int zzPushbackPos; + + /// <summary>the current text position in the buffer </summary> + private int zzCurrentPos; + + /// <summary>startRead marks the beginning of the yytext() string in the buffer </summary> + private int zzStartRead; + + /// <summary>endRead marks the last character in the buffer, that has been read + /// from input + /// </summary> + private int zzEndRead; + + /// <summary>number of newlines encountered up to the start of the matched text </summary> + private int yyline; + + /// <summary>the number of characters up to the start of the matched text </summary> + private int yychar; + + /// <summary> the number of characters from the last newline up to the start of the + /// matched text + /// </summary> + private int yycolumn; + + /// <summary> zzAtBOL == true <=> the scanner is currently at the beginning of a line</summary> + private bool zzAtBOL = true; + + /// <summary>zzAtEOF == true <=> the scanner is at the EOF </summary> + private bool zzAtEOF; + + /* user code: */ + + public static readonly int ALPHANUM; + public static readonly int APOSTROPHE; + public static readonly int ACRONYM; + public static readonly int COMPANY; + public static readonly int EMAIL; + public static readonly int HOST; + public static readonly int NUM; + public static readonly int CJ; + /// <deprecated> this solves a bug where HOSTs that end with '.' are identified + /// as ACRONYMs. + /// </deprecated> + [Obsolete("this solves a bug where HOSTs that end with '.' are identified as ACRONYMs")] + public static readonly int ACRONYM_DEP; + + public static readonly System.String[] TOKEN_TYPES; + + public int Yychar() + { + return yychar; + } + + /* + * Resets the Tokenizer to a new Reader. + */ + internal void Reset(System.IO.TextReader r) + { + // reset to default buffer size, if buffer has grown + if (zzBuffer.Length > ZZ_BUFFERSIZE) + { + zzBuffer = new char[ZZ_BUFFERSIZE]; + } + Yyreset(r); + } + + /// <summary> Fills Lucene token with the current token text.</summary> + internal void GetText(Token t) + { + t.SetTermBuffer(zzBuffer, zzStartRead, zzMarkedPos - zzStartRead); + } + + /// <summary> Fills TermAttribute with the current token text.</summary> + internal void GetText(ITermAttribute t) + { + t.SetTermBuffer(zzBuffer, zzStartRead, zzMarkedPos - zzStartRead); + } + + + /// <summary> Creates a new scanner + /// There is also a java.io.InputStream version of this constructor. + /// + /// </summary> + /// <param name="in_Renamed"> the java.io.Reader to read input from. + /// </param> + internal StandardTokenizerImpl(System.IO.TextReader in_Renamed) + { + this.zzReader = in_Renamed; + } + + /// <summary> Creates a new scanner. + /// There is also java.io.Reader version of this constructor. + /// + /// </summary> + /// <param name="in_Renamed"> the java.io.Inputstream to read input from. + /// </param> + internal StandardTokenizerImpl(System.IO.Stream in_Renamed):this(new System.IO.StreamReader(in_Renamed, System.Text.Encoding.Default)) + { + } + + /// <summary> Unpacks the compressed character translation table. + /// + /// </summary> + /// <param name="packed"> the packed character translation table + /// </param> + /// <returns> the unpacked character translation table + /// </returns> + private static char[] ZzUnpackCMap(System.String packed) + { + char[] map = new char[0x10000]; + int i = 0; /* index in packed string */ + int j = 0; /* index in unpacked array */ + while (i < 1154) + { + int count = packed[i++]; + char value_Renamed = packed[i++]; + do + map[j++] = value_Renamed; + while (--count > 0); + } + return map; + } + + + /// <summary> Refills the input buffer. + /// </summary> + /// <returns><c>false</c>, iff there was new input. + /// + /// </returns> + /// <exception cref="System.IO.IOException"> if any I/O-Error occurs + /// </exception> + private bool ZzRefill() + { + + /* first: make room (if you can) */ + if (zzStartRead > 0) + { + Array.Copy(zzBuffer, zzStartRead, zzBuffer, 0, zzEndRead - zzStartRead); + + /* translate stored positions */ + zzEndRead -= zzStartRead; + zzCurrentPos -= zzStartRead; + zzMarkedPos -= zzStartRead; + zzPushbackPos -= zzStartRead; + zzStartRead = 0; + } + + /* is the buffer big enough? */ + if (zzCurrentPos >= zzBuffer.Length) + { + /* if not: blow it up */ + char[] newBuffer = new char[zzCurrentPos * 2]; + Array.Copy(zzBuffer, 0, newBuffer, 0, zzBuffer.Length); + zzBuffer = newBuffer; + } + + /* finally: fill the buffer with new input */ + int numRead = zzReader.Read(zzBuffer, zzEndRead, zzBuffer.Length - zzEndRead); + + if (numRead <= 0) + { + return true; + } + else + { + zzEndRead += numRead; + return false; + } + } + + + /// <summary> Closes the input stream.</summary> + public void Yyclose() + { + zzAtEOF = true; /* indicate end of file */ + zzEndRead = zzStartRead; /* invalidate buffer */ + + if (zzReader != null) + zzReader.Close(); + } + + + /// <summary> Resets the scanner to read from a new input stream. + /// Does not close the old reader. + /// + /// All internal variables are reset, the old input stream + /// <b>cannot</b> be reused (internal buffer is discarded and lost). + /// Lexical state is set to <tt>ZZ_INITIAL</tt>. + /// + /// </summary> + /// <param name="reader"> the new input stream + /// </param> + public void Yyreset(System.IO.TextReader reader) + { + zzReader = reader; + zzAtBOL = true; + zzAtEOF = false; + zzEndRead = zzStartRead = 0; + zzCurrentPos = zzMarkedPos = zzPushbackPos = 0; + yyline = yychar = yycolumn = 0; + zzLexicalState = YYINITIAL; + } + + + /// <summary> Returns the current lexical state.</summary> + public int Yystate() + { + return zzLexicalState; + } + + + /// <summary> Enters a new lexical state + /// + /// </summary> + /// <param name="newState">the new lexical state + /// </param> + public void Yybegin(int newState) + { + zzLexicalState = newState; + } + + + /// <summary> Returns the text matched by the current regular expression.</summary> + public System.String Yytext() + { + return new System.String(zzBuffer, zzStartRead, zzMarkedPos - zzStartRead); + } + + + /// <summary> Returns the character at position <tt>pos</tt> from the + /// matched text. + /// + /// It is equivalent to yytext().charAt(pos), but faster + /// + /// </summary> + /// <param name="pos">the position of the character to fetch. + /// A value from 0 to yylength()-1. + /// + /// </param> + /// <returns> the character at position pos + /// </returns> + public char Yycharat(int pos) + { + return zzBuffer[zzStartRead + pos]; + } + + + /// <summary> Returns the length of the matched text region.</summary> + public int Yylength() + { + return zzMarkedPos - zzStartRead; + } + + + /// <summary> Reports an error that occured while scanning. + /// + /// In a wellformed scanner (no or only correct usage of + /// yypushback(int) and a match-all fallback rule) this method + /// will only be called with things that "Can't Possibly Happen". + /// If this method is called, something is seriously wrong + /// (e.g. a JFlex bug producing a faulty scanner etc.). + /// + /// Usual syntax/scanner level error handling should be done + /// in error fallback rules. + /// + /// </summary> + /// <param name="errorCode"> the code of the errormessage to display + /// </param> + private void ZzScanError(int errorCode) + { + System.String message; + try + { + message = ZZ_ERROR_MSG[errorCode]; + } + catch (System.IndexOutOfRangeException) + { + message = ZZ_ERROR_MSG[ZZ_UNKNOWN_ERROR]; + } + + throw new System.ApplicationException(message); + } + + + /// <summary> Pushes the specified amount of characters back into the input stream. + /// + /// They will be read again by then next call of the scanning method + /// + /// </summary> + /// <param name="number"> the number of characters to be read again. + /// This number must not be greater than yylength()! + /// </param> + public virtual void Yypushback(int number) + { + if (number > Yylength()) + ZzScanError(ZZ_PUSHBACK_2BIG); + + zzMarkedPos -= number; + } + + + /// <summary> Resumes scanning until the next regular expression is matched, + /// the end of input is encountered or an I/O-Error occurs. + /// + /// </summary> + /// <returns> the next token + /// </returns> + /// <exception cref="System.IO.IOException"> if any I/O-Error occurs + /// </exception> + public virtual int GetNextToken() + { + int zzInput; + int zzAction; + + // cached fields: + int zzCurrentPosL; + int zzMarkedPosL; + int zzEndReadL = zzEndRead; + char[] zzBufferL = zzBuffer; + char[] zzCMapL = ZZ_CMAP; + + int[] zzTransL = ZZ_TRANS; + int[] zzRowMapL = ZZ_ROWMAP; + int[] zzAttrL = ZZ_ATTRIBUTE; + + while (true) + { + zzMarkedPosL = zzMarkedPos; + + yychar += zzMarkedPosL - zzStartRead; + + zzAction = - 1; + + zzCurrentPosL = zzCurrentPos = zzStartRead = zzMarkedPosL; + + zzState = zzLexicalState; + + + { + while (true) + { + + if (zzCurrentPosL < zzEndReadL) + zzInput = zzBufferL[zzCurrentPosL++]; + else if (zzAtEOF) + { + zzInput = YYEOF; + goto zzForAction_brk; // {{Aroush-2.9}} this 'goto' maybe in the wrong place + } + else + { + // store back cached positions + zzCurrentPos = zzCurrentPosL; + zzMarkedPos = zzMarkedPosL; + bool eof = ZzRefill(); + // get translated positions and possibly new buffer + zzCurrentPosL = zzCurrentPos; + zzMarkedPosL = zzMarkedPos; + zzBufferL = zzBuffer; + zzEndReadL = zzEndRead; + if (eof) + { + zzInput = YYEOF; + goto zzForAction_brk; // {{Aroush-2.9}} this 'goto' maybe in the wrong place + } + else + { + zzInput = zzBufferL[zzCurrentPosL++]; + } + } + int zzNext = zzTransL[zzRowMapL[zzState] + zzCMapL[zzInput]]; + if (zzNext == - 1) + { + goto zzForAction_brk; // {{Aroush-2.9}} this 'goto' maybe in the wrong place + } + zzState = zzNext; + + int zzAttributes = zzAttrL[zzState]; + if ((zzAttributes & 1) == 1) + { + zzAction = zzState; + zzMarkedPosL = zzCurrentPosL; + if ((zzAttributes & 8) == 8) + { + goto zzForAction_brk; // {{Aroush-2.9}} this 'goto' maybe in the wrong place + } + } + } + } + +zzForAction_brk: ; // {{Aroush-2.9}} this 'lable' maybe in the wrong place + + + // store back cached position + zzMarkedPos = zzMarkedPosL; + + switch (zzAction < 0?zzAction:ZZ_ACTION[zzAction]) + { + + case 4: + { + return HOST; + } + + case 11: break; + + case 9: + { + return ACRONYM; + } + + case 12: break; + + case 8: + { + return ACRONYM_DEP; + } + + case 13: break; + + case 1: + { + /* ignore */ + } + goto case 14; + + case 14: break; + + case 5: + { + return NUM; + } + + case 15: break; + + case 3: + { + return CJ; + } + + case 16: break; + + case 2: + { + return ALPHANUM; + } + + case 17: break; + + case 7: + { + return COMPANY; + } + + case 18: break; + + case 6: + { + return APOSTROPHE; + } + + case 19: break; + + case 10: + { + return EMAIL; + } + + case 20: break; + + default: + if (zzInput == YYEOF && zzStartRead == zzCurrentPos) + { + zzAtEOF = true; + return YYEOF; + } + else + { + ZzScanError(ZZ_NO_MATCH); + } + break; + + } + } + } + static StandardTokenizerImpl() + { + ALPHANUM = StandardTokenizer.ALPHANUM; + APOSTROPHE = StandardTokenizer.APOSTROPHE; + ACRONYM = StandardTokenizer.ACRONYM; + COMPANY = StandardTokenizer.COMPANY; + EMAIL = StandardTokenizer.EMAIL; + HOST = StandardTokenizer.HOST; + NUM = StandardTokenizer.NUM; + CJ = StandardTokenizer.CJ; + ACRONYM_DEP = StandardTokenizer.ACRONYM_DEP; + TOKEN_TYPES = StandardTokenizer.TOKEN_TYPES; + } + } +}
\ No newline at end of file diff --git a/src/core/Analysis/StopAnalyzer.cs b/src/core/Analysis/StopAnalyzer.cs new file mode 100644 index 0000000..aabe197 --- /dev/null +++ b/src/core/Analysis/StopAnalyzer.cs @@ -0,0 +1,141 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +using System.Collections.Generic; +using Version = Lucene.Net.Util.Version; + +namespace Lucene.Net.Analysis +{ + + /// <summary> Filters <see cref="LetterTokenizer" /> with <see cref="LowerCaseFilter" /> and + /// <see cref="StopFilter" />. + /// + /// <a name="version"/> + /// <p/> + /// You must specify the required <see cref="Version" /> compatibility when creating + /// StopAnalyzer: + /// <list type="bullet"> + /// <item>As of 2.9, position increments are preserved</item> + /// </list> + /// </summary> + + public sealed class StopAnalyzer:Analyzer + { + private readonly ISet<string> stopWords; + private readonly bool enablePositionIncrements; + + /// <summary>An unmodifiable set containing some common English words that are not usually useful + /// for searching. + /// </summary> + public static ISet<string> ENGLISH_STOP_WORDS_SET; + + /// <summary> Builds an analyzer which removes words in ENGLISH_STOP_WORDS.</summary> + public StopAnalyzer(Version matchVersion) + { + stopWords = ENGLISH_STOP_WORDS_SET; + enablePositionIncrements = StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion); + } + + /// <summary>Builds an analyzer with the stop words from the given set.</summary> + public StopAnalyzer(Version matchVersion, ISet<string> stopWords) + { + this.stopWords = stopWords; + enablePositionIncrements = StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion); + } + + /// <summary> Builds an analyzer with the stop words from the given file. + /// + /// </summary> + /// <seealso cref="WordlistLoader.GetWordSet(System.IO.FileInfo)"> + /// </seealso> + /// <param name="matchVersion">See <a href="#version">above</a> + /// </param> + /// <param name="stopwordsFile">File to load stop words from + /// </param> + public StopAnalyzer(Version matchVersion, System.IO.FileInfo stopwordsFile) + { + stopWords = WordlistLoader.GetWordSet(stopwordsFile); + enablePositionIncrements = StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion); + } + + /// <summary>Builds an analyzer with the stop words from the given reader. </summary> + /// <seealso cref="WordlistLoader.GetWordSet(System.IO.TextReader)"> + /// </seealso> + /// <param name="matchVersion">See <a href="#Version">above</a> + /// </param> + /// <param name="stopwords">Reader to load stop words from + /// </param> + public StopAnalyzer(Version matchVersion, System.IO.TextReader stopwords) + { + stopWords = WordlistLoader.GetWordSet(stopwords); + enablePositionIncrements = StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion); + } + + /// <summary>Filters LowerCaseTokenizer with StopFilter. </summary> + public override TokenStream TokenStream(System.String fieldName, System.IO.TextReader reader) + { + return new StopFilter(enablePositionIncrements, new LowerCaseTokenizer(reader), stopWords); + } + + /// <summary>Filters LowerCaseTokenizer with StopFilter. </summary> + private class SavedStreams + { + public SavedStreams(StopAnalyzer enclosingInstance) + { + InitBlock(enclosingInstance); + } + private void InitBlock(StopAnalyzer enclosingInstance) + { + this.enclosingInstance = enclosingInstance; + } + private StopAnalyzer enclosingInstance; + public StopAnalyzer Enclosing_Instance + { + get + { + return enclosingInstance; + } + + } + internal Tokenizer source; + internal TokenStream result; + } + + public override TokenStream ReusableTokenStream(System.String fieldName, System.IO.TextReader reader) + { + var streams = (SavedStreams) PreviousTokenStream; + if (streams == null) + { + streams = new SavedStreams(this) {source = new LowerCaseTokenizer(reader)}; + streams.result = new StopFilter(enablePositionIncrements, streams.source, stopWords); + PreviousTokenStream = streams; + } + else + streams.source.Reset(reader); + return streams.result; + } + static StopAnalyzer() + { + { + var stopWords = new System.String[]{"a", "an", "and", "are", "as", "at", "be", "but", "by", "for", "if", "in", "into", "is", "it", "no", "not", "of", "on", "or", "such", "that", "the", "their", "then", "there", "these", "they", "this", "to", "was", "will", "with"}; + var stopSet = new CharArraySet(stopWords.Length, false); + stopSet.AddAll(stopWords); + ENGLISH_STOP_WORDS_SET = CharArraySet.UnmodifiableSet(stopSet); + } + } + } +}
\ No newline at end of file diff --git a/src/core/Analysis/StopFilter.cs b/src/core/Analysis/StopFilter.cs new file mode 100644 index 0000000..81b7dd0 --- /dev/null +++ b/src/core/Analysis/StopFilter.cs @@ -0,0 +1,178 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +using System; +using System.Collections.Generic; +using Lucene.Net.Analysis.Tokenattributes; +using Lucene.Net.Util; +using QueryParser = Lucene.Net.QueryParsers.QueryParser; +using Version = Lucene.Net.Util.Version; + +namespace Lucene.Net.Analysis +{ + + /// <summary> Removes stop words from a token stream.</summary> + + public sealed class StopFilter:TokenFilter + { + private readonly CharArraySet stopWords; + private bool enablePositionIncrements = false; + + private readonly ITermAttribute termAtt; + private readonly IPositionIncrementAttribute posIncrAtt; + + /// <summary> Construct a token stream filtering the given input. + /// If <c>stopWords</c> is an instance of <see cref="CharArraySet" /> (true if + /// <c>makeStopSet()</c> was used to construct the set) it will be directly used + /// and <c>ignoreCase</c> will be ignored since <c>CharArraySet</c> + /// directly controls case sensitivity. + /// <p/> + /// If <c>stopWords</c> is not an instance of <see cref="CharArraySet" />, + /// a new CharArraySet will be constructed and <c>ignoreCase</c> will be + /// used to specify the case sensitivity of that set. + /// </summary> + /// <param name="enablePositionIncrements">true if token positions should record the removed stop words</param> + /// <param name="input">Input TokenStream</param> + /// <param name="stopWords">A Set of strings or strings or char[] or any other ToString()-able set representing the stopwords</param> + /// <param name="ignoreCase">if true, all words are lower cased first</param> + public StopFilter(bool enablePositionIncrements, TokenStream input, ISet<string> stopWords, bool ignoreCase) + : base(input) + { + if (stopWords is CharArraySet) + { + this.stopWords = (CharArraySet) stopWords; + } + else + { + this.stopWords = new CharArraySet(stopWords.Count, ignoreCase); + this.stopWords.AddAll(stopWords); + } + this.enablePositionIncrements = enablePositionIncrements; + termAtt = AddAttribute<ITermAttribute>(); + posIncrAtt = AddAttribute<IPositionIncrementAttribute>(); + } + + /// <summary> Constructs a filter which removes words from the input + /// TokenStream that are named in the Set. + /// </summary> + /// <param name="enablePositionIncrements">true if token positions should record the removed stop words</param> + /// <param name="in">Input stream</param> + /// <param name="stopWords">A Set of strings or char[] or any other ToString()-able set representing the stopwords</param> + /// <seealso cref="MakeStopSet(String[])"/> + public StopFilter(bool enablePositionIncrements, TokenStream @in, ISet<string> stopWords) + : this(enablePositionIncrements, @in, stopWords, false) + { } + + /// <summary> Builds a Set from an array of stop words, + /// appropriate for passing into the StopFilter constructor. + /// This permits this stopWords construction to be cached once when + /// an Analyzer is constructed. + /// + /// </summary> + /// <seealso cref="MakeStopSet(String[], bool)">passing false to ignoreCase</seealso> + public static ISet<string> MakeStopSet(params string[] stopWords) + { + return MakeStopSet(stopWords, false); + } + + /// <summary> Builds a Set from an array of stop words, + /// appropriate for passing into the StopFilter constructor. + /// This permits this stopWords construction to be cached once when + /// an Analyzer is constructed. + /// </summary> + /// <param name="stopWords">A list of strings or char[] or any other ToString()-able list representing the stop words</param> + /// <seealso cref="MakeStopSet(String[], bool)">passing false to ignoreCase</seealso> + public static ISet<string> MakeStopSet(IList<object> stopWords) + { + return MakeStopSet(stopWords, false); + } + + /// <summary></summary> + /// <param name="stopWords">An array of stopwords</param> + /// <param name="ignoreCase">If true, all words are lower cased first.</param> + /// <returns> a Set containing the words</returns> + public static ISet<string> MakeStopSet(string[] stopWords, bool ignoreCase) + { + var stopSet = new CharArraySet(stopWords.Length, ignoreCase); + stopSet.AddAll(stopWords); + return stopSet; + } + + /// <summary> </summary> + /// <param name="stopWords">A List of Strings or char[] or any other toString()-able list representing the stopwords </param> + /// <param name="ignoreCase">if true, all words are lower cased first</param> + /// <returns>A Set (<see cref="CharArraySet"/>)containing the words</returns> + public static ISet<string> MakeStopSet(IList<object> stopWords, bool ignoreCase) + { + var stopSet = new CharArraySet(stopWords.Count, ignoreCase); + foreach(var word in stopWords) + stopSet.Add(word.ToString()); + return stopSet; + } + + /// <summary> Returns the next input Token whose term() is not a stop word.</summary> + public override bool IncrementToken() + { + // return the first non-stop word found + int skippedPositions = 0; + while (input.IncrementToken()) + { + if (!stopWords.Contains(termAtt.TermBuffer(), 0, termAtt.TermLength())) + { + if (enablePositionIncrements) + { + posIncrAtt.PositionIncrement = posIncrAtt.PositionIncrement + skippedPositions; + } + return true; + } + skippedPositions += posIncrAtt.PositionIncrement; + } + // reached EOS -- return false + return false; + } + + /// <summary> Returns version-dependent default for enablePositionIncrements. Analyzers + /// that embed StopFilter use this method when creating the StopFilter. Prior + /// to 2.9, this returns false. On 2.9 or later, it returns true. + /// </summary> + public static bool GetEnablePositionIncrementsVersionDefault(Version matchVersion) + { + return matchVersion.OnOrAfter(Version.LUCENE_29); + } + + /// <summary> If <c>true</c>, this StopFilter will preserve + /// positions of the incoming tokens (ie, accumulate and + /// set position increments of the removed stop tokens). + /// Generally, <c>true</c> is best as it does not + /// lose information (positions of the original tokens) + /// during indexing. + /// + /// <p/> When set, when a token is stopped + /// (omitted), the position increment of the following + /// token is incremented. + /// + /// <p/> <b>NOTE</b>: be sure to also + /// set <see cref="QueryParser.EnablePositionIncrements" /> if + /// you use QueryParser to create queries. + /// </summary> + public bool EnablePositionIncrements + { + get { return enablePositionIncrements; } + set { enablePositionIncrements = value; } + } + } +}
\ No newline at end of file diff --git a/src/core/Analysis/TeeSinkTokenFilter.cs b/src/core/Analysis/TeeSinkTokenFilter.cs new file mode 100644 index 0000000..bec605e --- /dev/null +++ b/src/core/Analysis/TeeSinkTokenFilter.cs @@ -0,0 +1,266 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +using System; +using System.Collections.Generic; +using Attribute = Lucene.Net.Util.Attribute; +using AttributeSource = Lucene.Net.Util.AttributeSource; + +namespace Lucene.Net.Analysis +{ + + /// <summary> This TokenFilter provides the ability to set aside attribute states + /// that have already been analyzed. This is useful in situations where multiple fields share + /// many common analysis steps and then go their separate ways. + /// <p/> + /// It is also useful for doing things like entity extraction or proper noun analysis as + /// part of the analysis workflow and saving off those tokens for use in another field. + /// + /// <code> + /// TeeSinkTokenFilter source1 = new TeeSinkTokenFilter(new WhitespaceTokenizer(reader1)); + /// TeeSinkTokenFilter.SinkTokenStream sink1 = source1.newSinkTokenStream(); + /// TeeSinkTokenFilter.SinkTokenStream sink2 = source1.newSinkTokenStream(); + /// TeeSinkTokenFilter source2 = new TeeSinkTokenFilter(new WhitespaceTokenizer(reader2)); + /// source2.addSinkTokenStream(sink1); + /// source2.addSinkTokenStream(sink2); + /// TokenStream final1 = new LowerCaseFilter(source1); + /// TokenStream final2 = source2; + /// TokenStream final3 = new EntityDetect(sink1); + /// TokenStream final4 = new URLDetect(sink2); + /// d.add(new Field("f1", final1)); + /// d.add(new Field("f2", final2)); + /// d.add(new Field("f3", final3)); + /// d.add(new Field("f4", final4)); + /// </code> + /// In this example, <c>sink1</c> and <c>sink2</c> will both get tokens from both + /// <c>reader1</c> and <c>reader2</c> after whitespace tokenizer + /// and now we can further wrap any of these in extra analysis, and more "sources" can be inserted if desired. + /// It is important, that tees are consumed before sinks (in the above example, the field names must be + /// less the sink's field names). If you are not sure, which stream is consumed first, you can simply + /// add another sink and then pass all tokens to the sinks at once using <see cref="ConsumeAllTokens" />. + /// This TokenFilter is exhausted after this. In the above example, change + /// the example above to: + /// <code> + /// ... + /// TokenStream final1 = new LowerCaseFilter(source1.newSinkTokenStream()); + /// TokenStream final2 = source2.newSinkTokenStream(); + /// sink1.consumeAllTokens(); + /// sink2.consumeAllTokens(); + /// ... + /// </code> + /// In this case, the fields can be added in any order, because the sources are not used anymore and all sinks are ready. + /// <p/>Note, the EntityDetect and URLDetect TokenStreams are for the example and do not currently exist in Lucene. + /// </summary> + public sealed class TeeSinkTokenFilter:TokenFilter + { + public class AnonymousClassSinkFilter:SinkFilter + { + public override bool Accept(AttributeSource source) + { + return true; + } + } + private readonly LinkedList<WeakReference> sinks = new LinkedList<WeakReference>(); + + /// <summary> Instantiates a new TeeSinkTokenFilter.</summary> + public TeeSinkTokenFilter(TokenStream input):base(input) + { + } + + /// <summary> Returns a new <see cref="SinkTokenStream" /> that receives all tokens consumed by this stream.</summary> + public SinkTokenStream NewSinkTokenStream() + { + return NewSinkTokenStream(ACCEPT_ALL_FILTER); + } + + /// <summary> Returns a new <see cref="SinkTokenStream" /> that receives all tokens consumed by this stream + /// that pass the supplied filter. + /// </summary> + /// <seealso cref="SinkFilter"> + /// </seealso> + public SinkTokenStream NewSinkTokenStream(SinkFilter filter) + { + var sink = new SinkTokenStream(this.CloneAttributes(), filter); + sinks.AddLast(new WeakReference(sink)); + return sink; + } + + /// <summary> Adds a <see cref="SinkTokenStream" /> created by another <c>TeeSinkTokenFilter</c> + /// to this one. The supplied stream will also receive all consumed tokens. + /// This method can be used to pass tokens from two different tees to one sink. + /// </summary> + public void AddSinkTokenStream(SinkTokenStream sink) + { + // check that sink has correct factory + if (!this.Factory.Equals(sink.Factory)) + { + throw new System.ArgumentException("The supplied sink is not compatible to this tee"); + } + // add eventually missing attribute impls to the existing sink + foreach (var impl in this.CloneAttributes().GetAttributeImplsIterator()) + { + sink.AddAttributeImpl(impl); + } + sinks.AddLast(new WeakReference(sink)); + } + + /// <summary> <c>TeeSinkTokenFilter</c> passes all tokens to the added sinks + /// when itself is consumed. To be sure, that all tokens from the input + /// stream are passed to the sinks, you can call this methods. + /// This instance is exhausted after this, but all sinks are instant available. + /// </summary> + public void ConsumeAllTokens() + { + while (IncrementToken()) + { + } + } + + public override bool IncrementToken() + { + if (input.IncrementToken()) + { + // capture state lazily - maybe no SinkFilter accepts this state + State state = null; + foreach(WeakReference wr in sinks) + { + var sink = (SinkTokenStream)wr.Target; + if (sink != null) + { + if (sink.Accept(this)) + { + if (state == null) + { + state = this.CaptureState(); + } + sink.AddState(state); + } + } + } + return true; + } + + return false; + } + + public override void End() + { + base.End(); + State finalState = CaptureState(); + foreach(WeakReference wr in sinks) + { + var sink = (SinkTokenStream)wr.Target; + if (sink != null) + { + sink.SetFinalState(finalState); + } + } + } + + /// <summary> A filter that decides which <see cref="AttributeSource" /> states to store in the sink.</summary> + public abstract class SinkFilter + { + /// <summary> Returns true, iff the current state of the passed-in <see cref="AttributeSource" /> shall be stored + /// in the sink. + /// </summary> + public abstract bool Accept(AttributeSource source); + + /// <summary> Called by <see cref="SinkTokenStream.Reset()" />. This method does nothing by default + /// and can optionally be overridden. + /// </summary> + public virtual void Reset() + { + // nothing to do; can be overridden + } + } + + public sealed class SinkTokenStream : TokenStream + { + private readonly LinkedList<State> cachedStates = new LinkedList<State>(); + private State finalState; + private IEnumerator<AttributeSource.State> it = null; + private readonly SinkFilter filter; + + internal SinkTokenStream(AttributeSource source, SinkFilter filter) + : base(source) + { + this.filter = filter; + } + + internal /*private*/ bool Accept(AttributeSource source) + { + return filter.Accept(source); + } + + internal /*private*/ void AddState(AttributeSource.State state) + { + if (it != null) + { + throw new System.SystemException("The tee must be consumed before sinks are consumed."); + } + cachedStates.AddLast(state); + } + + internal /*private*/ void SetFinalState(AttributeSource.State finalState) + { + this.finalState = finalState; + } + + public override bool IncrementToken() + { + // lazy init the iterator + if (it == null) + { + it = cachedStates.GetEnumerator(); + } + + if (!it.MoveNext()) + { + return false; + } + + State state = it.Current; + RestoreState(state); + return true; + } + + public override void End() + { + if (finalState != null) + { + RestoreState(finalState); + } + } + + public override void Reset() + { + it = cachedStates.GetEnumerator(); + } + + protected override void Dispose(bool disposing) + { + // Do nothing. + } + } + + private static readonly SinkFilter ACCEPT_ALL_FILTER; + static TeeSinkTokenFilter() + { + ACCEPT_ALL_FILTER = new AnonymousClassSinkFilter(); + } + } +}
\ No newline at end of file diff --git a/src/core/Analysis/Token.cs b/src/core/Analysis/Token.cs new file mode 100644 index 0000000..3357f34 --- /dev/null +++ b/src/core/Analysis/Token.cs @@ -0,0 +1,852 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +using System; +using Lucene.Net.Analysis.Tokenattributes; +using Lucene.Net.Support; +using Lucene.Net.Util; +using Payload = Lucene.Net.Index.Payload; +using TermPositions = Lucene.Net.Index.TermPositions; +using ArrayUtil = Lucene.Net.Util.ArrayUtil; +using Attribute = Lucene.Net.Util.Attribute; + +namespace Lucene.Net.Analysis +{ + + /// <summary>A Token is an occurrence of a term from the text of a field. It consists of + /// a term's text, the start and end offset of the term in the text of the field, + /// and a type string. + /// <p/> + /// The start and end offsets permit applications to re-associate a token with + /// its source text, e.g., to display highlighted query terms in a document + /// browser, or to show matching text fragments in a <abbr + /// title="KeyWord In Context">KWIC</abbr> display, etc. + /// <p/> + /// The type is a string, assigned by a lexical analyzer + /// (a.k.a. tokenizer), naming the lexical or syntactic class that the token + /// belongs to. For example an end of sentence marker token might be implemented + /// with type "eos". The default token type is "word". + /// <p/> + /// A Token can optionally have metadata (a.k.a. Payload) in the form of a variable + /// length byte array. Use <see cref="TermPositions.PayloadLength" /> and + /// <see cref="TermPositions.GetPayload(byte[], int)" /> to retrieve the payloads from the index. + /// </summary> + /// <summary><br/><br/> + /// </summary> + /// <summary><p/><b>NOTE:</b> As of 2.9, Token implements all <see cref="IAttribute" /> interfaces + /// that are part of core Lucene and can be found in the <see cref="Lucene.Net.Analysis.Tokenattributes"/> namespace. + /// Even though it is not necessary to use Token anymore, with the new TokenStream API it can + /// be used as convenience class that implements all <see cref="IAttribute" />s, which is especially useful + /// to easily switch from the old to the new TokenStream API. + /// <br/><br/> + /// <p/>Tokenizers and TokenFilters should try to re-use a Token instance when + /// possible for best performance, by implementing the + /// <see cref="TokenStream.IncrementToken()" /> API. + /// Failing that, to create a new Token you should first use + /// one of the constructors that starts with null text. To load + /// the token from a char[] use <see cref="SetTermBuffer(char[], int, int)" />. + /// To load from a String use <see cref="SetTermBuffer(String)" /> or <see cref="SetTermBuffer(String, int, int)" />. + /// Alternatively you can get the Token's termBuffer by calling either <see cref="TermBuffer()" />, + /// if you know that your text is shorter than the capacity of the termBuffer + /// or <see cref="ResizeTermBuffer(int)" />, if there is any possibility + /// that you may need to grow the buffer. Fill in the characters of your term into this + /// buffer, with <see cref="string.ToCharArray(int, int)" /> if loading from a string, + /// or with <see cref="Array.Copy(Array, long, Array, long, long)" />, and finally call <see cref="SetTermLength(int)" /> to + /// set the length of the term text. See <a target="_top" + /// href="https://issues.apache.org/jira/browse/LUCENE-969">LUCENE-969</a> + /// for details.<p/> + /// <p/>Typical Token reuse patterns: + /// <list type="bullet"> + /// <item> Copying text from a string (type is reset to <see cref="DEFAULT_TYPE" /> if not + /// specified):<br/> + /// <code> + /// return reusableToken.reinit(string, startOffset, endOffset[, type]); + /// </code> + /// </item> + /// <item> Copying some text from a string (type is reset to <see cref="DEFAULT_TYPE" /> + /// if not specified):<br/> + /// <code> + /// return reusableToken.reinit(string, 0, string.length(), startOffset, endOffset[, type]); + /// </code> + /// </item> + /// <item> Copying text from char[] buffer (type is reset to <see cref="DEFAULT_TYPE" /> + /// if not specified):<br/> + /// <code> + /// return reusableToken.reinit(buffer, 0, buffer.length, startOffset, endOffset[, type]); + /// </code> + /// </item> + /// <item> Copying some text from a char[] buffer (type is reset to + /// <see cref="DEFAULT_TYPE" /> if not specified):<br/> + /// <code> + /// return reusableToken.reinit(buffer, start, end - start, startOffset, endOffset[, type]); + /// </code> + /// </item> + /// <item> Copying from one one Token to another (type is reset to + /// <see cref="DEFAULT_TYPE" /> if not specified):<br/> + /// <code> + /// return reusableToken.reinit(source.termBuffer(), 0, source.termLength(), source.startOffset(), source.endOffset()[, source.type()]); + /// </code> + /// </item> + /// </list> + /// A few things to note: + /// <list type="bullet"> + /// <item>clear() initializes all of the fields to default values. This was changed in contrast to Lucene 2.4, but should affect no one.</item> + /// <item>Because <c>TokenStreams</c> can be chained, one cannot assume that the <c>Token's</c> current type is correct.</item> + /// <item>The startOffset and endOffset represent the start and offset in the + /// source text, so be careful in adjusting them.</item> + /// <item>When caching a reusable token, clone it. When injecting a cached token into a stream that can be reset, clone it again.</item> + /// </list> + /// <p/> + /// </summary> + /// <seealso cref="Lucene.Net.Index.Payload"> + /// </seealso> + [Serializable] + public class Token : Attribute, ITermAttribute, ITypeAttribute, IPositionIncrementAttribute, IFlagsAttribute, IOffsetAttribute, IPayloadAttribute + { + public const String DEFAULT_TYPE = "word"; + + private const int MIN_BUFFER_SIZE = 10; + + private char[] termBuffer; + private int termLength; + private int startOffset, endOffset; + private string type = DEFAULT_TYPE; + private int flags; + private Payload payload; + private int positionIncrement = 1; + + /// <summary>Constructs a Token will null text. </summary> + public Token() + { + } + + /// <summary>Constructs a Token with null text and start & end + /// offsets. + /// </summary> + /// <param name="start">start offset in the source text</param> + /// <param name="end">end offset in the source text</param> + public Token(int start, int end) + { + startOffset = start; + endOffset = end; + } + + /// <summary>Constructs a Token with null text and start & end + /// offsets plus the Token type. + /// </summary> + /// <param name="start">start offset in the source text</param> + /// <param name="end">end offset in the source text</param> + /// <param name="typ">the lexical type of this Token</param> + public Token(int start, int end, String typ) + { + startOffset = start; + endOffset = end; + type = typ; + } + + /// <summary> Constructs a Token with null text and start & end + /// offsets plus flags. NOTE: flags is EXPERIMENTAL. + /// </summary> + /// <param name="start">start offset in the source text</param> + /// <param name="end">end offset in the source text</param> + /// <param name="flags">The bits to set for this token</param> + public Token(int start, int end, int flags) + { + startOffset = start; + endOffset = end; + this.flags = flags; + } + + /// <summary>Constructs a Token with the given term text, and start + /// & end offsets. The type defaults to "word." + /// <b>NOTE:</b> for better indexing speed you should + /// instead use the char[] termBuffer methods to set the + /// term text. + /// </summary> + /// <param name="text">term text</param> + /// <param name="start">start offset</param> + /// <param name="end">end offset</param> + public Token(String text, int start, int end) + { + SetTermBuffer(text); + startOffset = start; + endOffset = end; + } + + /// <summary>Constructs a Token with the given text, start and end + /// offsets, & type. <b>NOTE:</b> for better indexing + /// speed you should instead use the char[] termBuffer + /// methods to set the term text. + /// </summary> + /// <param name="text">term text</param> + /// <param name="start">start offset</param> + /// <param name="end">end offset</param> + /// <param name="typ">token type</param> + public Token(System.String text, int start, int end, System.String typ) + { + SetTermBuffer(text); + startOffset = start; + endOffset = end; + type = typ; + } + + /// <summary> Constructs a Token with the given text, start and end + /// offsets, & type. <b>NOTE:</b> for better indexing + /// speed you should instead use the char[] termBuffer + /// methods to set the term text. + /// </summary> + /// <param name="text"></param> + /// <param name="start"></param> + /// <param name="end"></param> + /// <param name="flags">token type bits</param> + public Token(System.String text, int start, int end, int flags) + { + SetTermBuffer(text); + startOffset = start; + endOffset = end; + this.flags = flags; + } + + /// <summary> Constructs a Token with the given term buffer (offset + /// & length), start and end + /// offsets + /// </summary> + /// <param name="startTermBuffer"></param> + /// <param name="termBufferOffset"></param> + /// <param name="termBufferLength"></param> + /// <param name="start"></param> + /// <param name="end"></param> + public Token(char[] startTermBuffer, int termBufferOffset, int termBufferLength, int start, int end) + { + SetTermBuffer(startTermBuffer, termBufferOffset, termBufferLength); + startOffset = start; + endOffset = end; + } + + /// <summary>Set the position increment. This determines the position of this token + /// relative to the previous Token in a <see cref="TokenStream" />, used in phrase + /// searching. + /// + /// <p/>The default value is one. + /// + /// <p/>Some common uses for this are:<list> + /// + /// <item>Set it to zero to put multiple terms in the same position. This is + /// useful if, e.g., a word has multiple stems. Searches for phrases + /// including either stem will match. In this case, all but the first stem's + /// increment should be set to zero: the increment of the first instance + /// should be one. Repeating a token with an increment of zero can also be + /// used to boost the scores of matches on that token.</item> + /// + /// <item>Set it to values greater than one to inhibit exact phrase matches. + /// If, for example, one does not want phrases to match across removed stop + /// words, then one could build a stop word filter that removes stop words and + /// also sets the increment to the number of stop words removed before each + /// non-stop word. Then exact phrase queries will only match when the terms + /// occur with no intervening stop words.</item> + /// + /// </list> + /// </summary> + /// <value> the distance from the prior term </value> + /// <seealso cref="Lucene.Net.Index.TermPositions"> + /// </seealso> + public virtual int PositionIncrement + { + set + { + if (value < 0) + throw new System.ArgumentException("Increment must be zero or greater: " + value); + this.positionIncrement = value; + } + get { return positionIncrement; } + } + + /// <summary>Returns the Token's term text. + /// + /// This method has a performance penalty + /// because the text is stored internally in a char[]. If + /// possible, use <see cref="TermBuffer()" /> and <see cref="TermLength()"/> + /// directly instead. If you really need a + /// String, use this method, which is nothing more than + /// a convenience call to <b>new String(token.termBuffer(), 0, token.termLength())</b> + /// </summary> + public string Term + { + get + { + InitTermBuffer(); + return new System.String(termBuffer, 0, termLength); + } + } + + /// <summary>Copies the contents of buffer, starting at offset for + /// length characters, into the termBuffer array. + /// </summary> + /// <param name="buffer">the buffer to copy</param> + /// <param name="offset">the index in the buffer of the first character to copy</param> + /// <param name="length">the number of characters to copy</param> + public void SetTermBuffer(char[] buffer, int offset, int length) + { + GrowTermBuffer(length); + Array.Copy(buffer, offset, termBuffer, 0, length); + termLength = length; + } + + /// <summary>Copies the contents of buffer into the termBuffer array.</summary> + /// <param name="buffer">the buffer to copy + /// </param> + public void SetTermBuffer(System.String buffer) + { + int length = buffer.Length; + GrowTermBuffer(length); + TextSupport.GetCharsFromString(buffer, 0, length, termBuffer, 0); + termLength = length; + } + + /// <summary>Copies the contents of buffer, starting at offset and continuing + /// for length characters, into the termBuffer array. + /// </summary> + /// <param name="buffer">the buffer to copy + /// </param> + /// <param name="offset">the index in the buffer of the first character to copy + /// </param> + /// <param name="length">the number of characters to copy + /// </param> + public void SetTermBuffer(System.String buffer, int offset, int length) + { + System.Diagnostics.Debug.Assert(offset <= buffer.Length); + System.Diagnostics.Debug.Assert(offset + length <= buffer.Length); + GrowTermBuffer(length); + TextSupport.GetCharsFromString(buffer, offset, offset + length, termBuffer, 0); + termLength = length; + } + + /// <summary>Returns the internal termBuffer character array which + /// you can then directly alter. If the array is too + /// small for your token, use <see cref="ResizeTermBuffer(int)" /> + /// to increase it. After + /// altering the buffer be sure to call <see cref="SetTermLength" /> + /// to record the number of valid + /// characters that were placed into the termBuffer. + /// </summary> + public char[] TermBuffer() + { + InitTermBuffer(); + return termBuffer; + } + + /// <summary>Grows the termBuffer to at least size newSize, preserving the + /// existing content. Note: If the next operation is to change + /// the contents of the term buffer use + /// <see cref="SetTermBuffer(char[], int, int)" />, + /// <see cref="SetTermBuffer(String)" />, or + /// <see cref="SetTermBuffer(String, int, int)" /> + /// to optimally combine the resize with the setting of the termBuffer. + /// </summary> + /// <param name="newSize">minimum size of the new termBuffer + /// </param> + /// <returns> newly created termBuffer with length >= newSize + /// </returns> + public virtual char[] ResizeTermBuffer(int newSize) + { + if (termBuffer == null) + { + termBuffer = new char[ArrayUtil.GetNextSize(newSize < MIN_BUFFER_SIZE ? MIN_BUFFER_SIZE : newSize)]; + } + else + { + if (termBuffer.Length < newSize) + { + // Not big enough; create a new array with slight + // over allocation and preserve content + var newCharBuffer = new char[ArrayUtil.GetNextSize(newSize)]; + Array.Copy(termBuffer, 0, newCharBuffer, 0, termBuffer.Length); + termBuffer = newCharBuffer; + } + } + return termBuffer; + } + + /// <summary>Allocates a buffer char[] of at least newSize, without preserving the existing content. + /// its always used in places that set the content + /// </summary> + /// <param name="newSize">minimum size of the buffer + /// </param> + private void GrowTermBuffer(int newSize) + { + if (termBuffer == null) + { + // The buffer is always at least MIN_BUFFER_SIZE + termBuffer = new char[ArrayUtil.GetNextSize(newSize < MIN_BUFFER_SIZE?MIN_BUFFER_SIZE:newSize)]; + } + else + { + if (termBuffer.Length < newSize) + { + // Not big enough; create a new array with slight + // over allocation: + termBuffer = new char[ArrayUtil.GetNextSize(newSize)]; + } + } + } + + private void InitTermBuffer() + { + if (termBuffer == null) + { + termBuffer = new char[ArrayUtil.GetNextSize(MIN_BUFFER_SIZE)]; + termLength = 0; + } + } + + /// <summary>Return number of valid characters (length of the term) + /// in the termBuffer array. + /// </summary> + public int TermLength() + { + InitTermBuffer(); + return termLength; + } + + /// <summary>Set number of valid characters (length of the term) in + /// the termBuffer array. Use this to truncate the termBuffer + /// or to synchronize with external manipulation of the termBuffer. + /// Note: to grow the size of the array, + /// use <see cref="ResizeTermBuffer(int)" /> first. + /// </summary> + /// <param name="length">the truncated length + /// </param> + public void SetTermLength(int length) + { + InitTermBuffer(); + if (length > termBuffer.Length) + throw new System.ArgumentException("length " + length + " exceeds the size of the termBuffer (" + termBuffer.Length + ")"); + termLength = length; + } + + /// <summary>Gets or sets this Token's starting offset, the position of the first character + /// corresponding to this token in the source text. + /// Note that the difference between endOffset() and startOffset() may not be + /// equal to <see cref="TermLength"/>, as the term text may have been altered by a + /// stemmer or some other filter. + /// </summary> + public virtual int StartOffset + { + get { return startOffset; } + set { this.startOffset = value; } + } + + /// <summary>Gets or sets this Token's ending offset, one greater than the position of the + /// last character corresponding to this token in the source text. The length + /// of the token in the source text is (endOffset - startOffset). + /// </summary> + public virtual int EndOffset + { + get { return endOffset; } + set { this.endOffset = value; } + } + + /// <summary>Set the starting and ending offset. + /// See StartOffset() and EndOffset() + /// </summary> + public virtual void SetOffset(int startOffset, int endOffset) + { + this.startOffset = startOffset; + this.endOffset = endOffset; + } + + /// <summary>Returns this Token's lexical type. Defaults to "word". </summary> + public string Type + { + get { return type; } + set { this.type = value; } + } + + /// <summary> EXPERIMENTAL: While we think this is here to stay, we may want to change it to be a long. + /// <p/> + /// + /// Get the bitset for any bits that have been set. This is completely distinct from <see cref="Type()" />, although they do share similar purposes. + /// The flags can be used to encode information about the token for use by other <see cref="TokenFilter"/>s. + /// + /// + /// </summary> + /// <value> The bits </value> + public virtual int Flags + { + get { return flags; } + set { flags = value; } + } + + /// <summary> Returns this Token's payload.</summary> + public virtual Payload Payload + { + get { return payload; } + set { payload = value; } + } + + public override String ToString() + { + var sb = new System.Text.StringBuilder(); + sb.Append('('); + InitTermBuffer(); + if (termBuffer == null) + sb.Append("null"); + else + sb.Append(termBuffer, 0, termLength); + sb.Append(',').Append(startOffset).Append(',').Append(endOffset); + if (!type.Equals("word")) + sb.Append(",type=").Append(type); + if (positionIncrement != 1) + sb.Append(",posIncr=").Append(positionIncrement); + sb.Append(')'); + return sb.ToString(); + } + + /// <summary>Resets the term text, payload, flags, and positionIncrement, + /// startOffset, endOffset and token type to default. + /// </summary> + public override void Clear() + { + payload = null; + // Leave termBuffer to allow re-use + termLength = 0; + positionIncrement = 1; + flags = 0; + startOffset = endOffset = 0; + type = DEFAULT_TYPE; + } + + public override System.Object Clone() + { + var t = (Token) base.Clone(); + // Do a deep clone + if (termBuffer != null) + { + t.termBuffer = new char[termBuffer.Length]; + termBuffer.CopyTo(t.termBuffer, 0); + } + if (payload != null) + { + t.payload = (Payload) payload.Clone(); + } + return t; + } + + /// <summary>Makes a clone, but replaces the term buffer & + /// start/end offset in the process. This is more + /// efficient than doing a full clone (and then calling + /// setTermBuffer) because it saves a wasted copy of the old + /// termBuffer. + /// </summary> + public virtual Token Clone(char[] newTermBuffer, int newTermOffset, int newTermLength, int newStartOffset, int newEndOffset) + { + var t = new Token(newTermBuffer, newTermOffset, newTermLength, newStartOffset, newEndOffset) + {positionIncrement = positionIncrement, flags = flags, type = type}; + if (payload != null) + t.payload = (Payload) payload.Clone(); + return t; + } + + public override bool Equals(Object obj) + { + if (obj == this) + return true; + + var other = obj as Token; + if (other == null) + return false; + + InitTermBuffer(); + other.InitTermBuffer(); + + if (termLength == other.termLength && startOffset == other.startOffset && endOffset == other.endOffset && + flags == other.flags && positionIncrement == other.positionIncrement && SubEqual(type, other.type) && + SubEqual(payload, other.payload)) + { + for (int i = 0; i < termLength; i++) + if (termBuffer[i] != other.termBuffer[i]) + return false; + return true; + } + return false; + } + + private bool SubEqual(System.Object o1, System.Object o2) + { + if (o1 == null) + return o2 == null; + return o1.Equals(o2); + } + + public override int GetHashCode() + { + InitTermBuffer(); + int code = termLength; + code = code * 31 + startOffset; + code = code * 31 + endOffset; + code = code * 31 + flags; + code = code * 31 + positionIncrement; + code = code * 31 + type.GetHashCode(); + code = (payload == null?code:code * 31 + payload.GetHashCode()); + code = code * 31 + ArrayUtil.HashCode(termBuffer, 0, termLength); + return code; + } + + // like clear() but doesn't clear termBuffer/text + private void ClearNoTermBuffer() + { + payload = null; + positionIncrement = 1; + flags = 0; + startOffset = endOffset = 0; + type = DEFAULT_TYPE; + } + + /// <summary>Shorthand for calling <see cref="Clear" />, + /// <see cref="SetTermBuffer(char[], int, int)" />, + /// <see cref="StartOffset" />, + /// <see cref="EndOffset" />, + /// <see cref="Type" /> + /// </summary> + /// <returns> this Token instance + /// </returns> + public virtual Token Reinit(char[] newTermBuffer, int newTermOffset, int newTermLength, int newStartOffset, int newEndOffset, System.String newType) + { + ClearNoTermBuffer(); + payload = null; + positionIncrement = 1; + SetTermBuffer(newTermBuffer, newTermOffset, newTermLength); + startOffset = newStartOffset; + endOffset = newEndOffset; + type = newType; + return this; + } + + /// <summary>Shorthand for calling <see cref="Clear" />, + /// <see cref="SetTermBuffer(char[], int, int)" />, + /// <see cref="StartOffset" />, + /// <see cref="EndOffset" /> + /// <see cref="Type" /> on Token.DEFAULT_TYPE + /// </summary> + /// <returns> this Token instance + /// </returns> + public virtual Token Reinit(char[] newTermBuffer, int newTermOffset, int newTermLength, int newStartOffset, int newEndOffset) + { + ClearNoTermBuffer(); + SetTermBuffer(newTermBuffer, newTermOffset, newTermLength); + startOffset = newStartOffset; + endOffset = newEndOffset; + type = DEFAULT_TYPE; + return this; + } + + /// <summary>Shorthand for calling <see cref="Clear" />, + /// <see cref="SetTermBuffer(String)" />, + /// <see cref="StartOffset" />, + /// <see cref="EndOffset" /> + /// <see cref="Type" /> + /// </summary> + /// <returns> this Token instance + /// </returns> + public virtual Token Reinit(System.String newTerm, int newStartOffset, int newEndOffset, System.String newType) + { + ClearNoTermBuffer(); + SetTermBuffer(newTerm); + startOffset = newStartOffset; + endOffset = newEndOffset; + type = newType; + return this; + } + + /// <summary>Shorthand for calling <see cref="Clear" />, + /// <see cref="SetTermBuffer(String, int, int)" />, + /// <see cref="StartOffset" />, + /// <see cref="EndOffset" /> + /// <see cref="Type" /> + /// </summary> + /// <returns> this Token instance + /// </returns> + public virtual Token Reinit(System.String newTerm, int newTermOffset, int newTermLength, int newStartOffset, int newEndOffset, System.String newType) + { + ClearNoTermBuffer(); + SetTermBuffer(newTerm, newTermOffset, newTermLength); + startOffset = newStartOffset; + endOffset = newEndOffset; + type = newType; + return this; + } + + /// <summary>Shorthand for calling <see cref="Clear" />, + /// <see cref="SetTermBuffer(String)" />, + /// <see cref="StartOffset" />, + /// <see cref="EndOffset" /> + /// <see cref="Type" /> on Token.DEFAULT_TYPE + /// </summary> + /// <returns> this Token instance + /// </returns> + public virtual Token Reinit(System.String newTerm, int newStartOffset, int newEndOffset) + { + ClearNoTermBuffer(); + SetTermBuffer(newTerm); + startOffset = newStartOffset; + endOffset = newEndOffset; + type = DEFAULT_TYPE; + return this; + } + + /// <summary>Shorthand for calling <see cref="Clear" />, + /// <see cref="SetTermBuffer(String, int, int)" />, + /// <see cref="StartOffset" />, + /// <see cref="EndOffset" /> + /// <see cref="Type" /> on Token.DEFAULT_TYPE + /// </summary> + /// <returns> this Token instance + /// </returns> + public virtual Token Reinit(System.String newTerm, int newTermOffset, int newTermLength, int newStartOffset, int newEndOffset) + { + ClearNoTermBuffer(); + SetTermBuffer(newTerm, newTermOffset, newTermLength); + startOffset = newStartOffset; + endOffset = newEndOffset; + type = DEFAULT_TYPE; + return this; + } + + /// <summary> Copy the prototype token's fields into this one. Note: Payloads are shared.</summary> + /// <param name="prototype"> + /// </param> + public virtual void Reinit(Token prototype) + { + prototype.InitTermBuffer(); + SetTermBuffer(prototype.termBuffer, 0, prototype.termLength); + positionIncrement = prototype.positionIncrement; + flags = prototype.flags; + startOffset = prototype.startOffset; + endOffset = prototype.endOffset; + type = prototype.type; + payload = prototype.payload; + } + + /// <summary> Copy the prototype token's fields into this one, with a different term. Note: Payloads are shared.</summary> + /// <param name="prototype"> + /// </param> + /// <param name="newTerm"> + /// </param> + public virtual void Reinit(Token prototype, System.String newTerm) + { + SetTermBuffer(newTerm); + positionIncrement = prototype.positionIncrement; + flags = prototype.flags; + startOffset = prototype.startOffset; + endOffset = prototype.endOffset; + type = prototype.type; + payload = prototype.payload; + } + + /// <summary> Copy the prototype token's fields into this one, with a different term. Note: Payloads are shared.</summary> + /// <param name="prototype"> + /// </param> + /// <param name="newTermBuffer"> + /// </param> + /// <param name="offset"> + /// </param> + /// <param name="length"> + /// </param> + public virtual void Reinit(Token prototype, char[] newTermBuffer, int offset, int length) + { + SetTermBuffer(newTermBuffer, offset, length); + positionIncrement = prototype.positionIncrement; + flags = prototype.flags; + startOffset = prototype.startOffset; + endOffset = prototype.endOffset; + type = prototype.type; + payload = prototype.payload; + } + + public override void CopyTo(Attribute target) + { + if (target is Token) + { + var to = (Token) target; + to.Reinit(this); + // reinit shares the payload, so clone it: + if (payload != null) + { + to.payload = (Payload) payload.Clone(); + } + } + else + { + InitTermBuffer(); + ((ITermAttribute) target).SetTermBuffer(termBuffer, 0, termLength); + ((IOffsetAttribute) target).SetOffset(startOffset, endOffset); + ((IPositionIncrementAttribute) target).PositionIncrement = positionIncrement; + ((IPayloadAttribute) target).Payload = (payload == null)?null:(Payload) payload.Clone(); + ((IFlagsAttribute) target).Flags = flags; + ((ITypeAttribute) target).Type = type; + } + } + + ///<summary> + /// Convenience factory that returns <c>Token</c> as implementation for the basic + /// attributes and return the default impl (with "Impl" appended) for all other + /// attributes. + /// @since 3.0 + /// </summary> + public static AttributeSource.AttributeFactory TOKEN_ATTRIBUTE_FACTORY = + new TokenAttributeFactory(AttributeSource.AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY); + + /// <summary> + /// <b>Expert</b>: Creates an AttributeFactory returning {@link Token} as instance for the basic attributes + /// and for all other attributes calls the given delegate factory. + /// </summary> + public class TokenAttributeFactory : AttributeSource.AttributeFactory + { + + private readonly AttributeSource.AttributeFactory _delegateFactory; + + /// <summary> + /// <b>Expert</b>: Creates an AttributeFactory returning {@link Token} as instance for the basic attributes + /// and for all other attributes calls the given delegate factory. + /// </summary> + public TokenAttributeFactory(AttributeSource.AttributeFactory delegateFactory) + { + this._delegateFactory = delegateFactory; + } + + public override Attribute CreateAttributeInstance<T>() + { + return typeof(T).IsAssignableFrom(typeof(Token)) + ? new Token() + : _delegateFactory.CreateAttributeInstance<T>(); + } + + public override bool Equals(Object other) + { + if (this == other) return true; + + var af = other as TokenAttributeFactory; + return af != null && _delegateFactory.Equals(af._delegateFactory); + } + + public override int GetHashCode() + { + return _delegateFactory.GetHashCode() ^ 0x0a45aa31; + } + } + } +}
\ No newline at end of file diff --git a/src/core/Analysis/TokenFilter.cs b/src/core/Analysis/TokenFilter.cs new file mode 100644 index 0000000..7483c82 --- /dev/null +++ b/src/core/Analysis/TokenFilter.cs @@ -0,0 +1,72 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +namespace Lucene.Net.Analysis +{ + + /// <summary> A TokenFilter is a TokenStream whose input is another TokenStream. + /// <p/> + /// This is an abstract class; subclasses must override <see cref="TokenStream.IncrementToken()" />. + /// + /// </summary> + /// <seealso cref="TokenStream"> + /// </seealso> + public abstract class TokenFilter:TokenStream + { + /// <summary>The source of tokens for this filter. </summary> + protected internal TokenStream input; + + private bool isDisposed; + + /// <summary>Construct a token stream filtering the given input. </summary> + protected internal TokenFilter(TokenStream input):base(input) + { + this.input = input; + } + + /// <summary>Performs end-of-stream operations, if any, and calls then <c>end()</c> on the + /// input TokenStream.<p/> + /// <b>NOTE:</b> Be sure to call <c>super.end()</c> first when overriding this method. + /// </summary> + public override void End() + { + input.End(); + } + + protected override void Dispose(bool disposing) + { + if (isDisposed) return; + + if (disposing) + { + if (input != null) + { + input.Close(); + } + } + + //input = null; + isDisposed = true; + } + + /// <summary>Reset the filter as well as the input TokenStream. </summary> + public override void Reset() + { + input.Reset(); + } + } +}
\ No newline at end of file diff --git a/src/core/Analysis/TokenStream.cs b/src/core/Analysis/TokenStream.cs new file mode 100644 index 0000000..c624696 --- /dev/null +++ b/src/core/Analysis/TokenStream.cs @@ -0,0 +1,162 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +using System; +using Lucene.Net.Util; +using Document = Lucene.Net.Documents.Document; +using Field = Lucene.Net.Documents.Field; +using IndexWriter = Lucene.Net.Index.IndexWriter; +using AttributeSource = Lucene.Net.Util.AttributeSource; + +namespace Lucene.Net.Analysis +{ + + /// <summary> A <c>TokenStream</c> enumerates the sequence of tokens, either from + /// <see cref="Field" />s of a <see cref="Document" /> or from query text. + /// <p/> + /// This is an abstract class. Concrete subclasses are: + /// <list type="bullet"> + /// <item><see cref="Tokenizer" />, a <c>TokenStream</c> whose input is a Reader; and</item> + /// <item><see cref="TokenFilter" />, a <c>TokenStream</c> whose input is another + /// <c>TokenStream</c>.</item> + /// </list> + /// A new <c>TokenStream</c> API has been introduced with Lucene 2.9. This API + /// has moved from being <see cref="Token" /> based to <see cref="IAttribute" /> based. While + /// <see cref="Token" /> still exists in 2.9 as a convenience class, the preferred way + /// to store the information of a <see cref="Token" /> is to use <see cref="Util.Attribute" />s. + /// <p/> + /// <c>TokenStream</c> now extends <see cref="AttributeSource" />, which provides + /// access to all of the token <see cref="IAttribute" />s for the <c>TokenStream</c>. + /// Note that only one instance per <see cref="Util.Attribute" /> is created and reused + /// for every token. This approach reduces object creation and allows local + /// caching of references to the <see cref="Util.Attribute" />s. See + /// <see cref="IncrementToken()" /> for further details. + /// <p/> + /// <b>The workflow of the new <c>TokenStream</c> API is as follows:</b> + /// <list type="bullet"> + /// <item>Instantiation of <c>TokenStream</c>/<see cref="TokenFilter" />s which add/get + /// attributes to/from the <see cref="AttributeSource" />.</item> + /// <item>The consumer calls <see cref="TokenStream.Reset()" />.</item> + /// <item>The consumer retrieves attributes from the stream and stores local + /// references to all attributes it wants to access</item> + /// <item>The consumer calls <see cref="IncrementToken()" /> until it returns false and + /// consumes the attributes after each call.</item> + /// <item>The consumer calls <see cref="End()" /> so that any end-of-stream operations + /// can be performed.</item> + /// <item>The consumer calls <see cref="Close()" /> to release any resource when finished + /// using the <c>TokenStream</c></item> + /// </list> + /// To make sure that filters and consumers know which attributes are available, + /// the attributes must be added during instantiation. Filters and consumers are + /// not required to check for availability of attributes in + /// <see cref="IncrementToken()" />. + /// <p/> + /// You can find some example code for the new API in the analysis package level + /// Javadoc. + /// <p/> + /// Sometimes it is desirable to capture a current state of a <c>TokenStream</c> + /// , e. g. for buffering purposes (see <see cref="CachingTokenFilter" />, + /// <see cref="TeeSinkTokenFilter" />). For this usecase + /// <see cref="AttributeSource.CaptureState" /> and <see cref="AttributeSource.RestoreState" /> + /// can be used. + /// </summary> + public abstract class TokenStream : AttributeSource, IDisposable + { + /// <summary> A TokenStream using the default attribute factory.</summary> + protected internal TokenStream() + { } + + /// <summary> A TokenStream that uses the same attributes as the supplied one.</summary> + protected internal TokenStream(AttributeSource input) + : base(input) + { } + + /// <summary> A TokenStream using the supplied AttributeFactory for creating new <see cref="IAttribute" /> instances.</summary> + protected internal TokenStream(AttributeFactory factory) + : base(factory) + { } + + /// <summary> Consumers (i.e., <see cref="IndexWriter" />) use this method to advance the stream to + /// the next token. Implementing classes must implement this method and update + /// the appropriate <see cref="Util.Attribute" />s with the attributes of the next + /// token. + /// + /// The producer must make no assumptions about the attributes after the + /// method has been returned: the caller may arbitrarily change it. If the + /// producer needs to preserve the state for subsequent calls, it can use + /// <see cref="AttributeSource.CaptureState" /> to create a copy of the current attribute state. + /// + /// This method is called for every token of a document, so an efficient + /// implementation is crucial for good performance. To avoid calls to + /// <see cref="AttributeSource.AddAttribute{T}()" /> and <see cref="AttributeSource.GetAttribute{T}()" />, + /// references to all <see cref="Util.Attribute" />s that this stream uses should be + /// retrieved during instantiation. + /// + /// To ensure that filters and consumers know which attributes are available, + /// the attributes must be added during instantiation. Filters and consumers + /// are not required to check for availability of attributes in + /// <see cref="IncrementToken()" />. + /// + /// </summary> + /// <returns> false for end of stream; true otherwise</returns> + public abstract bool IncrementToken(); + + /// <summary> This method is called by the consumer after the last token has been + /// consumed, after <see cref="IncrementToken" /> returned <c>false</c> + /// (using the new <c>TokenStream</c> API). Streams implementing the old API + /// should upgrade to use this feature. + /// <p/> + /// This method can be used to perform any end-of-stream operations, such as + /// setting the final offset of a stream. The final offset of a stream might + /// differ from the offset of the last token eg in case one or more whitespaces + /// followed after the last token, but a <see cref="WhitespaceTokenizer" /> was used. + /// + /// </summary> + /// <throws> IOException </throws> + public virtual void End() + { + // do nothing by default + } + + /// <summary> Resets this stream to the beginning. This is an optional operation, so + /// subclasses may or may not implement this method. <see cref="Reset()" /> is not needed for + /// the standard indexing process. However, if the tokens of a + /// <c>TokenStream</c> are intended to be consumed more than once, it is + /// necessary to implement <see cref="Reset()" />. Note that if your TokenStream + /// caches tokens and feeds them back again after a reset, it is imperative + /// that you clone the tokens when you store them away (on the first pass) as + /// well as when you return them (on future passes after <see cref="Reset()" />). + /// </summary> + public virtual void Reset() + { + } + + /// <summary>Releases resources associated with this stream. </summary> + [Obsolete("Use Dispose() instead")] + public void Close() + { + Dispose(); + } + + public void Dispose() + { + Dispose(true); + } + + protected abstract void Dispose(bool disposing); + } +}
\ No newline at end of file diff --git a/src/core/Analysis/Tokenattributes/FlagsAttribute.cs b/src/core/Analysis/Tokenattributes/FlagsAttribute.cs new file mode 100644 index 0000000..b5c4b7b --- /dev/null +++ b/src/core/Analysis/Tokenattributes/FlagsAttribute.cs @@ -0,0 +1,85 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +using System; +using Attribute = Lucene.Net.Util.Attribute; + +namespace Lucene.Net.Analysis.Tokenattributes +{ + + /// <summary> This attribute can be used to pass different flags down the tokenizer chain, + /// eg from one TokenFilter to another one. + /// </summary> + [Serializable] + public class FlagsAttribute:Util.Attribute, IFlagsAttribute, System.ICloneable + { + private int flags = 0; + + /// <summary> EXPERIMENTAL: While we think this is here to stay, we may want to change it to be a long. + /// <p/> + /// + /// Get the bitset for any bits that have been set. This is completely distinct from <see cref="ITypeAttribute.Type()" />, although they do share similar purposes. + /// The flags can be used to encode information about the token for use by other <see cref="Lucene.Net.Analysis.TokenFilter" />s. + /// + /// + /// </summary> + /// <value> The bits </value> + public virtual int Flags + { + get { return flags; } + set { this.flags = value; } + } + + public override void Clear() + { + flags = 0; + } + + public override bool Equals(System.Object other) + { + if (this == other) + { + return true; + } + + if (other is FlagsAttribute) + { + return ((FlagsAttribute) other).flags == flags; + } + + return false; + } + + public override int GetHashCode() + { + return flags; + } + + public override void CopyTo(Attribute target) + { + IFlagsAttribute t = (IFlagsAttribute) target; + t.Flags = flags; + } + + override public System.Object Clone() + { + FlagsAttribute impl = new FlagsAttribute(); + impl.flags = this.flags; + return impl; + } + } +}
\ No newline at end of file diff --git a/src/core/Analysis/Tokenattributes/IFlagsAttribute.cs b/src/core/Analysis/Tokenattributes/IFlagsAttribute.cs new file mode 100644 index 0000000..24b2bea --- /dev/null +++ b/src/core/Analysis/Tokenattributes/IFlagsAttribute.cs @@ -0,0 +1,41 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +using System; +using Lucene.Net.Util; +using Tokenizer = Lucene.Net.Analysis.Tokenizer; + +namespace Lucene.Net.Analysis.Tokenattributes +{ + + /// <summary> This attribute can be used to pass different flags down the <see cref="Tokenizer" /> chain, + /// eg from one TokenFilter to another one. + /// </summary> + public interface IFlagsAttribute:IAttribute + { + /// <summary> EXPERIMENTAL: While we think this is here to stay, we may want to change it to be a long. + /// <p/> + /// + /// Get the bitset for any bits that have been set. This is completely distinct from <see cref="ITypeAttribute.Type()" />, although they do share similar purposes. + /// The flags can be used to encode information about the token for use by other <see cref="Lucene.Net.Analysis.TokenFilter" />s. + /// + /// + /// </summary> + /// <value> The bits </value> + int Flags { get; set; } + } +}
\ No newline at end of file diff --git a/src/core/Analysis/Tokenattributes/IOffsetAttribute.cs b/src/core/Analysis/Tokenattributes/IOffsetAttribute.cs new file mode 100644 index 0000000..ffbbe02 --- /dev/null +++ b/src/core/Analysis/Tokenattributes/IOffsetAttribute.cs @@ -0,0 +1,48 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +using System; +using Lucene.Net.Util; + +namespace Lucene.Net.Analysis.Tokenattributes +{ + + /// <summary> The start and end character offset of a Token. </summary> + public interface IOffsetAttribute : IAttribute + { + /// <summary>Returns this Token's starting offset, the position of the first character + /// corresponding to this token in the source text. + /// Note that the difference between endOffset() and startOffset() may not be + /// equal to termText.length(), as the term text may have been altered by a + /// stemmer or some other filter. + /// </summary> + int StartOffset { get; } + + + /// <summary>Set the starting and ending offset. + /// See StartOffset() and EndOffset() + /// </summary> + void SetOffset(int startOffset, int endOffset); + + + /// <summary>Returns this Token's ending offset, one greater than the position of the + /// last character corresponding to this token in the source text. The length + /// of the token in the source text is (endOffset - startOffset). + /// </summary> + int EndOffset { get; } + } +}
\ No newline at end of file diff --git a/src/core/Analysis/Tokenattributes/IPayloadAttribute.cs b/src/core/Analysis/Tokenattributes/IPayloadAttribute.cs new file mode 100644 index 0000000..7e313ce --- /dev/null +++ b/src/core/Analysis/Tokenattributes/IPayloadAttribute.cs @@ -0,0 +1,31 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +using System; +using Lucene.Net.Util; +using Payload = Lucene.Net.Index.Payload; + +namespace Lucene.Net.Analysis.Tokenattributes +{ + + /// <summary> The payload of a Token. See also <see cref="Payload" />.</summary> + public interface IPayloadAttribute:IAttribute + { + /// <summary> Returns this Token's payload.</summary> + Payload Payload { get; set; } + } +}
\ No newline at end of file diff --git a/src/core/Analysis/Tokenattributes/IPositionIncrementAttribute.cs b/src/core/Analysis/Tokenattributes/IPositionIncrementAttribute.cs new file mode 100644 index 0000000..6c2a131 --- /dev/null +++ b/src/core/Analysis/Tokenattributes/IPositionIncrementAttribute.cs @@ -0,0 +1,59 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +using System; +using Lucene.Net.Util; + +namespace Lucene.Net.Analysis.Tokenattributes +{ + + /// <summary>The positionIncrement determines the position of this token + /// relative to the previous Token in a TokenStream, used in phrase + /// searching. + /// + /// <p/>The default value is one. + /// + /// <p/>Some common uses for this are:<list> + /// + /// <item>Set it to zero to put multiple terms in the same position. This is + /// useful if, e.g., a word has multiple stems. Searches for phrases + /// including either stem will match. In this case, all but the first stem's + /// increment should be set to zero: the increment of the first instance + /// should be one. Repeating a token with an increment of zero can also be + /// used to boost the scores of matches on that token.</item> + /// + /// <item>Set it to values greater than one to inhibit exact phrase matches. + /// If, for example, one does not want phrases to match across removed stop + /// words, then one could build a stop word filter that removes stop words and + /// also sets the increment to the number of stop words removed before each + /// non-stop word. Then exact phrase queries will only match when the terms + /// occur with no intervening stop words.</item> + /// + /// </list> + /// + /// </summary> + /// <seealso cref="Lucene.Net.Index.TermPositions"> + /// </seealso> + public interface IPositionIncrementAttribute:IAttribute + { + /// <summary>Gets or sets the position increment. The default value is one. + /// + /// </summary> + /// <value> the distance from the prior term </value> + int PositionIncrement { set; get; } + } +}
\ No newline at end of file diff --git a/src/core/Analysis/Tokenattributes/ITermAttribute.cs b/src/core/Analysis/Tokenattributes/ITermAttribute.cs new file mode 100644 index 0000000..8f9b030 --- /dev/null +++ b/src/core/Analysis/Tokenattributes/ITermAttribute.cs @@ -0,0 +1,104 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +using System; +using Lucene.Net.Util; + +namespace Lucene.Net.Analysis.Tokenattributes +{ + + /// <summary> The term text of a Token.</summary> + public interface ITermAttribute:IAttribute + { + /// <summary>Returns the Token's term text. + /// + /// This method has a performance penalty + /// because the text is stored internally in a char[]. If + /// possible, use <see cref="TermBuffer()" /> and <see cref="TermLength()" /> + /// directly instead. If you really need a + /// String, use this method, which is nothing more than + /// a convenience call to <b>new String(token.termBuffer(), 0, token.termLength())</b> + /// </summary> + string Term { get; } + + /// <summary>Copies the contents of buffer, starting at offset for + /// length characters, into the termBuffer array. + /// </summary> + /// <param name="buffer">the buffer to copy + /// </param> + /// <param name="offset">the index in the buffer of the first character to copy + /// </param> + /// <param name="length">the number of characters to copy + /// </param> + void SetTermBuffer(char[] buffer, int offset, int length); + + /// <summary>Copies the contents of buffer into the termBuffer array.</summary> + /// <param name="buffer">the buffer to copy + /// </param> + void SetTermBuffer(System.String buffer); + + /// <summary>Copies the contents of buffer, starting at offset and continuing + /// for length characters, into the termBuffer array. + /// </summary> + /// <param name="buffer">the buffer to copy + /// </param> + /// <param name="offset">the index in the buffer of the first character to copy + /// </param> + /// <param name="length">the number of characters to copy + /// </param> + void SetTermBuffer(System.String buffer, int offset, int length); + + /// <summary>Returns the internal termBuffer character array which + /// you can then directly alter. If the array is too + /// small for your token, use <see cref="ResizeTermBuffer(int)" /> + /// to increase it. After + /// altering the buffer be sure to call <see cref="SetTermLength" /> + /// to record the number of valid + /// characters that were placed into the termBuffer. + /// </summary> + char[] TermBuffer(); + + /// <summary>Grows the termBuffer to at least size newSize, preserving the + /// existing content. Note: If the next operation is to change + /// the contents of the term buffer use + /// <see cref="SetTermBuffer(char[], int, int)" />, + /// <see cref="SetTermBuffer(String)" />, or + /// <see cref="SetTermBuffer(String, int, int)" /> + /// to optimally combine the resize with the setting of the termBuffer. + /// </summary> + /// <param name="newSize">minimum size of the new termBuffer + /// </param> + /// <returns> newly created termBuffer with length >= newSize + /// </returns> + char[] ResizeTermBuffer(int newSize); + + /// <summary>Return number of valid characters (length of the term) + /// in the termBuffer array. + /// </summary> + int TermLength(); + + /// <summary>Set number of valid characters (length of the term) in + /// the termBuffer array. Use this to truncate the termBuffer + /// or to synchronize with external manipulation of the termBuffer. + /// Note: to grow the size of the array, + /// use <see cref="ResizeTermBuffer(int)" /> first. + /// </summary> + /// <param name="length">the truncated length + /// </param> + void SetTermLength(int length); + } +}
\ No newline at end of file diff --git a/src/core/Analysis/Tokenattributes/ITypeAttribute.cs b/src/core/Analysis/Tokenattributes/ITypeAttribute.cs new file mode 100644 index 0000000..48bcc10 --- /dev/null +++ b/src/core/Analysis/Tokenattributes/ITypeAttribute.cs @@ -0,0 +1,30 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +using System; +using Lucene.Net.Util; + +namespace Lucene.Net.Analysis.Tokenattributes +{ + + /// <summary> A Token's lexical type. The Default value is "word". </summary> + public interface ITypeAttribute:IAttribute + { + /// <summary>Gets or sets this Token's lexical type. Defaults to "word". </summary> + string Type { get; set; } + } +}
\ No newline at end of file diff --git a/src/core/Analysis/Tokenattributes/OffsetAttribute.cs b/src/core/Analysis/Tokenattributes/OffsetAttribute.cs new file mode 100644 index 0000000..5149559 --- /dev/null +++ b/src/core/Analysis/Tokenattributes/OffsetAttribute.cs @@ -0,0 +1,106 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +using System; +using Attribute = Lucene.Net.Util.Attribute; + +namespace Lucene.Net.Analysis.Tokenattributes +{ + + /// <summary> The start and end character offset of a Token. </summary> + [Serializable] + public class OffsetAttribute:Attribute, IOffsetAttribute, System.ICloneable + { + private int startOffset; + private int endOffset; + + /// <summary>Returns this Token's starting offset, the position of the first character + /// corresponding to this token in the source text. + /// Note that the difference between endOffset() and startOffset() may not be + /// equal to termText.length(), as the term text may have been altered by a + /// stemmer or some other filter. + /// </summary> + public virtual int StartOffset + { + get { return startOffset; } + } + + + /// <summary>Set the starting and ending offset. + /// See StartOffset() and EndOffset() + /// </summary> + public virtual void SetOffset(int startOffset, int endOffset) + { + this.startOffset = startOffset; + this.endOffset = endOffset; + } + + + /// <summary>Returns this Token's ending offset, one greater than the position of the + /// last character corresponding to this token in the source text. The length + /// of the token in the source text is (endOffset - startOffset). + /// </summary> + public virtual int EndOffset + { + get { return endOffset; } + } + + + public override void Clear() + { + startOffset = 0; + endOffset = 0; + } + + public override bool Equals(System.Object other) + { + if (other == this) + { + return true; + } + + if (other is OffsetAttribute) + { + OffsetAttribute o = (OffsetAttribute) other; + return o.startOffset == startOffset && o.endOffset == endOffset; + } + + return false; + } + + public override int GetHashCode() + { + int code = startOffset; + code = code * 31 + endOffset; + return code; + } + + public override void CopyTo(Attribute target) + { + IOffsetAttribute t = (IOffsetAttribute) target; + t.SetOffset(startOffset, endOffset); + } + + override public System.Object Clone() + { + OffsetAttribute impl = new OffsetAttribute(); + impl.endOffset = endOffset; + impl.startOffset = startOffset; + return impl; + } + } +}
\ No newline at end of file diff --git a/src/core/Analysis/Tokenattributes/PayloadAttribute.cs b/src/core/Analysis/Tokenattributes/PayloadAttribute.cs new file mode 100644 index 0000000..ae1c4d9 --- /dev/null +++ b/src/core/Analysis/Tokenattributes/PayloadAttribute.cs @@ -0,0 +1,100 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +using System; +using Attribute = Lucene.Net.Util.Attribute; +using Payload = Lucene.Net.Index.Payload; + +namespace Lucene.Net.Analysis.Tokenattributes +{ + + /// <summary> The payload of a Token. See also <see cref="Payload" />.</summary> + [Serializable] + public class PayloadAttribute:Attribute, IPayloadAttribute, System.ICloneable + { + private Payload payload; + + /// <summary> Initialize this attribute with no payload.</summary> + public PayloadAttribute() + { + } + + /// <summary> Initialize this attribute with the given payload. </summary> + public PayloadAttribute(Payload payload) + { + this.payload = payload; + } + + /// <summary> Returns this Token's payload.</summary> + public virtual Payload Payload + { + get { return this.payload; } + set { this.payload = value; } + } + + public override void Clear() + { + payload = null; + } + + public override System.Object Clone() + { + var clone = (PayloadAttribute) base.Clone(); + if (payload != null) + { + clone.payload = (Payload) payload.Clone(); + } + return clone; + // TODO: This code use to be as below. Any reason why? the if(payload!=null) was missing... + //PayloadAttributeImpl impl = new PayloadAttributeImpl(); + //impl.payload = new Payload(this.payload.data, this.payload.offset, this.payload.length); + //return impl; + } + + public override bool Equals(System.Object other) + { + if (other == this) + { + return true; + } + + if (other is IPayloadAttribute) + { + PayloadAttribute o = (PayloadAttribute) other; + if (o.payload == null || payload == null) + { + return o.payload == null && payload == null; + } + + return o.payload.Equals(payload); + } + + return false; + } + + public override int GetHashCode() + { + return (payload == null)?0:payload.GetHashCode(); + } + + public override void CopyTo(Attribute target) + { + IPayloadAttribute t = (IPayloadAttribute) target; + t.Payload = (payload == null)?null:(Payload) payload.Clone(); + } + } +}
\ No newline at end of file diff --git a/src/core/Analysis/Tokenattributes/PositionIncrementAttribute.cs b/src/core/Analysis/Tokenattributes/PositionIncrementAttribute.cs new file mode 100644 index 0000000..4f7a04f --- /dev/null +++ b/src/core/Analysis/Tokenattributes/PositionIncrementAttribute.cs @@ -0,0 +1,107 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +using System; +using Attribute = Lucene.Net.Util.Attribute; +using TokenStream = Lucene.Net.Analysis.TokenStream; + +namespace Lucene.Net.Analysis.Tokenattributes +{ + + /// <summary>The positionIncrement determines the position of this token + /// relative to the previous Token in a <see cref="TokenStream" />, used in phrase + /// searching. + /// + /// <p/>The default value is one. + /// + /// <p/>Some common uses for this are:<list> + /// + /// <item>Set it to zero to put multiple terms in the same position. This is + /// useful if, e.g., a word has multiple stems. Searches for phrases + /// including either stem will match. In this case, all but the first stem's + /// increment should be set to zero: the increment of the first instance + /// should be one. Repeating a token with an increment of zero can also be + /// used to boost the scores of matches on that token.</item> + /// + /// <item>Set it to values greater than one to inhibit exact phrase matches. + /// If, for example, one does not want phrases to match across removed stop + /// words, then one could build a stop word filter that removes stop words and + /// also sets the increment to the number of stop words removed before each + /// non-stop word. Then exact phrase queries will only match when the terms + /// occur with no intervening stop words.</item> + /// + /// </list> + /// </summary> + [Serializable] + public class PositionIncrementAttribute:Attribute, IPositionIncrementAttribute, System.ICloneable + { + private int positionIncrement = 1; + + /// <summary>Set the position increment. The default value is one. + /// + /// </summary> + /// <value> the distance from the prior term </value> + public virtual int PositionIncrement + { + set + { + if (value < 0) + throw new System.ArgumentException("Increment must be zero or greater: " + value); + this.positionIncrement = value; + } + get { return positionIncrement; } + } + + public override void Clear() + { + this.positionIncrement = 1; + } + + public override bool Equals(System.Object other) + { + if (other == this) + { + return true; + } + + if (other is PositionIncrementAttribute) + { + return positionIncrement == ((PositionIncrementAttribute) other).positionIncrement; + } + + return false; + } + + public override int GetHashCode() + { + return positionIncrement; + } + + public override void CopyTo(Attribute target) + { + IPositionIncrementAttribute t = (IPositionIncrementAttribute) target; + t.PositionIncrement = positionIncrement; + } + + override public System.Object Clone() + { + PositionIncrementAttribute impl = new PositionIncrementAttribute(); + impl.positionIncrement = positionIncrement; + return impl; + } + } +}
\ No newline at end of file diff --git a/src/core/Analysis/Tokenattributes/TermAttribute.cs b/src/core/Analysis/Tokenattributes/TermAttribute.cs new file mode 100644 index 0000000..f95402c --- /dev/null +++ b/src/core/Analysis/Tokenattributes/TermAttribute.cs @@ -0,0 +1,268 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +using System; +using Lucene.Net.Support; +using ArrayUtil = Lucene.Net.Util.ArrayUtil; +using Attribute = Lucene.Net.Util.Attribute; + +namespace Lucene.Net.Analysis.Tokenattributes +{ + + /// <summary> The term text of a Token.</summary> + [Serializable] + public class TermAttribute:Attribute, ITermAttribute, System.ICloneable + { + private static int MIN_BUFFER_SIZE = 10; + + private char[] termBuffer; + private int termLength; + + /// <summary>Returns the Token's term text. + /// + /// This method has a performance penalty + /// because the text is stored internally in a char[]. If + /// possible, use <see cref="TermBuffer()" /> and + /// <see cref="TermLength()" /> directly instead. If you + /// really need a String, use this method, which is nothing more than + /// a convenience call to <b>new String(token.termBuffer(), 0, token.termLength())</b> + /// </summary> + public virtual string Term + { + get + { + InitTermBuffer(); + return new System.String(termBuffer, 0, termLength); + } + } + + /// <summary>Copies the contents of buffer, starting at offset for + /// length characters, into the termBuffer array. + /// </summary> + /// <param name="buffer">the buffer to copy + /// </param> + /// <param name="offset">the index in the buffer of the first character to copy + /// </param> + /// <param name="length">the number of characters to copy + /// </param> + public virtual void SetTermBuffer(char[] buffer, int offset, int length) + { + GrowTermBuffer(length); + Array.Copy(buffer, offset, termBuffer, 0, length); + termLength = length; + } + + /// <summary>Copies the contents of buffer into the termBuffer array.</summary> + /// <param name="buffer">the buffer to copy + /// </param> + public virtual void SetTermBuffer(System.String buffer) + { + int length = buffer.Length; + GrowTermBuffer(length); + TextSupport.GetCharsFromString(buffer, 0, length, termBuffer, 0); + termLength = length; + } + + /// <summary>Copies the contents of buffer, starting at offset and continuing + /// for length characters, into the termBuffer array. + /// </summary> + /// <param name="buffer">the buffer to copy + /// </param> + /// <param name="offset">the index in the buffer of the first character to copy + /// </param> + /// <param name="length">the number of characters to copy + /// </param> + public virtual void SetTermBuffer(System.String buffer, int offset, int length) + { + System.Diagnostics.Debug.Assert(offset <= buffer.Length); + System.Diagnostics.Debug.Assert(offset + length <= buffer.Length); + GrowTermBuffer(length); + TextSupport.GetCharsFromString(buffer, offset, offset + length, termBuffer, 0); + termLength = length; + } + + /// <summary>Returns the internal termBuffer character array which + /// you can then directly alter. If the array is too + /// small for your token, use <see cref="ResizeTermBuffer(int)" /> + /// to increase it. After + /// altering the buffer be sure to call <see cref="SetTermLength" /> + /// to record the number of valid + /// characters that were placed into the termBuffer. + /// </summary> + public virtual char[] TermBuffer() + { + InitTermBuffer(); + return termBuffer; + } + + /// <summary>Grows the termBuffer to at least size newSize, preserving the + /// existing content. Note: If the next operation is to change + /// the contents of the term buffer use + /// <see cref="SetTermBuffer(char[], int, int)" />, + /// <see cref="SetTermBuffer(String)" />, or + /// <see cref="SetTermBuffer(String, int, int)" /> + /// to optimally combine the resize with the setting of the termBuffer. + /// </summary> + /// <param name="newSize">minimum size of the new termBuffer + /// </param> + /// <returns> newly created termBuffer with length >= newSize + /// </returns> + public virtual char[] ResizeTermBuffer(int newSize) + { + if (termBuffer == null) + { + // The buffer is always at least MIN_BUFFER_SIZE + termBuffer = new char[ArrayUtil.GetNextSize(newSize < MIN_BUFFER_SIZE?MIN_BUFFER_SIZE:newSize)]; + } + else + { + if (termBuffer.Length < newSize) + { + // Not big enough; create a new array with slight + // over allocation and preserve content + char[] newCharBuffer = new char[ArrayUtil.GetNextSize(newSize)]; + Array.Copy(termBuffer, 0, newCharBuffer, 0, termBuffer.Length); + termBuffer = newCharBuffer; + } + } + return termBuffer; + } + + + /// <summary>Allocates a buffer char[] of at least newSize, without preserving the existing content. + /// its always used in places that set the content + /// </summary> + /// <param name="newSize">minimum size of the buffer + /// </param> + private void GrowTermBuffer(int newSize) + { + if (termBuffer == null) + { + // The buffer is always at least MIN_BUFFER_SIZE + termBuffer = new char[ArrayUtil.GetNextSize(newSize < MIN_BUFFER_SIZE?MIN_BUFFER_SIZE:newSize)]; + } + else + { + if (termBuffer.Length < newSize) + { + // Not big enough; create a new array with slight + // over allocation: + termBuffer = new char[ArrayUtil.GetNextSize(newSize)]; + } + } + } + + private void InitTermBuffer() + { + if (termBuffer == null) + { + termBuffer = new char[ArrayUtil.GetNextSize(MIN_BUFFER_SIZE)]; + termLength = 0; + } + } + + /// <summary>Return number of valid characters (length of the term) + /// in the termBuffer array. + /// </summary> + public virtual int TermLength() + { + return termLength; + } + + /// <summary>Set number of valid characters (length of the term) in + /// the termBuffer array. Use this to truncate the termBuffer + /// or to synchronize with external manipulation of the termBuffer. + /// Note: to grow the size of the array, + /// use <see cref="ResizeTermBuffer(int)" /> first. + /// </summary> + /// <param name="length">the truncated length + /// </param> + public virtual void SetTermLength(int length) + { + InitTermBuffer(); + if (length > termBuffer.Length) + throw new System.ArgumentException("length " + length + " exceeds the size of the termBuffer (" + termBuffer.Length + ")"); + termLength = length; + } + + public override int GetHashCode() + { + InitTermBuffer(); + int code = termLength; + code = code * 31 + ArrayUtil.HashCode(termBuffer, 0, termLength); + return code; + } + + public override void Clear() + { + termLength = 0; + } + + public override System.Object Clone() + { + TermAttribute t = (TermAttribute) base.Clone(); + // Do a deep clone + if (termBuffer != null) + { + t.termBuffer = new char[termBuffer.Length]; + termBuffer.CopyTo(t.termBuffer, 0); + } + return t; + } + + public override bool Equals(System.Object other) + { + if (other == this) + { + return true; + } + + if (other is ITermAttribute) + { + InitTermBuffer(); + TermAttribute o = ((TermAttribute) other); + o.InitTermBuffer(); + + if (termLength != o.termLength) + return false; + for (int i = 0; i < termLength; i++) + { + if (termBuffer[i] != o.termBuffer[i]) + { + return false; + } + } + return true; + } + + return false; + } + + public override System.String ToString() + { + InitTermBuffer(); + return "term=" + new System.String(termBuffer, 0, termLength); + } + + public override void CopyTo(Attribute target) + { + InitTermBuffer(); + ITermAttribute t = (ITermAttribute) target; + t.SetTermBuffer(termBuffer, 0, termLength); + } + } +}
\ No newline at end of file diff --git a/src/core/Analysis/Tokenattributes/TypeAttribute.cs b/src/core/Analysis/Tokenattributes/TypeAttribute.cs new file mode 100644 index 0000000..1da1c50 --- /dev/null +++ b/src/core/Analysis/Tokenattributes/TypeAttribute.cs @@ -0,0 +1,85 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +using System; +using Attribute = Lucene.Net.Util.Attribute; + +namespace Lucene.Net.Analysis.Tokenattributes +{ + + /// <summary> A Token's lexical type. The Default value is "word". </summary> + [Serializable] + public class TypeAttribute:Attribute, ITypeAttribute, System.ICloneable + { + private System.String type; + public const System.String DEFAULT_TYPE = "word"; + + public TypeAttribute():this(DEFAULT_TYPE) + { + } + + public TypeAttribute(System.String type) + { + this.type = type; + } + + /// <summary>Returns this Token's lexical type. Defaults to "word". </summary> + public virtual string Type + { + get { return type; } + set { this.type = value; } + } + + public override void Clear() + { + type = DEFAULT_TYPE; + } + + public override bool Equals(System.Object other) + { + if (other == this) + { + return true; + } + + if (other is TypeAttribute) + { + return type.Equals(((TypeAttribute) other).type); + } + + return false; + } + + public override int GetHashCode() + { + return type.GetHashCode(); + } + + public override void CopyTo(Attribute target) + { + ITypeAttribute t = (ITypeAttribute) target; + t.Type = type; + } + + override public System.Object Clone() + { + TypeAttribute impl = new TypeAttribute(); + impl.type = type; + return impl; + } + } +}
\ No newline at end of file diff --git a/src/core/Analysis/Tokenizer.cs b/src/core/Analysis/Tokenizer.cs new file mode 100644 index 0000000..5ab741e --- /dev/null +++ b/src/core/Analysis/Tokenizer.cs @@ -0,0 +1,112 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +using AttributeSource = Lucene.Net.Util.AttributeSource; + +namespace Lucene.Net.Analysis +{ + + /// <summary> A Tokenizer is a TokenStream whose input is a Reader. + /// <p/> + /// This is an abstract class; subclasses must override <see cref="TokenStream.IncrementToken()" /> + /// <p/> + /// NOTE: Subclasses overriding <see cref="TokenStream.IncrementToken()" /> must call + /// <see cref="AttributeSource.ClearAttributes()" /> before setting attributes. + /// </summary> + + public abstract class Tokenizer:TokenStream + { + /// <summary>The text source for this Tokenizer. </summary> + protected internal System.IO.TextReader input; + + private bool isDisposed; + + /// <summary>Construct a tokenizer with null input. </summary> + protected internal Tokenizer() + { + } + + /// <summary>Construct a token stream processing the given input. </summary> + protected internal Tokenizer(System.IO.TextReader input) + { + this.input = CharReader.Get(input); + } + + /// <summary>Construct a tokenizer with null input using the given AttributeFactory. </summary> + protected internal Tokenizer(AttributeFactory factory):base(factory) + { + } + + /// <summary>Construct a token stream processing the given input using the given AttributeFactory. </summary> + protected internal Tokenizer(AttributeFactory factory, System.IO.TextReader input):base(factory) + { + this.input = CharReader.Get(input); + } + + /// <summary>Construct a token stream processing the given input using the given AttributeSource. </summary> + protected internal Tokenizer(AttributeSource source):base(source) + { + } + + /// <summary>Construct a token stream processing the given input using the given AttributeSource. </summary> + protected internal Tokenizer(AttributeSource source, System.IO.TextReader input):base(source) + { + this.input = CharReader.Get(input); + } + + protected override void Dispose(bool disposing) + { + if (isDisposed) return; + + if (disposing) + { + if (input != null) + { + input.Close(); + } + } + + // LUCENE-2387: don't hold onto Reader after close, so + // GC can reclaim + input = null; + isDisposed = true; + } + + /// <summary>Return the corrected offset. If <see cref="input" /> is a <see cref="CharStream" /> subclass + /// this method calls <see cref="CharStream.CorrectOffset" />, else returns <c>currentOff</c>. + /// </summary> + /// <param name="currentOff">offset as seen in the output + /// </param> + /// <returns> corrected offset based on the input + /// </returns> + /// <seealso cref="CharStream.CorrectOffset"> + /// </seealso> + protected internal int CorrectOffset(int currentOff) + { + return (input is CharStream)?((CharStream) input).CorrectOffset(currentOff):currentOff; + } + + /// <summary>Expert: Reset the tokenizer to a new reader. Typically, an + /// analyzer (in its reusableTokenStream method) will use + /// this to re-use a previously created tokenizer. + /// </summary> + public virtual void Reset(System.IO.TextReader input) + { + this.input = input; + } + } +}
\ No newline at end of file diff --git a/src/core/Analysis/WhitespaceAnalyzer.cs b/src/core/Analysis/WhitespaceAnalyzer.cs new file mode 100644 index 0000000..77dbaa3 --- /dev/null +++ b/src/core/Analysis/WhitespaceAnalyzer.cs @@ -0,0 +1,43 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +namespace Lucene.Net.Analysis +{ + + /// <summary>An Analyzer that uses <see cref="WhitespaceTokenizer" />. </summary> + + public sealed class WhitespaceAnalyzer:Analyzer + { + public override TokenStream TokenStream(System.String fieldName, System.IO.TextReader reader) + { + return new WhitespaceTokenizer(reader); + } + + public override TokenStream ReusableTokenStream(System.String fieldName, System.IO.TextReader reader) + { + var tokenizer = (Tokenizer) PreviousTokenStream; + if (tokenizer == null) + { + tokenizer = new WhitespaceTokenizer(reader); + PreviousTokenStream = tokenizer; + } + else + tokenizer.Reset(reader); + return tokenizer; + } + } +}
\ No newline at end of file diff --git a/src/core/Analysis/WhitespaceTokenizer.cs b/src/core/Analysis/WhitespaceTokenizer.cs new file mode 100644 index 0000000..c96ad50 --- /dev/null +++ b/src/core/Analysis/WhitespaceTokenizer.cs @@ -0,0 +1,55 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +using AttributeSource = Lucene.Net.Util.AttributeSource; + +namespace Lucene.Net.Analysis +{ + + /// <summary>A WhitespaceTokenizer is a tokenizer that divides text at whitespace. + /// Adjacent sequences of non-Whitespace characters form tokens. + /// </summary> + + public class WhitespaceTokenizer:CharTokenizer + { + /// <summary>Construct a new WhitespaceTokenizer. </summary> + public WhitespaceTokenizer(System.IO.TextReader @in) + : base(@in) + { + } + + /// <summary>Construct a new WhitespaceTokenizer using a given <see cref="AttributeSource" />. </summary> + public WhitespaceTokenizer(AttributeSource source, System.IO.TextReader @in) + : base(source, @in) + { + } + + /// <summary>Construct a new WhitespaceTokenizer using a given <see cref="Lucene.Net.Util.AttributeSource.AttributeFactory" />. </summary> + public WhitespaceTokenizer(AttributeFactory factory, System.IO.TextReader @in) + : base(factory, @in) + { + } + + /// <summary>Collects only characters which do not satisfy + /// <see cref="char.IsWhiteSpace(char)" />. + /// </summary> + protected internal override bool IsTokenChar(char c) + { + return !System.Char.IsWhiteSpace(c); + } + } +}
\ No newline at end of file diff --git a/src/core/Analysis/WordlistLoader.cs b/src/core/Analysis/WordlistLoader.cs new file mode 100644 index 0000000..bfd1b07 --- /dev/null +++ b/src/core/Analysis/WordlistLoader.cs @@ -0,0 +1,146 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +using System.Collections.Generic; + +namespace Lucene.Net.Analysis +{ + + /// <summary> Loader for text files that represent a list of stopwords.</summary> + public class WordlistLoader + { + + /// <summary> Loads a text file and adds every line as an entry to a HashSet (omitting + /// leading and trailing whitespace). Every line of the file should contain only + /// one word. The words need to be in lowercase if you make use of an + /// Analyzer which uses LowerCaseFilter (like StandardAnalyzer). + /// </summary> + /// <param name="wordfile">File containing the wordlist</param> + /// <returns> A HashSet with the file's words</returns> + public static ISet<string> GetWordSet(System.IO.FileInfo wordfile) + { + using (var reader = new System.IO.StreamReader(wordfile.FullName, System.Text.Encoding.Default)) + { + return GetWordSet(reader); + } + } + + /// <summary> Loads a text file and adds every non-comment line as an entry to a HashSet (omitting + /// leading and trailing whitespace). Every line of the file should contain only + /// one word. The words need to be in lowercase if you make use of an + /// Analyzer which uses LowerCaseFilter (like StandardAnalyzer). + /// </summary> + /// <param name="wordfile">File containing the wordlist</param> + /// <param name="comment">The comment string to ignore</param> + /// <returns> A HashSet with the file's words</returns> + public static ISet<string> GetWordSet(System.IO.FileInfo wordfile, System.String comment) + { + using (var reader = new System.IO.StreamReader(wordfile.FullName, System.Text.Encoding.Default)) + { + return GetWordSet(reader, comment); + } + } + + + /// <summary> Reads lines from a Reader and adds every line as an entry to a HashSet (omitting + /// leading and trailing whitespace). Every line of the Reader should contain only + /// one word. The words need to be in lowercase if you make use of an + /// Analyzer which uses LowerCaseFilter (like StandardAnalyzer). + /// </summary> + /// <param name="reader">Reader containing the wordlist</param> + /// <returns>A HashSet with the reader's words</returns> + public static ISet<string> GetWordSet(System.IO.TextReader reader) + { + var result = Support.Compatibility.SetFactory.CreateHashSet<string>(); + + System.String word; + while ((word = reader.ReadLine()) != null) + { + result.Add(word.Trim()); + } + + return result; + } + + /// <summary> Reads lines from a Reader and adds every non-comment line as an entry to a HashSet (omitting + /// leading and trailing whitespace). Every line of the Reader should contain only + /// one word. The words need to be in lowercase if you make use of an + /// Analyzer which uses LowerCaseFilter (like StandardAnalyzer). + /// + /// </summary> + /// <param name="reader">Reader containing the wordlist + /// </param> + /// <param name="comment">The string representing a comment. + /// </param> + /// <returns> A HashSet with the reader's words + /// </returns> + public static ISet<string> GetWordSet(System.IO.TextReader reader, System.String comment) + { + var result = Support.Compatibility.SetFactory.CreateHashSet<string>(); + + System.String word = null; + while ((word = reader.ReadLine()) != null) + { + if (word.StartsWith(comment) == false) + { + result.Add(word.Trim()); + } + } + + return result; + } + + + + /// <summary> Reads a stem dictionary. Each line contains: + /// <c>word<b>\t</b>stem</c> + /// (i.e. two tab seperated words) + /// + /// </summary> + /// <returns> stem dictionary that overrules the stemming algorithm + /// </returns> + /// <throws> IOException </throws> + public static Dictionary<string, string> GetStemDict(System.IO.FileInfo wordstemfile) + { + if (wordstemfile == null) + throw new System.NullReferenceException("wordstemfile may not be null"); + var result = new Dictionary<string, string>(); + System.IO.StreamReader br = null; + System.IO.StreamReader fr = null; + try + { + fr = new System.IO.StreamReader(wordstemfile.FullName, System.Text.Encoding.Default); + br = new System.IO.StreamReader(fr.BaseStream, fr.CurrentEncoding); + System.String line; + char[] tab = {'\t'}; + while ((line = br.ReadLine()) != null) + { + System.String[] wordstem = line.Split(tab, 2); + result[wordstem[0]] = wordstem[1]; + } + } + finally + { + if (fr != null) + fr.Close(); + if (br != null) + br.Close(); + } + return result; + } + } +}
\ No newline at end of file |