Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/mono/Lucene.Net.Light.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
Diffstat (limited to 'src/core/Analysis')
-rw-r--r--src/core/Analysis/ASCIIFoldingFilter.cs3285
-rw-r--r--src/core/Analysis/Analyzer.cs171
-rw-r--r--src/core/Analysis/BaseCharFilter.cs105
-rw-r--r--src/core/Analysis/CachingTokenFilter.cs86
-rw-r--r--src/core/Analysis/CharArraySet.cs517
-rw-r--r--src/core/Analysis/CharFilter.cs95
-rw-r--r--src/core/Analysis/CharReader.cs94
-rw-r--r--src/core/Analysis/CharStream.cs45
-rw-r--r--src/core/Analysis/CharTokenizer.cs135
-rw-r--r--src/core/Analysis/ISOLatin1AccentFilter.cs344
-rw-r--r--src/core/Analysis/KeywordAnalyzer.cs54
-rw-r--r--src/core/Analysis/KeywordTokenizer.cs99
-rw-r--r--src/core/Analysis/LengthFilter.cs60
-rw-r--r--src/core/Analysis/LetterTokenizer.cs57
-rw-r--r--src/core/Analysis/LowerCaseFilter.cs49
-rw-r--r--src/core/Analysis/LowerCaseTokenizer.cs60
-rw-r--r--src/core/Analysis/MappingCharFilter.cs166
-rw-r--r--src/core/Analysis/NormalizeCharMap.cs68
-rw-r--r--src/core/Analysis/NumericTokenStream.cs270
-rw-r--r--src/core/Analysis/PerFieldAnalyzerWrapper.cs135
-rw-r--r--src/core/Analysis/PorterStemFilter.cs62
-rw-r--r--src/core/Analysis/PorterStemmer.cs746
-rw-r--r--src/core/Analysis/SimpleAnalyzer.cs45
-rw-r--r--src/core/Analysis/Standard/StandardAnalyzer.cs174
-rw-r--r--src/core/Analysis/Standard/StandardFilter.cs88
-rw-r--r--src/core/Analysis/Standard/StandardTokenizer.cs232
-rw-r--r--src/core/Analysis/Standard/StandardTokenizerImpl.cs707
-rw-r--r--src/core/Analysis/StopAnalyzer.cs141
-rw-r--r--src/core/Analysis/StopFilter.cs178
-rw-r--r--src/core/Analysis/TeeSinkTokenFilter.cs266
-rw-r--r--src/core/Analysis/Token.cs852
-rw-r--r--src/core/Analysis/TokenFilter.cs72
-rw-r--r--src/core/Analysis/TokenStream.cs162
-rw-r--r--src/core/Analysis/Tokenattributes/FlagsAttribute.cs85
-rw-r--r--src/core/Analysis/Tokenattributes/IFlagsAttribute.cs41
-rw-r--r--src/core/Analysis/Tokenattributes/IOffsetAttribute.cs48
-rw-r--r--src/core/Analysis/Tokenattributes/IPayloadAttribute.cs31
-rw-r--r--src/core/Analysis/Tokenattributes/IPositionIncrementAttribute.cs59
-rw-r--r--src/core/Analysis/Tokenattributes/ITermAttribute.cs104
-rw-r--r--src/core/Analysis/Tokenattributes/ITypeAttribute.cs30
-rw-r--r--src/core/Analysis/Tokenattributes/OffsetAttribute.cs106
-rw-r--r--src/core/Analysis/Tokenattributes/PayloadAttribute.cs100
-rw-r--r--src/core/Analysis/Tokenattributes/PositionIncrementAttribute.cs107
-rw-r--r--src/core/Analysis/Tokenattributes/TermAttribute.cs268
-rw-r--r--src/core/Analysis/Tokenattributes/TypeAttribute.cs85
-rw-r--r--src/core/Analysis/Tokenizer.cs112
-rw-r--r--src/core/Analysis/WhitespaceAnalyzer.cs43
-rw-r--r--src/core/Analysis/WhitespaceTokenizer.cs55
-rw-r--r--src/core/Analysis/WordlistLoader.cs146
49 files changed, 11040 insertions, 0 deletions
diff --git a/src/core/Analysis/ASCIIFoldingFilter.cs b/src/core/Analysis/ASCIIFoldingFilter.cs
new file mode 100644
index 0000000..6133870
--- /dev/null
+++ b/src/core/Analysis/ASCIIFoldingFilter.cs
@@ -0,0 +1,3285 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+using Lucene.Net.Analysis.Tokenattributes;
+using ArrayUtil = Lucene.Net.Util.ArrayUtil;
+
+namespace Lucene.Net.Analysis
+{
+
+ /// <summary> This class converts alphabetic, numeric, and symbolic Unicode characters
+ /// which are not in the first 127 ASCII characters (the "Basic Latin" Unicode
+ /// block) into their ASCII equivalents, if one exists.
+ ///
+ /// Characters from the following Unicode blocks are converted; however, only
+ /// those characters with reasonable ASCII alternatives are converted:
+ ///
+ /// <list type="bullet">
+ /// <item>C1 Controls and Latin-1 Supplement: <a href="http://www.unicode.org/charts/PDF/U0080.pdf">http://www.unicode.org/charts/PDF/U0080.pdf</a></item>
+ /// <item>Latin Extended-A: <a href="http://www.unicode.org/charts/PDF/U0100.pdf">http://www.unicode.org/charts/PDF/U0100.pdf</a></item>
+ /// <item>Latin Extended-B: <a href="http://www.unicode.org/charts/PDF/U0180.pdf">http://www.unicode.org/charts/PDF/U0180.pdf</a></item>
+ /// <item>Latin Extended Additional: <a href="http://www.unicode.org/charts/PDF/U1E00.pdf">http://www.unicode.org/charts/PDF/U1E00.pdf</a></item>
+ /// <item>Latin Extended-C: <a href="http://www.unicode.org/charts/PDF/U2C60.pdf">http://www.unicode.org/charts/PDF/U2C60.pdf</a></item>
+ /// <item>Latin Extended-D: <a href="http://www.unicode.org/charts/PDF/UA720.pdf">http://www.unicode.org/charts/PDF/UA720.pdf</a></item>
+ /// <item>IPA Extensions: <a href="http://www.unicode.org/charts/PDF/U0250.pdf">http://www.unicode.org/charts/PDF/U0250.pdf</a></item>
+ /// <item>Phonetic Extensions: <a href="http://www.unicode.org/charts/PDF/U1D00.pdf">http://www.unicode.org/charts/PDF/U1D00.pdf</a></item>
+ /// <item>Phonetic Extensions Supplement: <a href="http://www.unicode.org/charts/PDF/U1D80.pdf">http://www.unicode.org/charts/PDF/U1D80.pdf</a></item>
+ /// <item>General Punctuation: <a href="http://www.unicode.org/charts/PDF/U2000.pdf">http://www.unicode.org/charts/PDF/U2000.pdf</a></item>
+ /// <item>Superscripts and Subscripts: <a href="http://www.unicode.org/charts/PDF/U2070.pdf">http://www.unicode.org/charts/PDF/U2070.pdf</a></item>
+ /// <item>Enclosed Alphanumerics: <a href="http://www.unicode.org/charts/PDF/U2460.pdf">http://www.unicode.org/charts/PDF/U2460.pdf</a></item>
+ /// <item>Dingbats: <a href="http://www.unicode.org/charts/PDF/U2700.pdf">http://www.unicode.org/charts/PDF/U2700.pdf</a></item>
+ /// <item>Supplemental Punctuation: <a href="http://www.unicode.org/charts/PDF/U2E00.pdf">http://www.unicode.org/charts/PDF/U2E00.pdf</a></item>
+ /// <item>Alphabetic Presentation Forms: <a href="http://www.unicode.org/charts/PDF/UFB00.pdf">http://www.unicode.org/charts/PDF/UFB00.pdf</a></item>
+ /// <item>Halfwidth and Fullwidth Forms: <a href="http://www.unicode.org/charts/PDF/UFF00.pdf">http://www.unicode.org/charts/PDF/UFF00.pdf</a></item>
+ /// </list>
+ ///
+ /// See: <a href="http://en.wikipedia.org/wiki/Latin_characters_in_Unicode">http://en.wikipedia.org/wiki/Latin_characters_in_Unicode</a>
+ ///
+ /// The set of character conversions supported by this class is a superset of
+ /// those supported by Lucene's <see cref="ISOLatin1AccentFilter" /> which strips
+ /// accents from Latin1 characters. For example, '&#192;' will be replaced by
+ /// 'a'.
+ /// </summary>
+ public sealed class ASCIIFoldingFilter : TokenFilter
+ {
+ public ASCIIFoldingFilter(TokenStream input):base(input)
+ {
+ termAtt = AddAttribute<ITermAttribute>();
+ }
+
+ private char[] output = new char[512];
+ private int outputPos;
+ private ITermAttribute termAtt;
+
+ public override bool IncrementToken()
+ {
+ if (input.IncrementToken())
+ {
+ char[] buffer = termAtt.TermBuffer();
+ int length = termAtt.TermLength();
+
+ // If no characters actually require rewriting then we
+ // just return token as-is:
+ for (int i = 0; i < length; ++i)
+ {
+ char c = buffer[i];
+ if (c >= '\u0080')
+ {
+ FoldToASCII(buffer, length);
+ termAtt.SetTermBuffer(output, 0, outputPos);
+ break;
+ }
+ }
+ return true;
+ }
+ else
+ {
+ return false;
+ }
+ }
+
+ /// <summary> Converts characters above ASCII to their ASCII equivalents. For example,
+ /// accents are removed from accented characters.
+ /// </summary>
+ /// <param name="input">The string to fold
+ /// </param>
+ /// <param name="length">The number of characters in the input string
+ /// </param>
+ public void FoldToASCII(char[] input, int length)
+ {
+ // Worst-case length required:
+ int maxSizeNeeded = 4 * length;
+ if (output.Length < maxSizeNeeded)
+ {
+ output = new char[ArrayUtil.GetNextSize(maxSizeNeeded)];
+ }
+
+ outputPos = 0;
+
+ for (int pos = 0; pos < length; ++pos)
+ {
+ char c = input[pos];
+
+ // Quick test: if it's not in range then just keep current character
+ if (c < '\u0080')
+ {
+ output[outputPos++] = c;
+ }
+ else
+ {
+ switch (c)
+ {
+
+ case '\u00C0':
+ // À [LATIN CAPITAL LETTER A WITH GRAVE]
+ case '\u00C1':
+ // � [LATIN CAPITAL LETTER A WITH ACUTE]
+ case '\u00C2':
+ // Â [LATIN CAPITAL LETTER A WITH CIRCUMFLEX]
+ case '\u00C3':
+ // Ã [LATIN CAPITAL LETTER A WITH TILDE]
+ case '\u00C4':
+ // Ä [LATIN CAPITAL LETTER A WITH DIAERESIS]
+ case '\u00C5':
+ // Ã… [LATIN CAPITAL LETTER A WITH RING ABOVE]
+ case '\u0100':
+ // Ā [LATIN CAPITAL LETTER A WITH MACRON]
+ case '\u0102':
+ // Ä‚ [LATIN CAPITAL LETTER A WITH BREVE]
+ case '\u0104':
+ // Ä„ [LATIN CAPITAL LETTER A WITH OGONEK]
+ case '\u018F':
+ // � http://en.wikipedia.org/wiki/Schwa [LATIN CAPITAL LETTER SCHWA]
+ case '\u01CD':
+ // � [LATIN CAPITAL LETTER A WITH CARON]
+ case '\u01DE':
+ // Çž [LATIN CAPITAL LETTER A WITH DIAERESIS AND MACRON]
+ case '\u01E0':
+ // Ç  [LATIN CAPITAL LETTER A WITH DOT ABOVE AND MACRON]
+ case '\u01FA':
+ // Ǻ [LATIN CAPITAL LETTER A WITH RING ABOVE AND ACUTE]
+ case '\u0200':
+ // Ȁ [LATIN CAPITAL LETTER A WITH DOUBLE GRAVE]
+ case '\u0202':
+ // È‚ [LATIN CAPITAL LETTER A WITH INVERTED BREVE]
+ case '\u0226':
+ // Ȧ [LATIN CAPITAL LETTER A WITH DOT ABOVE]
+ case '\u023A':
+ // Ⱥ [LATIN CAPITAL LETTER A WITH STROKE]
+ case '\u1D00':
+ // á´€ [LATIN LETTER SMALL CAPITAL A]
+ case '\u1E00':
+ // Ḁ [LATIN CAPITAL LETTER A WITH RING BELOW]
+ case '\u1EA0':
+ // Ạ [LATIN CAPITAL LETTER A WITH DOT BELOW]
+ case '\u1EA2':
+ // Ả [LATIN CAPITAL LETTER A WITH HOOK ABOVE]
+ case '\u1EA4':
+ // Ấ [LATIN CAPITAL LETTER A WITH CIRCUMFLEX AND ACUTE]
+ case '\u1EA6':
+ // Ầ [LATIN CAPITAL LETTER A WITH CIRCUMFLEX AND GRAVE]
+ case '\u1EA8':
+ // Ẩ [LATIN CAPITAL LETTER A WITH CIRCUMFLEX AND HOOK ABOVE]
+ case '\u1EAA':
+ // Ẫ [LATIN CAPITAL LETTER A WITH CIRCUMFLEX AND TILDE]
+ case '\u1EAC':
+ // Ậ [LATIN CAPITAL LETTER A WITH CIRCUMFLEX AND DOT BELOW]
+ case '\u1EAE':
+ // Ắ [LATIN CAPITAL LETTER A WITH BREVE AND ACUTE]
+ case '\u1EB0':
+ // Ằ [LATIN CAPITAL LETTER A WITH BREVE AND GRAVE]
+ case '\u1EB2':
+ // Ẳ [LATIN CAPITAL LETTER A WITH BREVE AND HOOK ABOVE]
+ case '\u1EB4':
+ // Ẵ [LATIN CAPITAL LETTER A WITH BREVE AND TILDE]
+ case '\u1EB6':
+ // Ặ [LATIN CAPITAL LETTER A WITH BREVE AND DOT BELOW]
+ case '\u24B6':
+ // â’¶ [CIRCLED LATIN CAPITAL LETTER A]
+ case '\uFF21': // A [FULLWIDTH LATIN CAPITAL LETTER A]
+ output[outputPos++] = 'A';
+ break;
+
+ case '\u00E0':
+ // à [LATIN SMALL LETTER A WITH GRAVE]
+ case '\u00E1':
+ // á [LATIN SMALL LETTER A WITH ACUTE]
+ case '\u00E2':
+ // â [LATIN SMALL LETTER A WITH CIRCUMFLEX]
+ case '\u00E3':
+ // ã [LATIN SMALL LETTER A WITH TILDE]
+ case '\u00E4':
+ // ä [LATIN SMALL LETTER A WITH DIAERESIS]
+ case '\u00E5':
+ // å [LATIN SMALL LETTER A WITH RING ABOVE]
+ case '\u0101':
+ // � [LATIN SMALL LETTER A WITH MACRON]
+ case '\u0103':
+ // ă [LATIN SMALL LETTER A WITH BREVE]
+ case '\u0105':
+ // Ä… [LATIN SMALL LETTER A WITH OGONEK]
+ case '\u01CE':
+ // ÇŽ [LATIN SMALL LETTER A WITH CARON]
+ case '\u01DF':
+ // ÇŸ [LATIN SMALL LETTER A WITH DIAERESIS AND MACRON]
+ case '\u01E1':
+ // Ç¡ [LATIN SMALL LETTER A WITH DOT ABOVE AND MACRON]
+ case '\u01FB':
+ // Ç» [LATIN SMALL LETTER A WITH RING ABOVE AND ACUTE]
+ case '\u0201':
+ // � [LATIN SMALL LETTER A WITH DOUBLE GRAVE]
+ case '\u0203':
+ // ȃ [LATIN SMALL LETTER A WITH INVERTED BREVE]
+ case '\u0227':
+ // ȧ [LATIN SMALL LETTER A WITH DOT ABOVE]
+ case '\u0250':
+ // � [LATIN SMALL LETTER TURNED A]
+ case '\u0259':
+ // É™ [LATIN SMALL LETTER SCHWA]
+ case '\u025A':
+ // Éš [LATIN SMALL LETTER SCHWA WITH HOOK]
+ case '\u1D8F':
+ // � [LATIN SMALL LETTER A WITH RETROFLEX HOOK]
+ case '\u1D95':
+ // ᶕ [LATIN SMALL LETTER SCHWA WITH RETROFLEX HOOK]
+ case '\u1E01':
+ // ạ [LATIN SMALL LETTER A WITH RING BELOW]
+ case '\u1E9A':
+ // ả [LATIN SMALL LETTER A WITH RIGHT HALF RING]
+ case '\u1EA1':
+ // ạ [LATIN SMALL LETTER A WITH DOT BELOW]
+ case '\u1EA3':
+ // ả [LATIN SMALL LETTER A WITH HOOK ABOVE]
+ case '\u1EA5':
+ // ấ [LATIN SMALL LETTER A WITH CIRCUMFLEX AND ACUTE]
+ case '\u1EA7':
+ // ầ [LATIN SMALL LETTER A WITH CIRCUMFLEX AND GRAVE]
+ case '\u1EA9':
+ // ẩ [LATIN SMALL LETTER A WITH CIRCUMFLEX AND HOOK ABOVE]
+ case '\u1EAB':
+ // ẫ [LATIN SMALL LETTER A WITH CIRCUMFLEX AND TILDE]
+ case '\u1EAD':
+ // ậ [LATIN SMALL LETTER A WITH CIRCUMFLEX AND DOT BELOW]
+ case '\u1EAF':
+ // ắ [LATIN SMALL LETTER A WITH BREVE AND ACUTE]
+ case '\u1EB1':
+ // ằ [LATIN SMALL LETTER A WITH BREVE AND GRAVE]
+ case '\u1EB3':
+ // ẳ [LATIN SMALL LETTER A WITH BREVE AND HOOK ABOVE]
+ case '\u1EB5':
+ // ẵ [LATIN SMALL LETTER A WITH BREVE AND TILDE]
+ case '\u1EB7':
+ // ặ [LATIN SMALL LETTER A WITH BREVE AND DOT BELOW]
+ case '\u2090':
+ // � [LATIN SUBSCRIPT SMALL LETTER A]
+ case '\u2094':
+ // �? [LATIN SUBSCRIPT SMALL LETTER SCHWA]
+ case '\u24D0':
+ // � [CIRCLED LATIN SMALL LETTER A]
+ case '\u2C65':
+ // â±¥ [LATIN SMALL LETTER A WITH STROKE]
+ case '\u2C6F':
+ // Ɐ [LATIN CAPITAL LETTER TURNED A]
+ case '\uFF41': // � [FULLWIDTH LATIN SMALL LETTER A]
+ output[outputPos++] = 'a';
+ break;
+
+ case '\uA732': // Ꜳ [LATIN CAPITAL LETTER AA]
+ output[outputPos++] = 'A';
+ output[outputPos++] = 'A';
+ break;
+
+ case '\u00C6':
+ // Æ [LATIN CAPITAL LETTER AE]
+ case '\u01E2':
+ // Ǣ [LATIN CAPITAL LETTER AE WITH MACRON]
+ case '\u01FC':
+ // Ǽ [LATIN CAPITAL LETTER AE WITH ACUTE]
+ case '\u1D01': // á´� [LATIN LETTER SMALL CAPITAL AE]
+ output[outputPos++] = 'A';
+ output[outputPos++] = 'E';
+ break;
+
+ case '\uA734': // Ꜵ [LATIN CAPITAL LETTER AO]
+ output[outputPos++] = 'A';
+ output[outputPos++] = 'O';
+ break;
+
+ case '\uA736': // Ꜷ [LATIN CAPITAL LETTER AU]
+ output[outputPos++] = 'A';
+ output[outputPos++] = 'U';
+ break;
+
+ case '\uA738':
+ // Ꜹ [LATIN CAPITAL LETTER AV]
+ case '\uA73A': // Ꜻ [LATIN CAPITAL LETTER AV WITH HORIZONTAL BAR]
+ output[outputPos++] = 'A';
+ output[outputPos++] = 'V';
+ break;
+
+ case '\uA73C': // Ꜽ [LATIN CAPITAL LETTER AY]
+ output[outputPos++] = 'A';
+ output[outputPos++] = 'Y';
+ break;
+
+ case '\u249C': // ⒜ [PARENTHESIZED LATIN SMALL LETTER A]
+ output[outputPos++] = '(';
+ output[outputPos++] = 'a';
+ output[outputPos++] = ')';
+ break;
+
+ case '\uA733': // ꜳ [LATIN SMALL LETTER AA]
+ output[outputPos++] = 'a';
+ output[outputPos++] = 'a';
+ break;
+
+ case '\u00E6':
+ // æ [LATIN SMALL LETTER AE]
+ case '\u01E3':
+ // ǣ [LATIN SMALL LETTER AE WITH MACRON]
+ case '\u01FD':
+ // ǽ [LATIN SMALL LETTER AE WITH ACUTE]
+ case '\u1D02': // á´‚ [LATIN SMALL LETTER TURNED AE]
+ output[outputPos++] = 'a';
+ output[outputPos++] = 'e';
+ break;
+
+ case '\uA735': // ꜵ [LATIN SMALL LETTER AO]
+ output[outputPos++] = 'a';
+ output[outputPos++] = 'o';
+ break;
+
+ case '\uA737': // ꜷ [LATIN SMALL LETTER AU]
+ output[outputPos++] = 'a';
+ output[outputPos++] = 'u';
+ break;
+
+ case '\uA739':
+ // ꜹ [LATIN SMALL LETTER AV]
+ case '\uA73B': // ꜻ [LATIN SMALL LETTER AV WITH HORIZONTAL BAR]
+ output[outputPos++] = 'a';
+ output[outputPos++] = 'v';
+ break;
+
+ case '\uA73D': // ꜽ [LATIN SMALL LETTER AY]
+ output[outputPos++] = 'a';
+ output[outputPos++] = 'y';
+ break;
+
+ case '\u0181':
+ // � [LATIN CAPITAL LETTER B WITH HOOK]
+ case '\u0182':
+ // Æ‚ [LATIN CAPITAL LETTER B WITH TOPBAR]
+ case '\u0243':
+ // Ƀ [LATIN CAPITAL LETTER B WITH STROKE]
+ case '\u0299':
+ // Ê™ [LATIN LETTER SMALL CAPITAL B]
+ case '\u1D03':
+ // á´ƒ [LATIN LETTER SMALL CAPITAL BARRED B]
+ case '\u1E02':
+ // Ḃ [LATIN CAPITAL LETTER B WITH DOT ABOVE]
+ case '\u1E04':
+ // Ḅ [LATIN CAPITAL LETTER B WITH DOT BELOW]
+ case '\u1E06':
+ // Ḇ [LATIN CAPITAL LETTER B WITH LINE BELOW]
+ case '\u24B7':
+ // â’· [CIRCLED LATIN CAPITAL LETTER B]
+ case '\uFF22': // ï¼¢ [FULLWIDTH LATIN CAPITAL LETTER B]
+ output[outputPos++] = 'B';
+ break;
+
+ case '\u0180':
+ // ƀ [LATIN SMALL LETTER B WITH STROKE]
+ case '\u0183':
+ // ƃ [LATIN SMALL LETTER B WITH TOPBAR]
+ case '\u0253':
+ // É“ [LATIN SMALL LETTER B WITH HOOK]
+ case '\u1D6C':
+ // ᵬ [LATIN SMALL LETTER B WITH MIDDLE TILDE]
+ case '\u1D80':
+ // ᶀ [LATIN SMALL LETTER B WITH PALATAL HOOK]
+ case '\u1E03':
+ // ḃ [LATIN SMALL LETTER B WITH DOT ABOVE]
+ case '\u1E05':
+ // ḅ [LATIN SMALL LETTER B WITH DOT BELOW]
+ case '\u1E07':
+ // ḇ [LATIN SMALL LETTER B WITH LINE BELOW]
+ case '\u24D1':
+ // â“‘ [CIRCLED LATIN SMALL LETTER B]
+ case '\uFF42': // b [FULLWIDTH LATIN SMALL LETTER B]
+ output[outputPos++] = 'b';
+ break;
+
+ case '\u249D': // â’� [PARENTHESIZED LATIN SMALL LETTER B]
+ output[outputPos++] = '(';
+ output[outputPos++] = 'b';
+ output[outputPos++] = ')';
+ break;
+
+ case '\u00C7':
+ // Ç [LATIN CAPITAL LETTER C WITH CEDILLA]
+ case '\u0106':
+ // Ć [LATIN CAPITAL LETTER C WITH ACUTE]
+ case '\u0108':
+ // Ĉ [LATIN CAPITAL LETTER C WITH CIRCUMFLEX]
+ case '\u010A':
+ // ÄŠ [LATIN CAPITAL LETTER C WITH DOT ABOVE]
+ case '\u010C':
+ // Č [LATIN CAPITAL LETTER C WITH CARON]
+ case '\u0187':
+ // Ƈ [LATIN CAPITAL LETTER C WITH HOOK]
+ case '\u023B':
+ // È» [LATIN CAPITAL LETTER C WITH STROKE]
+ case '\u0297':
+ // Ê— [LATIN LETTER STRETCHED C]
+ case '\u1D04':
+ // á´„ [LATIN LETTER SMALL CAPITAL C]
+ case '\u1E08':
+ // Ḉ [LATIN CAPITAL LETTER C WITH CEDILLA AND ACUTE]
+ case '\u24B8':
+ // â’¸ [CIRCLED LATIN CAPITAL LETTER C]
+ case '\uFF23': // ï¼£ [FULLWIDTH LATIN CAPITAL LETTER C]
+ output[outputPos++] = 'C';
+ break;
+
+ case '\u00E7':
+ // ç [LATIN SMALL LETTER C WITH CEDILLA]
+ case '\u0107':
+ // ć [LATIN SMALL LETTER C WITH ACUTE]
+ case '\u0109':
+ // ĉ [LATIN SMALL LETTER C WITH CIRCUMFLEX]
+ case '\u010B':
+ // Ä‹ [LATIN SMALL LETTER C WITH DOT ABOVE]
+ case '\u010D':
+ // � [LATIN SMALL LETTER C WITH CARON]
+ case '\u0188':
+ // ƈ [LATIN SMALL LETTER C WITH HOOK]
+ case '\u023C':
+ // ȼ [LATIN SMALL LETTER C WITH STROKE]
+ case '\u0255':
+ // É• [LATIN SMALL LETTER C WITH CURL]
+ case '\u1E09':
+ // ḉ [LATIN SMALL LETTER C WITH CEDILLA AND ACUTE]
+ case '\u2184':
+ // ↄ [LATIN SMALL LETTER REVERSED C]
+ case '\u24D2':
+ // â“’ [CIRCLED LATIN SMALL LETTER C]
+ case '\uA73E':
+ // Ꜿ [LATIN CAPITAL LETTER REVERSED C WITH DOT]
+ case '\uA73F':
+ // ꜿ [LATIN SMALL LETTER REVERSED C WITH DOT]
+ case '\uFF43': // c [FULLWIDTH LATIN SMALL LETTER C]
+ output[outputPos++] = 'c';
+ break;
+
+ case '\u249E': // â’ž [PARENTHESIZED LATIN SMALL LETTER C]
+ output[outputPos++] = '(';
+ output[outputPos++] = 'c';
+ output[outputPos++] = ')';
+ break;
+
+ case '\u00D0':
+ // � [LATIN CAPITAL LETTER ETH]
+ case '\u010E':
+ // ÄŽ [LATIN CAPITAL LETTER D WITH CARON]
+ case '\u0110':
+ // � [LATIN CAPITAL LETTER D WITH STROKE]
+ case '\u0189':
+ // Ɖ [LATIN CAPITAL LETTER AFRICAN D]
+ case '\u018A':
+ // ÆŠ [LATIN CAPITAL LETTER D WITH HOOK]
+ case '\u018B':
+ // Æ‹ [LATIN CAPITAL LETTER D WITH TOPBAR]
+ case '\u1D05':
+ // á´… [LATIN LETTER SMALL CAPITAL D]
+ case '\u1D06':
+ // á´† [LATIN LETTER SMALL CAPITAL ETH]
+ case '\u1E0A':
+ // Ḋ [LATIN CAPITAL LETTER D WITH DOT ABOVE]
+ case '\u1E0C':
+ // Ḍ [LATIN CAPITAL LETTER D WITH DOT BELOW]
+ case '\u1E0E':
+ // Ḏ [LATIN CAPITAL LETTER D WITH LINE BELOW]
+ case '\u1E10':
+ // � [LATIN CAPITAL LETTER D WITH CEDILLA]
+ case '\u1E12':
+ // Ḓ [LATIN CAPITAL LETTER D WITH CIRCUMFLEX BELOW]
+ case '\u24B9':
+ // â’¹ [CIRCLED LATIN CAPITAL LETTER D]
+ case '\uA779':
+ // � [LATIN CAPITAL LETTER INSULAR D]
+ case '\uFF24': // D [FULLWIDTH LATIN CAPITAL LETTER D]
+ output[outputPos++] = 'D';
+ break;
+
+ case '\u00F0':
+ // ð [LATIN SMALL LETTER ETH]
+ case '\u010F':
+ // � [LATIN SMALL LETTER D WITH CARON]
+ case '\u0111':
+ // Ä‘ [LATIN SMALL LETTER D WITH STROKE]
+ case '\u018C':
+ // ƌ [LATIN SMALL LETTER D WITH TOPBAR]
+ case '\u0221':
+ // È¡ [LATIN SMALL LETTER D WITH CURL]
+ case '\u0256':
+ // É– [LATIN SMALL LETTER D WITH TAIL]
+ case '\u0257':
+ // É— [LATIN SMALL LETTER D WITH HOOK]
+ case '\u1D6D':
+ // áµ­ [LATIN SMALL LETTER D WITH MIDDLE TILDE]
+ case '\u1D81':
+ // � [LATIN SMALL LETTER D WITH PALATAL HOOK]
+ case '\u1D91':
+ // ᶑ [LATIN SMALL LETTER D WITH HOOK AND TAIL]
+ case '\u1E0B':
+ // ḋ [LATIN SMALL LETTER D WITH DOT ABOVE]
+ case '\u1E0D':
+ // � [LATIN SMALL LETTER D WITH DOT BELOW]
+ case '\u1E0F':
+ // � [LATIN SMALL LETTER D WITH LINE BELOW]
+ case '\u1E11':
+ // ḑ [LATIN SMALL LETTER D WITH CEDILLA]
+ case '\u1E13':
+ // ḓ [LATIN SMALL LETTER D WITH CIRCUMFLEX BELOW]
+ case '\u24D3':
+ // â““ [CIRCLED LATIN SMALL LETTER D]
+ case '\uA77A':
+ // � [LATIN SMALL LETTER INSULAR D]
+ case '\uFF44': // d [FULLWIDTH LATIN SMALL LETTER D]
+ output[outputPos++] = 'd';
+ break;
+
+ case '\u01C4':
+ // Ç„ [LATIN CAPITAL LETTER DZ WITH CARON]
+ case '\u01F1': // DZ [LATIN CAPITAL LETTER DZ]
+ output[outputPos++] = 'D';
+ output[outputPos++] = 'Z';
+ break;
+
+ case '\u01C5':
+ // Ç… [LATIN CAPITAL LETTER D WITH SMALL LETTER Z WITH CARON]
+ case '\u01F2': // Dz [LATIN CAPITAL LETTER D WITH SMALL LETTER Z]
+ output[outputPos++] = 'D';
+ output[outputPos++] = 'z';
+ break;
+
+ case '\u249F': // â’Ÿ [PARENTHESIZED LATIN SMALL LETTER D]
+ output[outputPos++] = '(';
+ output[outputPos++] = 'd';
+ output[outputPos++] = ')';
+ break;
+
+ case '\u0238': // ȸ [LATIN SMALL LETTER DB DIGRAPH]
+ output[outputPos++] = 'd';
+ output[outputPos++] = 'b';
+ break;
+
+ case '\u01C6':
+ // dž [LATIN SMALL LETTER DZ WITH CARON]
+ case '\u01F3':
+ // dz [LATIN SMALL LETTER DZ]
+ case '\u02A3':
+ // ʣ [LATIN SMALL LETTER DZ DIGRAPH]
+ case '\u02A5': // ʥ [LATIN SMALL LETTER DZ DIGRAPH WITH CURL]
+ output[outputPos++] = 'd';
+ output[outputPos++] = 'z';
+ break;
+
+ case '\u00C8':
+ // È [LATIN CAPITAL LETTER E WITH GRAVE]
+ case '\u00C9':
+ // É [LATIN CAPITAL LETTER E WITH ACUTE]
+ case '\u00CA':
+ // Ê [LATIN CAPITAL LETTER E WITH CIRCUMFLEX]
+ case '\u00CB':
+ // Ë [LATIN CAPITAL LETTER E WITH DIAERESIS]
+ case '\u0112':
+ // Ä’ [LATIN CAPITAL LETTER E WITH MACRON]
+ case '\u0114':
+ // �? [LATIN CAPITAL LETTER E WITH BREVE]
+ case '\u0116':
+ // Ä– [LATIN CAPITAL LETTER E WITH DOT ABOVE]
+ case '\u0118':
+ // Ę [LATIN CAPITAL LETTER E WITH OGONEK]
+ case '\u011A':
+ // Äš [LATIN CAPITAL LETTER E WITH CARON]
+ case '\u018E':
+ // ÆŽ [LATIN CAPITAL LETTER REVERSED E]
+ case '\u0190':
+ // � [LATIN CAPITAL LETTER OPEN E]
+ case '\u0204':
+ // È„ [LATIN CAPITAL LETTER E WITH DOUBLE GRAVE]
+ case '\u0206':
+ // Ȇ [LATIN CAPITAL LETTER E WITH INVERTED BREVE]
+ case '\u0228':
+ // Ȩ [LATIN CAPITAL LETTER E WITH CEDILLA]
+ case '\u0246':
+ // Ɇ [LATIN CAPITAL LETTER E WITH STROKE]
+ case '\u1D07':
+ // á´‡ [LATIN LETTER SMALL CAPITAL E]
+ case '\u1E14':
+ // �? [LATIN CAPITAL LETTER E WITH MACRON AND GRAVE]
+ case '\u1E16':
+ // Ḗ [LATIN CAPITAL LETTER E WITH MACRON AND ACUTE]
+ case '\u1E18':
+ // Ḙ [LATIN CAPITAL LETTER E WITH CIRCUMFLEX BELOW]
+ case '\u1E1A':
+ // Ḛ [LATIN CAPITAL LETTER E WITH TILDE BELOW]
+ case '\u1E1C':
+ // Ḝ [LATIN CAPITAL LETTER E WITH CEDILLA AND BREVE]
+ case '\u1EB8':
+ // Ẹ [LATIN CAPITAL LETTER E WITH DOT BELOW]
+ case '\u1EBA':
+ // Ẻ [LATIN CAPITAL LETTER E WITH HOOK ABOVE]
+ case '\u1EBC':
+ // Ẽ [LATIN CAPITAL LETTER E WITH TILDE]
+ case '\u1EBE':
+ // Ế [LATIN CAPITAL LETTER E WITH CIRCUMFLEX AND ACUTE]
+ case '\u1EC0':
+ // Ề [LATIN CAPITAL LETTER E WITH CIRCUMFLEX AND GRAVE]
+ case '\u1EC2':
+ // Ể [LATIN CAPITAL LETTER E WITH CIRCUMFLEX AND HOOK ABOVE]
+ case '\u1EC4':
+ // Ễ [LATIN CAPITAL LETTER E WITH CIRCUMFLEX AND TILDE]
+ case '\u1EC6':
+ // Ệ [LATIN CAPITAL LETTER E WITH CIRCUMFLEX AND DOT BELOW]
+ case '\u24BA':
+ // â’º [CIRCLED LATIN CAPITAL LETTER E]
+ case '\u2C7B':
+ // â±» [LATIN LETTER SMALL CAPITAL TURNED E]
+ case '\uFF25': // ï¼¥ [FULLWIDTH LATIN CAPITAL LETTER E]
+ output[outputPos++] = 'E';
+ break;
+
+ case '\u00E8':
+ // è [LATIN SMALL LETTER E WITH GRAVE]
+ case '\u00E9':
+ // é [LATIN SMALL LETTER E WITH ACUTE]
+ case '\u00EA':
+ // ê [LATIN SMALL LETTER E WITH CIRCUMFLEX]
+ case '\u00EB':
+ // ë [LATIN SMALL LETTER E WITH DIAERESIS]
+ case '\u0113':
+ // Ä“ [LATIN SMALL LETTER E WITH MACRON]
+ case '\u0115':
+ // Ä• [LATIN SMALL LETTER E WITH BREVE]
+ case '\u0117':
+ // Ä— [LATIN SMALL LETTER E WITH DOT ABOVE]
+ case '\u0119':
+ // Ä™ [LATIN SMALL LETTER E WITH OGONEK]
+ case '\u011B':
+ // Ä› [LATIN SMALL LETTER E WITH CARON]
+ case '\u01DD':
+ // � [LATIN SMALL LETTER TURNED E]
+ case '\u0205':
+ // È… [LATIN SMALL LETTER E WITH DOUBLE GRAVE]
+ case '\u0207':
+ // ȇ [LATIN SMALL LETTER E WITH INVERTED BREVE]
+ case '\u0229':
+ // È© [LATIN SMALL LETTER E WITH CEDILLA]
+ case '\u0247':
+ // ɇ [LATIN SMALL LETTER E WITH STROKE]
+ case '\u0258':
+ // ɘ [LATIN SMALL LETTER REVERSED E]
+ case '\u025B':
+ // É› [LATIN SMALL LETTER OPEN E]
+ case '\u025C':
+ // ɜ [LATIN SMALL LETTER REVERSED OPEN E]
+ case '\u025D':
+ // � [LATIN SMALL LETTER REVERSED OPEN E WITH HOOK]
+ case '\u025E':
+ // Éž [LATIN SMALL LETTER CLOSED REVERSED OPEN E]
+ case '\u029A':
+ // Êš [LATIN SMALL LETTER CLOSED OPEN E]
+ case '\u1D08':
+ // á´ˆ [LATIN SMALL LETTER TURNED OPEN E]
+ case '\u1D92':
+ // ᶒ [LATIN SMALL LETTER E WITH RETROFLEX HOOK]
+ case '\u1D93':
+ // ᶓ [LATIN SMALL LETTER OPEN E WITH RETROFLEX HOOK]
+ case '\u1D94':
+ // �? [LATIN SMALL LETTER REVERSED OPEN E WITH RETROFLEX HOOK]
+ case '\u1E15':
+ // ḕ [LATIN SMALL LETTER E WITH MACRON AND GRAVE]
+ case '\u1E17':
+ // ḗ [LATIN SMALL LETTER E WITH MACRON AND ACUTE]
+ case '\u1E19':
+ // ḙ [LATIN SMALL LETTER E WITH CIRCUMFLEX BELOW]
+ case '\u1E1B':
+ // ḛ [LATIN SMALL LETTER E WITH TILDE BELOW]
+ case '\u1E1D':
+ // � [LATIN SMALL LETTER E WITH CEDILLA AND BREVE]
+ case '\u1EB9':
+ // ẹ [LATIN SMALL LETTER E WITH DOT BELOW]
+ case '\u1EBB':
+ // ẻ [LATIN SMALL LETTER E WITH HOOK ABOVE]
+ case '\u1EBD':
+ // ẽ [LATIN SMALL LETTER E WITH TILDE]
+ case '\u1EBF':
+ // ế [LATIN SMALL LETTER E WITH CIRCUMFLEX AND ACUTE]
+ case '\u1EC1':
+ // � [LATIN SMALL LETTER E WITH CIRCUMFLEX AND GRAVE]
+ case '\u1EC3':
+ // ể [LATIN SMALL LETTER E WITH CIRCUMFLEX AND HOOK ABOVE]
+ case '\u1EC5':
+ // á»… [LATIN SMALL LETTER E WITH CIRCUMFLEX AND TILDE]
+ case '\u1EC7':
+ // ệ [LATIN SMALL LETTER E WITH CIRCUMFLEX AND DOT BELOW]
+ case '\u2091':
+ // â‚‘ [LATIN SUBSCRIPT SMALL LETTER E]
+ case '\u24D4':
+ // �? [CIRCLED LATIN SMALL LETTER E]
+ case '\u2C78':
+ // ⱸ [LATIN SMALL LETTER E WITH NOTCH]
+ case '\uFF45': // ï½… [FULLWIDTH LATIN SMALL LETTER E]
+ output[outputPos++] = 'e';
+ break;
+
+ case '\u24A0': // â’  [PARENTHESIZED LATIN SMALL LETTER E]
+ output[outputPos++] = '(';
+ output[outputPos++] = 'e';
+ output[outputPos++] = ')';
+ break;
+
+ case '\u0191':
+ // Æ‘ [LATIN CAPITAL LETTER F WITH HOOK]
+ case '\u1E1E':
+ // Ḟ [LATIN CAPITAL LETTER F WITH DOT ABOVE]
+ case '\u24BB':
+ // â’» [CIRCLED LATIN CAPITAL LETTER F]
+ case '\uA730':
+ // ꜰ [LATIN LETTER SMALL CAPITAL F]
+ case '\uA77B':
+ // � [LATIN CAPITAL LETTER INSULAR F]
+ case '\uA7FB':
+ // ꟻ [LATIN EPIGRAPHIC LETTER REVERSED F]
+ case '\uFF26': // F [FULLWIDTH LATIN CAPITAL LETTER F]
+ output[outputPos++] = 'F';
+ break;
+
+ case '\u0192':
+ // Æ’ [LATIN SMALL LETTER F WITH HOOK]
+ case '\u1D6E':
+ // áµ® [LATIN SMALL LETTER F WITH MIDDLE TILDE]
+ case '\u1D82':
+ // ᶂ [LATIN SMALL LETTER F WITH PALATAL HOOK]
+ case '\u1E1F':
+ // ḟ [LATIN SMALL LETTER F WITH DOT ABOVE]
+ case '\u1E9B':
+ // ẛ [LATIN SMALL LETTER LONG S WITH DOT ABOVE]
+ case '\u24D5':
+ // â“• [CIRCLED LATIN SMALL LETTER F]
+ case '\uA77C':
+ // � [LATIN SMALL LETTER INSULAR F]
+ case '\uFF46': // f [FULLWIDTH LATIN SMALL LETTER F]
+ output[outputPos++] = 'f';
+ break;
+
+ case '\u24A1': // â’¡ [PARENTHESIZED LATIN SMALL LETTER F]
+ output[outputPos++] = '(';
+ output[outputPos++] = 'f';
+ output[outputPos++] = ')';
+ break;
+
+ case '\uFB00': // ff [LATIN SMALL LIGATURE FF]
+ output[outputPos++] = 'f';
+ output[outputPos++] = 'f';
+ break;
+
+ case '\uFB03': // ffi [LATIN SMALL LIGATURE FFI]
+ output[outputPos++] = 'f';
+ output[outputPos++] = 'f';
+ output[outputPos++] = 'i';
+ break;
+
+ case '\uFB04': // ffl [LATIN SMALL LIGATURE FFL]
+ output[outputPos++] = 'f';
+ output[outputPos++] = 'f';
+ output[outputPos++] = 'l';
+ break;
+
+ case '\uFB01': // � [LATIN SMALL LIGATURE FI]
+ output[outputPos++] = 'f';
+ output[outputPos++] = 'i';
+ break;
+
+ case '\uFB02': // fl [LATIN SMALL LIGATURE FL]
+ output[outputPos++] = 'f';
+ output[outputPos++] = 'l';
+ break;
+
+ case '\u011C':
+ // Ĝ [LATIN CAPITAL LETTER G WITH CIRCUMFLEX]
+ case '\u011E':
+ // Äž [LATIN CAPITAL LETTER G WITH BREVE]
+ case '\u0120':
+ // Ä  [LATIN CAPITAL LETTER G WITH DOT ABOVE]
+ case '\u0122':
+ // Ģ [LATIN CAPITAL LETTER G WITH CEDILLA]
+ case '\u0193':
+ // Æ“ [LATIN CAPITAL LETTER G WITH HOOK]
+ case '\u01E4':
+ // Ǥ [LATIN CAPITAL LETTER G WITH STROKE]
+ case '\u01E5':
+ // ǥ [LATIN SMALL LETTER G WITH STROKE]
+ case '\u01E6':
+ // Ǧ [LATIN CAPITAL LETTER G WITH CARON]
+ case '\u01E7':
+ // ǧ [LATIN SMALL LETTER G WITH CARON]
+ case '\u01F4':
+ // Ç´ [LATIN CAPITAL LETTER G WITH ACUTE]
+ case '\u0262':
+ // ɢ [LATIN LETTER SMALL CAPITAL G]
+ case '\u029B':
+ // Ê› [LATIN LETTER SMALL CAPITAL G WITH HOOK]
+ case '\u1E20':
+ // Ḡ [LATIN CAPITAL LETTER G WITH MACRON]
+ case '\u24BC':
+ // â’¼ [CIRCLED LATIN CAPITAL LETTER G]
+ case '\uA77D':
+ // � [LATIN CAPITAL LETTER INSULAR G]
+ case '\uA77E':
+ // � [LATIN CAPITAL LETTER TURNED INSULAR G]
+ case '\uFF27': // G [FULLWIDTH LATIN CAPITAL LETTER G]
+ output[outputPos++] = 'G';
+ break;
+
+ case '\u011D':
+ // � [LATIN SMALL LETTER G WITH CIRCUMFLEX]
+ case '\u011F':
+ // ÄŸ [LATIN SMALL LETTER G WITH BREVE]
+ case '\u0121':
+ // Ä¡ [LATIN SMALL LETTER G WITH DOT ABOVE]
+ case '\u0123':
+ // ģ [LATIN SMALL LETTER G WITH CEDILLA]
+ case '\u01F5':
+ // ǵ [LATIN SMALL LETTER G WITH ACUTE]
+ case '\u0260':
+ // É  [LATIN SMALL LETTER G WITH HOOK]
+ case '\u0261':
+ // É¡ [LATIN SMALL LETTER SCRIPT G]
+ case '\u1D77':
+ // áµ· [LATIN SMALL LETTER TURNED G]
+ case '\u1D79':
+ // áµ¹ [LATIN SMALL LETTER INSULAR G]
+ case '\u1D83':
+ // ᶃ [LATIN SMALL LETTER G WITH PALATAL HOOK]
+ case '\u1E21':
+ // ḡ [LATIN SMALL LETTER G WITH MACRON]
+ case '\u24D6':
+ // â“– [CIRCLED LATIN SMALL LETTER G]
+ case '\uA77F':
+ // � [LATIN SMALL LETTER TURNED INSULAR G]
+ case '\uFF47': // g [FULLWIDTH LATIN SMALL LETTER G]
+ output[outputPos++] = 'g';
+ break;
+
+ case '\u24A2': // â’¢ [PARENTHESIZED LATIN SMALL LETTER G]
+ output[outputPos++] = '(';
+ output[outputPos++] = 'g';
+ output[outputPos++] = ')';
+ break;
+
+ case '\u0124':
+ // Ĥ [LATIN CAPITAL LETTER H WITH CIRCUMFLEX]
+ case '\u0126':
+ // Ħ [LATIN CAPITAL LETTER H WITH STROKE]
+ case '\u021E':
+ // Èž [LATIN CAPITAL LETTER H WITH CARON]
+ case '\u029C':
+ // ʜ [LATIN LETTER SMALL CAPITAL H]
+ case '\u1E22':
+ // Ḣ [LATIN CAPITAL LETTER H WITH DOT ABOVE]
+ case '\u1E24':
+ // Ḥ [LATIN CAPITAL LETTER H WITH DOT BELOW]
+ case '\u1E26':
+ // Ḧ [LATIN CAPITAL LETTER H WITH DIAERESIS]
+ case '\u1E28':
+ // Ḩ [LATIN CAPITAL LETTER H WITH CEDILLA]
+ case '\u1E2A':
+ // Ḫ [LATIN CAPITAL LETTER H WITH BREVE BELOW]
+ case '\u24BD':
+ // â’½ [CIRCLED LATIN CAPITAL LETTER H]
+ case '\u2C67':
+ // Ⱨ [LATIN CAPITAL LETTER H WITH DESCENDER]
+ case '\u2C75':
+ // â±µ [LATIN CAPITAL LETTER HALF H]
+ case '\uFF28': // H [FULLWIDTH LATIN CAPITAL LETTER H]
+ output[outputPos++] = 'H';
+ break;
+
+ case '\u0125':
+ // ĥ [LATIN SMALL LETTER H WITH CIRCUMFLEX]
+ case '\u0127':
+ // ħ [LATIN SMALL LETTER H WITH STROKE]
+ case '\u021F':
+ // ÈŸ [LATIN SMALL LETTER H WITH CARON]
+ case '\u0265':
+ // ɥ [LATIN SMALL LETTER TURNED H]
+ case '\u0266':
+ // ɦ [LATIN SMALL LETTER H WITH HOOK]
+ case '\u02AE':
+ // Ê® [LATIN SMALL LETTER TURNED H WITH FISHHOOK]
+ case '\u02AF':
+ // ʯ [LATIN SMALL LETTER TURNED H WITH FISHHOOK AND TAIL]
+ case '\u1E23':
+ // ḣ [LATIN SMALL LETTER H WITH DOT ABOVE]
+ case '\u1E25':
+ // ḥ [LATIN SMALL LETTER H WITH DOT BELOW]
+ case '\u1E27':
+ // ḧ [LATIN SMALL LETTER H WITH DIAERESIS]
+ case '\u1E29':
+ // ḩ [LATIN SMALL LETTER H WITH CEDILLA]
+ case '\u1E2B':
+ // ḫ [LATIN SMALL LETTER H WITH BREVE BELOW]
+ case '\u1E96':
+ // ẖ [LATIN SMALL LETTER H WITH LINE BELOW]
+ case '\u24D7':
+ // â“— [CIRCLED LATIN SMALL LETTER H]
+ case '\u2C68':
+ // ⱨ [LATIN SMALL LETTER H WITH DESCENDER]
+ case '\u2C76':
+ // ⱶ [LATIN SMALL LETTER HALF H]
+ case '\uFF48': // h [FULLWIDTH LATIN SMALL LETTER H]
+ output[outputPos++] = 'h';
+ break;
+
+ case '\u01F6': // Ƕ http://en.wikipedia.org/wiki/Hwair [LATIN CAPITAL LETTER HWAIR]
+ output[outputPos++] = 'H';
+ output[outputPos++] = 'V';
+ break;
+
+ case '\u24A3': // â’£ [PARENTHESIZED LATIN SMALL LETTER H]
+ output[outputPos++] = '(';
+ output[outputPos++] = 'h';
+ output[outputPos++] = ')';
+ break;
+
+ case '\u0195': // Æ• [LATIN SMALL LETTER HV]
+ output[outputPos++] = 'h';
+ output[outputPos++] = 'v';
+ break;
+
+ case '\u00CC':
+ // Ì [LATIN CAPITAL LETTER I WITH GRAVE]
+ case '\u00CD':
+ // � [LATIN CAPITAL LETTER I WITH ACUTE]
+ case '\u00CE':
+ // ÃŽ [LATIN CAPITAL LETTER I WITH CIRCUMFLEX]
+ case '\u00CF':
+ // � [LATIN CAPITAL LETTER I WITH DIAERESIS]
+ case '\u0128':
+ // Ĩ [LATIN CAPITAL LETTER I WITH TILDE]
+ case '\u012A':
+ // Ī [LATIN CAPITAL LETTER I WITH MACRON]
+ case '\u012C':
+ // Ĭ [LATIN CAPITAL LETTER I WITH BREVE]
+ case '\u012E':
+ // Ä® [LATIN CAPITAL LETTER I WITH OGONEK]
+ case '\u0130':
+ // Ä° [LATIN CAPITAL LETTER I WITH DOT ABOVE]
+ case '\u0196':
+ // Æ– [LATIN CAPITAL LETTER IOTA]
+ case '\u0197':
+ // Æ— [LATIN CAPITAL LETTER I WITH STROKE]
+ case '\u01CF':
+ // � [LATIN CAPITAL LETTER I WITH CARON]
+ case '\u0208':
+ // Ȉ [LATIN CAPITAL LETTER I WITH DOUBLE GRAVE]
+ case '\u020A':
+ // ÈŠ [LATIN CAPITAL LETTER I WITH INVERTED BREVE]
+ case '\u026A':
+ // ɪ [LATIN LETTER SMALL CAPITAL I]
+ case '\u1D7B':
+ // áµ» [LATIN SMALL CAPITAL LETTER I WITH STROKE]
+ case '\u1E2C':
+ // Ḭ [LATIN CAPITAL LETTER I WITH TILDE BELOW]
+ case '\u1E2E':
+ // Ḯ [LATIN CAPITAL LETTER I WITH DIAERESIS AND ACUTE]
+ case '\u1EC8':
+ // Ỉ [LATIN CAPITAL LETTER I WITH HOOK ABOVE]
+ case '\u1ECA':
+ // Ị [LATIN CAPITAL LETTER I WITH DOT BELOW]
+ case '\u24BE':
+ // â’¾ [CIRCLED LATIN CAPITAL LETTER I]
+ case '\uA7FE':
+ // ꟾ [LATIN EPIGRAPHIC LETTER I LONGA]
+ case '\uFF29': // I [FULLWIDTH LATIN CAPITAL LETTER I]
+ output[outputPos++] = 'I';
+ break;
+
+ case '\u00EC':
+ // ì [LATIN SMALL LETTER I WITH GRAVE]
+ case '\u00ED':
+ // í [LATIN SMALL LETTER I WITH ACUTE]
+ case '\u00EE':
+ // î [LATIN SMALL LETTER I WITH CIRCUMFLEX]
+ case '\u00EF':
+ // ï [LATIN SMALL LETTER I WITH DIAERESIS]
+ case '\u0129':
+ // Ä© [LATIN SMALL LETTER I WITH TILDE]
+ case '\u012B':
+ // Ä« [LATIN SMALL LETTER I WITH MACRON]
+ case '\u012D':
+ // Ä­ [LATIN SMALL LETTER I WITH BREVE]
+ case '\u012F':
+ // į [LATIN SMALL LETTER I WITH OGONEK]
+ case '\u0131':
+ // ı [LATIN SMALL LETTER DOTLESS I]
+ case '\u01D0':
+ // � [LATIN SMALL LETTER I WITH CARON]
+ case '\u0209':
+ // ȉ [LATIN SMALL LETTER I WITH DOUBLE GRAVE]
+ case '\u020B':
+ // È‹ [LATIN SMALL LETTER I WITH INVERTED BREVE]
+ case '\u0268':
+ // ɨ [LATIN SMALL LETTER I WITH STROKE]
+ case '\u1D09':
+ // á´‰ [LATIN SMALL LETTER TURNED I]
+ case '\u1D62':
+ // áµ¢ [LATIN SUBSCRIPT SMALL LETTER I]
+ case '\u1D7C':
+ // áµ¼ [LATIN SMALL LETTER IOTA WITH STROKE]
+ case '\u1D96':
+ // ᶖ [LATIN SMALL LETTER I WITH RETROFLEX HOOK]
+ case '\u1E2D':
+ // ḭ [LATIN SMALL LETTER I WITH TILDE BELOW]
+ case '\u1E2F':
+ // ḯ [LATIN SMALL LETTER I WITH DIAERESIS AND ACUTE]
+ case '\u1EC9':
+ // ỉ [LATIN SMALL LETTER I WITH HOOK ABOVE]
+ case '\u1ECB':
+ // ị [LATIN SMALL LETTER I WITH DOT BELOW]
+ case '\u2071':
+ // � [SUPERSCRIPT LATIN SMALL LETTER I]
+ case '\u24D8':
+ // ⓘ [CIRCLED LATIN SMALL LETTER I]
+ case '\uFF49': // i [FULLWIDTH LATIN SMALL LETTER I]
+ output[outputPos++] = 'i';
+ break;
+
+ case '\u0132': // IJ [LATIN CAPITAL LIGATURE IJ]
+ output[outputPos++] = 'I';
+ output[outputPos++] = 'J';
+ break;
+
+ case '\u24A4': // â’¤ [PARENTHESIZED LATIN SMALL LETTER I]
+ output[outputPos++] = '(';
+ output[outputPos++] = 'i';
+ output[outputPos++] = ')';
+ break;
+
+ case '\u0133': // ij [LATIN SMALL LIGATURE IJ]
+ output[outputPos++] = 'i';
+ output[outputPos++] = 'j';
+ break;
+
+ case '\u0134':
+ // Ä´ [LATIN CAPITAL LETTER J WITH CIRCUMFLEX]
+ case '\u0248':
+ // Ɉ [LATIN CAPITAL LETTER J WITH STROKE]
+ case '\u1D0A':
+ // á´Š [LATIN LETTER SMALL CAPITAL J]
+ case '\u24BF':
+ // â’¿ [CIRCLED LATIN CAPITAL LETTER J]
+ case '\uFF2A': // J [FULLWIDTH LATIN CAPITAL LETTER J]
+ output[outputPos++] = 'J';
+ break;
+
+ case '\u0135':
+ // ĵ [LATIN SMALL LETTER J WITH CIRCUMFLEX]
+ case '\u01F0':
+ // Ç° [LATIN SMALL LETTER J WITH CARON]
+ case '\u0237':
+ // È· [LATIN SMALL LETTER DOTLESS J]
+ case '\u0249':
+ // ɉ [LATIN SMALL LETTER J WITH STROKE]
+ case '\u025F':
+ // ÉŸ [LATIN SMALL LETTER DOTLESS J WITH STROKE]
+ case '\u0284':
+ // Ê„ [LATIN SMALL LETTER DOTLESS J WITH STROKE AND HOOK]
+ case '\u029D':
+ // � [LATIN SMALL LETTER J WITH CROSSED-TAIL]
+ case '\u24D9':
+ // â“™ [CIRCLED LATIN SMALL LETTER J]
+ case '\u2C7C':
+ // â±¼ [LATIN SUBSCRIPT SMALL LETTER J]
+ case '\uFF4A': // j [FULLWIDTH LATIN SMALL LETTER J]
+ output[outputPos++] = 'j';
+ break;
+
+ case '\u24A5': // â’¥ [PARENTHESIZED LATIN SMALL LETTER J]
+ output[outputPos++] = '(';
+ output[outputPos++] = 'j';
+ output[outputPos++] = ')';
+ break;
+
+ case '\u0136':
+ // Ķ [LATIN CAPITAL LETTER K WITH CEDILLA]
+ case '\u0198':
+ // Ƙ [LATIN CAPITAL LETTER K WITH HOOK]
+ case '\u01E8':
+ // Ǩ [LATIN CAPITAL LETTER K WITH CARON]
+ case '\u1D0B':
+ // á´‹ [LATIN LETTER SMALL CAPITAL K]
+ case '\u1E30':
+ // Ḱ [LATIN CAPITAL LETTER K WITH ACUTE]
+ case '\u1E32':
+ // Ḳ [LATIN CAPITAL LETTER K WITH DOT BELOW]
+ case '\u1E34':
+ // Ḵ [LATIN CAPITAL LETTER K WITH LINE BELOW]
+ case '\u24C0':
+ // â“€ [CIRCLED LATIN CAPITAL LETTER K]
+ case '\u2C69':
+ // Ⱪ [LATIN CAPITAL LETTER K WITH DESCENDER]
+ case '\uA740':
+ // � [LATIN CAPITAL LETTER K WITH STROKE]
+ case '\uA742':
+ // � [LATIN CAPITAL LETTER K WITH DIAGONAL STROKE]
+ case '\uA744':
+ // � [LATIN CAPITAL LETTER K WITH STROKE AND DIAGONAL STROKE]
+ case '\uFF2B': // K [FULLWIDTH LATIN CAPITAL LETTER K]
+ output[outputPos++] = 'K';
+ break;
+
+ case '\u0137':
+ // Ä· [LATIN SMALL LETTER K WITH CEDILLA]
+ case '\u0199':
+ // Æ™ [LATIN SMALL LETTER K WITH HOOK]
+ case '\u01E9':
+ // Ç© [LATIN SMALL LETTER K WITH CARON]
+ case '\u029E':
+ // Êž [LATIN SMALL LETTER TURNED K]
+ case '\u1D84':
+ // ᶄ [LATIN SMALL LETTER K WITH PALATAL HOOK]
+ case '\u1E31':
+ // ḱ [LATIN SMALL LETTER K WITH ACUTE]
+ case '\u1E33':
+ // ḳ [LATIN SMALL LETTER K WITH DOT BELOW]
+ case '\u1E35':
+ // ḵ [LATIN SMALL LETTER K WITH LINE BELOW]
+ case '\u24DA':
+ // â“š [CIRCLED LATIN SMALL LETTER K]
+ case '\u2C6A':
+ // ⱪ [LATIN SMALL LETTER K WITH DESCENDER]
+ case '\uA741':
+ // � [LATIN SMALL LETTER K WITH STROKE]
+ case '\uA743':
+ // � [LATIN SMALL LETTER K WITH DIAGONAL STROKE]
+ case '\uA745':
+ // � [LATIN SMALL LETTER K WITH STROKE AND DIAGONAL STROKE]
+ case '\uFF4B': // k [FULLWIDTH LATIN SMALL LETTER K]
+ output[outputPos++] = 'k';
+ break;
+
+ case '\u24A6': // â’¦ [PARENTHESIZED LATIN SMALL LETTER K]
+ output[outputPos++] = '(';
+ output[outputPos++] = 'k';
+ output[outputPos++] = ')';
+ break;
+
+ case '\u0139':
+ // Ĺ [LATIN CAPITAL LETTER L WITH ACUTE]
+ case '\u013B':
+ // Ä» [LATIN CAPITAL LETTER L WITH CEDILLA]
+ case '\u013D':
+ // Ľ [LATIN CAPITAL LETTER L WITH CARON]
+ case '\u013F':
+ // Ä¿ [LATIN CAPITAL LETTER L WITH MIDDLE DOT]
+ case '\u0141':
+ // � [LATIN CAPITAL LETTER L WITH STROKE]
+ case '\u023D':
+ // Ƚ [LATIN CAPITAL LETTER L WITH BAR]
+ case '\u029F':
+ // ÊŸ [LATIN LETTER SMALL CAPITAL L]
+ case '\u1D0C':
+ // ᴌ [LATIN LETTER SMALL CAPITAL L WITH STROKE]
+ case '\u1E36':
+ // Ḷ [LATIN CAPITAL LETTER L WITH DOT BELOW]
+ case '\u1E38':
+ // Ḹ [LATIN CAPITAL LETTER L WITH DOT BELOW AND MACRON]
+ case '\u1E3A':
+ // Ḻ [LATIN CAPITAL LETTER L WITH LINE BELOW]
+ case '\u1E3C':
+ // Ḽ [LATIN CAPITAL LETTER L WITH CIRCUMFLEX BELOW]
+ case '\u24C1':
+ // � [CIRCLED LATIN CAPITAL LETTER L]
+ case '\u2C60':
+ // â±  [LATIN CAPITAL LETTER L WITH DOUBLE BAR]
+ case '\u2C62':
+ // â±¢ [LATIN CAPITAL LETTER L WITH MIDDLE TILDE]
+ case '\uA746':
+ // � [LATIN CAPITAL LETTER BROKEN L]
+ case '\uA748':
+ // � [LATIN CAPITAL LETTER L WITH HIGH STROKE]
+ case '\uA780':
+ // Ꞁ [LATIN CAPITAL LETTER TURNED L]
+ case '\uFF2C': // L [FULLWIDTH LATIN CAPITAL LETTER L]
+ output[outputPos++] = 'L';
+ break;
+
+ case '\u013A':
+ // ĺ [LATIN SMALL LETTER L WITH ACUTE]
+ case '\u013C':
+ // ļ [LATIN SMALL LETTER L WITH CEDILLA]
+ case '\u013E':
+ // ľ [LATIN SMALL LETTER L WITH CARON]
+ case '\u0140':
+ // ŀ [LATIN SMALL LETTER L WITH MIDDLE DOT]
+ case '\u0142':
+ // Å‚ [LATIN SMALL LETTER L WITH STROKE]
+ case '\u019A':
+ // Æš [LATIN SMALL LETTER L WITH BAR]
+ case '\u0234':
+ // È´ [LATIN SMALL LETTER L WITH CURL]
+ case '\u026B':
+ // É« [LATIN SMALL LETTER L WITH MIDDLE TILDE]
+ case '\u026C':
+ // ɬ [LATIN SMALL LETTER L WITH BELT]
+ case '\u026D':
+ // É­ [LATIN SMALL LETTER L WITH RETROFLEX HOOK]
+ case '\u1D85':
+ // ᶅ [LATIN SMALL LETTER L WITH PALATAL HOOK]
+ case '\u1E37':
+ // ḷ [LATIN SMALL LETTER L WITH DOT BELOW]
+ case '\u1E39':
+ // ḹ [LATIN SMALL LETTER L WITH DOT BELOW AND MACRON]
+ case '\u1E3B':
+ // ḻ [LATIN SMALL LETTER L WITH LINE BELOW]
+ case '\u1E3D':
+ // ḽ [LATIN SMALL LETTER L WITH CIRCUMFLEX BELOW]
+ case '\u24DB':
+ // â“› [CIRCLED LATIN SMALL LETTER L]
+ case '\u2C61':
+ // ⱡ [LATIN SMALL LETTER L WITH DOUBLE BAR]
+ case '\uA747':
+ // � [LATIN SMALL LETTER BROKEN L]
+ case '\uA749':
+ // � [LATIN SMALL LETTER L WITH HIGH STROKE]
+ case '\uA781':
+ // � [LATIN SMALL LETTER TURNED L]
+ case '\uFF4C': // l [FULLWIDTH LATIN SMALL LETTER L]
+ output[outputPos++] = 'l';
+ break;
+
+ case '\u01C7': // LJ [LATIN CAPITAL LETTER LJ]
+ output[outputPos++] = 'L';
+ output[outputPos++] = 'J';
+ break;
+
+ case '\u1EFA': // Ỻ [LATIN CAPITAL LETTER MIDDLE-WELSH LL]
+ output[outputPos++] = 'L';
+ output[outputPos++] = 'L';
+ break;
+
+ case '\u01C8': // Lj [LATIN CAPITAL LETTER L WITH SMALL LETTER J]
+ output[outputPos++] = 'L';
+ output[outputPos++] = 'j';
+ break;
+
+ case '\u24A7': // â’§ [PARENTHESIZED LATIN SMALL LETTER L]
+ output[outputPos++] = '(';
+ output[outputPos++] = 'l';
+ output[outputPos++] = ')';
+ break;
+
+ case '\u01C9': // lj [LATIN SMALL LETTER LJ]
+ output[outputPos++] = 'l';
+ output[outputPos++] = 'j';
+ break;
+
+ case '\u1EFB': // á»» [LATIN SMALL LETTER MIDDLE-WELSH LL]
+ output[outputPos++] = 'l';
+ output[outputPos++] = 'l';
+ break;
+
+ case '\u02AA': // ʪ [LATIN SMALL LETTER LS DIGRAPH]
+ output[outputPos++] = 'l';
+ output[outputPos++] = 's';
+ break;
+
+ case '\u02AB': // Ê« [LATIN SMALL LETTER LZ DIGRAPH]
+ output[outputPos++] = 'l';
+ output[outputPos++] = 'z';
+ break;
+
+ case '\u019C':
+ // Ɯ [LATIN CAPITAL LETTER TURNED M]
+ case '\u1D0D':
+ // á´� [LATIN LETTER SMALL CAPITAL M]
+ case '\u1E3E':
+ // Ḿ [LATIN CAPITAL LETTER M WITH ACUTE]
+ case '\u1E40':
+ // á¹€ [LATIN CAPITAL LETTER M WITH DOT ABOVE]
+ case '\u1E42':
+ // Ṃ [LATIN CAPITAL LETTER M WITH DOT BELOW]
+ case '\u24C2':
+ // â“‚ [CIRCLED LATIN CAPITAL LETTER M]
+ case '\u2C6E':
+ // â±® [LATIN CAPITAL LETTER M WITH HOOK]
+ case '\uA7FD':
+ // ꟽ [LATIN EPIGRAPHIC LETTER INVERTED M]
+ case '\uA7FF':
+ // ꟿ [LATIN EPIGRAPHIC LETTER ARCHAIC M]
+ case '\uFF2D': // ï¼­ [FULLWIDTH LATIN CAPITAL LETTER M]
+ output[outputPos++] = 'M';
+ break;
+
+ case '\u026F':
+ // ɯ [LATIN SMALL LETTER TURNED M]
+ case '\u0270':
+ // É° [LATIN SMALL LETTER TURNED M WITH LONG LEG]
+ case '\u0271':
+ // ɱ [LATIN SMALL LETTER M WITH HOOK]
+ case '\u1D6F':
+ // ᵯ [LATIN SMALL LETTER M WITH MIDDLE TILDE]
+ case '\u1D86':
+ // ᶆ [LATIN SMALL LETTER M WITH PALATAL HOOK]
+ case '\u1E3F':
+ // ḿ [LATIN SMALL LETTER M WITH ACUTE]
+ case '\u1E41':
+ // � [LATIN SMALL LETTER M WITH DOT ABOVE]
+ case '\u1E43':
+ // ṃ [LATIN SMALL LETTER M WITH DOT BELOW]
+ case '\u24DC':
+ // ⓜ [CIRCLED LATIN SMALL LETTER M]
+ case '\uFF4D': // � [FULLWIDTH LATIN SMALL LETTER M]
+ output[outputPos++] = 'm';
+ break;
+
+ case '\u24A8': // â’¨ [PARENTHESIZED LATIN SMALL LETTER M]
+ output[outputPos++] = '(';
+ output[outputPos++] = 'm';
+ output[outputPos++] = ')';
+ break;
+
+ case '\u00D1':
+ // Ñ [LATIN CAPITAL LETTER N WITH TILDE]
+ case '\u0143':
+ // Ã…Æ’ [LATIN CAPITAL LETTER N WITH ACUTE]
+ case '\u0145':
+ // Å… [LATIN CAPITAL LETTER N WITH CEDILLA]
+ case '\u0147':
+ // Ň [LATIN CAPITAL LETTER N WITH CARON]
+ case '\u014A':
+ // Ã…Å  http://en.wikipedia.org/wiki/Eng_(letter) [LATIN CAPITAL LETTER ENG]
+ case '\u019D':
+ // � [LATIN CAPITAL LETTER N WITH LEFT HOOK]
+ case '\u01F8':
+ // Ǹ [LATIN CAPITAL LETTER N WITH GRAVE]
+ case '\u0220':
+ // È  [LATIN CAPITAL LETTER N WITH LONG RIGHT LEG]
+ case '\u0274':
+ // É´ [LATIN LETTER SMALL CAPITAL N]
+ case '\u1D0E':
+ // á´Ž [LATIN LETTER SMALL CAPITAL REVERSED N]
+ case '\u1E44':
+ // Ṅ [LATIN CAPITAL LETTER N WITH DOT ABOVE]
+ case '\u1E46':
+ // Ṇ [LATIN CAPITAL LETTER N WITH DOT BELOW]
+ case '\u1E48':
+ // Ṉ [LATIN CAPITAL LETTER N WITH LINE BELOW]
+ case '\u1E4A':
+ // Ṋ [LATIN CAPITAL LETTER N WITH CIRCUMFLEX BELOW]
+ case '\u24C3':
+ // Ⓝ [CIRCLED LATIN CAPITAL LETTER N]
+ case '\uFF2E': // ï¼® [FULLWIDTH LATIN CAPITAL LETTER N]
+ output[outputPos++] = 'N';
+ break;
+
+ case '\u00F1':
+ // ñ [LATIN SMALL LETTER N WITH TILDE]
+ case '\u0144':
+ // Å„ [LATIN SMALL LETTER N WITH ACUTE]
+ case '\u0146':
+ // ņ [LATIN SMALL LETTER N WITH CEDILLA]
+ case '\u0148':
+ // ň [LATIN SMALL LETTER N WITH CARON]
+ case '\u0149':
+ // ʼn [LATIN SMALL LETTER N PRECEDED BY APOSTROPHE]
+ case '\u014B':
+ // Å‹ http://en.wikipedia.org/wiki/Eng_(letter) [LATIN SMALL LETTER ENG]
+ case '\u019E':
+ // Æž [LATIN SMALL LETTER N WITH LONG RIGHT LEG]
+ case '\u01F9':
+ // ǹ [LATIN SMALL LETTER N WITH GRAVE]
+ case '\u0235':
+ // ȵ [LATIN SMALL LETTER N WITH CURL]
+ case '\u0272':
+ // ɲ [LATIN SMALL LETTER N WITH LEFT HOOK]
+ case '\u0273':
+ // ɳ [LATIN SMALL LETTER N WITH RETROFLEX HOOK]
+ case '\u1D70':
+ // áµ° [LATIN SMALL LETTER N WITH MIDDLE TILDE]
+ case '\u1D87':
+ // ᶇ [LATIN SMALL LETTER N WITH PALATAL HOOK]
+ case '\u1E45':
+ // á¹… [LATIN SMALL LETTER N WITH DOT ABOVE]
+ case '\u1E47':
+ // ṇ [LATIN SMALL LETTER N WITH DOT BELOW]
+ case '\u1E49':
+ // ṉ [LATIN SMALL LETTER N WITH LINE BELOW]
+ case '\u1E4B':
+ // ṋ [LATIN SMALL LETTER N WITH CIRCUMFLEX BELOW]
+ case '\u207F':
+ // � [SUPERSCRIPT LATIN SMALL LETTER N]
+ case '\u24DD':
+ // � [CIRCLED LATIN SMALL LETTER N]
+ case '\uFF4E': // n [FULLWIDTH LATIN SMALL LETTER N]
+ output[outputPos++] = 'n';
+ break;
+
+ case '\u01CA': // ÇŠ [LATIN CAPITAL LETTER NJ]
+ output[outputPos++] = 'N';
+ output[outputPos++] = 'J';
+ break;
+
+ case '\u01CB': // Ç‹ [LATIN CAPITAL LETTER N WITH SMALL LETTER J]
+ output[outputPos++] = 'N';
+ output[outputPos++] = 'j';
+ break;
+
+ case '\u24A9': // â’© [PARENTHESIZED LATIN SMALL LETTER N]
+ output[outputPos++] = '(';
+ output[outputPos++] = 'n';
+ output[outputPos++] = ')';
+ break;
+
+ case '\u01CC': // nj [LATIN SMALL LETTER NJ]
+ output[outputPos++] = 'n';
+ output[outputPos++] = 'j';
+ break;
+
+ case '\u00D2':
+ // Ã’ [LATIN CAPITAL LETTER O WITH GRAVE]
+ case '\u00D3':
+ // Ó [LATIN CAPITAL LETTER O WITH ACUTE]
+ case '\u00D4':
+ // �? [LATIN CAPITAL LETTER O WITH CIRCUMFLEX]
+ case '\u00D5':
+ // Õ [LATIN CAPITAL LETTER O WITH TILDE]
+ case '\u00D6':
+ // Ö [LATIN CAPITAL LETTER O WITH DIAERESIS]
+ case '\u00D8':
+ // Ø [LATIN CAPITAL LETTER O WITH STROKE]
+ case '\u014C':
+ // Ã…Å’ [LATIN CAPITAL LETTER O WITH MACRON]
+ case '\u014E':
+ // ÅŽ [LATIN CAPITAL LETTER O WITH BREVE]
+ case '\u0150':
+ // � [LATIN CAPITAL LETTER O WITH DOUBLE ACUTE]
+ case '\u0186':
+ // Ɔ [LATIN CAPITAL LETTER OPEN O]
+ case '\u019F':
+ // ÆŸ [LATIN CAPITAL LETTER O WITH MIDDLE TILDE]
+ case '\u01A0':
+ // Æ  [LATIN CAPITAL LETTER O WITH HORN]
+ case '\u01D1':
+ // Ç‘ [LATIN CAPITAL LETTER O WITH CARON]
+ case '\u01EA':
+ // Ǫ [LATIN CAPITAL LETTER O WITH OGONEK]
+ case '\u01EC':
+ // Ǭ [LATIN CAPITAL LETTER O WITH OGONEK AND MACRON]
+ case '\u01FE':
+ // Ǿ [LATIN CAPITAL LETTER O WITH STROKE AND ACUTE]
+ case '\u020C':
+ // Ȍ [LATIN CAPITAL LETTER O WITH DOUBLE GRAVE]
+ case '\u020E':
+ // ÈŽ [LATIN CAPITAL LETTER O WITH INVERTED BREVE]
+ case '\u022A':
+ // Ȫ [LATIN CAPITAL LETTER O WITH DIAERESIS AND MACRON]
+ case '\u022C':
+ // Ȭ [LATIN CAPITAL LETTER O WITH TILDE AND MACRON]
+ case '\u022E':
+ // È® [LATIN CAPITAL LETTER O WITH DOT ABOVE]
+ case '\u0230':
+ // È° [LATIN CAPITAL LETTER O WITH DOT ABOVE AND MACRON]
+ case '\u1D0F':
+ // á´� [LATIN LETTER SMALL CAPITAL O]
+ case '\u1D10':
+ // á´� [LATIN LETTER SMALL CAPITAL OPEN O]
+ case '\u1E4C':
+ // Ṍ [LATIN CAPITAL LETTER O WITH TILDE AND ACUTE]
+ case '\u1E4E':
+ // Ṏ [LATIN CAPITAL LETTER O WITH TILDE AND DIAERESIS]
+ case '\u1E50':
+ // � [LATIN CAPITAL LETTER O WITH MACRON AND GRAVE]
+ case '\u1E52':
+ // á¹’ [LATIN CAPITAL LETTER O WITH MACRON AND ACUTE]
+ case '\u1ECC':
+ // Ọ [LATIN CAPITAL LETTER O WITH DOT BELOW]
+ case '\u1ECE':
+ // Ỏ [LATIN CAPITAL LETTER O WITH HOOK ABOVE]
+ case '\u1ED0':
+ // � [LATIN CAPITAL LETTER O WITH CIRCUMFLEX AND ACUTE]
+ case '\u1ED2':
+ // á»’ [LATIN CAPITAL LETTER O WITH CIRCUMFLEX AND GRAVE]
+ case '\u1ED4':
+ // �? [LATIN CAPITAL LETTER O WITH CIRCUMFLEX AND HOOK ABOVE]
+ case '\u1ED6':
+ // á»– [LATIN CAPITAL LETTER O WITH CIRCUMFLEX AND TILDE]
+ case '\u1ED8':
+ // Ộ [LATIN CAPITAL LETTER O WITH CIRCUMFLEX AND DOT BELOW]
+ case '\u1EDA':
+ // Ớ [LATIN CAPITAL LETTER O WITH HORN AND ACUTE]
+ case '\u1EDC':
+ // Ờ [LATIN CAPITAL LETTER O WITH HORN AND GRAVE]
+ case '\u1EDE':
+ // Ở [LATIN CAPITAL LETTER O WITH HORN AND HOOK ABOVE]
+ case '\u1EE0':
+ // á»  [LATIN CAPITAL LETTER O WITH HORN AND TILDE]
+ case '\u1EE2':
+ // Ợ [LATIN CAPITAL LETTER O WITH HORN AND DOT BELOW]
+ case '\u24C4':
+ // â“„ [CIRCLED LATIN CAPITAL LETTER O]
+ case '\uA74A':
+ // � [LATIN CAPITAL LETTER O WITH LONG STROKE OVERLAY]
+ case '\uA74C':
+ // � [LATIN CAPITAL LETTER O WITH LOOP]
+ case '\uFF2F': // O [FULLWIDTH LATIN CAPITAL LETTER O]
+ output[outputPos++] = 'O';
+ break;
+
+ case '\u00F2':
+ // ò [LATIN SMALL LETTER O WITH GRAVE]
+ case '\u00F3':
+ // ó [LATIN SMALL LETTER O WITH ACUTE]
+ case '\u00F4':
+ // ô [LATIN SMALL LETTER O WITH CIRCUMFLEX]
+ case '\u00F5':
+ // õ [LATIN SMALL LETTER O WITH TILDE]
+ case '\u00F6':
+ // ö [LATIN SMALL LETTER O WITH DIAERESIS]
+ case '\u00F8':
+ // ø [LATIN SMALL LETTER O WITH STROKE]
+ case '\u014D':
+ // � [LATIN SMALL LETTER O WITH MACRON]
+ case '\u014F':
+ // � [LATIN SMALL LETTER O WITH BREVE]
+ case '\u0151':
+ // Å‘ [LATIN SMALL LETTER O WITH DOUBLE ACUTE]
+ case '\u01A1':
+ // Æ¡ [LATIN SMALL LETTER O WITH HORN]
+ case '\u01D2':
+ // Ç’ [LATIN SMALL LETTER O WITH CARON]
+ case '\u01EB':
+ // Ç« [LATIN SMALL LETTER O WITH OGONEK]
+ case '\u01ED':
+ // Ç­ [LATIN SMALL LETTER O WITH OGONEK AND MACRON]
+ case '\u01FF':
+ // Ç¿ [LATIN SMALL LETTER O WITH STROKE AND ACUTE]
+ case '\u020D':
+ // � [LATIN SMALL LETTER O WITH DOUBLE GRAVE]
+ case '\u020F':
+ // � [LATIN SMALL LETTER O WITH INVERTED BREVE]
+ case '\u022B':
+ // È« [LATIN SMALL LETTER O WITH DIAERESIS AND MACRON]
+ case '\u022D':
+ // È­ [LATIN SMALL LETTER O WITH TILDE AND MACRON]
+ case '\u022F':
+ // ȯ [LATIN SMALL LETTER O WITH DOT ABOVE]
+ case '\u0231':
+ // ȱ [LATIN SMALL LETTER O WITH DOT ABOVE AND MACRON]
+ case '\u0254':
+ // �? [LATIN SMALL LETTER OPEN O]
+ case '\u0275':
+ // ɵ [LATIN SMALL LETTER BARRED O]
+ case '\u1D16':
+ // á´– [LATIN SMALL LETTER TOP HALF O]
+ case '\u1D17':
+ // á´— [LATIN SMALL LETTER BOTTOM HALF O]
+ case '\u1D97':
+ // ᶗ [LATIN SMALL LETTER OPEN O WITH RETROFLEX HOOK]
+ case '\u1E4D':
+ // � [LATIN SMALL LETTER O WITH TILDE AND ACUTE]
+ case '\u1E4F':
+ // � [LATIN SMALL LETTER O WITH TILDE AND DIAERESIS]
+ case '\u1E51':
+ // ṑ [LATIN SMALL LETTER O WITH MACRON AND GRAVE]
+ case '\u1E53':
+ // ṓ [LATIN SMALL LETTER O WITH MACRON AND ACUTE]
+ case '\u1ECD':
+ // � [LATIN SMALL LETTER O WITH DOT BELOW]
+ case '\u1ECF':
+ // � [LATIN SMALL LETTER O WITH HOOK ABOVE]
+ case '\u1ED1':
+ // ố [LATIN SMALL LETTER O WITH CIRCUMFLEX AND ACUTE]
+ case '\u1ED3':
+ // ồ [LATIN SMALL LETTER O WITH CIRCUMFLEX AND GRAVE]
+ case '\u1ED5':
+ // ổ [LATIN SMALL LETTER O WITH CIRCUMFLEX AND HOOK ABOVE]
+ case '\u1ED7':
+ // á»— [LATIN SMALL LETTER O WITH CIRCUMFLEX AND TILDE]
+ case '\u1ED9':
+ // á»™ [LATIN SMALL LETTER O WITH CIRCUMFLEX AND DOT BELOW]
+ case '\u1EDB':
+ // á»› [LATIN SMALL LETTER O WITH HORN AND ACUTE]
+ case '\u1EDD':
+ // � [LATIN SMALL LETTER O WITH HORN AND GRAVE]
+ case '\u1EDF':
+ // ở [LATIN SMALL LETTER O WITH HORN AND HOOK ABOVE]
+ case '\u1EE1':
+ // ỡ [LATIN SMALL LETTER O WITH HORN AND TILDE]
+ case '\u1EE3':
+ // ợ [LATIN SMALL LETTER O WITH HORN AND DOT BELOW]
+ case '\u2092':
+ // â‚’ [LATIN SUBSCRIPT SMALL LETTER O]
+ case '\u24DE':
+ // â“ž [CIRCLED LATIN SMALL LETTER O]
+ case '\u2C7A':
+ // ⱺ [LATIN SMALL LETTER O WITH LOW RING INSIDE]
+ case '\uA74B':
+ // � [LATIN SMALL LETTER O WITH LONG STROKE OVERLAY]
+ case '\uA74D':
+ // � [LATIN SMALL LETTER O WITH LOOP]
+ case '\uFF4F': // � [FULLWIDTH LATIN SMALL LETTER O]
+ output[outputPos++] = 'o';
+ break;
+
+ case '\u0152':
+ // Å’ [LATIN CAPITAL LIGATURE OE]
+ case '\u0276': // ɶ [LATIN LETTER SMALL CAPITAL OE]
+ output[outputPos++] = 'O';
+ output[outputPos++] = 'E';
+ break;
+
+ case '\uA74E': // � [LATIN CAPITAL LETTER OO]
+ output[outputPos++] = 'O';
+ output[outputPos++] = 'O';
+ break;
+
+ case '\u0222':
+ // Ȣ http://en.wikipedia.org/wiki/OU [LATIN CAPITAL LETTER OU]
+ case '\u1D15': // á´• [LATIN LETTER SMALL CAPITAL OU]
+ output[outputPos++] = 'O';
+ output[outputPos++] = 'U';
+ break;
+
+ case '\u24AA': // â’ª [PARENTHESIZED LATIN SMALL LETTER O]
+ output[outputPos++] = '(';
+ output[outputPos++] = 'o';
+ output[outputPos++] = ')';
+ break;
+
+ case '\u0153':
+ // Å“ [LATIN SMALL LIGATURE OE]
+ case '\u1D14': // á´�? [LATIN SMALL LETTER TURNED OE]
+ output[outputPos++] = 'o';
+ output[outputPos++] = 'e';
+ break;
+
+ case '\uA74F': // � [LATIN SMALL LETTER OO]
+ output[outputPos++] = 'o';
+ output[outputPos++] = 'o';
+ break;
+
+ case '\u0223': // ȣ http://en.wikipedia.org/wiki/OU [LATIN SMALL LETTER OU]
+ output[outputPos++] = 'o';
+ output[outputPos++] = 'u';
+ break;
+
+ case '\u01A4':
+ // Ƥ [LATIN CAPITAL LETTER P WITH HOOK]
+ case '\u1D18':
+ // á´˜ [LATIN LETTER SMALL CAPITAL P]
+ case '\u1E54':
+ // �? [LATIN CAPITAL LETTER P WITH ACUTE]
+ case '\u1E56':
+ // á¹– [LATIN CAPITAL LETTER P WITH DOT ABOVE]
+ case '\u24C5':
+ // â“… [CIRCLED LATIN CAPITAL LETTER P]
+ case '\u2C63':
+ // â±£ [LATIN CAPITAL LETTER P WITH STROKE]
+ case '\uA750':
+ // � [LATIN CAPITAL LETTER P WITH STROKE THROUGH DESCENDER]
+ case '\uA752':
+ // � [LATIN CAPITAL LETTER P WITH FLOURISH]
+ case '\uA754':
+ // �? [LATIN CAPITAL LETTER P WITH SQUIRREL TAIL]
+ case '\uFF30': // ï¼° [FULLWIDTH LATIN CAPITAL LETTER P]
+ output[outputPos++] = 'P';
+ break;
+
+ case '\u01A5':
+ // ƥ [LATIN SMALL LETTER P WITH HOOK]
+ case '\u1D71':
+ // áµ± [LATIN SMALL LETTER P WITH MIDDLE TILDE]
+ case '\u1D7D':
+ // áµ½ [LATIN SMALL LETTER P WITH STROKE]
+ case '\u1D88':
+ // ᶈ [LATIN SMALL LETTER P WITH PALATAL HOOK]
+ case '\u1E55':
+ // ṕ [LATIN SMALL LETTER P WITH ACUTE]
+ case '\u1E57':
+ // á¹— [LATIN SMALL LETTER P WITH DOT ABOVE]
+ case '\u24DF':
+ // â“Ÿ [CIRCLED LATIN SMALL LETTER P]
+ case '\uA751':
+ // � [LATIN SMALL LETTER P WITH STROKE THROUGH DESCENDER]
+ case '\uA753':
+ // � [LATIN SMALL LETTER P WITH FLOURISH]
+ case '\uA755':
+ // � [LATIN SMALL LETTER P WITH SQUIRREL TAIL]
+ case '\uA7FC':
+ // ꟼ [LATIN EPIGRAPHIC LETTER REVERSED P]
+ case '\uFF50': // � [FULLWIDTH LATIN SMALL LETTER P]
+ output[outputPos++] = 'p';
+ break;
+
+ case '\u24AB': // â’« [PARENTHESIZED LATIN SMALL LETTER P]
+ output[outputPos++] = '(';
+ output[outputPos++] = 'p';
+ output[outputPos++] = ')';
+ break;
+
+ case '\u024A':
+ // ÉŠ [LATIN CAPITAL LETTER SMALL Q WITH HOOK TAIL]
+ case '\u24C6':
+ // Ⓠ [CIRCLED LATIN CAPITAL LETTER Q]
+ case '\uA756':
+ // � [LATIN CAPITAL LETTER Q WITH STROKE THROUGH DESCENDER]
+ case '\uA758':
+ // � [LATIN CAPITAL LETTER Q WITH DIAGONAL STROKE]
+ case '\uFF31': // ï¼± [FULLWIDTH LATIN CAPITAL LETTER Q]
+ output[outputPos++] = 'Q';
+ break;
+
+ case '\u0138':
+ // ĸ http://en.wikipedia.org/wiki/Kra_(letter) [LATIN SMALL LETTER KRA]
+ case '\u024B':
+ // É‹ [LATIN SMALL LETTER Q WITH HOOK TAIL]
+ case '\u02A0':
+ // Ê  [LATIN SMALL LETTER Q WITH HOOK]
+ case '\u24E0':
+ // â“  [CIRCLED LATIN SMALL LETTER Q]
+ case '\uA757':
+ // � [LATIN SMALL LETTER Q WITH STROKE THROUGH DESCENDER]
+ case '\uA759':
+ // � [LATIN SMALL LETTER Q WITH DIAGONAL STROKE]
+ case '\uFF51': // q [FULLWIDTH LATIN SMALL LETTER Q]
+ output[outputPos++] = 'q';
+ break;
+
+ case '\u24AC': // â’¬ [PARENTHESIZED LATIN SMALL LETTER Q]
+ output[outputPos++] = '(';
+ output[outputPos++] = 'q';
+ output[outputPos++] = ')';
+ break;
+
+ case '\u0239': // ȹ [LATIN SMALL LETTER QP DIGRAPH]
+ output[outputPos++] = 'q';
+ output[outputPos++] = 'p';
+ break;
+
+ case '\u0154':
+ // �? [LATIN CAPITAL LETTER R WITH ACUTE]
+ case '\u0156':
+ // Å– [LATIN CAPITAL LETTER R WITH CEDILLA]
+ case '\u0158':
+ // Ř [LATIN CAPITAL LETTER R WITH CARON]
+ case '\u0210':
+ // È’ [LATIN CAPITAL LETTER R WITH DOUBLE GRAVE]
+ case '\u0212':
+ // È’ [LATIN CAPITAL LETTER R WITH INVERTED BREVE]
+ case '\u024C':
+ // Ɍ [LATIN CAPITAL LETTER R WITH STROKE]
+ case '\u0280':
+ // ʀ [LATIN LETTER SMALL CAPITAL R]
+ case '\u0281':
+ // � [LATIN LETTER SMALL CAPITAL INVERTED R]
+ case '\u1D19':
+ // á´™ [LATIN LETTER SMALL CAPITAL REVERSED R]
+ case '\u1D1A':
+ // á´š [LATIN LETTER SMALL CAPITAL TURNED R]
+ case '\u1E58':
+ // Ṙ [LATIN CAPITAL LETTER R WITH DOT ABOVE]
+ case '\u1E5A':
+ // Ṛ [LATIN CAPITAL LETTER R WITH DOT BELOW]
+ case '\u1E5C':
+ // Ṝ [LATIN CAPITAL LETTER R WITH DOT BELOW AND MACRON]
+ case '\u1E5E':
+ // Ṟ [LATIN CAPITAL LETTER R WITH LINE BELOW]
+ case '\u24C7':
+ // Ⓡ [CIRCLED LATIN CAPITAL LETTER R]
+ case '\u2C64':
+ // Ɽ [LATIN CAPITAL LETTER R WITH TAIL]
+ case '\uA75A':
+ // � [LATIN CAPITAL LETTER R ROTUNDA]
+ case '\uA782':
+ // êž‚ [LATIN CAPITAL LETTER INSULAR R]
+ case '\uFF32': // ï¼² [FULLWIDTH LATIN CAPITAL LETTER R]
+ output[outputPos++] = 'R';
+ break;
+
+ case '\u0155':
+ // Å• [LATIN SMALL LETTER R WITH ACUTE]
+ case '\u0157':
+ // Å— [LATIN SMALL LETTER R WITH CEDILLA]
+ case '\u0159':
+ // Ã…â„¢ [LATIN SMALL LETTER R WITH CARON]
+ case '\u0211':
+ // È‘ [LATIN SMALL LETTER R WITH DOUBLE GRAVE]
+ case '\u0213':
+ // È“ [LATIN SMALL LETTER R WITH INVERTED BREVE]
+ case '\u024D':
+ // � [LATIN SMALL LETTER R WITH STROKE]
+ case '\u027C':
+ // ɼ [LATIN SMALL LETTER R WITH LONG LEG]
+ case '\u027D':
+ // ɽ [LATIN SMALL LETTER R WITH TAIL]
+ case '\u027E':
+ // ɾ [LATIN SMALL LETTER R WITH FISHHOOK]
+ case '\u027F':
+ // É¿ [LATIN SMALL LETTER REVERSED R WITH FISHHOOK]
+ case '\u1D63':
+ // áµ£ [LATIN SUBSCRIPT SMALL LETTER R]
+ case '\u1D72':
+ // áµ² [LATIN SMALL LETTER R WITH MIDDLE TILDE]
+ case '\u1D73':
+ // áµ³ [LATIN SMALL LETTER R WITH FISHHOOK AND MIDDLE TILDE]
+ case '\u1D89':
+ // ᶉ [LATIN SMALL LETTER R WITH PALATAL HOOK]
+ case '\u1E59':
+ // á¹™ [LATIN SMALL LETTER R WITH DOT ABOVE]
+ case '\u1E5B':
+ // á¹› [LATIN SMALL LETTER R WITH DOT BELOW]
+ case '\u1E5D':
+ // � [LATIN SMALL LETTER R WITH DOT BELOW AND MACRON]
+ case '\u1E5F':
+ // ṟ [LATIN SMALL LETTER R WITH LINE BELOW]
+ case '\u24E1':
+ // â“¡ [CIRCLED LATIN SMALL LETTER R]
+ case '\uA75B':
+ // � [LATIN SMALL LETTER R ROTUNDA]
+ case '\uA783':
+ // ꞃ [LATIN SMALL LETTER INSULAR R]
+ case '\uFF52': // ï½’ [FULLWIDTH LATIN SMALL LETTER R]
+ output[outputPos++] = 'r';
+ break;
+
+ case '\u24AD': // â’­ [PARENTHESIZED LATIN SMALL LETTER R]
+ output[outputPos++] = '(';
+ output[outputPos++] = 'r';
+ output[outputPos++] = ')';
+ break;
+
+ case '\u015A':
+ // Ã…Å¡ [LATIN CAPITAL LETTER S WITH ACUTE]
+ case '\u015C':
+ // Ã…Å“ [LATIN CAPITAL LETTER S WITH CIRCUMFLEX]
+ case '\u015E':
+ // Åž [LATIN CAPITAL LETTER S WITH CEDILLA]
+ case '\u0160':
+ // Å  [LATIN CAPITAL LETTER S WITH CARON]
+ case '\u0218':
+ // Ș [LATIN CAPITAL LETTER S WITH COMMA BELOW]
+ case '\u1E60':
+ // á¹  [LATIN CAPITAL LETTER S WITH DOT ABOVE]
+ case '\u1E62':
+ // á¹¢ [LATIN CAPITAL LETTER S WITH DOT BELOW]
+ case '\u1E64':
+ // Ṥ [LATIN CAPITAL LETTER S WITH ACUTE AND DOT ABOVE]
+ case '\u1E66':
+ // Ṧ [LATIN CAPITAL LETTER S WITH CARON AND DOT ABOVE]
+ case '\u1E68':
+ // Ṩ [LATIN CAPITAL LETTER S WITH DOT BELOW AND DOT ABOVE]
+ case '\u24C8':
+ // Ⓢ [CIRCLED LATIN CAPITAL LETTER S]
+ case '\uA731':
+ // ꜱ [LATIN LETTER SMALL CAPITAL S]
+ case '\uA785':
+ // êž… [LATIN SMALL LETTER INSULAR S]
+ case '\uFF33': // ï¼³ [FULLWIDTH LATIN CAPITAL LETTER S]
+ output[outputPos++] = 'S';
+ break;
+
+ case '\u015B':
+ // Å› [LATIN SMALL LETTER S WITH ACUTE]
+ case '\u015D':
+ // � [LATIN SMALL LETTER S WITH CIRCUMFLEX]
+ case '\u015F':
+ // ÅŸ [LATIN SMALL LETTER S WITH CEDILLA]
+ case '\u0161':
+ // Å¡ [LATIN SMALL LETTER S WITH CARON]
+ case '\u017F':
+ // Å¿ http://en.wikipedia.org/wiki/Long_S [LATIN SMALL LETTER LONG S]
+ case '\u0219':
+ // È™ [LATIN SMALL LETTER S WITH COMMA BELOW]
+ case '\u023F':
+ // È¿ [LATIN SMALL LETTER S WITH SWASH TAIL]
+ case '\u0282':
+ // Ê‚ [LATIN SMALL LETTER S WITH HOOK]
+ case '\u1D74':
+ // áµ´ [LATIN SMALL LETTER S WITH MIDDLE TILDE]
+ case '\u1D8A':
+ // ᶊ [LATIN SMALL LETTER S WITH PALATAL HOOK]
+ case '\u1E61':
+ // ṡ [LATIN SMALL LETTER S WITH DOT ABOVE]
+ case '\u1E63':
+ // á¹£ [LATIN SMALL LETTER S WITH DOT BELOW]
+ case '\u1E65':
+ // á¹¥ [LATIN SMALL LETTER S WITH ACUTE AND DOT ABOVE]
+ case '\u1E67':
+ // ṧ [LATIN SMALL LETTER S WITH CARON AND DOT ABOVE]
+ case '\u1E69':
+ // ṩ [LATIN SMALL LETTER S WITH DOT BELOW AND DOT ABOVE]
+ case '\u1E9C':
+ // ẜ [LATIN SMALL LETTER LONG S WITH DIAGONAL STROKE]
+ case '\u1E9D':
+ // � [LATIN SMALL LETTER LONG S WITH HIGH STROKE]
+ case '\u24E2':
+ // â“¢ [CIRCLED LATIN SMALL LETTER S]
+ case '\uA784':
+ // êž„ [LATIN CAPITAL LETTER INSULAR S]
+ case '\uFF53': // s [FULLWIDTH LATIN SMALL LETTER S]
+ output[outputPos++] = 's';
+ break;
+
+ case '\u1E9E': // ẞ [LATIN CAPITAL LETTER SHARP S]
+ output[outputPos++] = 'S';
+ output[outputPos++] = 'S';
+ break;
+
+ case '\u24AE': // â’® [PARENTHESIZED LATIN SMALL LETTER S]
+ output[outputPos++] = '(';
+ output[outputPos++] = 's';
+ output[outputPos++] = ')';
+ break;
+
+ case '\u00DF': // ß [LATIN SMALL LETTER SHARP S]
+ output[outputPos++] = 's';
+ output[outputPos++] = 's';
+ break;
+
+ case '\uFB06': // st [LATIN SMALL LIGATURE ST]
+ output[outputPos++] = 's';
+ output[outputPos++] = 't';
+ break;
+
+ case '\u0162':
+ // Ţ [LATIN CAPITAL LETTER T WITH CEDILLA]
+ case '\u0164':
+ // Ť [LATIN CAPITAL LETTER T WITH CARON]
+ case '\u0166':
+ // Ŧ [LATIN CAPITAL LETTER T WITH STROKE]
+ case '\u01AC':
+ // Ƭ [LATIN CAPITAL LETTER T WITH HOOK]
+ case '\u01AE':
+ // Æ® [LATIN CAPITAL LETTER T WITH RETROFLEX HOOK]
+ case '\u021A':
+ // Èš [LATIN CAPITAL LETTER T WITH COMMA BELOW]
+ case '\u023E':
+ // Ⱦ [LATIN CAPITAL LETTER T WITH DIAGONAL STROKE]
+ case '\u1D1B':
+ // á´› [LATIN LETTER SMALL CAPITAL T]
+ case '\u1E6A':
+ // Ṫ [LATIN CAPITAL LETTER T WITH DOT ABOVE]
+ case '\u1E6C':
+ // Ṭ [LATIN CAPITAL LETTER T WITH DOT BELOW]
+ case '\u1E6E':
+ // á¹® [LATIN CAPITAL LETTER T WITH LINE BELOW]
+ case '\u1E70':
+ // á¹° [LATIN CAPITAL LETTER T WITH CIRCUMFLEX BELOW]
+ case '\u24C9':
+ // Ⓣ [CIRCLED LATIN CAPITAL LETTER T]
+ case '\uA786':
+ // Ꞇ [LATIN CAPITAL LETTER INSULAR T]
+ case '\uFF34': // ï¼´ [FULLWIDTH LATIN CAPITAL LETTER T]
+ output[outputPos++] = 'T';
+ break;
+
+ case '\u0163':
+ // ţ [LATIN SMALL LETTER T WITH CEDILLA]
+ case '\u0165':
+ // Ã…Â¥ [LATIN SMALL LETTER T WITH CARON]
+ case '\u0167':
+ // ŧ [LATIN SMALL LETTER T WITH STROKE]
+ case '\u01AB':
+ // Æ« [LATIN SMALL LETTER T WITH PALATAL HOOK]
+ case '\u01AD':
+ // Æ­ [LATIN SMALL LETTER T WITH HOOK]
+ case '\u021B':
+ // È› [LATIN SMALL LETTER T WITH COMMA BELOW]
+ case '\u0236':
+ // ȶ [LATIN SMALL LETTER T WITH CURL]
+ case '\u0287':
+ // ʇ [LATIN SMALL LETTER TURNED T]
+ case '\u0288':
+ // ʈ [LATIN SMALL LETTER T WITH RETROFLEX HOOK]
+ case '\u1D75':
+ // áµµ [LATIN SMALL LETTER T WITH MIDDLE TILDE]
+ case '\u1E6B':
+ // ṫ [LATIN SMALL LETTER T WITH DOT ABOVE]
+ case '\u1E6D':
+ // á¹­ [LATIN SMALL LETTER T WITH DOT BELOW]
+ case '\u1E6F':
+ // ṯ [LATIN SMALL LETTER T WITH LINE BELOW]
+ case '\u1E71':
+ // á¹± [LATIN SMALL LETTER T WITH CIRCUMFLEX BELOW]
+ case '\u1E97':
+ // ẗ [LATIN SMALL LETTER T WITH DIAERESIS]
+ case '\u24E3':
+ // â“£ [CIRCLED LATIN SMALL LETTER T]
+ case '\u2C66':
+ // ⱦ [LATIN SMALL LETTER T WITH DIAGONAL STROKE]
+ case '\uFF54': // �? [FULLWIDTH LATIN SMALL LETTER T]
+ output[outputPos++] = 't';
+ break;
+
+ case '\u00DE':
+ // Þ [LATIN CAPITAL LETTER THORN]
+ case '\uA766': // � [LATIN CAPITAL LETTER THORN WITH STROKE THROUGH DESCENDER]
+ output[outputPos++] = 'T';
+ output[outputPos++] = 'H';
+ break;
+
+ case '\uA728': // Ꜩ [LATIN CAPITAL LETTER TZ]
+ output[outputPos++] = 'T';
+ output[outputPos++] = 'Z';
+ break;
+
+ case '\u24AF': // â’¯ [PARENTHESIZED LATIN SMALL LETTER T]
+ output[outputPos++] = '(';
+ output[outputPos++] = 't';
+ output[outputPos++] = ')';
+ break;
+
+ case '\u02A8': // ʨ [LATIN SMALL LETTER TC DIGRAPH WITH CURL]
+ output[outputPos++] = 't';
+ output[outputPos++] = 'c';
+ break;
+
+ case '\u00FE':
+ // þ [LATIN SMALL LETTER THORN]
+ case '\u1D7A':
+ // ᵺ [LATIN SMALL LETTER TH WITH STRIKETHROUGH]
+ case '\uA767': // � [LATIN SMALL LETTER THORN WITH STROKE THROUGH DESCENDER]
+ output[outputPos++] = 't';
+ output[outputPos++] = 'h';
+ break;
+
+ case '\u02A6': // ʦ [LATIN SMALL LETTER TS DIGRAPH]
+ output[outputPos++] = 't';
+ output[outputPos++] = 's';
+ break;
+
+ case '\uA729': // ꜩ [LATIN SMALL LETTER TZ]
+ output[outputPos++] = 't';
+ output[outputPos++] = 'z';
+ break;
+
+ case '\u00D9':
+ // Ù [LATIN CAPITAL LETTER U WITH GRAVE]
+ case '\u00DA':
+ // Ú [LATIN CAPITAL LETTER U WITH ACUTE]
+ case '\u00DB':
+ // Û [LATIN CAPITAL LETTER U WITH CIRCUMFLEX]
+ case '\u00DC':
+ // Ü [LATIN CAPITAL LETTER U WITH DIAERESIS]
+ case '\u0168':
+ // Ũ [LATIN CAPITAL LETTER U WITH TILDE]
+ case '\u016A':
+ // Ū [LATIN CAPITAL LETTER U WITH MACRON]
+ case '\u016C':
+ // Ŭ [LATIN CAPITAL LETTER U WITH BREVE]
+ case '\u016E':
+ // Å® [LATIN CAPITAL LETTER U WITH RING ABOVE]
+ case '\u0170':
+ // Å° [LATIN CAPITAL LETTER U WITH DOUBLE ACUTE]
+ case '\u0172':
+ // Ų [LATIN CAPITAL LETTER U WITH OGONEK]
+ case '\u01AF':
+ // Ư [LATIN CAPITAL LETTER U WITH HORN]
+ case '\u01D3':
+ // Ç“ [LATIN CAPITAL LETTER U WITH CARON]
+ case '\u01D5':
+ // Ç• [LATIN CAPITAL LETTER U WITH DIAERESIS AND MACRON]
+ case '\u01D7':
+ // Ç— [LATIN CAPITAL LETTER U WITH DIAERESIS AND ACUTE]
+ case '\u01D9':
+ // Ç™ [LATIN CAPITAL LETTER U WITH DIAERESIS AND CARON]
+ case '\u01DB':
+ // Ç› [LATIN CAPITAL LETTER U WITH DIAERESIS AND GRAVE]
+ case '\u0214':
+ // �? [LATIN CAPITAL LETTER U WITH DOUBLE GRAVE]
+ case '\u0216':
+ // È– [LATIN CAPITAL LETTER U WITH INVERTED BREVE]
+ case '\u0244':
+ // É„ [LATIN CAPITAL LETTER U BAR]
+ case '\u1D1C':
+ // ᴜ [LATIN LETTER SMALL CAPITAL U]
+ case '\u1D7E':
+ // áµ¾ [LATIN SMALL CAPITAL LETTER U WITH STROKE]
+ case '\u1E72':
+ // á¹² [LATIN CAPITAL LETTER U WITH DIAERESIS BELOW]
+ case '\u1E74':
+ // á¹´ [LATIN CAPITAL LETTER U WITH TILDE BELOW]
+ case '\u1E76':
+ // Ṷ [LATIN CAPITAL LETTER U WITH CIRCUMFLEX BELOW]
+ case '\u1E78':
+ // Ṹ [LATIN CAPITAL LETTER U WITH TILDE AND ACUTE]
+ case '\u1E7A':
+ // Ṻ [LATIN CAPITAL LETTER U WITH MACRON AND DIAERESIS]
+ case '\u1EE4':
+ // Ụ [LATIN CAPITAL LETTER U WITH DOT BELOW]
+ case '\u1EE6':
+ // Ủ [LATIN CAPITAL LETTER U WITH HOOK ABOVE]
+ case '\u1EE8':
+ // Ứ [LATIN CAPITAL LETTER U WITH HORN AND ACUTE]
+ case '\u1EEA':
+ // Ừ [LATIN CAPITAL LETTER U WITH HORN AND GRAVE]
+ case '\u1EEC':
+ // Ử [LATIN CAPITAL LETTER U WITH HORN AND HOOK ABOVE]
+ case '\u1EEE':
+ // á»® [LATIN CAPITAL LETTER U WITH HORN AND TILDE]
+ case '\u1EF0':
+ // á»° [LATIN CAPITAL LETTER U WITH HORN AND DOT BELOW]
+ case '\u24CA':
+ // â“Š [CIRCLED LATIN CAPITAL LETTER U]
+ case '\uFF35': // ï¼µ [FULLWIDTH LATIN CAPITAL LETTER U]
+ output[outputPos++] = 'U';
+ break;
+
+ case '\u00F9':
+ // ù [LATIN SMALL LETTER U WITH GRAVE]
+ case '\u00FA':
+ // ú [LATIN SMALL LETTER U WITH ACUTE]
+ case '\u00FB':
+ // û [LATIN SMALL LETTER U WITH CIRCUMFLEX]
+ case '\u00FC':
+ // ü [LATIN SMALL LETTER U WITH DIAERESIS]
+ case '\u0169':
+ // Å© [LATIN SMALL LETTER U WITH TILDE]
+ case '\u016B':
+ // Å« [LATIN SMALL LETTER U WITH MACRON]
+ case '\u016D':
+ // Å­ [LATIN SMALL LETTER U WITH BREVE]
+ case '\u016F':
+ // ů [LATIN SMALL LETTER U WITH RING ABOVE]
+ case '\u0171':
+ // ű [LATIN SMALL LETTER U WITH DOUBLE ACUTE]
+ case '\u0173':
+ // ų [LATIN SMALL LETTER U WITH OGONEK]
+ case '\u01B0':
+ // Æ° [LATIN SMALL LETTER U WITH HORN]
+ case '\u01D4':
+ // �? [LATIN SMALL LETTER U WITH CARON]
+ case '\u01D6':
+ // Ç– [LATIN SMALL LETTER U WITH DIAERESIS AND MACRON]
+ case '\u01D8':
+ // ǘ [LATIN SMALL LETTER U WITH DIAERESIS AND ACUTE]
+ case '\u01DA':
+ // Çš [LATIN SMALL LETTER U WITH DIAERESIS AND CARON]
+ case '\u01DC':
+ // ǜ [LATIN SMALL LETTER U WITH DIAERESIS AND GRAVE]
+ case '\u0215':
+ // È• [LATIN SMALL LETTER U WITH DOUBLE GRAVE]
+ case '\u0217':
+ // È— [LATIN SMALL LETTER U WITH INVERTED BREVE]
+ case '\u0289':
+ // ʉ [LATIN SMALL LETTER U BAR]
+ case '\u1D64':
+ // ᵤ [LATIN SUBSCRIPT SMALL LETTER U]
+ case '\u1D99':
+ // ᶙ [LATIN SMALL LETTER U WITH RETROFLEX HOOK]
+ case '\u1E73':
+ // á¹³ [LATIN SMALL LETTER U WITH DIAERESIS BELOW]
+ case '\u1E75':
+ // á¹µ [LATIN SMALL LETTER U WITH TILDE BELOW]
+ case '\u1E77':
+ // á¹· [LATIN SMALL LETTER U WITH CIRCUMFLEX BELOW]
+ case '\u1E79':
+ // á¹¹ [LATIN SMALL LETTER U WITH TILDE AND ACUTE]
+ case '\u1E7B':
+ // á¹» [LATIN SMALL LETTER U WITH MACRON AND DIAERESIS]
+ case '\u1EE5':
+ // ụ [LATIN SMALL LETTER U WITH DOT BELOW]
+ case '\u1EE7':
+ // ủ [LATIN SMALL LETTER U WITH HOOK ABOVE]
+ case '\u1EE9':
+ // ứ [LATIN SMALL LETTER U WITH HORN AND ACUTE]
+ case '\u1EEB':
+ // ừ [LATIN SMALL LETTER U WITH HORN AND GRAVE]
+ case '\u1EED':
+ // á»­ [LATIN SMALL LETTER U WITH HORN AND HOOK ABOVE]
+ case '\u1EEF':
+ // ữ [LATIN SMALL LETTER U WITH HORN AND TILDE]
+ case '\u1EF1':
+ // á»± [LATIN SMALL LETTER U WITH HORN AND DOT BELOW]
+ case '\u24E4':
+ // ⓤ [CIRCLED LATIN SMALL LETTER U]
+ case '\uFF55': // u [FULLWIDTH LATIN SMALL LETTER U]
+ output[outputPos++] = 'u';
+ break;
+
+ case '\u24B0': // â’° [PARENTHESIZED LATIN SMALL LETTER U]
+ output[outputPos++] = '(';
+ output[outputPos++] = 'u';
+ output[outputPos++] = ')';
+ break;
+
+ case '\u1D6B': // ᵫ [LATIN SMALL LETTER UE]
+ output[outputPos++] = 'u';
+ output[outputPos++] = 'e';
+ break;
+
+ case '\u01B2':
+ // Ʋ [LATIN CAPITAL LETTER V WITH HOOK]
+ case '\u0245':
+ // É… [LATIN CAPITAL LETTER TURNED V]
+ case '\u1D20':
+ // á´  [LATIN LETTER SMALL CAPITAL V]
+ case '\u1E7C':
+ // á¹¼ [LATIN CAPITAL LETTER V WITH TILDE]
+ case '\u1E7E':
+ // á¹¾ [LATIN CAPITAL LETTER V WITH DOT BELOW]
+ case '\u1EFC':
+ // Ỽ [LATIN CAPITAL LETTER MIDDLE-WELSH V]
+ case '\u24CB':
+ // â“‹ [CIRCLED LATIN CAPITAL LETTER V]
+ case '\uA75E':
+ // � [LATIN CAPITAL LETTER V WITH DIAGONAL STROKE]
+ case '\uA768':
+ // � [LATIN CAPITAL LETTER VEND]
+ case '\uFF36': // V [FULLWIDTH LATIN CAPITAL LETTER V]
+ output[outputPos++] = 'V';
+ break;
+
+ case '\u028B':
+ // Ê‹ [LATIN SMALL LETTER V WITH HOOK]
+ case '\u028C':
+ // ʌ [LATIN SMALL LETTER TURNED V]
+ case '\u1D65':
+ // áµ¥ [LATIN SUBSCRIPT SMALL LETTER V]
+ case '\u1D8C':
+ // ᶌ [LATIN SMALL LETTER V WITH PALATAL HOOK]
+ case '\u1E7D':
+ // á¹½ [LATIN SMALL LETTER V WITH TILDE]
+ case '\u1E7F':
+ // ṿ [LATIN SMALL LETTER V WITH DOT BELOW]
+ case '\u24E5':
+ // â“¥ [CIRCLED LATIN SMALL LETTER V]
+ case '\u2C71':
+ // â±± [LATIN SMALL LETTER V WITH RIGHT HOOK]
+ case '\u2C74':
+ // â±´ [LATIN SMALL LETTER V WITH CURL]
+ case '\uA75F':
+ // � [LATIN SMALL LETTER V WITH DIAGONAL STROKE]
+ case '\uFF56': // ï½– [FULLWIDTH LATIN SMALL LETTER V]
+ output[outputPos++] = 'v';
+ break;
+
+ case '\uA760': // � [LATIN CAPITAL LETTER VY]
+ output[outputPos++] = 'V';
+ output[outputPos++] = 'Y';
+ break;
+
+ case '\u24B1': // â’± [PARENTHESIZED LATIN SMALL LETTER V]
+ output[outputPos++] = '(';
+ output[outputPos++] = 'v';
+ output[outputPos++] = ')';
+ break;
+
+ case '\uA761': // � [LATIN SMALL LETTER VY]
+ output[outputPos++] = 'v';
+ output[outputPos++] = 'y';
+ break;
+
+ case '\u0174':
+ // Å´ [LATIN CAPITAL LETTER W WITH CIRCUMFLEX]
+ case '\u01F7':
+ // Ç· http://en.wikipedia.org/wiki/Wynn [LATIN CAPITAL LETTER WYNN]
+ case '\u1D21':
+ // á´¡ [LATIN LETTER SMALL CAPITAL W]
+ case '\u1E80':
+ // Ẁ [LATIN CAPITAL LETTER W WITH GRAVE]
+ case '\u1E82':
+ // Ẃ [LATIN CAPITAL LETTER W WITH ACUTE]
+ case '\u1E84':
+ // Ẅ [LATIN CAPITAL LETTER W WITH DIAERESIS]
+ case '\u1E86':
+ // Ẇ [LATIN CAPITAL LETTER W WITH DOT ABOVE]
+ case '\u1E88':
+ // Ẉ [LATIN CAPITAL LETTER W WITH DOT BELOW]
+ case '\u24CC':
+ // Ⓦ [CIRCLED LATIN CAPITAL LETTER W]
+ case '\u2C72':
+ // â±² [LATIN CAPITAL LETTER W WITH HOOK]
+ case '\uFF37': // ï¼· [FULLWIDTH LATIN CAPITAL LETTER W]
+ output[outputPos++] = 'W';
+ break;
+
+ case '\u0175':
+ // ŵ [LATIN SMALL LETTER W WITH CIRCUMFLEX]
+ case '\u01BF':
+ // Æ¿ http://en.wikipedia.org/wiki/Wynn [LATIN LETTER WYNN]
+ case '\u028D':
+ // � [LATIN SMALL LETTER TURNED W]
+ case '\u1E81':
+ // � [LATIN SMALL LETTER W WITH GRAVE]
+ case '\u1E83':
+ // ẃ [LATIN SMALL LETTER W WITH ACUTE]
+ case '\u1E85':
+ // ẅ [LATIN SMALL LETTER W WITH DIAERESIS]
+ case '\u1E87':
+ // ẇ [LATIN SMALL LETTER W WITH DOT ABOVE]
+ case '\u1E89':
+ // ẉ [LATIN SMALL LETTER W WITH DOT BELOW]
+ case '\u1E98':
+ // ẘ [LATIN SMALL LETTER W WITH RING ABOVE]
+ case '\u24E6':
+ // ⓦ [CIRCLED LATIN SMALL LETTER W]
+ case '\u2C73':
+ // â±³ [LATIN SMALL LETTER W WITH HOOK]
+ case '\uFF57': // ï½— [FULLWIDTH LATIN SMALL LETTER W]
+ output[outputPos++] = 'w';
+ break;
+
+ case '\u24B2': // â’² [PARENTHESIZED LATIN SMALL LETTER W]
+ output[outputPos++] = '(';
+ output[outputPos++] = 'w';
+ output[outputPos++] = ')';
+ break;
+
+ case '\u1E8A':
+ // Ẋ [LATIN CAPITAL LETTER X WITH DOT ABOVE]
+ case '\u1E8C':
+ // Ẍ [LATIN CAPITAL LETTER X WITH DIAERESIS]
+ case '\u24CD':
+ // � [CIRCLED LATIN CAPITAL LETTER X]
+ case '\uFF38': // X [FULLWIDTH LATIN CAPITAL LETTER X]
+ output[outputPos++] = 'X';
+ break;
+
+ case '\u1D8D':
+ // � [LATIN SMALL LETTER X WITH PALATAL HOOK]
+ case '\u1E8B':
+ // ẋ [LATIN SMALL LETTER X WITH DOT ABOVE]
+ case '\u1E8D':
+ // � [LATIN SMALL LETTER X WITH DIAERESIS]
+ case '\u2093':
+ // â‚“ [LATIN SUBSCRIPT SMALL LETTER X]
+ case '\u24E7':
+ // ⓧ [CIRCLED LATIN SMALL LETTER X]
+ case '\uFF58': // x [FULLWIDTH LATIN SMALL LETTER X]
+ output[outputPos++] = 'x';
+ break;
+
+ case '\u24B3': // â’³ [PARENTHESIZED LATIN SMALL LETTER X]
+ output[outputPos++] = '(';
+ output[outputPos++] = 'x';
+ output[outputPos++] = ')';
+ break;
+
+ case '\u00DD':
+ // � [LATIN CAPITAL LETTER Y WITH ACUTE]
+ case '\u0176':
+ // Ŷ [LATIN CAPITAL LETTER Y WITH CIRCUMFLEX]
+ case '\u0178':
+ // Ÿ [LATIN CAPITAL LETTER Y WITH DIAERESIS]
+ case '\u01B3':
+ // Ƴ [LATIN CAPITAL LETTER Y WITH HOOK]
+ case '\u0232':
+ // Ȳ [LATIN CAPITAL LETTER Y WITH MACRON]
+ case '\u024E':
+ // ÉŽ [LATIN CAPITAL LETTER Y WITH STROKE]
+ case '\u028F':
+ // � [LATIN LETTER SMALL CAPITAL Y]
+ case '\u1E8E':
+ // Ẏ [LATIN CAPITAL LETTER Y WITH DOT ABOVE]
+ case '\u1EF2':
+ // Ỳ [LATIN CAPITAL LETTER Y WITH GRAVE]
+ case '\u1EF4':
+ // á»´ [LATIN CAPITAL LETTER Y WITH DOT BELOW]
+ case '\u1EF6':
+ // Ỷ [LATIN CAPITAL LETTER Y WITH HOOK ABOVE]
+ case '\u1EF8':
+ // Ỹ [LATIN CAPITAL LETTER Y WITH TILDE]
+ case '\u1EFE':
+ // Ỿ [LATIN CAPITAL LETTER Y WITH LOOP]
+ case '\u24CE':
+ // â“Ž [CIRCLED LATIN CAPITAL LETTER Y]
+ case '\uFF39': // ï¼¹ [FULLWIDTH LATIN CAPITAL LETTER Y]
+ output[outputPos++] = 'Y';
+ break;
+
+ case '\u00FD':
+ // ý [LATIN SMALL LETTER Y WITH ACUTE]
+ case '\u00FF':
+ // ÿ [LATIN SMALL LETTER Y WITH DIAERESIS]
+ case '\u0177':
+ // Å· [LATIN SMALL LETTER Y WITH CIRCUMFLEX]
+ case '\u01B4':
+ // Æ´ [LATIN SMALL LETTER Y WITH HOOK]
+ case '\u0233':
+ // ȳ [LATIN SMALL LETTER Y WITH MACRON]
+ case '\u024F':
+ // � [LATIN SMALL LETTER Y WITH STROKE]
+ case '\u028E':
+ // ÊŽ [LATIN SMALL LETTER TURNED Y]
+ case '\u1E8F':
+ // � [LATIN SMALL LETTER Y WITH DOT ABOVE]
+ case '\u1E99':
+ // ẙ [LATIN SMALL LETTER Y WITH RING ABOVE]
+ case '\u1EF3':
+ // ỳ [LATIN SMALL LETTER Y WITH GRAVE]
+ case '\u1EF5':
+ // ỵ [LATIN SMALL LETTER Y WITH DOT BELOW]
+ case '\u1EF7':
+ // á»· [LATIN SMALL LETTER Y WITH HOOK ABOVE]
+ case '\u1EF9':
+ // ỹ [LATIN SMALL LETTER Y WITH TILDE]
+ case '\u1EFF':
+ // ỿ [LATIN SMALL LETTER Y WITH LOOP]
+ case '\u24E8':
+ // ⓨ [CIRCLED LATIN SMALL LETTER Y]
+ case '\uFF59': // ï½™ [FULLWIDTH LATIN SMALL LETTER Y]
+ output[outputPos++] = 'y';
+ break;
+
+ case '\u24B4': // â’´ [PARENTHESIZED LATIN SMALL LETTER Y]
+ output[outputPos++] = '(';
+ output[outputPos++] = 'y';
+ output[outputPos++] = ')';
+ break;
+
+ case '\u0179':
+ // Ź [LATIN CAPITAL LETTER Z WITH ACUTE]
+ case '\u017B':
+ // Å» [LATIN CAPITAL LETTER Z WITH DOT ABOVE]
+ case '\u017D':
+ // Ž [LATIN CAPITAL LETTER Z WITH CARON]
+ case '\u01B5':
+ // Ƶ [LATIN CAPITAL LETTER Z WITH STROKE]
+ case '\u021C':
+ // Ȝ http://en.wikipedia.org/wiki/Yogh [LATIN CAPITAL LETTER YOGH]
+ case '\u0224':
+ // Ȥ [LATIN CAPITAL LETTER Z WITH HOOK]
+ case '\u1D22':
+ // á´¢ [LATIN LETTER SMALL CAPITAL Z]
+ case '\u1E90':
+ // � [LATIN CAPITAL LETTER Z WITH CIRCUMFLEX]
+ case '\u1E92':
+ // Ẓ [LATIN CAPITAL LETTER Z WITH DOT BELOW]
+ case '\u1E94':
+ // �? [LATIN CAPITAL LETTER Z WITH LINE BELOW]
+ case '\u24CF':
+ // � [CIRCLED LATIN CAPITAL LETTER Z]
+ case '\u2C6B':
+ // Ⱬ [LATIN CAPITAL LETTER Z WITH DESCENDER]
+ case '\uA762':
+ // � [LATIN CAPITAL LETTER VISIGOTHIC Z]
+ case '\uFF3A': // Z [FULLWIDTH LATIN CAPITAL LETTER Z]
+ output[outputPos++] = 'Z';
+ break;
+
+ case '\u017A':
+ // ź [LATIN SMALL LETTER Z WITH ACUTE]
+ case '\u017C':
+ // ż [LATIN SMALL LETTER Z WITH DOT ABOVE]
+ case '\u017E':
+ // ž [LATIN SMALL LETTER Z WITH CARON]
+ case '\u01B6':
+ // ƶ [LATIN SMALL LETTER Z WITH STROKE]
+ case '\u021D':
+ // � http://en.wikipedia.org/wiki/Yogh [LATIN SMALL LETTER YOGH]
+ case '\u0225':
+ // ȥ [LATIN SMALL LETTER Z WITH HOOK]
+ case '\u0240':
+ // ɀ [LATIN SMALL LETTER Z WITH SWASH TAIL]
+ case '\u0290':
+ // � [LATIN SMALL LETTER Z WITH RETROFLEX HOOK]
+ case '\u0291':
+ // Ê‘ [LATIN SMALL LETTER Z WITH CURL]
+ case '\u1D76':
+ // ᵶ [LATIN SMALL LETTER Z WITH MIDDLE TILDE]
+ case '\u1D8E':
+ // ᶎ [LATIN SMALL LETTER Z WITH PALATAL HOOK]
+ case '\u1E91':
+ // ẑ [LATIN SMALL LETTER Z WITH CIRCUMFLEX]
+ case '\u1E93':
+ // ẓ [LATIN SMALL LETTER Z WITH DOT BELOW]
+ case '\u1E95':
+ // ẕ [LATIN SMALL LETTER Z WITH LINE BELOW]
+ case '\u24E9':
+ // â“© [CIRCLED LATIN SMALL LETTER Z]
+ case '\u2C6C':
+ // ⱬ [LATIN SMALL LETTER Z WITH DESCENDER]
+ case '\uA763':
+ // � [LATIN SMALL LETTER VISIGOTHIC Z]
+ case '\uFF5A': // z [FULLWIDTH LATIN SMALL LETTER Z]
+ output[outputPos++] = 'z';
+ break;
+
+ case '\u24B5': // â’µ [PARENTHESIZED LATIN SMALL LETTER Z]
+ output[outputPos++] = '(';
+ output[outputPos++] = 'z';
+ output[outputPos++] = ')';
+ break;
+
+ case '\u2070':
+ // � [SUPERSCRIPT ZERO]
+ case '\u2080':
+ // â‚€ [SUBSCRIPT ZERO]
+ case '\u24EA':
+ // ⓪ [CIRCLED DIGIT ZERO]
+ case '\u24FF':
+ // â“¿ [NEGATIVE CIRCLED DIGIT ZERO]
+ case '\uFF10': // � [FULLWIDTH DIGIT ZERO]
+ output[outputPos++] = '0';
+ break;
+
+ case '\u00B9':
+ // ¹ [SUPERSCRIPT ONE]
+ case '\u2081':
+ // � [SUBSCRIPT ONE]
+ case '\u2460':
+ // â‘  [CIRCLED DIGIT ONE]
+ case '\u24F5':
+ // ⓵ [DOUBLE CIRCLED DIGIT ONE]
+ case '\u2776':
+ // � [DINGBAT NEGATIVE CIRCLED DIGIT ONE]
+ case '\u2780':
+ // ➀ [DINGBAT CIRCLED SANS-SERIF DIGIT ONE]
+ case '\u278A':
+ // ➊ [DINGBAT NEGATIVE CIRCLED SANS-SERIF DIGIT ONE]
+ case '\uFF11': // 1 [FULLWIDTH DIGIT ONE]
+ output[outputPos++] = '1';
+ break;
+
+ case '\u2488': // â’ˆ [DIGIT ONE FULL STOP]
+ output[outputPos++] = '1';
+ output[outputPos++] = '.';
+ break;
+
+ case '\u2474': // â‘´ [PARENTHESIZED DIGIT ONE]
+ output[outputPos++] = '(';
+ output[outputPos++] = '1';
+ output[outputPos++] = ')';
+ break;
+
+ case '\u00B2':
+ // ² [SUPERSCRIPT TWO]
+ case '\u2082':
+ // â‚‚ [SUBSCRIPT TWO]
+ case '\u2461':
+ // â‘¡ [CIRCLED DIGIT TWO]
+ case '\u24F6':
+ // ⓶ [DOUBLE CIRCLED DIGIT TWO]
+ case '\u2777':
+ // � [DINGBAT NEGATIVE CIRCLED DIGIT TWO]
+ case '\u2781':
+ // � [DINGBAT CIRCLED SANS-SERIF DIGIT TWO]
+ case '\u278B':
+ // âž‹ [DINGBAT NEGATIVE CIRCLED SANS-SERIF DIGIT TWO]
+ case '\uFF12': // ï¼’ [FULLWIDTH DIGIT TWO]
+ output[outputPos++] = '2';
+ break;
+
+ case '\u2489': // â’‰ [DIGIT TWO FULL STOP]
+ output[outputPos++] = '2';
+ output[outputPos++] = '.';
+ break;
+
+ case '\u2475': // ⑵ [PARENTHESIZED DIGIT TWO]
+ output[outputPos++] = '(';
+ output[outputPos++] = '2';
+ output[outputPos++] = ')';
+ break;
+
+ case '\u00B3':
+ // ³ [SUPERSCRIPT THREE]
+ case '\u2083':
+ // ₃ [SUBSCRIPT THREE]
+ case '\u2462':
+ // â‘¢ [CIRCLED DIGIT THREE]
+ case '\u24F7':
+ // â“· [DOUBLE CIRCLED DIGIT THREE]
+ case '\u2778':
+ // � [DINGBAT NEGATIVE CIRCLED DIGIT THREE]
+ case '\u2782':
+ // âž‚ [DINGBAT CIRCLED SANS-SERIF DIGIT THREE]
+ case '\u278C':
+ // ➌ [DINGBAT NEGATIVE CIRCLED SANS-SERIF DIGIT THREE]
+ case '\uFF13': // 3 [FULLWIDTH DIGIT THREE]
+ output[outputPos++] = '3';
+ break;
+
+ case '\u248A': // â’Š [DIGIT THREE FULL STOP]
+ output[outputPos++] = '3';
+ output[outputPos++] = '.';
+ break;
+
+ case '\u2476': // ⑶ [PARENTHESIZED DIGIT THREE]
+ output[outputPos++] = '(';
+ output[outputPos++] = '3';
+ output[outputPos++] = ')';
+ break;
+
+ case '\u2074':
+ // � [SUPERSCRIPT FOUR]
+ case '\u2084':
+ // â‚„ [SUBSCRIPT FOUR]
+ case '\u2463':
+ // â‘£ [CIRCLED DIGIT FOUR]
+ case '\u24F8':
+ // ⓸ [DOUBLE CIRCLED DIGIT FOUR]
+ case '\u2779':
+ // � [DINGBAT NEGATIVE CIRCLED DIGIT FOUR]
+ case '\u2783':
+ // ➃ [DINGBAT CIRCLED SANS-SERIF DIGIT FOUR]
+ case '\u278D':
+ // � [DINGBAT NEGATIVE CIRCLED SANS-SERIF DIGIT FOUR]
+ case '\uFF14': // �? [FULLWIDTH DIGIT FOUR]
+ output[outputPos++] = '4';
+ break;
+
+ case '\u248B': // â’‹ [DIGIT FOUR FULL STOP]
+ output[outputPos++] = '4';
+ output[outputPos++] = '.';
+ break;
+
+ case '\u2477': // â‘· [PARENTHESIZED DIGIT FOUR]
+ output[outputPos++] = '(';
+ output[outputPos++] = '4';
+ output[outputPos++] = ')';
+ break;
+
+ case '\u2075':
+ // � [SUPERSCRIPT FIVE]
+ case '\u2085':
+ // â‚… [SUBSCRIPT FIVE]
+ case '\u2464':
+ // ⑤ [CIRCLED DIGIT FIVE]
+ case '\u24F9':
+ // ⓹ [DOUBLE CIRCLED DIGIT FIVE]
+ case '\u277A':
+ // � [DINGBAT NEGATIVE CIRCLED DIGIT FIVE]
+ case '\u2784':
+ // âž„ [DINGBAT CIRCLED SANS-SERIF DIGIT FIVE]
+ case '\u278E':
+ // ➎ [DINGBAT NEGATIVE CIRCLED SANS-SERIF DIGIT FIVE]
+ case '\uFF15': // 5 [FULLWIDTH DIGIT FIVE]
+ output[outputPos++] = '5';
+ break;
+
+ case '\u248C': // ⒌ [DIGIT FIVE FULL STOP]
+ output[outputPos++] = '5';
+ output[outputPos++] = '.';
+ break;
+
+ case '\u2478': // ⑸ [PARENTHESIZED DIGIT FIVE]
+ output[outputPos++] = '(';
+ output[outputPos++] = '5';
+ output[outputPos++] = ')';
+ break;
+
+ case '\u2076':
+ // � [SUPERSCRIPT SIX]
+ case '\u2086':
+ // ₆ [SUBSCRIPT SIX]
+ case '\u2465':
+ // â‘¥ [CIRCLED DIGIT SIX]
+ case '\u24FA':
+ // ⓺ [DOUBLE CIRCLED DIGIT SIX]
+ case '\u277B':
+ // � [DINGBAT NEGATIVE CIRCLED DIGIT SIX]
+ case '\u2785':
+ // âž… [DINGBAT CIRCLED SANS-SERIF DIGIT SIX]
+ case '\u278F':
+ // � [DINGBAT NEGATIVE CIRCLED SANS-SERIF DIGIT SIX]
+ case '\uFF16': // ï¼– [FULLWIDTH DIGIT SIX]
+ output[outputPos++] = '6';
+ break;
+
+ case '\u248D': // â’� [DIGIT SIX FULL STOP]
+ output[outputPos++] = '6';
+ output[outputPos++] = '.';
+ break;
+
+ case '\u2479': // ⑹ [PARENTHESIZED DIGIT SIX]
+ output[outputPos++] = '(';
+ output[outputPos++] = '6';
+ output[outputPos++] = ')';
+ break;
+
+ case '\u2077':
+ // � [SUPERSCRIPT SEVEN]
+ case '\u2087':
+ // ₇ [SUBSCRIPT SEVEN]
+ case '\u2466':
+ // ⑦ [CIRCLED DIGIT SEVEN]
+ case '\u24FB':
+ // â“» [DOUBLE CIRCLED DIGIT SEVEN]
+ case '\u277C':
+ // � [DINGBAT NEGATIVE CIRCLED DIGIT SEVEN]
+ case '\u2786':
+ // ➆ [DINGBAT CIRCLED SANS-SERIF DIGIT SEVEN]
+ case '\u2790':
+ // � [DINGBAT NEGATIVE CIRCLED SANS-SERIF DIGIT SEVEN]
+ case '\uFF17': // ï¼— [FULLWIDTH DIGIT SEVEN]
+ output[outputPos++] = '7';
+ break;
+
+ case '\u248E': // â’Ž [DIGIT SEVEN FULL STOP]
+ output[outputPos++] = '7';
+ output[outputPos++] = '.';
+ break;
+
+ case '\u247A': // ⑺ [PARENTHESIZED DIGIT SEVEN]
+ output[outputPos++] = '(';
+ output[outputPos++] = '7';
+ output[outputPos++] = ')';
+ break;
+
+ case '\u2078':
+ // � [SUPERSCRIPT EIGHT]
+ case '\u2088':
+ // ₈ [SUBSCRIPT EIGHT]
+ case '\u2467':
+ // ⑧ [CIRCLED DIGIT EIGHT]
+ case '\u24FC':
+ // ⓼ [DOUBLE CIRCLED DIGIT EIGHT]
+ case '\u277D':
+ // � [DINGBAT NEGATIVE CIRCLED DIGIT EIGHT]
+ case '\u2787':
+ // ➇ [DINGBAT CIRCLED SANS-SERIF DIGIT EIGHT]
+ case '\u2791':
+ // âž‘ [DINGBAT NEGATIVE CIRCLED SANS-SERIF DIGIT EIGHT]
+ case '\uFF18': // 8 [FULLWIDTH DIGIT EIGHT]
+ output[outputPos++] = '8';
+ break;
+
+ case '\u248F': // â’� [DIGIT EIGHT FULL STOP]
+ output[outputPos++] = '8';
+ output[outputPos++] = '.';
+ break;
+
+ case '\u247B': // â‘» [PARENTHESIZED DIGIT EIGHT]
+ output[outputPos++] = '(';
+ output[outputPos++] = '8';
+ output[outputPos++] = ')';
+ break;
+
+ case '\u2079':
+ // � [SUPERSCRIPT NINE]
+ case '\u2089':
+ // ₉ [SUBSCRIPT NINE]
+ case '\u2468':
+ // ⑨ [CIRCLED DIGIT NINE]
+ case '\u24FD':
+ // ⓽ [DOUBLE CIRCLED DIGIT NINE]
+ case '\u277E':
+ // � [DINGBAT NEGATIVE CIRCLED DIGIT NINE]
+ case '\u2788':
+ // ➈ [DINGBAT CIRCLED SANS-SERIF DIGIT NINE]
+ case '\u2792':
+ // âž’ [DINGBAT NEGATIVE CIRCLED SANS-SERIF DIGIT NINE]
+ case '\uFF19': // ï¼™ [FULLWIDTH DIGIT NINE]
+ output[outputPos++] = '9';
+ break;
+
+ case '\u2490': // â’� [DIGIT NINE FULL STOP]
+ output[outputPos++] = '9';
+ output[outputPos++] = '.';
+ break;
+
+ case '\u247C': // ⑼ [PARENTHESIZED DIGIT NINE]
+ output[outputPos++] = '(';
+ output[outputPos++] = '9';
+ output[outputPos++] = ')';
+ break;
+
+ case '\u2469':
+ // â‘© [CIRCLED NUMBER TEN]
+ case '\u24FE':
+ // ⓾ [DOUBLE CIRCLED NUMBER TEN]
+ case '\u277F':
+ // � [DINGBAT NEGATIVE CIRCLED NUMBER TEN]
+ case '\u2789':
+ // ➉ [DINGBAT CIRCLED SANS-SERIF NUMBER TEN]
+ case '\u2793': // âž“ [DINGBAT NEGATIVE CIRCLED SANS-SERIF NUMBER TEN]
+ output[outputPos++] = '1';
+ output[outputPos++] = '0';
+ break;
+
+ case '\u2491': // â’‘ [NUMBER TEN FULL STOP]
+ output[outputPos++] = '1';
+ output[outputPos++] = '0';
+ output[outputPos++] = '.';
+ break;
+
+ case '\u247D': // ⑽ [PARENTHESIZED NUMBER TEN]
+ output[outputPos++] = '(';
+ output[outputPos++] = '1';
+ output[outputPos++] = '0';
+ output[outputPos++] = ')';
+ break;
+
+ case '\u246A':
+ // ⑪ [CIRCLED NUMBER ELEVEN]
+ case '\u24EB': // â“« [NEGATIVE CIRCLED NUMBER ELEVEN]
+ output[outputPos++] = '1';
+ output[outputPos++] = '1';
+ break;
+
+ case '\u2492': // â’’ [NUMBER ELEVEN FULL STOP]
+ output[outputPos++] = '1';
+ output[outputPos++] = '1';
+ output[outputPos++] = '.';
+ break;
+
+ case '\u247E': // ⑾ [PARENTHESIZED NUMBER ELEVEN]
+ output[outputPos++] = '(';
+ output[outputPos++] = '1';
+ output[outputPos++] = '1';
+ output[outputPos++] = ')';
+ break;
+
+ case '\u246B':
+ // â‘« [CIRCLED NUMBER TWELVE]
+ case '\u24EC': // ⓬ [NEGATIVE CIRCLED NUMBER TWELVE]
+ output[outputPos++] = '1';
+ output[outputPos++] = '2';
+ break;
+
+ case '\u2493': // â’“ [NUMBER TWELVE FULL STOP]
+ output[outputPos++] = '1';
+ output[outputPos++] = '2';
+ output[outputPos++] = '.';
+ break;
+
+ case '\u247F': // â‘¿ [PARENTHESIZED NUMBER TWELVE]
+ output[outputPos++] = '(';
+ output[outputPos++] = '1';
+ output[outputPos++] = '2';
+ output[outputPos++] = ')';
+ break;
+
+ case '\u246C':
+ // ⑬ [CIRCLED NUMBER THIRTEEN]
+ case '\u24ED': // â“­ [NEGATIVE CIRCLED NUMBER THIRTEEN]
+ output[outputPos++] = '1';
+ output[outputPos++] = '3';
+ break;
+
+ case '\u2494': // â’�? [NUMBER THIRTEEN FULL STOP]
+ output[outputPos++] = '1';
+ output[outputPos++] = '3';
+ output[outputPos++] = '.';
+ break;
+
+ case '\u2480': // â’€ [PARENTHESIZED NUMBER THIRTEEN]
+ output[outputPos++] = '(';
+ output[outputPos++] = '1';
+ output[outputPos++] = '3';
+ output[outputPos++] = ')';
+ break;
+
+ case '\u246D':
+ // â‘­ [CIRCLED NUMBER FOURTEEN]
+ case '\u24EE': // â“® [NEGATIVE CIRCLED NUMBER FOURTEEN]
+ output[outputPos++] = '1';
+ output[outputPos++] = '4';
+ break;
+
+ case '\u2495': // â’• [NUMBER FOURTEEN FULL STOP]
+ output[outputPos++] = '1';
+ output[outputPos++] = '4';
+ output[outputPos++] = '.';
+ break;
+
+ case '\u2481': // â’� [PARENTHESIZED NUMBER FOURTEEN]
+ output[outputPos++] = '(';
+ output[outputPos++] = '1';
+ output[outputPos++] = '4';
+ output[outputPos++] = ')';
+ break;
+
+ case '\u246E':
+ // â‘® [CIRCLED NUMBER FIFTEEN]
+ case '\u24EF': // ⓯ [NEGATIVE CIRCLED NUMBER FIFTEEN]
+ output[outputPos++] = '1';
+ output[outputPos++] = '5';
+ break;
+
+ case '\u2496': // â’– [NUMBER FIFTEEN FULL STOP]
+ output[outputPos++] = '1';
+ output[outputPos++] = '5';
+ output[outputPos++] = '.';
+ break;
+
+ case '\u2482': // â’‚ [PARENTHESIZED NUMBER FIFTEEN]
+ output[outputPos++] = '(';
+ output[outputPos++] = '1';
+ output[outputPos++] = '5';
+ output[outputPos++] = ')';
+ break;
+
+ case '\u246F':
+ // ⑯ [CIRCLED NUMBER SIXTEEN]
+ case '\u24F0': // â“° [NEGATIVE CIRCLED NUMBER SIXTEEN]
+ output[outputPos++] = '1';
+ output[outputPos++] = '6';
+ break;
+
+ case '\u2497': // â’— [NUMBER SIXTEEN FULL STOP]
+ output[outputPos++] = '1';
+ output[outputPos++] = '6';
+ output[outputPos++] = '.';
+ break;
+
+ case '\u2483': // â’ƒ [PARENTHESIZED NUMBER SIXTEEN]
+ output[outputPos++] = '(';
+ output[outputPos++] = '1';
+ output[outputPos++] = '6';
+ output[outputPos++] = ')';
+ break;
+
+ case '\u2470':
+ // â‘° [CIRCLED NUMBER SEVENTEEN]
+ case '\u24F1': // ⓱ [NEGATIVE CIRCLED NUMBER SEVENTEEN]
+ output[outputPos++] = '1';
+ output[outputPos++] = '7';
+ break;
+
+ case '\u2498': // â’˜ [NUMBER SEVENTEEN FULL STOP]
+ output[outputPos++] = '1';
+ output[outputPos++] = '7';
+ output[outputPos++] = '.';
+ break;
+
+ case '\u2484': // â’„ [PARENTHESIZED NUMBER SEVENTEEN]
+ output[outputPos++] = '(';
+ output[outputPos++] = '1';
+ output[outputPos++] = '7';
+ output[outputPos++] = ')';
+ break;
+
+ case '\u2471':
+ // ⑱ [CIRCLED NUMBER EIGHTEEN]
+ case '\u24F2': // ⓲ [NEGATIVE CIRCLED NUMBER EIGHTEEN]
+ output[outputPos++] = '1';
+ output[outputPos++] = '8';
+ break;
+
+ case '\u2499': // â’™ [NUMBER EIGHTEEN FULL STOP]
+ output[outputPos++] = '1';
+ output[outputPos++] = '8';
+ output[outputPos++] = '.';
+ break;
+
+ case '\u2485': // â’… [PARENTHESIZED NUMBER EIGHTEEN]
+ output[outputPos++] = '(';
+ output[outputPos++] = '1';
+ output[outputPos++] = '8';
+ output[outputPos++] = ')';
+ break;
+
+ case '\u2472':
+ // ⑲ [CIRCLED NUMBER NINETEEN]
+ case '\u24F3': // ⓳ [NEGATIVE CIRCLED NUMBER NINETEEN]
+ output[outputPos++] = '1';
+ output[outputPos++] = '9';
+ break;
+
+ case '\u249A': // â’š [NUMBER NINETEEN FULL STOP]
+ output[outputPos++] = '1';
+ output[outputPos++] = '9';
+ output[outputPos++] = '.';
+ break;
+
+ case '\u2486': // â’† [PARENTHESIZED NUMBER NINETEEN]
+ output[outputPos++] = '(';
+ output[outputPos++] = '1';
+ output[outputPos++] = '9';
+ output[outputPos++] = ')';
+ break;
+
+ case '\u2473':
+ // ⑳ [CIRCLED NUMBER TWENTY]
+ case '\u24F4': // â“´ [NEGATIVE CIRCLED NUMBER TWENTY]
+ output[outputPos++] = '2';
+ output[outputPos++] = '0';
+ break;
+
+ case '\u249B': // â’› [NUMBER TWENTY FULL STOP]
+ output[outputPos++] = '2';
+ output[outputPos++] = '0';
+ output[outputPos++] = '.';
+ break;
+
+ case '\u2487': // â’‡ [PARENTHESIZED NUMBER TWENTY]
+ output[outputPos++] = '(';
+ output[outputPos++] = '2';
+ output[outputPos++] = '0';
+ output[outputPos++] = ')';
+ break;
+
+ case '\u00AB':
+ // « [LEFT-POINTING DOUBLE ANGLE QUOTATION MARK]
+ case '\u00BB':
+ // » [RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK]
+ case '\u201C':
+ // “ [LEFT DOUBLE QUOTATION MARK]
+ case '\u201D':
+ // � [RIGHT DOUBLE QUOTATION MARK]
+ case '\u201E':
+ // „ [DOUBLE LOW-9 QUOTATION MARK]
+ case '\u2033':
+ // ″ [DOUBLE PRIME]
+ case '\u2036':
+ // ‶ [REVERSED DOUBLE PRIME]
+ case '\u275D':
+ // � [HEAVY DOUBLE TURNED COMMA QUOTATION MARK ORNAMENT]
+ case '\u275E':
+ // � [HEAVY DOUBLE COMMA QUOTATION MARK ORNAMENT]
+ case '\u276E':
+ // � [HEAVY LEFT-POINTING ANGLE QUOTATION MARK ORNAMENT]
+ case '\u276F':
+ // � [HEAVY RIGHT-POINTING ANGLE QUOTATION MARK ORNAMENT]
+ case '\uFF02': // " [FULLWIDTH QUOTATION MARK]
+ output[outputPos++] = '"';
+ break;
+
+ case '\u2018':
+ // ‘ [LEFT SINGLE QUOTATION MARK]
+ case '\u2019':
+ // ’ [RIGHT SINGLE QUOTATION MARK]
+ case '\u201A':
+ // ‚ [SINGLE LOW-9 QUOTATION MARK]
+ case '\u201B':
+ // ‛ [SINGLE HIGH-REVERSED-9 QUOTATION MARK]
+ case '\u2032':
+ // ′ [PRIME]
+ case '\u2035':
+ // ‵ [REVERSED PRIME]
+ case '\u2039':
+ // ‹ [SINGLE LEFT-POINTING ANGLE QUOTATION MARK]
+ case '\u203A':
+ // › [SINGLE RIGHT-POINTING ANGLE QUOTATION MARK]
+ case '\u275B':
+ // � [HEAVY SINGLE TURNED COMMA QUOTATION MARK ORNAMENT]
+ case '\u275C':
+ // � [HEAVY SINGLE COMMA QUOTATION MARK ORNAMENT]
+ case '\uFF07': // ' [FULLWIDTH APOSTROPHE]
+ output[outputPos++] = '\'';
+ break;
+
+ case '\u2010':
+ // � [HYPHEN]
+ case '\u2011':
+ // ‑ [NON-BREAKING HYPHEN]
+ case '\u2012':
+ // ‒ [FIGURE DASH]
+ case '\u2013':
+ // – [EN DASH]
+ case '\u2014':
+ // �? [EM DASH]
+ case '\u207B':
+ // � [SUPERSCRIPT MINUS]
+ case '\u208B':
+ // â‚‹ [SUBSCRIPT MINUS]
+ case '\uFF0D': // � [FULLWIDTH HYPHEN-MINUS]
+ output[outputPos++] = '-';
+ break;
+
+ case '\u2045':
+ // � [LEFT SQUARE BRACKET WITH QUILL]
+ case '\u2772':
+ // � [LIGHT LEFT TORTOISE SHELL BRACKET ORNAMENT]
+ case '\uFF3B': // ï¼» [FULLWIDTH LEFT SQUARE BRACKET]
+ output[outputPos++] = '[';
+ break;
+
+ case '\u2046':
+ // � [RIGHT SQUARE BRACKET WITH QUILL]
+ case '\u2773':
+ // � [LIGHT RIGHT TORTOISE SHELL BRACKET ORNAMENT]
+ case '\uFF3D': // ï¼½ [FULLWIDTH RIGHT SQUARE BRACKET]
+ output[outputPos++] = ']';
+ break;
+
+ case '\u207D':
+ // � [SUPERSCRIPT LEFT PARENTHESIS]
+ case '\u208D':
+ // � [SUBSCRIPT LEFT PARENTHESIS]
+ case '\u2768':
+ // � [MEDIUM LEFT PARENTHESIS ORNAMENT]
+ case '\u276A':
+ // � [MEDIUM FLATTENED LEFT PARENTHESIS ORNAMENT]
+ case '\uFF08': // ( [FULLWIDTH LEFT PARENTHESIS]
+ output[outputPos++] = '(';
+ break;
+
+ case '\u2E28': // ⸨ [LEFT DOUBLE PARENTHESIS]
+ output[outputPos++] = '(';
+ output[outputPos++] = '(';
+ break;
+
+ case '\u207E':
+ // � [SUPERSCRIPT RIGHT PARENTHESIS]
+ case '\u208E':
+ // â‚Ž [SUBSCRIPT RIGHT PARENTHESIS]
+ case '\u2769':
+ // � [MEDIUM RIGHT PARENTHESIS ORNAMENT]
+ case '\u276B':
+ // � [MEDIUM FLATTENED RIGHT PARENTHESIS ORNAMENT]
+ case '\uFF09': // ) [FULLWIDTH RIGHT PARENTHESIS]
+ output[outputPos++] = ')';
+ break;
+
+ case '\u2E29': // ⸩ [RIGHT DOUBLE PARENTHESIS]
+ output[outputPos++] = ')';
+ output[outputPos++] = ')';
+ break;
+
+ case '\u276C':
+ // � [MEDIUM LEFT-POINTING ANGLE BRACKET ORNAMENT]
+ case '\u2770':
+ // � [HEAVY LEFT-POINTING ANGLE BRACKET ORNAMENT]
+ case '\uFF1C': // < [FULLWIDTH LESS-THAN SIGN]
+ output[outputPos++] = '<';
+ break;
+
+ case '\u276D':
+ // � [MEDIUM RIGHT-POINTING ANGLE BRACKET ORNAMENT]
+ case '\u2771':
+ // � [HEAVY RIGHT-POINTING ANGLE BRACKET ORNAMENT]
+ case '\uFF1E': // > [FULLWIDTH GREATER-THAN SIGN]
+ output[outputPos++] = '>';
+ break;
+
+ case '\u2774':
+ // � [MEDIUM LEFT CURLY BRACKET ORNAMENT]
+ case '\uFF5B': // ï½› [FULLWIDTH LEFT CURLY BRACKET]
+ output[outputPos++] = '{';
+ break;
+
+ case '\u2775':
+ // � [MEDIUM RIGHT CURLY BRACKET ORNAMENT]
+ case '\uFF5D': // � [FULLWIDTH RIGHT CURLY BRACKET]
+ output[outputPos++] = '}';
+ break;
+
+ case '\u207A':
+ // � [SUPERSCRIPT PLUS SIGN]
+ case '\u208A':
+ // â‚Š [SUBSCRIPT PLUS SIGN]
+ case '\uFF0B': // + [FULLWIDTH PLUS SIGN]
+ output[outputPos++] = '+';
+ break;
+
+ case '\u207C':
+ // � [SUPERSCRIPT EQUALS SIGN]
+ case '\u208C':
+ // ₌ [SUBSCRIPT EQUALS SIGN]
+ case '\uFF1D': // � [FULLWIDTH EQUALS SIGN]
+ output[outputPos++] = '=';
+ break;
+
+ case '\uFF01': // � [FULLWIDTH EXCLAMATION MARK]
+ output[outputPos++] = '!';
+ break;
+
+ case '\u203C': // ‼ [DOUBLE EXCLAMATION MARK]
+ output[outputPos++] = '!';
+ output[outputPos++] = '!';
+ break;
+
+ case '\u2049': // � [EXCLAMATION QUESTION MARK]
+ output[outputPos++] = '!';
+ output[outputPos++] = '?';
+ break;
+
+ case '\uFF03': // # [FULLWIDTH NUMBER SIGN]
+ output[outputPos++] = '#';
+ break;
+
+ case '\uFF04': // $ [FULLWIDTH DOLLAR SIGN]
+ output[outputPos++] = '$';
+ break;
+
+ case '\u2052':
+ // � [COMMERCIAL MINUS SIGN]
+ case '\uFF05': // ï¼… [FULLWIDTH PERCENT SIGN]
+ output[outputPos++] = '%';
+ break;
+
+ case '\uFF06': // & [FULLWIDTH AMPERSAND]
+ output[outputPos++] = '&';
+ break;
+
+ case '\u204E':
+ // � [LOW ASTERISK]
+ case '\uFF0A': // * [FULLWIDTH ASTERISK]
+ output[outputPos++] = '*';
+ break;
+
+ case '\uFF0C': // , [FULLWIDTH COMMA]
+ output[outputPos++] = ',';
+ break;
+
+ case '\uFF0E': // . [FULLWIDTH FULL STOP]
+ output[outputPos++] = '.';
+ break;
+
+ case '\u2044':
+ // � [FRACTION SLASH]
+ case '\uFF0F': // � [FULLWIDTH SOLIDUS]
+ output[outputPos++] = '/';
+ break;
+
+ case '\uFF1A': // : [FULLWIDTH COLON]
+ output[outputPos++] = ':';
+ break;
+
+ case '\u204F':
+ // � [REVERSED SEMICOLON]
+ case '\uFF1B': // ï¼› [FULLWIDTH SEMICOLON]
+ output[outputPos++] = ';';
+ break;
+
+ case '\uFF1F': // ? [FULLWIDTH QUESTION MARK]
+ output[outputPos++] = '?';
+ break;
+
+ case '\u2047': // � [DOUBLE QUESTION MARK]
+ output[outputPos++] = '?';
+ output[outputPos++] = '?';
+ break;
+
+ case '\u2048': // � [QUESTION EXCLAMATION MARK]
+ output[outputPos++] = '?';
+ output[outputPos++] = '!';
+ break;
+
+ case '\uFF20': // ï¼  [FULLWIDTH COMMERCIAL AT]
+ output[outputPos++] = '@';
+ break;
+
+ case '\uFF3C': // ï¼¼ [FULLWIDTH REVERSE SOLIDUS]
+ output[outputPos++] = '\\';
+ break;
+
+ case '\u2038':
+ // ‸ [CARET]
+ case '\uFF3E': // ï¼¾ [FULLWIDTH CIRCUMFLEX ACCENT]
+ output[outputPos++] = '^';
+ break;
+
+ case '\uFF3F': // _ [FULLWIDTH LOW LINE]
+ output[outputPos++] = '_';
+ break;
+
+ case '\u2053':
+ // � [SWUNG DASH]
+ case '\uFF5E': // ~ [FULLWIDTH TILDE]
+ output[outputPos++] = '~';
+ break;
+
+ default:
+ output[outputPos++] = c;
+ break;
+
+ }
+ }
+ }
+ }
+ }
+} \ No newline at end of file
diff --git a/src/core/Analysis/Analyzer.cs b/src/core/Analysis/Analyzer.cs
new file mode 100644
index 0000000..cea0ee3
--- /dev/null
+++ b/src/core/Analysis/Analyzer.cs
@@ -0,0 +1,171 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+using System;
+using Lucene.Net.Documents;
+using Lucene.Net.Store;
+using Lucene.Net.Util;
+
+namespace Lucene.Net.Analysis
+{
+ /// <summary>An Analyzer builds TokenStreams, which analyze text. It thus represents a
+ /// policy for extracting index terms from text.
+ /// <p/>
+ /// Typical implementations first build a Tokenizer, which breaks the stream of
+ /// characters from the Reader into raw Tokens. One or more TokenFilters may
+ /// then be applied to the output of the Tokenizer.
+ /// </summary>
+ public abstract class Analyzer : IDisposable
+ {
+ /// <summary>Creates a TokenStream which tokenizes all the text in the provided
+ /// Reader. Must be able to handle null field name for
+ /// backward compatibility.
+ /// </summary>
+ public abstract TokenStream TokenStream(String fieldName, System.IO.TextReader reader);
+
+ /// <summary>Creates a TokenStream that is allowed to be re-used
+ /// from the previous time that the same thread called
+ /// this method. Callers that do not need to use more
+ /// than one TokenStream at the same time from this
+ /// analyzer should use this method for better
+ /// performance.
+ /// </summary>
+ public virtual TokenStream ReusableTokenStream(String fieldName, System.IO.TextReader reader)
+ {
+ return TokenStream(fieldName, reader);
+ }
+
+ private CloseableThreadLocal<Object> tokenStreams = new CloseableThreadLocal<Object>();
+ private bool isDisposed;
+
+ /// <summary>Used by Analyzers that implement reusableTokenStream
+ /// to retrieve previously saved TokenStreams for re-use
+ /// by the same thread.
+ /// </summary>
+ protected internal virtual object PreviousTokenStream
+ {
+ get
+ {
+ if (tokenStreams == null)
+ {
+ throw new AlreadyClosedException("this Analyzer is closed");
+ }
+ return tokenStreams.Get();
+ }
+ set
+ {
+ if (tokenStreams == null)
+ {
+ throw new AlreadyClosedException("this Analyzer is closed");
+ }
+ tokenStreams.Set(value);
+ }
+ }
+
+ [Obsolete()]
+ protected internal bool overridesTokenStreamMethod = false;
+
+ /// <deprecated> This is only present to preserve
+ /// back-compat of classes that subclass a core analyzer
+ /// and override tokenStream but not reusableTokenStream
+ /// </deprecated>
+ /// <summary>
+ /// Java uses Class&lt;? extends Analyer&gt; to constrain <typeparamref name="TClass"/> to
+ /// only Types that inherit from Analyzer. C# does not have a generic type class,
+ /// ie Type&lt;t&gt;. The method signature stays the same, and an exception may
+ /// still be thrown, if the method doesn't exist.
+ /// </summary>
+ [Obsolete("This is only present to preserve back-compat of classes that subclass a core analyzer and override tokenStream but not reusableTokenStream ")]
+ protected internal virtual void SetOverridesTokenStreamMethod<TClass>()
+ where TClass : Analyzer
+ {
+ try
+ {
+ System.Reflection.MethodInfo m = this.GetType().GetMethod("TokenStream", new[] { typeof(string), typeof(System.IO.TextReader) });
+ overridesTokenStreamMethod = m.DeclaringType != typeof(TClass);
+ }
+ catch (MethodAccessException)
+ {
+ // can't happen, as baseClass is subclass of Analyzer
+ overridesTokenStreamMethod = false;
+ }
+ }
+
+
+ /// <summary> Invoked before indexing a Fieldable instance if
+ /// terms have already been added to that field. This allows custom
+ /// analyzers to place an automatic position increment gap between
+ /// Fieldable instances using the same field name. The default value
+ /// position increment gap is 0. With a 0 position increment gap and
+ /// the typical default token position increment of 1, all terms in a field,
+ /// including across Fieldable instances, are in successive positions, allowing
+ /// exact PhraseQuery matches, for instance, across Fieldable instance boundaries.
+ ///
+ /// </summary>
+ /// <param name="fieldName">Fieldable name being indexed.
+ /// </param>
+ /// <returns> position increment gap, added to the next token emitted from <see cref="TokenStream(String,System.IO.TextReader)" />
+ /// </returns>
+ public virtual int GetPositionIncrementGap(String fieldName)
+ {
+ return 0;
+ }
+
+ /// <summary> Just like <see cref="GetPositionIncrementGap" />, except for
+ /// Token offsets instead. By default this returns 1 for
+ /// tokenized fields and, as if the fields were joined
+ /// with an extra space character, and 0 for un-tokenized
+ /// fields. This method is only called if the field
+ /// produced at least one token for indexing.
+ ///
+ /// </summary>
+ /// <param name="field">the field just indexed
+ /// </param>
+ /// <returns> offset gap, added to the next token emitted from <see cref="TokenStream(String,System.IO.TextReader)" />
+ /// </returns>
+ public virtual int GetOffsetGap(IFieldable field)
+ {
+ return field.IsTokenized ? 1 : 0;
+ }
+
+ /// <summary>Frees persistent resources used by this Analyzer </summary>
+ public void Close()
+ {
+ Dispose();
+ }
+
+ public virtual void Dispose()
+ {
+ Dispose(true);
+ }
+
+ protected virtual void Dispose(bool disposing)
+ {
+ if (isDisposed) return;
+
+ if (disposing)
+ {
+ if (tokenStreams != null)
+ {
+ tokenStreams.Close();
+ tokenStreams = null;
+ }
+ }
+ isDisposed = true;
+ }
+ }
+} \ No newline at end of file
diff --git a/src/core/Analysis/BaseCharFilter.cs b/src/core/Analysis/BaseCharFilter.cs
new file mode 100644
index 0000000..b84fce0
--- /dev/null
+++ b/src/core/Analysis/BaseCharFilter.cs
@@ -0,0 +1,105 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+using System;
+using Lucene.Net.Support;
+using Lucene.Net.Util;
+
+namespace Lucene.Net.Analysis
+{
+
+ /// <summary>
+ /// * Base utility class for implementing a <see cref="CharFilter" />.
+ /// * You subclass this, and then record mappings by calling
+ /// * <see cref="AddOffCorrectMap" />, and then invoke the correct
+ /// * method to correct an offset.
+ /// </summary>
+ public abstract class BaseCharFilter : CharFilter
+ {
+
+ private int[] offsets;
+ private int[] diffs;
+ private int size = 0;
+
+ protected BaseCharFilter(CharStream @in) : base(@in)
+ {
+ }
+
+ /* Retrieve the corrected offset. */
+ //@Override
+ protected internal override int Correct(int currentOff)
+ {
+ if (offsets == null || currentOff < offsets[0])
+ {
+ return currentOff;
+ }
+
+ int hi = size - 1;
+ if (currentOff >= offsets[hi])
+ return currentOff + diffs[hi];
+
+ int lo = 0;
+ int mid = -1;
+
+ while (hi >= lo)
+ {
+ mid = Number.URShift(lo + hi, 1);
+ if (currentOff < offsets[mid])
+ hi = mid - 1;
+ else if (currentOff > offsets[mid])
+ lo = mid + 1;
+ else
+ return currentOff + diffs[mid];
+ }
+
+ if (currentOff < offsets[mid])
+ return mid == 0 ? currentOff : currentOff + diffs[mid - 1];
+ return currentOff + diffs[mid];
+ }
+
+ protected int LastCumulativeDiff
+ {
+ get
+ {
+ return offsets == null ? 0 : diffs[size - 1];
+ }
+ }
+
+ [Obsolete("Use LastCumulativeDiff property instead")]
+ protected int GetLastCumulativeDiff()
+ {
+ return LastCumulativeDiff;
+ }
+
+ protected void AddOffCorrectMap(int off, int cumulativeDiff)
+ {
+ if (offsets == null)
+ {
+ offsets = new int[64];
+ diffs = new int[64];
+ }
+ else if (size == offsets.Length)
+ {
+ offsets = ArrayUtil.Grow(offsets);
+ diffs = ArrayUtil.Grow(diffs);
+ }
+
+ offsets[size] = off;
+ diffs[size++] = cumulativeDiff;
+ }
+ }
+} \ No newline at end of file
diff --git a/src/core/Analysis/CachingTokenFilter.cs b/src/core/Analysis/CachingTokenFilter.cs
new file mode 100644
index 0000000..c5f7694
--- /dev/null
+++ b/src/core/Analysis/CachingTokenFilter.cs
@@ -0,0 +1,86 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+namespace Lucene.Net.Analysis
+{
+
+ /// <summary> This class can be used if the token attributes of a TokenStream
+ /// are intended to be consumed more than once. It caches
+ /// all token attribute states locally in a List.
+ ///
+ /// <p/>CachingTokenFilter implements the optional method
+ /// <see cref="TokenStream.Reset()" />, which repositions the
+ /// stream to the first Token.
+ /// </summary>
+ public sealed class CachingTokenFilter : TokenFilter
+ {
+ private System.Collections.Generic.LinkedList<State> cache = null;
+ private System.Collections.Generic.IEnumerator<State> iterator = null;
+ private State finalState;
+
+ public CachingTokenFilter(TokenStream input):base(input)
+ {
+ }
+
+ public override bool IncrementToken()
+ {
+ if (cache == null)
+ {
+ // fill cache lazily
+ cache = new System.Collections.Generic.LinkedList<State>();
+ FillCache();
+ iterator = cache.GetEnumerator();
+ }
+
+ if (!iterator.MoveNext())
+ {
+ // the cache is exhausted, return false
+ return false;
+ }
+ // Since the TokenFilter can be reset, the tokens need to be preserved as immutable.
+ RestoreState(iterator.Current);
+ return true;
+ }
+
+ public override void End()
+ {
+ if (finalState != null)
+ {
+ RestoreState(finalState);
+ }
+ }
+
+ public override void Reset()
+ {
+ if (cache != null)
+ {
+ iterator = cache.GetEnumerator();
+ }
+ }
+
+ private void FillCache()
+ {
+ while (input.IncrementToken())
+ {
+ cache.AddLast(CaptureState());
+ }
+ // capture final state
+ input.End();
+ finalState = CaptureState();
+ }
+ }
+} \ No newline at end of file
diff --git a/src/core/Analysis/CharArraySet.cs b/src/core/Analysis/CharArraySet.cs
new file mode 100644
index 0000000..e7df0ba
--- /dev/null
+++ b/src/core/Analysis/CharArraySet.cs
@@ -0,0 +1,517 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+using System;
+using System.Collections;
+using System.Linq;
+using System.Collections.Generic;
+
+namespace Lucene.Net.Analysis
+{
+ /// <summary> A simple class that stores Strings as char[]'s in a
+ /// hash table. Note that this is not a general purpose
+ /// class. For example, it cannot remove items from the
+ /// set, nor does it resize its hash table to be smaller,
+ /// etc. It is designed to be quick to test if a char[]
+ /// is in the set without the necessity of converting it
+ /// to a String first.
+ /// <p/>
+ /// <em>Please note:</em> This class implements <see cref="System.Collections.Generic.ISet{T}"/> but
+ /// does not behave like it should in all cases. The generic type is
+ /// <see cref="System.Collections.Generic.ICollection{T}"/>, because you can add any object to it,
+ /// that has a string representation. The add methods will use
+ /// <see cref="object.ToString()"/> and store the result using a <see cref="char"/>
+ /// buffer. The same behaviour have the <see cref="Contains(object)"/> methods.
+ /// The <see cref="GetEnumerator"/> method returns an <see cref="string"/> IEnumerable.
+ /// For type safety also {@link #stringIterator()} is provided.
+ /// </summary>
+ // TODO: java uses wildcards, .net doesn't have this, easiest way is to
+ // make the entire class generic. Ultimately, though, since this
+ // works with strings, I can't think of a reason not to just declare
+ // this as an ISet<string>.
+ public class CharArraySet : ISet<string>
+ {
+ bool _ReadOnly = false;
+ const int INIT_SIZE = 8;
+ char[][] _Entries;
+ int _Count;
+ bool _IgnoreCase;
+ public static CharArraySet EMPTY_SET = UnmodifiableSet(new CharArraySet(0, false));
+
+ private void Init(int startSize, bool ignoreCase)
+ {
+ this._IgnoreCase = ignoreCase;
+ int size = INIT_SIZE;
+ while (startSize + (startSize >> 2) > size)
+ size <<= 1;
+ _Entries = new char[size][];
+ }
+
+ /// <summary>Create set with enough capacity to hold startSize
+ /// terms
+ /// </summary>
+ public CharArraySet(int startSize, bool ignoreCase)
+ {
+ Init(startSize, ignoreCase);
+ }
+
+ public CharArraySet(IEnumerable<string> c, bool ignoreCase)
+ {
+ Init(c.Count(), ignoreCase);
+ AddItems(c);
+ }
+
+ /// <summary>Create set from a Collection of char[] or String </summary>
+ public CharArraySet(IEnumerable<object> c, bool ignoreCase)
+ {
+ Init(c.Count(), ignoreCase);
+ AddItems(c);
+ }
+
+ private void AddItems<T>(IEnumerable<T> items)
+ {
+ foreach(var item in items)
+ {
+ Add(item.ToString());
+ }
+ }
+
+ /// <summary>Create set from entries </summary>
+ private CharArraySet(char[][] entries, bool ignoreCase, int count)
+ {
+ this._Entries = entries;
+ this._IgnoreCase = ignoreCase;
+ this._Count = count;
+ }
+
+ /// <summary>true if the <c>len</c> chars of <c>text</c> starting at <c>off</c>
+ /// are in the set
+ /// </summary>
+ public virtual bool Contains(char[] text, int off, int len)
+ {
+ return _Entries[GetSlot(text, off, len)] != null;
+ }
+
+ public virtual bool Contains(string text)
+ {
+ return _Entries[GetSlot(text)] != null;
+ }
+
+
+ private int GetSlot(char[] text, int off, int len)
+ {
+ int code = GetHashCode(text, off, len);
+ int pos = code & (_Entries.Length - 1);
+ char[] text2 = _Entries[pos];
+ if (text2 != null && !Equals(text, off, len, text2))
+ {
+ int inc = ((code >> 8) + code) | 1;
+ do
+ {
+ code += inc;
+ pos = code & (_Entries.Length - 1);
+ text2 = _Entries[pos];
+ }
+ while (text2 != null && !Equals(text, off, len, text2));
+ }
+ return pos;
+ }
+
+ /// <summary>Returns true if the String is in the set </summary>
+ private int GetSlot(string text)
+ {
+ int code = GetHashCode(text);
+ int pos = code & (_Entries.Length - 1);
+ char[] text2 = _Entries[pos];
+ if (text2 != null && !Equals(text, text2))
+ {
+ int inc = ((code >> 8) + code) | 1;
+ do
+ {
+ code += inc;
+ pos = code & (_Entries.Length - 1);
+ text2 = _Entries[pos];
+ }
+ while (text2 != null && !Equals(text, text2));
+ }
+ return pos;
+ }
+
+ public bool Add(string text)
+ {
+ if (_ReadOnly) throw new NotSupportedException();
+ return Add(text.ToCharArray());
+ }
+
+ /// <summary>Add this char[] directly to the set.
+ /// If ignoreCase is true for this Set, the text array will be directly modified.
+ /// The user should never modify this text array after calling this method.
+ /// </summary>
+ public bool Add(char[] text)
+ {
+ if (_ReadOnly) throw new NotSupportedException();
+
+ if (_IgnoreCase)
+ for (int i = 0; i < text.Length; i++)
+ text[i] = Char.ToLower(text[i]);
+ int slot = GetSlot(text, 0, text.Length);
+ if (_Entries[slot] != null)
+ return false;
+ _Entries[slot] = text;
+ _Count++;
+
+ if (_Count + (_Count >> 2) > _Entries.Length)
+ {
+ Rehash();
+ }
+
+ return true;
+ }
+
+ private bool Equals(char[] text1, int off, int len, char[] text2)
+ {
+ if (len != text2.Length)
+ return false;
+ if (_IgnoreCase)
+ {
+ for (int i = 0; i < len; i++)
+ {
+ if (char.ToLower(text1[off + i]) != text2[i])
+ return false;
+ }
+ }
+ else
+ {
+ for (int i = 0; i < len; i++)
+ {
+ if (text1[off + i] != text2[i])
+ return false;
+ }
+ }
+ return true;
+ }
+
+ private bool Equals(string text1, char[] text2)
+ {
+ int len = text1.Length;
+ if (len != text2.Length)
+ return false;
+ if (_IgnoreCase)
+ {
+ for (int i = 0; i < len; i++)
+ {
+ if (char.ToLower(text1[i]) != text2[i])
+ return false;
+ }
+ }
+ else
+ {
+ for (int i = 0; i < len; i++)
+ {
+ if (text1[i] != text2[i])
+ return false;
+ }
+ }
+ return true;
+ }
+
+ private void Rehash()
+ {
+ int newSize = 2 * _Entries.Length;
+ char[][] oldEntries = _Entries;
+ _Entries = new char[newSize][];
+
+ for (int i = 0; i < oldEntries.Length; i++)
+ {
+ char[] text = oldEntries[i];
+ if (text != null)
+ {
+ // todo: could be faster... no need to compare strings on collision
+ _Entries[GetSlot(text, 0, text.Length)] = text;
+ }
+ }
+ }
+
+ private int GetHashCode(char[] text, int offset, int len)
+ {
+ int code = 0;
+ int stop = offset + len;
+ if (_IgnoreCase)
+ {
+ for (int i = offset; i < stop; i++)
+ {
+ code = code * 31 + char.ToLower(text[i]);
+ }
+ }
+ else
+ {
+ for (int i = offset; i < stop; i++)
+ {
+ code = code * 31 + text[i];
+ }
+ }
+ return code;
+ }
+
+ private int GetHashCode(string text)
+ {
+ int code = 0;
+ int len = text.Length;
+ if (_IgnoreCase)
+ {
+ for (int i = 0; i < len; i++)
+ {
+ code = code * 31 + char.ToLower(text[i]);
+ }
+ }
+ else
+ {
+ for (int i = 0; i < len; i++)
+ {
+ code = code * 31 + text[i];
+ }
+ }
+ return code;
+ }
+
+ public int Count
+ {
+ get { return _Count; }
+ }
+
+ public bool IsEmpty
+ {
+ get { return _Count == 0; }
+ }
+
+ public bool Contains(object item)
+ {
+ var text = item as char[];
+ return text != null ? Contains(text, 0, text.Length) : Contains(item.ToString());
+ }
+
+ public bool Add(object item)
+ {
+ return Add(item.ToString());
+ }
+
+ void ICollection<string>.Add(string item)
+ {
+ this.Add(item);
+ }
+
+ /// <summary>
+ /// Returns an unmodifiable <see cref="CharArraySet"/>. This allows to provide
+ /// unmodifiable views of internal sets for "read-only" use
+ /// </summary>
+ /// <param name="set">A Set for which the unmodifiable set it returns.</param>
+ /// <returns>A new unmodifiable <see cref="CharArraySet"/></returns>
+ /// <throws>ArgumentNullException of the given set is <c>null</c></throws>
+ public static CharArraySet UnmodifiableSet(CharArraySet set)
+ {
+ if(set == null)
+ throw new ArgumentNullException("Given set is null");
+ if (set == EMPTY_SET)
+ return EMPTY_SET;
+ if (set._ReadOnly)
+ return set;
+
+ var newSet = new CharArraySet(set._Entries, set._IgnoreCase, set.Count) {IsReadOnly = true};
+ return newSet;
+ }
+
+ /// <summary>
+ /// returns a copy of the given set as a <see cref="CharArraySet"/>. If the given set
+ /// is a <see cref="CharArraySet"/> the ignoreCase property will be preserved.
+ /// </summary>
+ /// <param name="set">A set to copy</param>
+ /// <returns>a copy of the given set as a <see cref="CharArraySet"/>. If the given set
+ /// is a <see cref="CharArraySet"/> the ignoreCase property will be preserved.</returns>
+ public static CharArraySet Copy<T>(ISet<T> set)
+ {
+ if (set == null)
+ throw new ArgumentNullException("set", "Given set is null!");
+ if (set == EMPTY_SET)
+ return EMPTY_SET;
+ bool ignoreCase = set is CharArraySet && ((CharArraySet)set)._IgnoreCase;
+ var arrSet = new CharArraySet(set.Count, ignoreCase);
+ arrSet.AddItems(set);
+ return arrSet;
+ }
+
+ public void Clear()
+ {
+ throw new NotSupportedException("Remove not supported!");
+ }
+
+ public bool IsReadOnly
+ {
+ get { return _ReadOnly; }
+ private set { _ReadOnly = value; }
+ }
+
+ /// <summary>Adds all of the elements in the specified collection to this collection </summary>
+ public void UnionWith(IEnumerable<string> other)
+ {
+ if (_ReadOnly) throw new NotSupportedException();
+
+ foreach (string s in other)
+ {
+ Add(s.ToCharArray());
+ }
+ }
+
+ /// <summary>Wrapper that calls UnionWith</summary>
+ public void AddAll(IEnumerable<string> coll)
+ {
+ UnionWith(coll);
+ }
+
+ #region Unneeded methods
+ public void RemoveAll(ICollection<string> c)
+ {
+ throw new NotSupportedException();
+ }
+
+ public void RetainAll(ICollection<string> c)
+ {
+ throw new NotSupportedException();
+ }
+
+ void ICollection<string>.CopyTo(string[] array, int arrayIndex)
+ {
+ throw new NotSupportedException();
+ }
+
+ void ISet<string>.IntersectWith(IEnumerable<string> other)
+ {
+ throw new NotSupportedException();
+ }
+
+ void ISet<string>.ExceptWith(IEnumerable<string> other)
+ {
+ throw new NotSupportedException();
+ }
+
+ void ISet<string>.SymmetricExceptWith(IEnumerable<string> other)
+ {
+ throw new NotSupportedException();
+ }
+
+ bool ISet<string>.IsSubsetOf(IEnumerable<string> other)
+ {
+ throw new NotSupportedException();
+ }
+
+ bool ISet<string>.IsSupersetOf(IEnumerable<string> other)
+ {
+ throw new NotSupportedException();
+ }
+
+ bool ISet<string>.IsProperSupersetOf(IEnumerable<string> other)
+ {
+ throw new NotSupportedException();
+ }
+
+ bool ISet<string>.IsProperSubsetOf(IEnumerable<string> other)
+ {
+ throw new NotSupportedException();
+ }
+
+ bool ISet<string>.Overlaps(IEnumerable<string> other)
+ {
+ throw new NotSupportedException();
+ }
+
+ bool ISet<string>.SetEquals(IEnumerable<string> other)
+ {
+ throw new NotSupportedException();
+ }
+
+ bool ICollection<string>.Remove(string item)
+ {
+ throw new NotSupportedException();
+ }
+ #endregion
+
+ /// <summary>
+ /// The IEnumerator&lt;String&gt; for this set. Strings are constructed on the fly,
+ /// so use <c>nextCharArray</c> for more efficient access
+ /// </summary>
+ public class CharArraySetEnumerator : IEnumerator<string>
+ {
+ readonly CharArraySet _Creator;
+ int pos = -1;
+ char[] cur;
+
+ protected internal CharArraySetEnumerator(CharArraySet creator)
+ {
+ _Creator = creator;
+ }
+
+ public bool MoveNext()
+ {
+ cur = null;
+ pos++;
+ while (pos < _Creator._Entries.Length && (cur = _Creator._Entries[pos]) == null)
+ pos++;
+ return cur != null;
+ }
+
+ /// <summary>do not modify the returned char[] </summary>
+ public char[] NextCharArray()
+ {
+ return cur;
+ }
+
+ public string Current
+ {
+ get { return new string(NextCharArray()); }
+ }
+
+ public void Dispose()
+ {
+ }
+
+ object IEnumerator.Current
+ {
+ get { return new string(NextCharArray()); }
+ }
+
+ public void Reset()
+ {
+ throw new NotImplementedException();
+ }
+ }
+
+ public IEnumerator<string> StringEnumerator()
+ {
+ return new CharArraySetEnumerator(this);
+ }
+
+ public IEnumerator<string> GetEnumerator()
+ {
+ return new CharArraySetEnumerator(this);
+ }
+
+ IEnumerator IEnumerable.GetEnumerator()
+ {
+ return GetEnumerator();
+ }
+ }
+
+} \ No newline at end of file
diff --git a/src/core/Analysis/CharFilter.cs b/src/core/Analysis/CharFilter.cs
new file mode 100644
index 0000000..039f841
--- /dev/null
+++ b/src/core/Analysis/CharFilter.cs
@@ -0,0 +1,95 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+namespace Lucene.Net.Analysis
+{
+
+ /// <summary> Subclasses of CharFilter can be chained to filter CharStream.
+ /// They can be used as <see cref="System.IO.TextReader" /> with additional offset
+ /// correction. <see cref="Tokenizer" />s will automatically use <see cref="CorrectOffset" />
+ /// if a CharFilter/CharStream subclass is used.
+ ///
+ /// </summary>
+ /// <version> $Id$
+ ///
+ /// </version>
+ public abstract class CharFilter : CharStream
+ {
+ private long currentPosition = -1;
+ private bool isDisposed;
+ protected internal CharStream input;
+
+ protected internal CharFilter(CharStream in_Renamed) : base(in_Renamed)
+ {
+ input = in_Renamed;
+ }
+
+ /// <summary>Subclass may want to override to correct the current offset.</summary>
+ /// <param name="currentOff">current offset</param>
+ /// <returns>corrected offset</returns>
+ protected internal virtual int Correct(int currentOff)
+ {
+ return currentOff;
+ }
+
+ /// <summary> Chains the corrected offset through the input
+ /// CharFilter.
+ /// </summary>
+ public override int CorrectOffset(int currentOff)
+ {
+ return input.CorrectOffset(Correct(currentOff));
+ }
+
+ protected override void Dispose(bool disposing)
+ {
+ if (isDisposed) return;
+
+ if (disposing)
+ {
+ if (input != null)
+ {
+ input.Close();
+ }
+ }
+
+ input = null;
+ isDisposed = true;
+ base.Dispose(disposing);
+ }
+
+ public override int Read(System.Char[] cbuf, int off, int len)
+ {
+ return input.Read(cbuf, off, len);
+ }
+
+ public bool MarkSupported()
+ {
+ return input.BaseStream.CanSeek;
+ }
+
+ public void Mark(int readAheadLimit)
+ {
+ currentPosition = input.BaseStream.Position;
+ input.BaseStream.Position = readAheadLimit;
+ }
+
+ public void Reset()
+ {
+ input.BaseStream.Position = currentPosition;
+ }
+ }
+} \ No newline at end of file
diff --git a/src/core/Analysis/CharReader.cs b/src/core/Analysis/CharReader.cs
new file mode 100644
index 0000000..2120bd4
--- /dev/null
+++ b/src/core/Analysis/CharReader.cs
@@ -0,0 +1,94 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+namespace Lucene.Net.Analysis
+{
+
+ /// <summary> CharReader is a Reader wrapper. It reads chars from
+ /// Reader and outputs <see cref="CharStream" />, defining an
+ /// identify function <see cref="CorrectOffset" /> method that
+ /// simply returns the provided offset.
+ /// </summary>
+ public sealed class CharReader:CharStream
+ {
+ private long currentPosition = -1;
+
+ private bool isDisposed;
+
+ internal System.IO.StreamReader input;
+
+ public static CharStream Get(System.IO.TextReader input)
+ {
+ var charStream = input as CharStream;
+ if (charStream != null)
+ return charStream;
+
+ // {{Aroush-2.9}} isn't there a better (faster) way to do this?
+ var theString = new System.IO.MemoryStream(System.Text.Encoding.UTF8.GetBytes(input.ReadToEnd()));
+ return new CharReader(new System.IO.StreamReader(theString));
+ //return input is CharStream?(CharStream) input:new CharReader(input);
+ }
+
+ private CharReader(System.IO.StreamReader in_Renamed) : base(in_Renamed)
+ {
+ input = in_Renamed;
+ }
+
+ public override int CorrectOffset(int currentOff)
+ {
+ return currentOff;
+ }
+
+ protected override void Dispose(bool disposing)
+ {
+ if (isDisposed) return;
+
+ if (disposing)
+ {
+ if (input != null)
+ {
+ input.Close();
+ }
+ }
+
+ input = null;
+ isDisposed = true;
+ base.Dispose(disposing);
+ }
+
+ public override int Read(System.Char[] cbuf, int off, int len)
+ {
+ return input.Read(cbuf, off, len);
+ }
+
+ public bool MarkSupported()
+ {
+ return input.BaseStream.CanSeek;
+ }
+
+ public void Mark(int readAheadLimit)
+ {
+ currentPosition = input.BaseStream.Position;
+ input.BaseStream.Position = readAheadLimit;
+ }
+
+ public void Reset()
+ {
+ input.BaseStream.Position = currentPosition;
+ }
+ }
+} \ No newline at end of file
diff --git a/src/core/Analysis/CharStream.cs b/src/core/Analysis/CharStream.cs
new file mode 100644
index 0000000..0b36fe2
--- /dev/null
+++ b/src/core/Analysis/CharStream.cs
@@ -0,0 +1,45 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+namespace Lucene.Net.Analysis
+{
+
+ /// <summary> CharStream adds <see cref="CorrectOffset" />
+ /// functionality over <see cref="System.IO.TextReader" />. All Tokenizers accept a
+ /// CharStream instead of <see cref="System.IO.TextReader" /> as input, which enables
+ /// arbitrary character based filtering before tokenization.
+ /// The <see cref="CorrectOffset" /> method fixed offsets to account for
+ /// removal or insertion of characters, so that the offsets
+ /// reported in the tokens match the character offsets of the
+ /// original Reader.
+ /// </summary>
+ public abstract class CharStream : System.IO.StreamReader
+ {
+ protected CharStream(System.IO.StreamReader reader) : base(reader.BaseStream)
+ {
+ }
+
+ /// <summary> Called by CharFilter(s) and Tokenizer to correct token offset.
+ ///
+ /// </summary>
+ /// <param name="currentOff">offset as seen in the output
+ /// </param>
+ /// <returns> corrected offset based on the input
+ /// </returns>
+ public abstract int CorrectOffset(int currentOff);
+ }
+} \ No newline at end of file
diff --git a/src/core/Analysis/CharTokenizer.cs b/src/core/Analysis/CharTokenizer.cs
new file mode 100644
index 0000000..22423ec
--- /dev/null
+++ b/src/core/Analysis/CharTokenizer.cs
@@ -0,0 +1,135 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+using Lucene.Net.Analysis.Tokenattributes;
+using AttributeSource = Lucene.Net.Util.AttributeSource;
+
+namespace Lucene.Net.Analysis
+{
+
+ /// <summary>An abstract base class for simple, character-oriented tokenizers.</summary>
+ public abstract class CharTokenizer:Tokenizer
+ {
+ protected CharTokenizer(System.IO.TextReader input):base(input)
+ {
+ offsetAtt = AddAttribute<IOffsetAttribute>();
+ termAtt = AddAttribute<ITermAttribute>();
+ }
+
+ protected CharTokenizer(AttributeSource source, System.IO.TextReader input):base(source, input)
+ {
+ offsetAtt = AddAttribute<IOffsetAttribute>();
+ termAtt = AddAttribute<ITermAttribute>();
+ }
+
+ protected CharTokenizer(AttributeFactory factory, System.IO.TextReader input):base(factory, input)
+ {
+ offsetAtt = AddAttribute<IOffsetAttribute>();
+ termAtt = AddAttribute<ITermAttribute>();
+ }
+
+ private int offset = 0, bufferIndex = 0, dataLen = 0;
+ private const int MAX_WORD_LEN = 255;
+ private const int IO_BUFFER_SIZE = 4096;
+ private readonly char[] ioBuffer = new char[IO_BUFFER_SIZE];
+
+ private readonly ITermAttribute termAtt;
+ private readonly IOffsetAttribute offsetAtt;
+
+ /// <summary>Returns true iff a character should be included in a token. This
+ /// tokenizer generates as tokens adjacent sequences of characters which
+ /// satisfy this predicate. Characters for which this is false are used to
+ /// define token boundaries and are not included in tokens.
+ /// </summary>
+ protected internal abstract bool IsTokenChar(char c);
+
+ /// <summary>Called on each token character to normalize it before it is added to the
+ /// token. The default implementation does nothing. Subclasses may use this
+ /// to, e.g., lowercase tokens.
+ /// </summary>
+ protected internal virtual char Normalize(char c)
+ {
+ return c;
+ }
+
+ public override bool IncrementToken()
+ {
+ ClearAttributes();
+ int length = 0;
+ int start = bufferIndex;
+ char[] buffer = termAtt.TermBuffer();
+ while (true)
+ {
+
+ if (bufferIndex >= dataLen)
+ {
+ offset += dataLen;
+ dataLen = input.Read(ioBuffer, 0, ioBuffer.Length);
+ if (dataLen <= 0)
+ {
+ dataLen = 0; // so next offset += dataLen won't decrement offset
+ if (length > 0)
+ break;
+ return false;
+ }
+ bufferIndex = 0;
+ }
+
+ char c = ioBuffer[bufferIndex++];
+
+ if (IsTokenChar(c))
+ {
+ // if it's a token char
+
+ if (length == 0)
+ // start of token
+ start = offset + bufferIndex - 1;
+ else if (length == buffer.Length)
+ buffer = termAtt.ResizeTermBuffer(1 + length);
+
+ buffer[length++] = Normalize(c); // buffer it, normalized
+
+ if (length == MAX_WORD_LEN)
+ // buffer overflow!
+ break;
+ }
+ else if (length > 0)
+ // at non-Letter w/ chars
+ break; // return 'em
+ }
+
+ termAtt.SetTermLength(length);
+ offsetAtt.SetOffset(CorrectOffset(start), CorrectOffset(start + length));
+ return true;
+ }
+
+ public override void End()
+ {
+ // set final offset
+ int finalOffset = CorrectOffset(offset);
+ offsetAtt.SetOffset(finalOffset, finalOffset);
+ }
+
+ public override void Reset(System.IO.TextReader input)
+ {
+ base.Reset(input);
+ bufferIndex = 0;
+ offset = 0;
+ dataLen = 0;
+ }
+ }
+} \ No newline at end of file
diff --git a/src/core/Analysis/ISOLatin1AccentFilter.cs b/src/core/Analysis/ISOLatin1AccentFilter.cs
new file mode 100644
index 0000000..5fd839e
--- /dev/null
+++ b/src/core/Analysis/ISOLatin1AccentFilter.cs
@@ -0,0 +1,344 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+using System;
+using Lucene.Net.Analysis.Tokenattributes;
+
+namespace Lucene.Net.Analysis
+{
+
+ /// <summary> A filter that replaces accented characters in the ISO Latin 1 character set
+ /// (ISO-8859-1) by their unaccented equivalent. The case will not be altered.
+ /// <p/>
+ /// For instance, '&#192;' will be replaced by 'a'.
+ /// <p/>
+ ///
+ /// </summary>
+ /// <deprecated> If you build a new index, use <see cref="ASCIIFoldingFilter"/>
+ /// which covers a superset of Latin 1.
+ /// This class is included for use with existing indexes and will be removed
+ /// in a future release (possible Lucene 4.0)
+ /// </deprecated>
+ [Obsolete("If you build a new index, use ASCIIFoldingFilter which covers a superset of Latin 1. This class is included for use with existing indexes and will be removed in a future release (possible Lucene 4.0).")]
+ public class ISOLatin1AccentFilter : TokenFilter
+ {
+ public ISOLatin1AccentFilter(TokenStream input):base(input)
+ {
+ termAtt = AddAttribute<ITermAttribute>();
+ }
+
+ private char[] output = new char[256];
+ private int outputPos;
+ private readonly ITermAttribute termAtt;
+
+ public override bool IncrementToken()
+ {
+ if (input.IncrementToken())
+ {
+ char[] buffer = termAtt.TermBuffer();
+ int length = termAtt.TermLength();
+ // If no characters actually require rewriting then we
+ // just return token as-is:
+ for (int i = 0; i < length; i++)
+ {
+ char c = buffer[i];
+ if (c >= '\u00c0' && c <= '\uFB06')
+ {
+ RemoveAccents(buffer, length);
+ termAtt.SetTermBuffer(output, 0, outputPos);
+ break;
+ }
+ }
+ return true;
+ }
+ return false;
+ }
+
+ /// <summary> To replace accented characters in a String by unaccented equivalents.</summary>
+ public void RemoveAccents(char[] input, int length)
+ {
+
+ // Worst-case length required:
+ int maxSizeNeeded = 2 * length;
+
+ int size = output.Length;
+ while (size < maxSizeNeeded)
+ size *= 2;
+
+ if (size != output.Length)
+ output = new char[size];
+
+ outputPos = 0;
+
+ int pos = 0;
+
+ for (int i = 0; i < length; i++, pos++)
+ {
+ char c = input[pos];
+
+ // Quick test: if it's not in range then just keep
+ // current character
+ if (c < '\u00c0' || c > '\uFB06')
+ output[outputPos++] = c;
+ else
+ {
+ switch (c)
+ {
+
+ case '\u00C0':
+ // À
+ case '\u00C1':
+ // �?
+ case '\u00C2':
+ // Â
+ case '\u00C3':
+ // Ã
+ case '\u00C4':
+ // Ä
+ case '\u00C5': // Ã…
+ output[outputPos++] = 'A';
+ break;
+
+ case '\u00C6': // Æ
+ output[outputPos++] = 'A';
+ output[outputPos++] = 'E';
+ break;
+
+ case '\u00C7': // Ç
+ output[outputPos++] = 'C';
+ break;
+
+ case '\u00C8':
+ // È
+ case '\u00C9':
+ // É
+ case '\u00CA':
+ // Ê
+ case '\u00CB': // Ë
+ output[outputPos++] = 'E';
+ break;
+
+ case '\u00CC':
+ // Ì
+ case '\u00CD':
+ // �?
+ case '\u00CE':
+ // ÃŽ
+ case '\u00CF': // �?
+ output[outputPos++] = 'I';
+ break;
+
+ case '\u0132': // IJ
+ output[outputPos++] = 'I';
+ output[outputPos++] = 'J';
+ break;
+
+ case '\u00D0': // �?
+ output[outputPos++] = 'D';
+ break;
+
+ case '\u00D1': // Ñ
+ output[outputPos++] = 'N';
+ break;
+
+ case '\u00D2':
+ // Ã’
+ case '\u00D3':
+ // Ó
+ case '\u00D4':
+ // Ô
+ case '\u00D5':
+ // Õ
+ case '\u00D6':
+ // Ö
+ case '\u00D8': // Ø
+ output[outputPos++] = 'O';
+ break;
+
+ case '\u0152': // Å’
+ output[outputPos++] = 'O';
+ output[outputPos++] = 'E';
+ break;
+
+ case '\u00DE': // Þ
+ output[outputPos++] = 'T';
+ output[outputPos++] = 'H';
+ break;
+
+ case '\u00D9':
+ // Ù
+ case '\u00DA':
+ // Ú
+ case '\u00DB':
+ // Û
+ case '\u00DC': // Ü
+ output[outputPos++] = 'U';
+ break;
+
+ case '\u00DD':
+ // �?
+ case '\u0178': // Ÿ
+ output[outputPos++] = 'Y';
+ break;
+
+ case '\u00E0':
+ // à
+ case '\u00E1':
+ // á
+ case '\u00E2':
+ // â
+ case '\u00E3':
+ // ã
+ case '\u00E4':
+ // ä
+ case '\u00E5': // å
+ output[outputPos++] = 'a';
+ break;
+
+ case '\u00E6': // æ
+ output[outputPos++] = 'a';
+ output[outputPos++] = 'e';
+ break;
+
+ case '\u00E7': // ç
+ output[outputPos++] = 'c';
+ break;
+
+ case '\u00E8':
+ // è
+ case '\u00E9':
+ // é
+ case '\u00EA':
+ // ê
+ case '\u00EB': // ë
+ output[outputPos++] = 'e';
+ break;
+
+ case '\u00EC':
+ // ì
+ case '\u00ED':
+ // í
+ case '\u00EE':
+ // î
+ case '\u00EF': // ï
+ output[outputPos++] = 'i';
+ break;
+
+ case '\u0133': // ij
+ output[outputPos++] = 'i';
+ output[outputPos++] = 'j';
+ break;
+
+ case '\u00F0': // ð
+ output[outputPos++] = 'd';
+ break;
+
+ case '\u00F1': // ñ
+ output[outputPos++] = 'n';
+ break;
+
+ case '\u00F2':
+ // ò
+ case '\u00F3':
+ // ó
+ case '\u00F4':
+ // ô
+ case '\u00F5':
+ // õ
+ case '\u00F6':
+ // ö
+ case '\u00F8': // ø
+ output[outputPos++] = 'o';
+ break;
+
+ case '\u0153': // Å“
+ output[outputPos++] = 'o';
+ output[outputPos++] = 'e';
+ break;
+
+ case '\u00DF': // ß
+ output[outputPos++] = 's';
+ output[outputPos++] = 's';
+ break;
+
+ case '\u00FE': // þ
+ output[outputPos++] = 't';
+ output[outputPos++] = 'h';
+ break;
+
+ case '\u00F9':
+ // ù
+ case '\u00FA':
+ // ú
+ case '\u00FB':
+ // û
+ case '\u00FC': // ü
+ output[outputPos++] = 'u';
+ break;
+
+ case '\u00FD':
+ // ý
+ case '\u00FF': // ÿ
+ output[outputPos++] = 'y';
+ break;
+
+ case '\uFB00': // ff
+ output[outputPos++] = 'f';
+ output[outputPos++] = 'f';
+ break;
+
+ case '\uFB01': // �?
+ output[outputPos++] = 'f';
+ output[outputPos++] = 'i';
+ break;
+
+ case '\uFB02': // fl
+ output[outputPos++] = 'f';
+ output[outputPos++] = 'l';
+ break;
+ // following 2 are commented as they can break the maxSizeNeeded (and doing *3 could be expensive)
+ // case '\uFB03': // ffi
+ // output[outputPos++] = 'f';
+ // output[outputPos++] = 'f';
+ // output[outputPos++] = 'i';
+ // break;
+ // case '\uFB04': // ffl
+ // output[outputPos++] = 'f';
+ // output[outputPos++] = 'f';
+ // output[outputPos++] = 'l';
+ // break;
+
+ case '\uFB05': // ſt
+ output[outputPos++] = 'f';
+ output[outputPos++] = 't';
+ break;
+
+ case '\uFB06': // st
+ output[outputPos++] = 's';
+ output[outputPos++] = 't';
+ break;
+
+ default:
+ output[outputPos++] = c;
+ break;
+
+ }
+ }
+ }
+ }
+ }
+} \ No newline at end of file
diff --git a/src/core/Analysis/KeywordAnalyzer.cs b/src/core/Analysis/KeywordAnalyzer.cs
new file mode 100644
index 0000000..116babb
--- /dev/null
+++ b/src/core/Analysis/KeywordAnalyzer.cs
@@ -0,0 +1,54 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+namespace Lucene.Net.Analysis
+{
+
+ /// <summary> "Tokenizes" the entire stream as a single token. This is useful
+ /// for data like zip codes, ids, and some product names.
+ /// </summary>
+ public class KeywordAnalyzer:Analyzer
+ {
+ public KeywordAnalyzer()
+ {
+ SetOverridesTokenStreamMethod<KeywordAnalyzer>();
+ }
+ public override TokenStream TokenStream(System.String fieldName, System.IO.TextReader reader)
+ {
+ return new KeywordTokenizer(reader);
+ }
+ public override TokenStream ReusableTokenStream(System.String fieldName, System.IO.TextReader reader)
+ {
+ if (overridesTokenStreamMethod)
+ {
+ // LUCENE-1678: force fallback to tokenStream() if we
+ // have been subclassed and that subclass overrides
+ // tokenStream but not reusableTokenStream
+ return TokenStream(fieldName, reader);
+ }
+ var tokenizer = (Tokenizer) PreviousTokenStream;
+ if (tokenizer == null)
+ {
+ tokenizer = new KeywordTokenizer(reader);
+ PreviousTokenStream = tokenizer;
+ }
+ else
+ tokenizer.Reset(reader);
+ return tokenizer;
+ }
+ }
+} \ No newline at end of file
diff --git a/src/core/Analysis/KeywordTokenizer.cs b/src/core/Analysis/KeywordTokenizer.cs
new file mode 100644
index 0000000..f97ff95
--- /dev/null
+++ b/src/core/Analysis/KeywordTokenizer.cs
@@ -0,0 +1,99 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+using Lucene.Net.Analysis.Tokenattributes;
+using AttributeSource = Lucene.Net.Util.AttributeSource;
+
+namespace Lucene.Net.Analysis
+{
+
+ /// <summary> Emits the entire input as a single token.</summary>
+ public sealed class KeywordTokenizer:Tokenizer
+ {
+
+ private const int DEFAULT_BUFFER_SIZE = 256;
+
+ private bool done;
+ private int finalOffset;
+ private ITermAttribute termAtt;
+ private IOffsetAttribute offsetAtt;
+
+ public KeywordTokenizer(System.IO.TextReader input):this(input, DEFAULT_BUFFER_SIZE)
+ {
+ }
+
+ public KeywordTokenizer(System.IO.TextReader input, int bufferSize):base(input)
+ {
+ Init(bufferSize);
+ }
+
+ public KeywordTokenizer(AttributeSource source, System.IO.TextReader input, int bufferSize):base(source, input)
+ {
+ Init(bufferSize);
+ }
+
+ public KeywordTokenizer(AttributeFactory factory, System.IO.TextReader input, int bufferSize):base(factory, input)
+ {
+ Init(bufferSize);
+ }
+
+ private void Init(int bufferSize)
+ {
+ this.done = false;
+ termAtt = AddAttribute<ITermAttribute>();
+ offsetAtt = AddAttribute<IOffsetAttribute>();
+ termAtt.ResizeTermBuffer(bufferSize);
+ }
+
+ public override bool IncrementToken()
+ {
+ if (!done)
+ {
+ ClearAttributes();
+ done = true;
+ int upto = 0;
+ char[] buffer = termAtt.TermBuffer();
+ while (true)
+ {
+ int length = input.Read(buffer, upto, buffer.Length - upto);
+ if (length == 0)
+ break;
+ upto += length;
+ if (upto == buffer.Length)
+ buffer = termAtt.ResizeTermBuffer(1 + buffer.Length);
+ }
+ termAtt.SetTermLength(upto);
+ finalOffset = CorrectOffset(upto);
+ offsetAtt.SetOffset(CorrectOffset(0), finalOffset);
+ return true;
+ }
+ return false;
+ }
+
+ public override void End()
+ {
+ // set final offset
+ offsetAtt.SetOffset(finalOffset, finalOffset);
+ }
+
+ public override void Reset(System.IO.TextReader input)
+ {
+ base.Reset(input);
+ this.done = false;
+ }
+ }
+} \ No newline at end of file
diff --git a/src/core/Analysis/LengthFilter.cs b/src/core/Analysis/LengthFilter.cs
new file mode 100644
index 0000000..c4f60ad
--- /dev/null
+++ b/src/core/Analysis/LengthFilter.cs
@@ -0,0 +1,60 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+using Lucene.Net.Analysis.Tokenattributes;
+
+namespace Lucene.Net.Analysis
+{
+
+ /// <summary>Removes words that are too long or too short from the stream.</summary>
+ public sealed class LengthFilter:TokenFilter
+ {
+
+ internal int min;
+ internal int max;
+
+ private readonly ITermAttribute termAtt;
+
+ /// <summary> Build a filter that removes words that are too long or too
+ /// short from the text.
+ /// </summary>
+ public LengthFilter(TokenStream in_Renamed, int min, int max)
+ : base(in_Renamed)
+ {
+ this.min = min;
+ this.max = max;
+ termAtt = AddAttribute<ITermAttribute>();
+ }
+
+ /// <summary> Returns the next input Token whose term() is the right len</summary>
+ public override bool IncrementToken()
+ {
+ // return the first non-stop word found
+ while (input.IncrementToken())
+ {
+ var len = termAtt.TermLength();
+ if (len >= min && len <= max)
+ {
+ return true;
+ }
+ // note: else we ignore it but should we index each part of it?
+ }
+ // reached EOS -- return false
+ return false;
+ }
+ }
+} \ No newline at end of file
diff --git a/src/core/Analysis/LetterTokenizer.cs b/src/core/Analysis/LetterTokenizer.cs
new file mode 100644
index 0000000..77629a8
--- /dev/null
+++ b/src/core/Analysis/LetterTokenizer.cs
@@ -0,0 +1,57 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+using AttributeSource = Lucene.Net.Util.AttributeSource;
+
+namespace Lucene.Net.Analysis
+{
+
+ /// <summary>A LetterTokenizer is a tokenizer that divides text at non-letters. That's
+ /// to say, it defines tokens as maximal strings of adjacent letters, as defined
+ /// by java.lang.Character.isLetter() predicate.
+ /// Note: this does a decent job for most European languages, but does a terrible
+ /// job for some Asian languages, where words are not separated by spaces.
+ /// </summary>
+
+ public class LetterTokenizer:CharTokenizer
+ {
+ /// <summary>Construct a new LetterTokenizer. </summary>
+ public LetterTokenizer(System.IO.TextReader @in):base(@in)
+ {
+ }
+
+ /// <summary>Construct a new LetterTokenizer using a given <see cref="AttributeSource" />. </summary>
+ public LetterTokenizer(AttributeSource source, System.IO.TextReader @in)
+ : base(source, @in)
+ {
+ }
+
+ /// <summary>Construct a new LetterTokenizer using a given <see cref="Lucene.Net.Util.AttributeSource.AttributeFactory" />. </summary>
+ public LetterTokenizer(AttributeFactory factory, System.IO.TextReader @in)
+ : base(factory, @in)
+ {
+ }
+
+ /// <summary>Collects only characters which satisfy
+ /// <see cref="char.IsLetter(char)" />.
+ /// </summary>
+ protected internal override bool IsTokenChar(char c)
+ {
+ return System.Char.IsLetter(c);
+ }
+ }
+} \ No newline at end of file
diff --git a/src/core/Analysis/LowerCaseFilter.cs b/src/core/Analysis/LowerCaseFilter.cs
new file mode 100644
index 0000000..cad0197
--- /dev/null
+++ b/src/core/Analysis/LowerCaseFilter.cs
@@ -0,0 +1,49 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+using Lucene.Net.Analysis.Tokenattributes;
+
+namespace Lucene.Net.Analysis
+{
+
+ /// <summary>Normalizes token text to lower case.</summary>
+ public sealed class LowerCaseFilter:TokenFilter
+ {
+ public LowerCaseFilter(TokenStream @in)
+ : base(@in)
+ {
+ termAtt = AddAttribute<ITermAttribute>();
+ }
+
+ private readonly ITermAttribute termAtt;
+
+ public override bool IncrementToken()
+ {
+ if (input.IncrementToken())
+ {
+
+ char[] buffer = termAtt.TermBuffer();
+ int length = termAtt.TermLength();
+ for (int i = 0; i < length; i++)
+ buffer[i] = System.Char.ToLower(buffer[i]);
+
+ return true;
+ }
+ return false;
+ }
+ }
+} \ No newline at end of file
diff --git a/src/core/Analysis/LowerCaseTokenizer.cs b/src/core/Analysis/LowerCaseTokenizer.cs
new file mode 100644
index 0000000..4cea217
--- /dev/null
+++ b/src/core/Analysis/LowerCaseTokenizer.cs
@@ -0,0 +1,60 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+using AttributeSource = Lucene.Net.Util.AttributeSource;
+
+namespace Lucene.Net.Analysis
+{
+
+ /// <summary> LowerCaseTokenizer performs the function of LetterTokenizer
+ /// and LowerCaseFilter together. It divides text at non-letters and converts
+ /// them to lower case. While it is functionally equivalent to the combination
+ /// of LetterTokenizer and LowerCaseFilter, there is a performance advantage
+ /// to doing the two tasks at once, hence this (redundant) implementation.
+ /// <p/>
+ /// Note: this does a decent job for most European languages, but does a terrible
+ /// job for some Asian languages, where words are not separated by spaces.
+ /// </summary>
+ public sealed class LowerCaseTokenizer:LetterTokenizer
+ {
+ /// <summary>Construct a new LowerCaseTokenizer. </summary>
+ public LowerCaseTokenizer(System.IO.TextReader @in)
+ : base(@in)
+ {
+ }
+
+ /// <summary>Construct a new LowerCaseTokenizer using a given <see cref="AttributeSource" />. </summary>
+ public LowerCaseTokenizer(AttributeSource source, System.IO.TextReader @in)
+ : base(source, @in)
+ {
+ }
+
+ /// <summary>Construct a new LowerCaseTokenizer using a given <see cref="Lucene.Net.Util.AttributeSource.AttributeFactory" />. </summary>
+ public LowerCaseTokenizer(AttributeFactory factory, System.IO.TextReader @in)
+ : base(factory, @in)
+ {
+ }
+
+ /// <summary>Converts char to lower case
+ /// <see cref="char.ToLower(char)" />.
+ /// </summary>
+ protected internal override char Normalize(char c)
+ {
+ return System.Char.ToLower(c);
+ }
+ }
+} \ No newline at end of file
diff --git a/src/core/Analysis/MappingCharFilter.cs b/src/core/Analysis/MappingCharFilter.cs
new file mode 100644
index 0000000..9705719
--- /dev/null
+++ b/src/core/Analysis/MappingCharFilter.cs
@@ -0,0 +1,166 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+using System.Collections.Generic;
+
+namespace Lucene.Net.Analysis
+{
+
+ /// <summary> Simplistic <see cref="CharFilter" /> that applies the mappings
+ /// contained in a <see cref="NormalizeCharMap" /> to the character
+ /// stream, and correcting the resulting changes to the
+ /// offsets.
+ /// </summary>
+ public class MappingCharFilter : BaseCharFilter
+ {
+ private readonly NormalizeCharMap normMap;
+ private LinkedList<char> buffer;
+ private System.String replacement;
+ private int charPointer;
+ private int nextCharCounter;
+
+ /// Default constructor that takes a <see cref="CharStream" />.
+ public MappingCharFilter(NormalizeCharMap normMap, CharStream @in)
+ : base(@in)
+ {
+ this.normMap = normMap;
+ }
+
+ /// Easy-use constructor that takes a <see cref="System.IO.TextReader" />.
+ public MappingCharFilter(NormalizeCharMap normMap, System.IO.TextReader @in)
+ : base(CharReader.Get(@in))
+ {
+ this.normMap = normMap;
+ }
+
+ public override int Read()
+ {
+ while (true)
+ {
+ if (replacement != null && charPointer < replacement.Length)
+ {
+ return replacement[charPointer++];
+ }
+
+ int firstChar = NextChar();
+ if (firstChar == - 1)
+ return - 1;
+ NormalizeCharMap nm = normMap.submap != null
+ ? normMap.submap[(char) firstChar]
+ : null;
+ if (nm == null)
+ return firstChar;
+ NormalizeCharMap result = Match(nm);
+ if (result == null)
+ return firstChar;
+ replacement = result.normStr;
+ charPointer = 0;
+ if (result.diff != 0)
+ {
+ int prevCumulativeDiff = LastCumulativeDiff;
+ if (result.diff < 0)
+ {
+ for (int i = 0; i < - result.diff; i++)
+ AddOffCorrectMap(nextCharCounter + i - prevCumulativeDiff, prevCumulativeDiff - 1 - i);
+ }
+ else
+ {
+ AddOffCorrectMap(nextCharCounter - result.diff - prevCumulativeDiff, prevCumulativeDiff + result.diff);
+ }
+ }
+ }
+ }
+
+ private int NextChar()
+ {
+ nextCharCounter++;
+ if (buffer != null && buffer.Count != 0)
+ {
+ char tempObject = buffer.First.Value;
+ buffer.RemoveFirst();
+ return (tempObject);
+ }
+ return input.Read();
+ }
+
+ private void PushChar(int c)
+ {
+ nextCharCounter--;
+ if (buffer == null)
+ {
+ buffer = new LinkedList<char>();
+ }
+ buffer.AddFirst((char)c);
+ }
+
+ private void PushLastChar(int c)
+ {
+ if (buffer == null)
+ {
+ buffer = new LinkedList<char>();
+ }
+ buffer.AddLast((char)c);
+ }
+
+ private NormalizeCharMap Match(NormalizeCharMap map)
+ {
+ NormalizeCharMap result = null;
+ if (map.submap != null)
+ {
+ int chr = NextChar();
+ if (chr != - 1)
+ {
+ NormalizeCharMap subMap = map.submap[(char)chr];
+ if (subMap != null)
+ {
+ result = Match(subMap);
+ }
+ if (result == null)
+ {
+ PushChar(chr);
+ }
+ }
+ }
+ if (result == null && map.normStr != null)
+ {
+ result = map;
+ }
+ return result;
+ }
+
+ public override int Read(System.Char[] cbuf, int off, int len)
+ {
+ var tmp = new char[len];
+ int l = input.Read(tmp, 0, len);
+ if (l != 0)
+ {
+ for (int i = 0; i < l; i++)
+ PushLastChar(tmp[i]);
+ }
+ l = 0;
+ for (int i = off; i < off + len; i++)
+ {
+ int c = Read();
+ if (c == - 1)
+ break;
+ cbuf[i] = (char) c;
+ l++;
+ }
+ return l == 0?- 1:l;
+ }
+ }
+} \ No newline at end of file
diff --git a/src/core/Analysis/NormalizeCharMap.cs b/src/core/Analysis/NormalizeCharMap.cs
new file mode 100644
index 0000000..7fd520c
--- /dev/null
+++ b/src/core/Analysis/NormalizeCharMap.cs
@@ -0,0 +1,68 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+using Lucene.Net.Support;
+
+namespace Lucene.Net.Analysis
+{
+
+ /// <summary> Holds a map of String input to String output, to be used
+ /// with <see cref="MappingCharFilter" />.
+ /// </summary>
+ public class NormalizeCharMap
+ {
+ internal System.Collections.Generic.IDictionary<char, NormalizeCharMap> submap;
+ internal System.String normStr;
+ internal int diff;
+
+ /// <summary>Records a replacement to be applied to the inputs
+ /// stream. Whenever <c>singleMatch</c> occurs in
+ /// the input, it will be replaced with
+ /// <c>replacement</c>.
+ ///
+ /// </summary>
+ /// <param name="singleMatch">input String to be replaced
+ /// </param>
+ /// <param name="replacement">output String
+ /// </param>
+ public virtual void Add(System.String singleMatch, System.String replacement)
+ {
+ NormalizeCharMap currMap = this;
+ for (var i = 0; i < singleMatch.Length; i++)
+ {
+ char c = singleMatch[i];
+ if (currMap.submap == null)
+ {
+ currMap.submap = new HashMap<char, NormalizeCharMap>(1);
+ }
+ var map = currMap.submap[c];
+ if (map == null)
+ {
+ map = new NormalizeCharMap();
+ currMap.submap[c] = map;
+ }
+ currMap = map;
+ }
+ if (currMap.normStr != null)
+ {
+ throw new System.SystemException("MappingCharFilter: there is already a mapping for " + singleMatch);
+ }
+ currMap.normStr = replacement;
+ currMap.diff = singleMatch.Length - replacement.Length;
+ }
+ }
+} \ No newline at end of file
diff --git a/src/core/Analysis/NumericTokenStream.cs b/src/core/Analysis/NumericTokenStream.cs
new file mode 100644
index 0000000..90b6e72
--- /dev/null
+++ b/src/core/Analysis/NumericTokenStream.cs
@@ -0,0 +1,270 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+using Lucene.Net.Analysis.Tokenattributes;
+using Lucene.Net.Search;
+using AttributeSource = Lucene.Net.Util.AttributeSource;
+using NumericUtils = Lucene.Net.Util.NumericUtils;
+using NumericField = Lucene.Net.Documents.NumericField;
+// javadocs
+
+namespace Lucene.Net.Analysis
+{
+
+ /// <summary> <b>Expert:</b> This class provides a <see cref="TokenStream" />
+ /// for indexing numeric values that can be used by <see cref="NumericRangeQuery{T}" />
+ /// or <see cref="NumericRangeFilter{T}" />.
+ ///
+ /// <p/>Note that for simple usage, <see cref="NumericField" /> is
+ /// recommended. <see cref="NumericField" /> disables norms and
+ /// term freqs, as they are not usually needed during
+ /// searching. If you need to change these settings, you
+ /// should use this class.
+ ///
+ /// <p/>See <see cref="NumericField" /> for capabilities of fields
+ /// indexed numerically.<p/>
+ ///
+ /// <p/>Here's an example usage, for an <c>int</c> field:
+ ///
+ /// <code>
+ /// Field field = new Field(name, new NumericTokenStream(precisionStep).setIntValue(value));
+ /// field.setOmitNorms(true);
+ /// field.setOmitTermFreqAndPositions(true);
+ /// document.add(field);
+ /// </code>
+ ///
+ /// <p/>For optimal performance, re-use the TokenStream and Field instance
+ /// for more than one document:
+ ///
+ /// <code>
+ /// NumericTokenStream stream = new NumericTokenStream(precisionStep);
+ /// Field field = new Field(name, stream);
+ /// field.setOmitNorms(true);
+ /// field.setOmitTermFreqAndPositions(true);
+ /// Document document = new Document();
+ /// document.add(field);
+ ///
+ /// for(all documents) {
+ /// stream.setIntValue(value)
+ /// writer.addDocument(document);
+ /// }
+ /// </code>
+ ///
+ /// <p/>This stream is not intended to be used in analyzers;
+ /// it's more for iterating the different precisions during
+ /// indexing a specific numeric value.<p/>
+ ///
+ /// <p/><b>NOTE</b>: as token streams are only consumed once
+ /// the document is added to the index, if you index more
+ /// than one numeric field, use a separate <c>NumericTokenStream</c>
+ /// instance for each.<p/>
+ ///
+ /// <p/>See <see cref="NumericRangeQuery{T}" /> for more details on the
+ /// <a href="../search/NumericRangeQuery.html#precisionStepDesc"><c>precisionStep</c></a>
+ /// parameter as well as how numeric fields work under the hood.<p/>
+ ///
+ /// <p/><font color="red"><b>NOTE:</b> This API is experimental and
+ /// might change in incompatible ways in the next release.</font>
+ /// Since 2.9
+ /// </summary>
+ public sealed class NumericTokenStream : TokenStream
+ {
+ private void InitBlock()
+ {
+ termAtt = AddAttribute<ITermAttribute>();
+ typeAtt = AddAttribute<ITypeAttribute>();
+ posIncrAtt = AddAttribute<IPositionIncrementAttribute>();
+ }
+
+ /// <summary>The full precision token gets this token type assigned. </summary>
+ public const System.String TOKEN_TYPE_FULL_PREC = "fullPrecNumeric";
+
+ /// <summary>The lower precision tokens gets this token type assigned. </summary>
+ public const System.String TOKEN_TYPE_LOWER_PREC = "lowerPrecNumeric";
+
+ /// <summary> Creates a token stream for numeric values using the default <c>precisionStep</c>
+ /// <see cref="NumericUtils.PRECISION_STEP_DEFAULT" /> (4). The stream is not yet initialized,
+ /// before using set a value using the various set<em>???</em>Value() methods.
+ /// </summary>
+ public NumericTokenStream():this(NumericUtils.PRECISION_STEP_DEFAULT)
+ {
+ }
+
+ /// <summary> Creates a token stream for numeric values with the specified
+ /// <c>precisionStep</c>. The stream is not yet initialized,
+ /// before using set a value using the various set<em>???</em>Value() methods.
+ /// </summary>
+ public NumericTokenStream(int precisionStep):base()
+ {
+ InitBlock();
+ this.precisionStep = precisionStep;
+ if (precisionStep < 1)
+ throw new System.ArgumentException("precisionStep must be >=1");
+ }
+
+ /// <summary> Expert: Creates a token stream for numeric values with the specified
+ /// <c>precisionStep</c> using the given <see cref="AttributeSource" />.
+ /// The stream is not yet initialized,
+ /// before using set a value using the various set<em>???</em>Value() methods.
+ /// </summary>
+ public NumericTokenStream(AttributeSource source, int precisionStep):base(source)
+ {
+ InitBlock();
+ this.precisionStep = precisionStep;
+ if (precisionStep < 1)
+ throw new System.ArgumentException("precisionStep must be >=1");
+ }
+
+ /// <summary> Expert: Creates a token stream for numeric values with the specified
+ /// <c>precisionStep</c> using the given
+ /// <see cref="Lucene.Net.Util.AttributeSource.AttributeFactory" />.
+ /// The stream is not yet initialized,
+ /// before using set a value using the various set<em>???</em>Value() methods.
+ /// </summary>
+ public NumericTokenStream(AttributeFactory factory, int precisionStep):base(factory)
+ {
+ InitBlock();
+ this.precisionStep = precisionStep;
+ if (precisionStep < 1)
+ throw new System.ArgumentException("precisionStep must be >=1");
+ }
+
+ /// <summary> Initializes the token stream with the supplied <c>long</c> value.</summary>
+ /// <param name="value_Renamed">the value, for which this TokenStream should enumerate tokens.
+ /// </param>
+ /// <returns> this instance, because of this you can use it the following way:
+ /// <c>new Field(name, new NumericTokenStream(precisionStep).SetLongValue(value))</c>
+ /// </returns>
+ public NumericTokenStream SetLongValue(long value_Renamed)
+ {
+ this.value_Renamed = value_Renamed;
+ valSize = 64;
+ shift = 0;
+ return this;
+ }
+
+ /// <summary> Initializes the token stream with the supplied <c>int</c> value.</summary>
+ /// <param name="value_Renamed">the value, for which this TokenStream should enumerate tokens.
+ /// </param>
+ /// <returns> this instance, because of this you can use it the following way:
+ /// <c>new Field(name, new NumericTokenStream(precisionStep).SetIntValue(value))</c>
+ /// </returns>
+ public NumericTokenStream SetIntValue(int value_Renamed)
+ {
+ this.value_Renamed = (long) value_Renamed;
+ valSize = 32;
+ shift = 0;
+ return this;
+ }
+
+ /// <summary> Initializes the token stream with the supplied <c>double</c> value.</summary>
+ /// <param name="value_Renamed">the value, for which this TokenStream should enumerate tokens.
+ /// </param>
+ /// <returns> this instance, because of this you can use it the following way:
+ /// <c>new Field(name, new NumericTokenStream(precisionStep).SetDoubleValue(value))</c>
+ /// </returns>
+ public NumericTokenStream SetDoubleValue(double value_Renamed)
+ {
+ this.value_Renamed = NumericUtils.DoubleToSortableLong(value_Renamed);
+ valSize = 64;
+ shift = 0;
+ return this;
+ }
+
+ /// <summary> Initializes the token stream with the supplied <c>float</c> value.</summary>
+ /// <param name="value_Renamed">the value, for which this TokenStream should enumerate tokens.
+ /// </param>
+ /// <returns> this instance, because of this you can use it the following way:
+ /// <c>new Field(name, new NumericTokenStream(precisionStep).SetFloatValue(value))</c>
+ /// </returns>
+ public NumericTokenStream SetFloatValue(float value_Renamed)
+ {
+ this.value_Renamed = (long) NumericUtils.FloatToSortableInt(value_Renamed);
+ valSize = 32;
+ shift = 0;
+ return this;
+ }
+
+ // @Override
+ public override void Reset()
+ {
+ if (valSize == 0)
+ throw new System.SystemException("call set???Value() before usage");
+ shift = 0;
+ }
+
+ protected override void Dispose(bool disposing)
+ {
+ // Do nothing.
+ }
+
+ // @Override
+ public override bool IncrementToken()
+ {
+ if (valSize == 0)
+ throw new System.SystemException("call set???Value() before usage");
+ if (shift >= valSize)
+ return false;
+
+ ClearAttributes();
+ char[] buffer;
+ switch (valSize)
+ {
+
+ case 64:
+ buffer = termAtt.ResizeTermBuffer(NumericUtils.BUF_SIZE_LONG);
+ termAtt.SetTermLength(NumericUtils.LongToPrefixCoded(value_Renamed, shift, buffer));
+ break;
+
+
+ case 32:
+ buffer = termAtt.ResizeTermBuffer(NumericUtils.BUF_SIZE_INT);
+ termAtt.SetTermLength(NumericUtils.IntToPrefixCoded((int) value_Renamed, shift, buffer));
+ break;
+
+
+ default:
+ // should not happen
+ throw new System.ArgumentException("valSize must be 32 or 64");
+
+ }
+
+ typeAtt.Type = (shift == 0)?TOKEN_TYPE_FULL_PREC:TOKEN_TYPE_LOWER_PREC;
+ posIncrAtt.PositionIncrement = (shift == 0)?1:0;
+ shift += precisionStep;
+ return true;
+ }
+
+ // @Override
+ public override System.String ToString()
+ {
+ System.Text.StringBuilder sb = new System.Text.StringBuilder("(numeric,valSize=").Append(valSize);
+ sb.Append(",precisionStep=").Append(precisionStep).Append(')');
+ return sb.ToString();
+ }
+
+ // members
+ private ITermAttribute termAtt;
+ private ITypeAttribute typeAtt;
+ private IPositionIncrementAttribute posIncrAtt;
+
+ private int shift = 0, valSize = 0; // valSize==0 means not initialized
+ private readonly int precisionStep;
+
+ private long value_Renamed = 0L;
+ }
+} \ No newline at end of file
diff --git a/src/core/Analysis/PerFieldAnalyzerWrapper.cs b/src/core/Analysis/PerFieldAnalyzerWrapper.cs
new file mode 100644
index 0000000..b1c43aa
--- /dev/null
+++ b/src/core/Analysis/PerFieldAnalyzerWrapper.cs
@@ -0,0 +1,135 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+using System.Collections.Generic;
+using Lucene.Net.Support;
+
+namespace Lucene.Net.Analysis
+{
+
+ /// <summary> This analyzer is used to facilitate scenarios where different
+ /// fields require different analysis techniques. Use <see cref="AddAnalyzer" />
+ /// to add a non-default analyzer on a field name basis.
+ ///
+ /// <p/>Example usage:
+ ///
+ /// <code>
+ /// PerFieldAnalyzerWrapper aWrapper =
+ /// new PerFieldAnalyzerWrapper(new StandardAnalyzer());
+ /// aWrapper.addAnalyzer("firstname", new KeywordAnalyzer());
+ /// aWrapper.addAnalyzer("lastname", new KeywordAnalyzer());
+ /// </code>
+ ///
+ /// <p/>In this example, StandardAnalyzer will be used for all fields except "firstname"
+ /// and "lastname", for which KeywordAnalyzer will be used.
+ ///
+ /// <p/>A PerFieldAnalyzerWrapper can be used like any other analyzer, for both indexing
+ /// and query parsing.
+ /// </summary>
+ public class PerFieldAnalyzerWrapper:Analyzer
+ {
+ private readonly Analyzer defaultAnalyzer;
+ private readonly IDictionary<string, Analyzer> analyzerMap = new HashMap<string, Analyzer>();
+
+
+ /// <summary> Constructs with default analyzer.
+ ///
+ /// </summary>
+ /// <param name="defaultAnalyzer">Any fields not specifically
+ /// defined to use a different analyzer will use the one provided here.
+ /// </param>
+ public PerFieldAnalyzerWrapper(Analyzer defaultAnalyzer)
+ : this(defaultAnalyzer, null)
+ {
+ }
+
+ /// <summary> Constructs with default analyzer and a map of analyzers to use for
+ /// specific fields.
+ ///
+ /// </summary>
+ /// <param name="defaultAnalyzer">Any fields not specifically
+ /// defined to use a different analyzer will use the one provided here.
+ /// </param>
+ /// <param name="fieldAnalyzers">a Map (String field name to the Analyzer) to be
+ /// used for those fields
+ /// </param>
+ public PerFieldAnalyzerWrapper(Analyzer defaultAnalyzer, IEnumerable<KeyValuePair<string, Analyzer>> fieldAnalyzers)
+ {
+ this.defaultAnalyzer = defaultAnalyzer;
+ if (fieldAnalyzers != null)
+ {
+ foreach(var entry in fieldAnalyzers)
+ analyzerMap[entry.Key] = entry.Value;
+ }
+ SetOverridesTokenStreamMethod<PerFieldAnalyzerWrapper>();
+ }
+
+
+ /// <summary> Defines an analyzer to use for the specified field.
+ ///
+ /// </summary>
+ /// <param name="fieldName">field name requiring a non-default analyzer
+ /// </param>
+ /// <param name="analyzer">non-default analyzer to use for field
+ /// </param>
+ public virtual void AddAnalyzer(System.String fieldName, Analyzer analyzer)
+ {
+ analyzerMap[fieldName] = analyzer;
+ }
+
+ public override TokenStream TokenStream(System.String fieldName, System.IO.TextReader reader)
+ {
+ var analyzer = analyzerMap[fieldName] ?? defaultAnalyzer;
+
+ return analyzer.TokenStream(fieldName, reader);
+ }
+
+ public override TokenStream ReusableTokenStream(string fieldName, System.IO.TextReader reader)
+ {
+ if (overridesTokenStreamMethod)
+ {
+ // LUCENE-1678: force fallback to tokenStream() if we
+ // have been subclassed and that subclass overrides
+ // tokenStream but not reusableTokenStream
+ return TokenStream(fieldName, reader);
+ }
+ var analyzer = analyzerMap[fieldName] ?? defaultAnalyzer;
+
+ return analyzer.ReusableTokenStream(fieldName, reader);
+ }
+
+ /// <summary>Return the positionIncrementGap from the analyzer assigned to fieldName </summary>
+ public override int GetPositionIncrementGap(string fieldName)
+ {
+ var analyzer = analyzerMap[fieldName] ?? defaultAnalyzer;
+ return analyzer.GetPositionIncrementGap(fieldName);
+ }
+
+ /// <summary> Return the offsetGap from the analyzer assigned to field </summary>
+ public override int GetOffsetGap(Documents.IFieldable field)
+ {
+ Analyzer analyzer = analyzerMap[field.Name] ?? defaultAnalyzer;
+ return analyzer.GetOffsetGap(field);
+ }
+
+ public override System.String ToString()
+ {
+ // {{Aroush-2.9}} will 'analyzerMap.ToString()' work in the same way as Java's java.util.HashMap.toString()?
+ return "PerFieldAnalyzerWrapper(" + analyzerMap + ", default=" + defaultAnalyzer + ")";
+ }
+ }
+} \ No newline at end of file
diff --git a/src/core/Analysis/PorterStemFilter.cs b/src/core/Analysis/PorterStemFilter.cs
new file mode 100644
index 0000000..b7f1dbf
--- /dev/null
+++ b/src/core/Analysis/PorterStemFilter.cs
@@ -0,0 +1,62 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+using Lucene.Net.Analysis.Tokenattributes;
+
+namespace Lucene.Net.Analysis
+{
+
+ /// <summary>Transforms the token stream as per the Porter stemming algorithm.
+ /// Note: the input to the stemming filter must already be in lower case,
+ /// so you will need to use LowerCaseFilter or LowerCaseTokenizer farther
+ /// down the Tokenizer chain in order for this to work properly!
+ /// <p/>
+ /// To use this filter with other analyzers, you'll want to write an
+ /// Analyzer class that sets up the TokenStream chain as you want it.
+ /// To use this with LowerCaseTokenizer, for example, you'd write an
+ /// analyzer like this:
+ /// <p/>
+ /// <code>
+ /// class MyAnalyzer extends Analyzer {
+ /// public final TokenStream tokenStream(String fieldName, Reader reader) {
+ /// return new PorterStemFilter(new LowerCaseTokenizer(reader));
+ /// }
+ /// }
+ /// </code>
+ /// </summary>
+ public sealed class PorterStemFilter:TokenFilter
+ {
+ private readonly PorterStemmer stemmer;
+ private readonly ITermAttribute termAtt;
+
+ public PorterStemFilter(TokenStream in_Renamed):base(in_Renamed)
+ {
+ stemmer = new PorterStemmer();
+ termAtt = AddAttribute<ITermAttribute>();
+ }
+
+ public override bool IncrementToken()
+ {
+ if (!input.IncrementToken())
+ return false;
+
+ if (stemmer.Stem(termAtt.TermBuffer(), 0, termAtt.TermLength()))
+ termAtt.SetTermBuffer(stemmer.ResultBuffer, 0, stemmer.ResultLength);
+ return true;
+ }
+ }
+} \ No newline at end of file
diff --git a/src/core/Analysis/PorterStemmer.cs b/src/core/Analysis/PorterStemmer.cs
new file mode 100644
index 0000000..f47c5a7
--- /dev/null
+++ b/src/core/Analysis/PorterStemmer.cs
@@ -0,0 +1,746 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+
+Porter stemmer in Java. The original paper is in
+
+Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14,
+no. 3, pp 130-137,
+
+See also http://www.tartarus.org/~martin/PorterStemmer/index.html
+
+Bug 1 (reported by Gonzalo Parra 16/10/99) fixed as marked below.
+Tthe words 'aed', 'eed', 'oed' leave k at 'a' for step 3, and b[k-1]
+is then out outside the bounds of b.
+
+Similarly,
+
+Bug 2 (reported by Steve Dyrdahl 22/2/00) fixed as marked below.
+'ion' by itself leaves j = -1 in the test for 'ion' in step 5, and
+b[j] is then outside the bounds of b.
+
+Release 3.
+
+[ This version is derived from Release 3, modified by Brian Goetz to
+optimize for fewer object creations. ]
+*/
+using System;
+namespace Lucene.Net.Analysis
+{
+
+ /// <summary>
+ /// Stemmer, implementing the Porter Stemming Algorithm
+ ///
+ /// The Stemmer class transforms a word into its root form. The input
+ /// word can be provided a character at time (by calling add()), or at once
+ /// by calling one of the various stem(something) methods.
+ /// </summary>
+
+ class PorterStemmer
+ {
+ private char[] b;
+ private int i, j, k, k0;
+ private bool dirty = false;
+ private const int INC = 50; /* unit of size whereby b is increased */
+ private const int EXTRA = 1;
+
+ public PorterStemmer()
+ {
+ b = new char[INC];
+ i = 0;
+ }
+
+ /// <summary> reset() resets the stemmer so it can stem another word. If you invoke
+ /// the stemmer by calling add(char) and then stem(), you must call reset()
+ /// before starting another word.
+ /// </summary>
+ public virtual void Reset()
+ {
+ i = 0; dirty = false;
+ }
+
+ /// <summary> Add a character to the word being stemmed. When you are finished
+ /// adding characters, you can call stem(void) to process the word.
+ /// </summary>
+ public virtual void Add(char ch)
+ {
+ if (b.Length <= i + EXTRA)
+ {
+ var new_b = new char[b.Length + INC];
+ Array.Copy(b, 0, new_b, 0, b.Length);
+ b = new_b;
+ }
+ b[i++] = ch;
+ }
+
+ /// <summary> After a word has been stemmed, it can be retrieved by toString(),
+ /// or a reference to the internal buffer can be retrieved by getResultBuffer
+ /// and getResultLength (which is generally more efficient.)
+ /// </summary>
+ public override System.String ToString()
+ {
+ return new System.String(b, 0, i);
+ }
+
+ /// <summary> Returns the length of the word resulting from the stemming process.</summary>
+ public virtual int ResultLength
+ {
+ get { return i; }
+ }
+
+ /// <summary> Returns a reference to a character buffer containing the results of
+ /// the stemming process. You also need to consult getResultLength()
+ /// to determine the length of the result.
+ /// </summary>
+ public virtual char[] ResultBuffer
+ {
+ get { return b; }
+ }
+
+ /* cons(i) is true <=> b[i] is a consonant. */
+
+ private bool Cons(int i)
+ {
+ switch (b[i])
+ {
+
+ case 'a':
+ case 'e':
+ case 'i':
+ case 'o':
+ case 'u':
+ return false;
+
+ case 'y':
+ return (i == k0)?true:!Cons(i - 1);
+
+ default:
+ return true;
+
+ }
+ }
+
+ /* m() measures the number of consonant sequences between k0 and j. if c is
+ a consonant sequence and v a vowel sequence, and <..> indicates arbitrary
+ presence,
+
+ <c><v> gives 0
+ <c>vc<v> gives 1
+ <c>vcvc<v> gives 2
+ <c>vcvcvc<v> gives 3
+ ....
+ */
+
+ private int M()
+ {
+ int n = 0;
+ int i = k0;
+ while (true)
+ {
+ if (i > j)
+ return n;
+ if (!Cons(i))
+ break;
+ i++;
+ }
+ i++;
+ while (true)
+ {
+ while (true)
+ {
+ if (i > j)
+ return n;
+ if (Cons(i))
+ break;
+ i++;
+ }
+ i++;
+ n++;
+ while (true)
+ {
+ if (i > j)
+ return n;
+ if (!Cons(i))
+ break;
+ i++;
+ }
+ i++;
+ }
+ }
+
+ /* vowelinstem() is true <=> k0,...j contains a vowel */
+
+ private bool Vowelinstem()
+ {
+ int i;
+ for (i = k0; i <= j; i++)
+ if (!Cons(i))
+ return true;
+ return false;
+ }
+
+ /* doublec(j) is true <=> j,(j-1) contain a double consonant. */
+
+ private bool Doublec(int j)
+ {
+ if (j < k0 + 1)
+ return false;
+ if (b[j] != b[j - 1])
+ return false;
+ return Cons(j);
+ }
+
+ /* cvc(i) is true <=> i-2,i-1,i has the form consonant - vowel - consonant
+ and also if the second c is not w,x or y. this is used when trying to
+ restore an e at the end of a short word. e.g.
+
+ cav(e), lov(e), hop(e), crim(e), but
+ snow, box, tray.
+
+ */
+
+ private bool Cvc(int i)
+ {
+ if (i < k0 + 2 || !Cons(i) || Cons(i - 1) || !Cons(i - 2))
+ return false;
+ else
+ {
+ int ch = b[i];
+ if (ch == 'w' || ch == 'x' || ch == 'y')
+ return false;
+ }
+ return true;
+ }
+
+ private bool Ends(System.String s)
+ {
+ int l = s.Length;
+ int o = k - l + 1;
+ if (o < k0)
+ return false;
+ for (int i = 0; i < l; i++)
+ if (b[o + i] != s[i])
+ return false;
+ j = k - l;
+ return true;
+ }
+
+ /* setto(s) sets (j+1),...k to the characters in the string s, readjusting
+ k. */
+
+ internal virtual void Setto(System.String s)
+ {
+ int l = s.Length;
+ int o = j + 1;
+ for (int i = 0; i < l; i++)
+ b[o + i] = s[i];
+ k = j + l;
+ dirty = true;
+ }
+
+ /* r(s) is used further down. */
+
+ internal virtual void R(System.String s)
+ {
+ if (M() > 0)
+ Setto(s);
+ }
+
+ /* step1() gets rid of plurals and -ed or -ing. e.g.
+
+ caresses -> caress
+ ponies -> poni
+ ties -> ti
+ caress -> caress
+ cats -> cat
+
+ feed -> feed
+ agreed -> agree
+ disabled -> disable
+
+ matting -> mat
+ mating -> mate
+ meeting -> meet
+ milling -> mill
+ messing -> mess
+
+ meetings -> meet
+
+ */
+
+ private void Step1()
+ {
+ if (b[k] == 's')
+ {
+ if (Ends("sses"))
+ k -= 2;
+ else if (Ends("ies"))
+ Setto("i");
+ else if (b[k - 1] != 's')
+ k--;
+ }
+ if (Ends("eed"))
+ {
+ if (M() > 0)
+ k--;
+ }
+ else if ((Ends("ed") || Ends("ing")) && Vowelinstem())
+ {
+ k = j;
+ if (Ends("at"))
+ Setto("ate");
+ else if (Ends("bl"))
+ Setto("ble");
+ else if (Ends("iz"))
+ Setto("ize");
+ else if (Doublec(k))
+ {
+ int ch = b[k--];
+ if (ch == 'l' || ch == 's' || ch == 'z')
+ k++;
+ }
+ else if (M() == 1 && Cvc(k))
+ Setto("e");
+ }
+ }
+
+ /* step2() turns terminal y to i when there is another vowel in the stem. */
+
+ private void Step2()
+ {
+ if (Ends("y") && Vowelinstem())
+ {
+ b[k] = 'i';
+ dirty = true;
+ }
+ }
+
+ /* step3() maps double suffices to single ones. so -ization ( = -ize plus
+ -ation) maps to -ize etc. note that the string before the suffix must give
+ m() > 0. */
+
+ private void Step3()
+ {
+ if (k == k0)
+ return ; /* For Bug 1 */
+ switch (b[k - 1])
+ {
+
+ case 'a':
+ if (Ends("ational"))
+ {
+ R("ate"); break;
+ }
+ if (Ends("tional"))
+ {
+ R("tion"); break;
+ }
+ break;
+
+ case 'c':
+ if (Ends("enci"))
+ {
+ R("ence"); break;
+ }
+ if (Ends("anci"))
+ {
+ R("ance"); break;
+ }
+ break;
+
+ case 'e':
+ if (Ends("izer"))
+ {
+ R("ize"); break;
+ }
+ break;
+
+ case 'l':
+ if (Ends("bli"))
+ {
+ R("ble"); break;
+ }
+ if (Ends("alli"))
+ {
+ R("al"); break;
+ }
+ if (Ends("entli"))
+ {
+ R("ent"); break;
+ }
+ if (Ends("eli"))
+ {
+ R("e"); break;
+ }
+ if (Ends("ousli"))
+ {
+ R("ous"); break;
+ }
+ break;
+
+ case 'o':
+ if (Ends("ization"))
+ {
+ R("ize"); break;
+ }
+ if (Ends("ation"))
+ {
+ R("ate"); break;
+ }
+ if (Ends("ator"))
+ {
+ R("ate"); break;
+ }
+ break;
+
+ case 's':
+ if (Ends("alism"))
+ {
+ R("al"); break;
+ }
+ if (Ends("iveness"))
+ {
+ R("ive"); break;
+ }
+ if (Ends("fulness"))
+ {
+ R("ful"); break;
+ }
+ if (Ends("ousness"))
+ {
+ R("ous"); break;
+ }
+ break;
+
+ case 't':
+ if (Ends("aliti"))
+ {
+ R("al"); break;
+ }
+ if (Ends("iviti"))
+ {
+ R("ive"); break;
+ }
+ if (Ends("biliti"))
+ {
+ R("ble"); break;
+ }
+ break;
+
+ case 'g':
+ if (Ends("logi"))
+ {
+ R("log"); break;
+ }
+ break;
+ }
+ }
+
+ /* step4() deals with -ic-, -full, -ness etc. similar strategy to step3. */
+
+ private void Step4()
+ {
+ switch (b[k])
+ {
+
+ case 'e':
+ if (Ends("icate"))
+ {
+ R("ic"); break;
+ }
+ if (Ends("ative"))
+ {
+ R(""); break;
+ }
+ if (Ends("alize"))
+ {
+ R("al"); break;
+ }
+ break;
+
+ case 'i':
+ if (Ends("iciti"))
+ {
+ R("ic"); break;
+ }
+ break;
+
+ case 'l':
+ if (Ends("ical"))
+ {
+ R("ic"); break;
+ }
+ if (Ends("ful"))
+ {
+ R(""); break;
+ }
+ break;
+
+ case 's':
+ if (Ends("ness"))
+ {
+ R(""); break;
+ }
+ break;
+ }
+ }
+
+ /* step5() takes off -ant, -ence etc., in context <c>vcvc<v>. */
+
+ private void Step5()
+ {
+ if (k == k0)
+ return ; /* for Bug 1 */
+ switch (b[k - 1])
+ {
+
+ case 'a':
+ if (Ends("al"))
+ break;
+ return ;
+
+ case 'c':
+ if (Ends("ance"))
+ break;
+ if (Ends("ence"))
+ break;
+ return ;
+
+ case 'e':
+ if (Ends("er"))
+ break; return ;
+
+ case 'i':
+ if (Ends("ic"))
+ break; return ;
+
+ case 'l':
+ if (Ends("able"))
+ break;
+ if (Ends("ible"))
+ break; return ;
+
+ case 'n':
+ if (Ends("ant"))
+ break;
+ if (Ends("ement"))
+ break;
+ if (Ends("ment"))
+ break;
+ /* element etc. not stripped before the m */
+ if (Ends("ent"))
+ break;
+ return ;
+
+ case 'o':
+ if (Ends("ion") && j >= 0 && (b[j] == 's' || b[j] == 't'))
+ break;
+ /* j >= 0 fixes Bug 2 */
+ if (Ends("ou"))
+ break;
+ return ;
+ /* takes care of -ous */
+
+ case 's':
+ if (Ends("ism"))
+ break;
+ return ;
+
+ case 't':
+ if (Ends("ate"))
+ break;
+ if (Ends("iti"))
+ break;
+ return ;
+
+ case 'u':
+ if (Ends("ous"))
+ break;
+ return ;
+
+ case 'v':
+ if (Ends("ive"))
+ break;
+ return ;
+
+ case 'z':
+ if (Ends("ize"))
+ break;
+ return ;
+
+ default:
+ return ;
+
+ }
+ if (M() > 1)
+ k = j;
+ }
+
+ /* step6() removes a final -e if m() > 1. */
+
+ private void Step6()
+ {
+ j = k;
+ if (b[k] == 'e')
+ {
+ int a = M();
+ if (a > 1 || a == 1 && !Cvc(k - 1))
+ k--;
+ }
+ if (b[k] == 'l' && Doublec(k) && M() > 1)
+ k--;
+ }
+
+
+ /// <summary> Stem a word provided as a String. Returns the result as a String.</summary>
+ public virtual System.String Stem(System.String s)
+ {
+ if (Stem(s.ToCharArray(), s.Length))
+ {
+ return ToString();
+ }
+ else
+ return s;
+ }
+
+ /// <summary>Stem a word contained in a char[]. Returns true if the stemming process
+ /// resulted in a word different from the input. You can retrieve the
+ /// result with getResultLength()/getResultBuffer() or toString().
+ /// </summary>
+ public virtual bool Stem(char[] word)
+ {
+ return Stem(word, word.Length);
+ }
+
+ /// <summary>Stem a word contained in a portion of a char[] array. Returns
+ /// true if the stemming process resulted in a word different from
+ /// the input. You can retrieve the result with
+ /// getResultLength()/getResultBuffer() or toString().
+ /// </summary>
+ public virtual bool Stem(char[] wordBuffer, int offset, int wordLen)
+ {
+ Reset();
+ if (b.Length < wordLen)
+ {
+ var new_b = new char[wordLen + EXTRA];
+ b = new_b;
+ }
+ Array.Copy(wordBuffer, offset, b, 0, wordLen);
+ i = wordLen;
+ return Stem(0);
+ }
+
+ /// <summary>Stem a word contained in a leading portion of a char[] array.
+ /// Returns true if the stemming process resulted in a word different
+ /// from the input. You can retrieve the result with
+ /// getResultLength()/getResultBuffer() or toString().
+ /// </summary>
+ public virtual bool Stem(char[] word, int wordLen)
+ {
+ return Stem(word, 0, wordLen);
+ }
+
+ /// <summary>Stem the word placed into the Stemmer buffer through calls to add().
+ /// Returns true if the stemming process resulted in a word different
+ /// from the input. You can retrieve the result with
+ /// getResultLength()/getResultBuffer() or toString().
+ /// </summary>
+ public virtual bool Stem()
+ {
+ return Stem(0);
+ }
+
+ public virtual bool Stem(int i0)
+ {
+ k = i - 1;
+ k0 = i0;
+ if (k > k0 + 1)
+ {
+ Step1(); Step2(); Step3(); Step4(); Step5(); Step6();
+ }
+ // Also, a word is considered dirty if we lopped off letters
+ // Thanks to Ifigenia Vairelles for pointing this out.
+ if (i != k + 1)
+ dirty = true;
+ i = k + 1;
+ return dirty;
+ }
+
+ /// <summary>Test program for demonstrating the Stemmer. It reads a file and
+ /// stems each word, writing the result to standard out.
+ /// Usage: Stemmer file-name
+ /// </summary>
+ [STAThread]
+ public static void Main(System.String[] args)
+ {
+ var s = new PorterStemmer();
+
+ for (int i = 0; i < args.Length; i++)
+ {
+ try
+ {
+ System.IO.Stream in_Renamed = new System.IO.FileStream(args[i], System.IO.FileMode.Open, System.IO.FileAccess.Read);
+ var buffer = new byte[1024];
+
+ int bufferLen = in_Renamed.Read(buffer, 0, buffer.Length);
+ int offset = 0;
+ s.Reset();
+
+ while (true)
+ {
+ int ch;
+ if (offset < bufferLen)
+ ch = buffer[offset++];
+ else
+ {
+ bufferLen = in_Renamed.Read(buffer, 0, buffer.Length);
+ offset = 0;
+ if (bufferLen < 0)
+ ch = - 1;
+ else
+ ch = buffer[offset++];
+ }
+
+ if (Char.IsLetter((char) ch))
+ {
+ s.Add(Char.ToLower((char) ch));
+ }
+ else
+ {
+ s.Stem();
+ Console.Out.Write(s.ToString());
+ s.Reset();
+ if (ch < 0)
+ break;
+ else
+ {
+ System.Console.Out.Write((char) ch);
+ }
+ }
+ }
+
+ in_Renamed.Close();
+ }
+ catch (System.IO.IOException)
+ {
+ Console.Out.WriteLine("error reading " + args[i]);
+ }
+ }
+ }
+ }
+} \ No newline at end of file
diff --git a/src/core/Analysis/SimpleAnalyzer.cs b/src/core/Analysis/SimpleAnalyzer.cs
new file mode 100644
index 0000000..b84f470
--- /dev/null
+++ b/src/core/Analysis/SimpleAnalyzer.cs
@@ -0,0 +1,45 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+namespace Lucene.Net.Analysis
+{
+
+ /// <summary>An <see cref="Analyzer" /> that filters <see cref="LetterTokenizer" />
+ /// with <see cref="LowerCaseFilter" />
+ /// </summary>
+
+ public sealed class SimpleAnalyzer : Analyzer
+ {
+ public override TokenStream TokenStream(System.String fieldName, System.IO.TextReader reader)
+ {
+ return new LowerCaseTokenizer(reader);
+ }
+
+ public override TokenStream ReusableTokenStream(System.String fieldName, System.IO.TextReader reader)
+ {
+ var tokenizer = (Tokenizer) PreviousTokenStream;
+ if (tokenizer == null)
+ {
+ tokenizer = new LowerCaseTokenizer(reader);
+ PreviousTokenStream = tokenizer;
+ }
+ else
+ tokenizer.Reset(reader);
+ return tokenizer;
+ }
+ }
+} \ No newline at end of file
diff --git a/src/core/Analysis/Standard/StandardAnalyzer.cs b/src/core/Analysis/Standard/StandardAnalyzer.cs
new file mode 100644
index 0000000..347d026
--- /dev/null
+++ b/src/core/Analysis/Standard/StandardAnalyzer.cs
@@ -0,0 +1,174 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+using System;
+using System.Collections;
+using System.Collections.Generic;
+using Lucene.Net.Analysis;
+using Lucene.Net.Util;
+using Version = Lucene.Net.Util.Version;
+
+namespace Lucene.Net.Analysis.Standard
+{
+
+ /// <summary> Filters <see cref="StandardTokenizer" /> with <see cref="StandardFilter" />,
+ /// <see cref="LowerCaseFilter" /> and <see cref="StopFilter" />, using a list of English stop
+ /// words.
+ ///
+ /// <a name="version"/>
+ /// <p/>
+ /// You must specify the required <see cref="Version" /> compatibility when creating
+ /// StandardAnalyzer:
+ /// <list type="bullet">
+ /// <item>As of 2.9, StopFilter preserves position increments</item>
+ /// <item>As of 2.4, Tokens incorrectly identified as acronyms are corrected (see
+ /// <a href="https://issues.apache.org/jira/browse/LUCENE-1068">LUCENE-1608</a>)</item>
+ /// </list>
+ /// </summary>
+ public class StandardAnalyzer : Analyzer
+ {
+ private ISet<string> stopSet;
+
+ /// <summary> Specifies whether deprecated acronyms should be replaced with HOST type.
+ /// See <a href="https://issues.apache.org/jira/browse/LUCENE-1068">https://issues.apache.org/jira/browse/LUCENE-1068</a>
+ /// </summary>
+ private bool replaceInvalidAcronym, enableStopPositionIncrements;
+
+ /// <summary>An unmodifiable set containing some common English words that are usually not
+ /// useful for searching.
+ /// </summary>
+ public static readonly ISet<string> STOP_WORDS_SET;
+ private Version matchVersion;
+
+ /// <summary>Builds an analyzer with the default stop words (<see cref="STOP_WORDS_SET" />).
+ /// </summary>
+ /// <param name="matchVersion">Lucene version to match see <see cref="Version">above</see></param>
+ public StandardAnalyzer(Version matchVersion)
+ : this(matchVersion, STOP_WORDS_SET)
+ { }
+
+ /// <summary>Builds an analyzer with the given stop words.</summary>
+ /// <param name="matchVersion">Lucene version to match See <see cref="Version">above</see> />
+ ///
+ /// </param>
+ /// <param name="stopWords">stop words
+ /// </param>
+ public StandardAnalyzer(Version matchVersion, ISet<string> stopWords)
+ {
+ stopSet = stopWords;
+ SetOverridesTokenStreamMethod<StandardAnalyzer>();
+ enableStopPositionIncrements = StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion);
+ replaceInvalidAcronym = matchVersion.OnOrAfter(Version.LUCENE_24);
+ this.matchVersion = matchVersion;
+ }
+
+ /// <summary>Builds an analyzer with the stop words from the given file.</summary>
+ /// <seealso cref="WordlistLoader.GetWordSet(System.IO.FileInfo)">
+ /// </seealso>
+ /// <param name="matchVersion">Lucene version to match See <see cref="Version">above</see> />
+ ///
+ /// </param>
+ /// <param name="stopwords">File to read stop words from
+ /// </param>
+ public StandardAnalyzer(Version matchVersion, System.IO.FileInfo stopwords)
+ : this (matchVersion, WordlistLoader.GetWordSet(stopwords))
+ {
+ }
+
+ /// <summary>Builds an analyzer with the stop words from the given reader.</summary>
+ /// <seealso cref="WordlistLoader.GetWordSet(System.IO.TextReader)">
+ /// </seealso>
+ /// <param name="matchVersion">Lucene version to match See <see cref="Version">above</see> />
+ ///
+ /// </param>
+ /// <param name="stopwords">Reader to read stop words from
+ /// </param>
+ public StandardAnalyzer(Version matchVersion, System.IO.TextReader stopwords)
+ : this(matchVersion, WordlistLoader.GetWordSet(stopwords))
+ { }
+
+ /// <summary>Constructs a <see cref="StandardTokenizer" /> filtered by a <see cref="StandardFilter" />
+ ///, a <see cref="LowerCaseFilter" /> and a <see cref="StopFilter" />.
+ /// </summary>
+ public override TokenStream TokenStream(System.String fieldName, System.IO.TextReader reader)
+ {
+ StandardTokenizer tokenStream = new StandardTokenizer(matchVersion, reader);
+ tokenStream.MaxTokenLength = maxTokenLength;
+ TokenStream result = new StandardFilter(tokenStream);
+ result = new LowerCaseFilter(result);
+ result = new StopFilter(enableStopPositionIncrements, result, stopSet);
+ return result;
+ }
+
+ private sealed class SavedStreams
+ {
+ internal StandardTokenizer tokenStream;
+ internal TokenStream filteredTokenStream;
+ }
+
+ /// <summary>Default maximum allowed token length </summary>
+ public const int DEFAULT_MAX_TOKEN_LENGTH = 255;
+
+ private int maxTokenLength = DEFAULT_MAX_TOKEN_LENGTH;
+
+ /// <summary> Set maximum allowed token length. If a token is seen
+ /// that exceeds this length then it is discarded. This
+ /// setting only takes effect the next time tokenStream or
+ /// reusableTokenStream is called.
+ /// </summary>
+ public virtual int MaxTokenLength
+ {
+ get { return maxTokenLength; }
+ set { maxTokenLength = value; }
+ }
+
+ public override TokenStream ReusableTokenStream(System.String fieldName, System.IO.TextReader reader)
+ {
+ if (overridesTokenStreamMethod)
+ {
+ // LUCENE-1678: force fallback to tokenStream() if we
+ // have been subclassed and that subclass overrides
+ // tokenStream but not reusableTokenStream
+ return TokenStream(fieldName, reader);
+ }
+ SavedStreams streams = (SavedStreams) PreviousTokenStream;
+ if (streams == null)
+ {
+ streams = new SavedStreams();
+ PreviousTokenStream = streams;
+ streams.tokenStream = new StandardTokenizer(matchVersion, reader);
+ streams.filteredTokenStream = new StandardFilter(streams.tokenStream);
+ streams.filteredTokenStream = new LowerCaseFilter(streams.filteredTokenStream);
+ streams.filteredTokenStream = new StopFilter(enableStopPositionIncrements,
+ streams.filteredTokenStream, stopSet);
+ }
+ else
+ {
+ streams.tokenStream.Reset(reader);
+ }
+ streams.tokenStream.MaxTokenLength = maxTokenLength;
+
+ streams.tokenStream.SetReplaceInvalidAcronym(replaceInvalidAcronym);
+
+ return streams.filteredTokenStream;
+ }
+ static StandardAnalyzer()
+ {
+ STOP_WORDS_SET = StopAnalyzer.ENGLISH_STOP_WORDS_SET;
+ }
+ }
+} \ No newline at end of file
diff --git a/src/core/Analysis/Standard/StandardFilter.cs b/src/core/Analysis/Standard/StandardFilter.cs
new file mode 100644
index 0000000..fd13261
--- /dev/null
+++ b/src/core/Analysis/Standard/StandardFilter.cs
@@ -0,0 +1,88 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+using System;
+using Lucene.Net.Analysis.Tokenattributes;
+using Token = Lucene.Net.Analysis.Token;
+using TokenFilter = Lucene.Net.Analysis.TokenFilter;
+using TokenStream = Lucene.Net.Analysis.TokenStream;
+
+namespace Lucene.Net.Analysis.Standard
+{
+
+ /// <summary>Normalizes tokens extracted with <see cref="StandardTokenizer" />. </summary>
+
+ public sealed class StandardFilter:TokenFilter
+ {
+
+
+ /// <summary>Construct filtering <i>in</i>. </summary>
+ public StandardFilter(TokenStream in_Renamed):base(in_Renamed)
+ {
+ termAtt = AddAttribute<ITermAttribute>();
+ typeAtt = AddAttribute<ITypeAttribute>();
+ }
+
+ private static readonly System.String APOSTROPHE_TYPE;
+ private static readonly System.String ACRONYM_TYPE;
+
+ // this filters uses attribute type
+ private ITypeAttribute typeAtt;
+ private ITermAttribute termAtt;
+
+ /// <summary>Returns the next token in the stream, or null at EOS.
+ /// <p/>Removes <tt>'s</tt> from the end of words.
+ /// <p/>Removes dots from acronyms.
+ /// </summary>
+ public override bool IncrementToken()
+ {
+ if (!input.IncrementToken())
+ {
+ return false;
+ }
+
+ char[] buffer = termAtt.TermBuffer();
+ int bufferLength = termAtt.TermLength();
+ System.String type = typeAtt.Type;
+
+ if ((System.Object) type == (System.Object) APOSTROPHE_TYPE && bufferLength >= 2 && buffer[bufferLength - 2] == '\'' && (buffer[bufferLength - 1] == 's' || buffer[bufferLength - 1] == 'S'))
+ {
+ // Strip last 2 characters off
+ termAtt.SetTermLength(bufferLength - 2);
+ }
+ else if ((System.Object) type == (System.Object) ACRONYM_TYPE)
+ {
+ // remove dots
+ int upto = 0;
+ for (int i = 0; i < bufferLength; i++)
+ {
+ char c = buffer[i];
+ if (c != '.')
+ buffer[upto++] = c;
+ }
+ termAtt.SetTermLength(upto);
+ }
+
+ return true;
+ }
+ static StandardFilter()
+ {
+ APOSTROPHE_TYPE = StandardTokenizerImpl.TOKEN_TYPES[StandardTokenizerImpl.APOSTROPHE];
+ ACRONYM_TYPE = StandardTokenizerImpl.TOKEN_TYPES[StandardTokenizerImpl.ACRONYM];
+ }
+ }
+} \ No newline at end of file
diff --git a/src/core/Analysis/Standard/StandardTokenizer.cs b/src/core/Analysis/Standard/StandardTokenizer.cs
new file mode 100644
index 0000000..dca409d
--- /dev/null
+++ b/src/core/Analysis/Standard/StandardTokenizer.cs
@@ -0,0 +1,232 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+using System;
+using Lucene.Net.Analysis.Tokenattributes;
+using Lucene.Net.Util;
+using CharReader = Lucene.Net.Analysis.CharReader;
+using Token = Lucene.Net.Analysis.Token;
+using Tokenizer = Lucene.Net.Analysis.Tokenizer;
+using AttributeSource = Lucene.Net.Util.AttributeSource;
+using Version = Lucene.Net.Util.Version;
+
+namespace Lucene.Net.Analysis.Standard
+{
+
+ /// <summary>A grammar-based tokenizer constructed with JFlex
+ ///
+ /// <p/> This should be a good tokenizer for most European-language documents:
+ ///
+ /// <list type="bullet">
+ /// <item>Splits words at punctuation characters, removing punctuation. However, a
+ /// dot that's not followed by whitespace is considered part of a token.</item>
+ /// <item>Splits words at hyphens, unless there's a number in the token, in which case
+ /// the whole token is interpreted as a product number and is not split.</item>
+ /// <item>Recognizes email addresses and internet hostnames as one token.</item>
+ /// </list>
+ ///
+ /// <p/>Many applications have specific tokenizer needs. If this tokenizer does
+ /// not suit your application, please consider copying this source code
+ /// directory to your project and maintaining your own grammar-based tokenizer.
+ ///
+ /// <a name="version"/>
+ /// <p/>
+ /// You must specify the required <see cref="Version" /> compatibility when creating
+ /// StandardAnalyzer:
+ /// <list type="bullet">
+ /// <item>As of 2.4, Tokens incorrectly identified as acronyms are corrected (see
+ /// <a href="https://issues.apache.org/jira/browse/LUCENE-1068">LUCENE-1608</a></item>
+ /// </list>
+ /// </summary>
+
+ public sealed class StandardTokenizer:Tokenizer
+ {
+ private void InitBlock()
+ {
+ maxTokenLength = StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH;
+ }
+ /// <summary>A private instance of the JFlex-constructed scanner </summary>
+ private StandardTokenizerImpl scanner;
+
+ public const int ALPHANUM = 0;
+ public const int APOSTROPHE = 1;
+ public const int ACRONYM = 2;
+ public const int COMPANY = 3;
+ public const int EMAIL = 4;
+ public const int HOST = 5;
+ public const int NUM = 6;
+ public const int CJ = 7;
+
+ /// <deprecated> this solves a bug where HOSTs that end with '.' are identified
+ /// as ACRONYMs.
+ /// </deprecated>
+ [Obsolete("this solves a bug where HOSTs that end with '.' are identified as ACRONYMs.")]
+ public const int ACRONYM_DEP = 8;
+
+ /// <summary>String token types that correspond to token type int constants </summary>
+ public static readonly System.String[] TOKEN_TYPES = new System.String[]{"<ALPHANUM>", "<APOSTROPHE>", "<ACRONYM>", "<COMPANY>", "<EMAIL>", "<HOST>", "<NUM>", "<CJ>", "<ACRONYM_DEP>"};
+
+ private bool replaceInvalidAcronym;
+
+ private int maxTokenLength;
+
+ /// <summary>Set the max allowed token length. Any token longer
+ /// than this is skipped.
+ /// </summary>
+ public int MaxTokenLength
+ {
+ get { return maxTokenLength; }
+ set { this.maxTokenLength = value; }
+ }
+
+ /// <summary> Creates a new instance of the
+ /// <see cref="Lucene.Net.Analysis.Standard.StandardTokenizer" />. Attaches
+ /// the <c>input</c> to the newly created JFlex scanner.
+ ///
+ /// </summary>
+ /// <param name="matchVersion"></param>
+ /// <param name="input">The input reader
+ ///
+ /// See http://issues.apache.org/jira/browse/LUCENE-1068
+ /// </param>
+ public StandardTokenizer(Version matchVersion, System.IO.TextReader input):base()
+ {
+ InitBlock();
+ this.scanner = new StandardTokenizerImpl(input);
+ Init(input, matchVersion);
+ }
+
+ /// <summary> Creates a new StandardTokenizer with a given <see cref="AttributeSource" />.</summary>
+ public StandardTokenizer(Version matchVersion, AttributeSource source, System.IO.TextReader input):base(source)
+ {
+ InitBlock();
+ this.scanner = new StandardTokenizerImpl(input);
+ Init(input, matchVersion);
+ }
+
+ /// <summary> Creates a new StandardTokenizer with a given
+ /// <see cref="Lucene.Net.Util.AttributeSource.AttributeFactory" />
+ /// </summary>
+ public StandardTokenizer(Version matchVersion, AttributeFactory factory, System.IO.TextReader input):base(factory)
+ {
+ InitBlock();
+ this.scanner = new StandardTokenizerImpl(input);
+ Init(input, matchVersion);
+ }
+
+ private void Init(System.IO.TextReader input, Version matchVersion)
+ {
+ if (matchVersion.OnOrAfter(Version.LUCENE_24))
+ {
+ replaceInvalidAcronym = true;
+ }
+ else
+ {
+ replaceInvalidAcronym = false;
+ }
+ this.input = input;
+ termAtt = AddAttribute<ITermAttribute>();
+ offsetAtt = AddAttribute<IOffsetAttribute>();
+ posIncrAtt = AddAttribute<IPositionIncrementAttribute>();
+ typeAtt = AddAttribute<ITypeAttribute>();
+ }
+
+ // this tokenizer generates three attributes:
+ // offset, positionIncrement and type
+ private ITermAttribute termAtt;
+ private IOffsetAttribute offsetAtt;
+ private IPositionIncrementAttribute posIncrAtt;
+ private ITypeAttribute typeAtt;
+
+ ///<summary>
+ /// (non-Javadoc)
+ /// <see cref="Lucene.Net.Analysis.TokenStream.IncrementToken()" />
+ ///</summary>
+ public override bool IncrementToken()
+ {
+ ClearAttributes();
+ int posIncr = 1;
+
+ while (true)
+ {
+ int tokenType = scanner.GetNextToken();
+
+ if (tokenType == StandardTokenizerImpl.YYEOF)
+ {
+ return false;
+ }
+
+ if (scanner.Yylength() <= maxTokenLength)
+ {
+ posIncrAtt.PositionIncrement = posIncr;
+ scanner.GetText(termAtt);
+ int start = scanner.Yychar();
+ offsetAtt.SetOffset(CorrectOffset(start), CorrectOffset(start + termAtt.TermLength()));
+ // This 'if' should be removed in the next release. For now, it converts
+ // invalid acronyms to HOST. When removed, only the 'else' part should
+ // remain.
+ if (tokenType == StandardTokenizerImpl.ACRONYM_DEP)
+ {
+ if (replaceInvalidAcronym)
+ {
+ typeAtt.Type = StandardTokenizerImpl.TOKEN_TYPES[StandardTokenizerImpl.HOST];
+ termAtt.SetTermLength(termAtt.TermLength() - 1); // remove extra '.'
+ }
+ else
+ {
+ typeAtt.Type = StandardTokenizerImpl.TOKEN_TYPES[StandardTokenizerImpl.ACRONYM];
+ }
+ }
+ else
+ {
+ typeAtt.Type = StandardTokenizerImpl.TOKEN_TYPES[tokenType];
+ }
+ return true;
+ }
+ // When we skip a too-long term, we still increment the
+ // position increment
+ else
+ posIncr++;
+ }
+ }
+
+ public override void End()
+ {
+ // set final offset
+ int finalOffset = CorrectOffset(scanner.Yychar() + scanner.Yylength());
+ offsetAtt.SetOffset(finalOffset, finalOffset);
+ }
+
+ public override void Reset(System.IO.TextReader reader)
+ {
+ base.Reset(reader);
+ scanner.Reset(reader);
+ }
+
+ /// <summary>
+ /// Remove in 3.X and make true the only valid value
+ /// See https://issues.apache.org/jira/browse/LUCENE-1068
+ /// </summary>
+ /// <param name="replaceInvalidAcronym">Set to true to replace mischaracterized acronyms as HOST.
+ /// </param>
+ [Obsolete("Remove in 3.X and make true the only valid value. See https://issues.apache.org/jira/browse/LUCENE-1068")]
+ public void SetReplaceInvalidAcronym(bool replaceInvalidAcronym)
+ {
+ this.replaceInvalidAcronym = replaceInvalidAcronym;
+ }
+ }
+} \ No newline at end of file
diff --git a/src/core/Analysis/Standard/StandardTokenizerImpl.cs b/src/core/Analysis/Standard/StandardTokenizerImpl.cs
new file mode 100644
index 0000000..cb4bf5f
--- /dev/null
+++ b/src/core/Analysis/Standard/StandardTokenizerImpl.cs
@@ -0,0 +1,707 @@
+/* The following code was generated by JFlex 1.4.1 on 9/4/08 6:49 PM */
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+/*
+ NOTE: if you change StandardTokenizerImpl.jflex and need to regenerate the tokenizer,
+ the tokenizer, only use Java 1.4 !!!
+ This grammar currently uses constructs (eg :digit:, :letter:) whose
+ meaning can vary according to the JRE used to run jflex. See
+ https://issues.apache.org/jira/browse/LUCENE-1126 for details.
+ For current backwards compatibility it is needed to support
+ only Java 1.4 - this will change in Lucene 3.1.
+*/
+
+using System;
+using Lucene.Net.Analysis.Tokenattributes;
+using Token = Lucene.Net.Analysis.Token;
+
+namespace Lucene.Net.Analysis.Standard
+{
+
+
+ /// <summary> This class is a scanner generated by
+ /// <a href="http://www.jflex.de/">JFlex</a> 1.4.1
+ /// on 9/4/08 6:49 PM from the specification file
+ /// <tt>/tango/mike/src/lucene.standarddigit/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.jflex</tt>
+ /// </summary>
+ class StandardTokenizerImpl
+ {
+
+ /// <summary>This character denotes the end of file </summary>
+ public const int YYEOF = - 1;
+
+ /// <summary>initial size of the lookahead buffer </summary>
+ private const int ZZ_BUFFERSIZE = 16384;
+
+ /// <summary>lexical states </summary>
+ public const int YYINITIAL = 0;
+
+ /// <summary> Translates characters to character classes</summary>
+ private const System.String ZZ_CMAP_PACKED = "\x0009\x0000\x0001\x0000\x0001\x000D\x0001\x0000\x0001\x0000\x0001\x000C\x0012\x0000\x0001\x0000\x0005\x0000\x0001\x0005" + "\x0001\x0003\x0004\x0000\x0001\x0009\x0001\x0007\x0001\x0004\x0001\x0009\x000A\x0002\x0006\x0000\x0001\x0006\x001A\x000A" + "\x0004\x0000\x0001\x0008\x0001\x0000\x001A\x000A\x002F\x0000\x0001\x000A\x000A\x0000\x0001\x000A\x0004\x0000\x0001\x000A" + "\x0005\x0000\x0017\x000A\x0001\x0000\x001F\x000A\x0001\x0000\u0128\x000A\x0002\x0000\x0012\x000A\x001C\x0000\x005E\x000A" + "\x0002\x0000\x0009\x000A\x0002\x0000\x0007\x000A\x000E\x0000\x0002\x000A\x000E\x0000\x0005\x000A\x0009\x0000\x0001\x000A" + "\x008B\x0000\x0001\x000A\x000B\x0000\x0001\x000A\x0001\x0000\x0003\x000A\x0001\x0000\x0001\x000A\x0001\x0000\x0014\x000A" + "\x0001\x0000\x002C\x000A\x0001\x0000\x0008\x000A\x0002\x0000\x001A\x000A\x000C\x0000\x0082\x000A\x000A\x0000\x0039\x000A" + "\x0002\x0000\x0002\x000A\x0002\x0000\x0002\x000A\x0003\x0000\x0026\x000A\x0002\x0000\x0002\x000A\x0037\x0000\x0026\x000A" + "\x0002\x0000\x0001\x000A\x0007\x0000\x0027\x000A\x0048\x0000\x001B\x000A\x0005\x0000\x0003\x000A\x002E\x0000\x001A\x000A" + "\x0005\x0000\x000B\x000A\x0015\x0000\x000A\x0002\x0007\x0000\x0063\x000A\x0001\x0000\x0001\x000A\x000F\x0000\x0002\x000A" + "\x0009\x0000\x000A\x0002\x0003\x000A\x0013\x0000\x0001\x000A\x0001\x0000\x001B\x000A\x0053\x0000\x0026\x000A\u015f\x0000" + "\x0035\x000A\x0003\x0000\x0001\x000A\x0012\x0000\x0001\x000A\x0007\x0000\x000A\x000A\x0004\x0000\x000A\x0002\x0015\x0000" + "\x0008\x000A\x0002\x0000\x0002\x000A\x0002\x0000\x0016\x000A\x0001\x0000\x0007\x000A\x0001\x0000\x0001\x000A\x0003\x0000" + "\x0004\x000A\x0022\x0000\x0002\x000A\x0001\x0000\x0003\x000A\x0004\x0000\x000A\x0002\x0002\x000A\x0013\x0000\x0006\x000A" + "\x0004\x0000\x0002\x000A\x0002\x0000\x0016\x000A\x0001\x0000\x0007\x000A\x0001\x0000\x0002\x000A\x0001\x0000\x0002\x000A" +
+ "\x0001\x0000\x0002\x000A\x001F\x0000\x0004\x000A\x0001\x0000\x0001\x000A\x0007\x0000\x000A\x0002\x0002\x0000\x0003\x000A" + "\x0010\x0000\x0007\x000A\x0001\x0000\x0001\x000A\x0001\x0000\x0003\x000A\x0001\x0000\x0016\x000A\x0001\x0000\x0007\x000A" + "\x0001\x0000\x0002\x000A\x0001\x0000\x0005\x000A\x0003\x0000\x0001\x000A\x0012\x0000\x0001\x000A\x000F\x0000\x0001\x000A" + "\x0005\x0000\x000A\x0002\x0015\x0000\x0008\x000A\x0002\x0000\x0002\x000A\x0002\x0000\x0016\x000A\x0001\x0000\x0007\x000A" + "\x0001\x0000\x0002\x000A\x0002\x0000\x0004\x000A\x0003\x0000\x0001\x000A\x001E\x0000\x0002\x000A\x0001\x0000\x0003\x000A" + "\x0004\x0000\x000A\x0002\x0015\x0000\x0006\x000A\x0003\x0000\x0003\x000A\x0001\x0000\x0004\x000A\x0003\x0000\x0002\x000A" + "\x0001\x0000\x0001\x000A\x0001\x0000\x0002\x000A\x0003\x0000\x0002\x000A\x0003\x0000\x0003\x000A\x0003\x0000\x0008\x000A" + "\x0001\x0000\x0003\x000A\x002D\x0000\x0009\x0002\x0015\x0000\x0008\x000A\x0001\x0000\x0003\x000A\x0001\x0000\x0017\x000A" + "\x0001\x0000\x000A\x000A\x0001\x0000\x0005\x000A\x0026\x0000\x0002\x000A\x0004\x0000\x000A\x0002\x0015\x0000\x0008\x000A" + "\x0001\x0000\x0003\x000A\x0001\x0000\x0017\x000A\x0001\x0000\x000A\x000A\x0001\x0000\x0005\x000A\x0024\x0000\x0001\x000A" + "\x0001\x0000\x0002\x000A\x0004\x0000\x000A\x0002\x0015\x0000\x0008\x000A\x0001\x0000\x0003\x000A\x0001\x0000\x0017\x000A" + "\x0001\x0000\x0010\x000A\x0026\x0000\x0002\x000A\x0004\x0000\x000A\x0002\x0015\x0000\x0012\x000A\x0003\x0000\x0018\x000A" + "\x0001\x0000\x0009\x000A\x0001\x0000\x0001\x000A\x0002\x0000\x0007\x000A\x0039\x0000\x0001\x0001\x0030\x000A\x0001\x0001" + "\x0002\x000A\x000C\x0001\x0007\x000A\x0009\x0001\x000A\x0002\x0027\x0000\x0002\x000A\x0001\x0000\x0001\x000A\x0002\x0000" + "\x0002\x000A\x0001\x0000\x0001\x000A\x0002\x0000\x0001\x000A\x0006\x0000\x0004\x000A\x0001\x0000\x0007\x000A\x0001\x0000" + "\x0003\x000A\x0001\x0000\x0001\x000A\x0001\x0000\x0001\x000A\x0002\x0000\x0002\x000A\x0001\x0000\x0004\x000A\x0001\x0000" +
+ "\x0002\x000A\x0009\x0000\x0001\x000A\x0002\x0000\x0005\x000A\x0001\x0000\x0001\x000A\x0009\x0000\x000A\x0002\x0002\x0000" + "\x0002\x000A\x0022\x0000\x0001\x000A\x001F\x0000\x000A\x0002\x0016\x0000\x0008\x000A\x0001\x0000\x0022\x000A\x001D\x0000" + "\x0004\x000A\x0074\x0000\x0022\x000A\x0001\x0000\x0005\x000A\x0001\x0000\x0002\x000A\x0015\x0000\x000A\x0002\x0006\x0000" + "\x0006\x000A\x004A\x0000\x0026\x000A\x000A\x0000\x0027\x000A\x0009\x0000\x005A\x000A\x0005\x0000\x0044\x000A\x0005\x0000" + "\x0052\x000A\x0006\x0000\x0007\x000A\x0001\x0000\x003F\x000A\x0001\x0000\x0001\x000A\x0001\x0000\x0004\x000A\x0002\x0000" + "\x0007\x000A\x0001\x0000\x0001\x000A\x0001\x0000\x0004\x000A\x0002\x0000\x0027\x000A\x0001\x0000\x0001\x000A\x0001\x0000" + "\x0004\x000A\x0002\x0000\x001F\x000A\x0001\x0000\x0001\x000A\x0001\x0000\x0004\x000A\x0002\x0000\x0007\x000A\x0001\x0000" + "\x0001\x000A\x0001\x0000\x0004\x000A\x0002\x0000\x0007\x000A\x0001\x0000\x0007\x000A\x0001\x0000\x0017\x000A\x0001\x0000" + "\x001F\x000A\x0001\x0000\x0001\x000A\x0001\x0000\x0004\x000A\x0002\x0000\x0007\x000A\x0001\x0000\x0027\x000A\x0001\x0000" + "\x0013\x000A\x000E\x0000\x0009\x0002\x002E\x0000\x0055\x000A\x000C\x0000\u026c\x000A\x0002\x0000\x0008\x000A\x000A\x0000" + "\x001A\x000A\x0005\x0000\x004B\x000A\x0095\x0000\x0034\x000A\x002C\x0000\x000A\x0002\x0026\x0000\x000A\x0002\x0006\x0000" + "\x0058\x000A\x0008\x0000\x0029\x000A\u0557\x0000\x009C\x000A\x0004\x0000\x005A\x000A\x0006\x0000\x0016\x000A\x0002\x0000" + "\x0006\x000A\x0002\x0000\x0026\x000A\x0002\x0000\x0006\x000A\x0002\x0000\x0008\x000A\x0001\x0000\x0001\x000A\x0001\x0000" + "\x0001\x000A\x0001\x0000\x0001\x000A\x0001\x0000\x001F\x000A\x0002\x0000\x0035\x000A\x0001\x0000\x0007\x000A\x0001\x0000" + "\x0001\x000A\x0003\x0000\x0003\x000A\x0001\x0000\x0007\x000A\x0003\x0000\x0004\x000A\x0002\x0000\x0006\x000A\x0004\x0000" + "\x000D\x000A\x0005\x0000\x0003\x000A\x0001\x0000\x0007\x000A\x0082\x0000\x0001\x000A\x0082\x0000\x0001\x000A\x0004\x0000" +
+ "\x0001\x000A\x0002\x0000\x000A\x000A\x0001\x0000\x0001\x000A\x0003\x0000\x0005\x000A\x0006\x0000\x0001\x000A\x0001\x0000" + "\x0001\x000A\x0001\x0000\x0001\x000A\x0001\x0000\x0004\x000A\x0001\x0000\x0003\x000A\x0001\x0000\x0007\x000A\u0ecb\x0000" + "\x0002\x000A\x002A\x0000\x0005\x000A\x000A\x0000\x0001\x000B\x0054\x000B\x0008\x000B\x0002\x000B\x0002\x000B\x005A\x000B" + "\x0001\x000B\x0003\x000B\x0006\x000B\x0028\x000B\x0003\x000B\x0001\x0000\x005E\x000A\x0011\x0000\x0018\x000A\x0038\x0000" + "\x0010\x000B\u0100\x0000\x0080\x000B\x0080\x0000\u19b6\x000B\x000A\x000B\x0040\x0000\u51a6\x000B\x005A\x000B\u048d\x000A" + "\u0773\x0000\u2ba4\x000A\u215c\x0000\u012e\x000B\x00D2\x000B\x0007\x000A\x000C\x0000\x0005\x000A\x0005\x0000\x0001\x000A" + "\x0001\x0000\x000A\x000A\x0001\x0000\x000D\x000A\x0001\x0000\x0005\x000A\x0001\x0000\x0001\x000A\x0001\x0000\x0002\x000A" + "\x0001\x0000\x0002\x000A\x0001\x0000\x006C\x000A\x0021\x0000\u016b\x000A\x0012\x0000\x0040\x000A\x0002\x0000\x0036\x000A" + "\x0028\x0000\x000C\x000A\x0074\x0000\x0003\x000A\x0001\x0000\x0001\x000A\x0001\x0000\x0087\x000A\x0013\x0000\x000A\x0002" + "\x0007\x0000\x001A\x000A\x0006\x0000\x001A\x000A\x000A\x0000\x0001\x000B\x003A\x000B\x001F\x000A\x0003\x0000\x0006\x000A" + "\x0002\x0000\x0006\x000A\x0002\x0000\x0006\x000A\x0002\x0000\x0003\x000A\x0023\x0000";
+
+ /// <summary> Translates characters to character classes</summary>
+ private static readonly char[] ZZ_CMAP = ZzUnpackCMap(ZZ_CMAP_PACKED);
+
+ /// <summary> Translates DFA states to action switch labels.</summary>
+ private static readonly int[] ZZ_ACTION = ZzUnpackAction();
+
+ private const System.String ZZ_ACTION_PACKED_0 = "\x0001\x0000\x0001\x0001\x0003\x0002\x0001\x0003\x0001\x0001\x000B\x0000\x0001\x0002\x0003\x0004" + "\x0002\x0000\x0001\x0005\x0001\x0000\x0001\x0005\x0003\x0004\x0006\x0005\x0001\x0006\x0001\x0004" + "\x0002\x0007\x0001\x0008\x0001\x0000\x0001\x0008\x0003\x0000\x0002\x0008\x0001\x0009\x0001\x000A" + "\x0001\x0004";
+
+ private static int[] ZzUnpackAction()
+ {
+ int[] result = new int[51];
+ int offset = 0;
+ offset = ZzUnpackAction(ZZ_ACTION_PACKED_0, offset, result);
+ return result;
+ }
+
+ private static int ZzUnpackAction(System.String packed, int offset, int[] result)
+ {
+ int i = 0; /* index in packed string */
+ int j = offset; /* index in unpacked array */
+ int l = packed.Length;
+ while (i < l)
+ {
+ int count = packed[i++];
+ int value_Renamed = packed[i++];
+ do
+ result[j++] = value_Renamed;
+ while (--count > 0);
+ }
+ return j;
+ }
+
+
+ /// <summary> Translates a state to a row index in the transition table</summary>
+ private static readonly int[] ZZ_ROWMAP = ZzUnpackRowMap();
+
+ private const System.String ZZ_ROWMAP_PACKED_0 = "\x0000\x0000\x0000\x000E\x0000\x001C\x0000\x002A\x0000\x0038\x0000\x000E\x0000\x0046\x0000\x0054" + "\x0000\x0062\x0000\x0070\x0000\x007E\x0000\x008C\x0000\x009A\x0000\x00A8\x0000\x00B6\x0000\x00C4" + "\x0000\x00D2\x0000\x00E0\x0000\x00EE\x0000\x00FC\x0000\u010a\x0000\u0118\x0000\u0126\x0000\u0134" + "\x0000\u0142\x0000\u0150\x0000\u015e\x0000\u016c\x0000\u017a\x0000\u0188\x0000\u0196\x0000\u01a4" + "\x0000\u01b2\x0000\u01c0\x0000\u01ce\x0000\u01dc\x0000\u01ea\x0000\u01f8\x0000\x00D2\x0000\u0206" + "\x0000\u0214\x0000\u0222\x0000\u0230\x0000\u023e\x0000\u024c\x0000\u025a\x0000\x0054\x0000\x008C" + "\x0000\u0268\x0000\u0276\x0000\u0284";
+
+ private static int[] ZzUnpackRowMap()
+ {
+ int[] result = new int[51];
+ int offset = 0;
+ offset = ZzUnpackRowMap(ZZ_ROWMAP_PACKED_0, offset, result);
+ return result;
+ }
+
+ private static int ZzUnpackRowMap(System.String packed, int offset, int[] result)
+ {
+ int i = 0; /* index in packed string */
+ int j = offset; /* index in unpacked array */
+ int l = packed.Length;
+ while (i < l)
+ {
+ int high = packed[i++] << 16;
+ result[j++] = high | packed[i++];
+ }
+ return j;
+ }
+
+ /// <summary> The transition table of the DFA</summary>
+ private static readonly int[] ZZ_TRANS = ZzUnpackTrans();
+
+ private const System.String ZZ_TRANS_PACKED_0 = "\x0001\x0002\x0001\x0003\x0001\x0004\x0007\x0002\x0001\x0005\x0001\x0006\x0001\x0007\x0001\x0002" + "\x000F\x0000\x0002\x0003\x0001\x0000\x0001\x0008\x0001\x0000\x0001\x0009\x0002\x000A\x0001\x000B" + "\x0001\x0003\x0004\x0000\x0001\x0003\x0001\x0004\x0001\x0000\x0001\x000C\x0001\x0000\x0001\x0009" + "\x0002\x000D\x0001\x000E\x0001\x0004\x0004\x0000\x0001\x0003\x0001\x0004\x0001\x000F\x0001\x0010" + "\x0001\x0011\x0001\x0012\x0002\x000A\x0001\x000B\x0001\x0013\x0010\x0000\x0001\x0002\x0001\x0000" + "\x0001\x0014\x0001\x0015\x0007\x0000\x0001\x0016\x0004\x0000\x0002\x0017\x0007\x0000\x0001\x0017" + "\x0004\x0000\x0001\x0018\x0001\x0019\x0007\x0000\x0001\x001A\x0005\x0000\x0001\x001B\x0007\x0000" + "\x0001\x000B\x0004\x0000\x0001\x001C\x0001\x001D\x0007\x0000\x0001\x001E\x0004\x0000\x0001\x001F" + "\x0001\x0020\x0007\x0000\x0001\x0021\x0004\x0000\x0001\x0022\x0001\x0023\x0007\x0000\x0001\x0024" + "\x000D\x0000\x0001\x0025\x0004\x0000\x0001\x0014\x0001\x0015\x0007\x0000\x0001\x0026\x000D\x0000" + "\x0001\x0027\x0004\x0000\x0002\x0017\x0007\x0000\x0001\x0028\x0004\x0000\x0001\x0003\x0001\x0004" + "\x0001\x000F\x0001\x0008\x0001\x0011\x0001\x0012\x0002\x000A\x0001\x000B\x0001\x0013\x0004\x0000" + "\x0002\x0014\x0001\x0000\x0001\x0029\x0001\x0000\x0001\x0009\x0002\x002A\x0001\x0000\x0001\x0014" + "\x0004\x0000\x0001\x0014\x0001\x0015\x0001\x0000\x0001\x002B\x0001\x0000\x0001\x0009\x0002\x002C" + "\x0001\x002D\x0001\x0015\x0004\x0000\x0001\x0014\x0001\x0015\x0001\x0000\x0001\x0029\x0001\x0000" + "\x0001\x0009\x0002\x002A\x0001\x0000\x0001\x0016\x0004\x0000\x0002\x0017\x0001\x0000\x0001\x002E" + "\x0002\x0000\x0001\x002E\x0002\x0000\x0001\x0017\x0004\x0000\x0002\x0018\x0001\x0000\x0001\x002A" + "\x0001\x0000\x0001\x0009\x0002\x002A\x0001\x0000\x0001\x0018\x0004\x0000\x0001\x0018\x0001\x0019" + "\x0001\x0000\x0001\x002C\x0001\x0000\x0001\x0009\x0002\x002C\x0001\x002D\x0001\x0019\x0004\x0000" +
+ "\x0001\x0018\x0001\x0019\x0001\x0000\x0001\x002A\x0001\x0000\x0001\x0009\x0002\x002A\x0001\x0000" + "\x0001\x001A\x0005\x0000\x0001\x001B\x0001\x0000\x0001\x002D\x0002\x0000\x0003\x002D\x0001\x001B" + "\x0004\x0000\x0002\x001C\x0001\x0000\x0001\x002F\x0001\x0000\x0001\x0009\x0002\x000A\x0001\x000B" + "\x0001\x001C\x0004\x0000\x0001\x001C\x0001\x001D\x0001\x0000\x0001\x0030\x0001\x0000\x0001\x0009" + "\x0002\x000D\x0001\x000E\x0001\x001D\x0004\x0000\x0001\x001C\x0001\x001D\x0001\x0000\x0001\x002F" + "\x0001\x0000\x0001\x0009\x0002\x000A\x0001\x000B\x0001\x001E\x0004\x0000\x0002\x001F\x0001\x0000" + "\x0001\x000A\x0001\x0000\x0001\x0009\x0002\x000A\x0001\x000B\x0001\x001F\x0004\x0000\x0001\x001F" + "\x0001\x0020\x0001\x0000\x0001\x000D\x0001\x0000\x0001\x0009\x0002\x000D\x0001\x000E\x0001\x0020" + "\x0004\x0000\x0001\x001F\x0001\x0020\x0001\x0000\x0001\x000A\x0001\x0000\x0001\x0009\x0002\x000A" + "\x0001\x000B\x0001\x0021\x0004\x0000\x0002\x0022\x0001\x0000\x0001\x000B\x0002\x0000\x0003\x000B" + "\x0001\x0022\x0004\x0000\x0001\x0022\x0001\x0023\x0001\x0000\x0001\x000E\x0002\x0000\x0003\x000E" + "\x0001\x0023\x0004\x0000\x0001\x0022\x0001\x0023\x0001\x0000\x0001\x000B\x0002\x0000\x0003\x000B" + "\x0001\x0024\x0006\x0000\x0001\x000F\x0006\x0000\x0001\x0025\x0004\x0000\x0001\x0014\x0001\x0015" + "\x0001\x0000\x0001\x0031\x0001\x0000\x0001\x0009\x0002\x002A\x0001\x0000\x0001\x0016\x0004\x0000" + "\x0002\x0017\x0001\x0000\x0001\x002E\x0002\x0000\x0001\x002E\x0002\x0000\x0001\x0028\x0004\x0000" + "\x0002\x0014\x0007\x0000\x0001\x0014\x0004\x0000\x0002\x0018\x0007\x0000\x0001\x0018\x0004\x0000" + "\x0002\x001C\x0007\x0000\x0001\x001C\x0004\x0000\x0002\x001F\x0007\x0000\x0001\x001F\x0004\x0000" + "\x0002\x0022\x0007\x0000\x0001\x0022\x0004\x0000\x0002\x0032\x0007\x0000\x0001\x0032\x0004\x0000" + "\x0002\x0014\x0007\x0000\x0001\x0033\x0004\x0000\x0002\x0032\x0001\x0000\x0001\x002E\x0002\x0000" + "\x0001\x002E\x0002\x0000\x0001\x0032\x0004\x0000\x0002\x0014\x0001\x0000\x0001\x0031\x0001\x0000" +
+ "\x0001\x0009\x0002\x002A\x0001\x0000\x0001\x0014\x0003\x0000";
+
+ private static int[] ZzUnpackTrans()
+ {
+ int[] result = new int[658];
+ int offset = 0;
+ offset = ZzUnpackTrans(ZZ_TRANS_PACKED_0, offset, result);
+ return result;
+ }
+
+ private static int ZzUnpackTrans(System.String packed, int offset, int[] result)
+ {
+ int i = 0; /* index in packed string */
+ int j = offset; /* index in unpacked array */
+ int l = packed.Length;
+ while (i < l)
+ {
+ int count = packed[i++];
+ int value_Renamed = packed[i++];
+ value_Renamed--;
+ do
+ result[j++] = value_Renamed;
+ while (--count > 0);
+ }
+ return j;
+ }
+
+
+ /* error codes */
+ private const int ZZ_UNKNOWN_ERROR = 0;
+ private const int ZZ_NO_MATCH = 1;
+ private const int ZZ_PUSHBACK_2BIG = 2;
+
+ /* error messages for the codes above */
+ private static readonly System.String[] ZZ_ERROR_MSG = new System.String[]{"Unkown internal scanner error", "Error: could not match input", "Error: pushback value was too large"};
+
+ /// <summary> ZZ_ATTRIBUTE[aState] contains the attributes of state <c>aState</c></summary>
+ private static readonly int[] ZZ_ATTRIBUTE = ZzUnpackAttribute();
+
+ private const System.String ZZ_ATTRIBUTE_PACKED_0 = "\x0001\x0000\x0001\x0009\x0003\x0001\x0001\x0009\x0001\x0001\x000B\x0000\x0004\x0001\x0002\x0000" + "\x0001\x0001\x0001\x0000\x000F\x0001\x0001\x0000\x0001\x0001\x0003\x0000\x0005\x0001";
+
+ private static int[] ZzUnpackAttribute()
+ {
+ int[] result = new int[51];
+ int offset = 0;
+ offset = ZzUnpackAttribute(ZZ_ATTRIBUTE_PACKED_0, offset, result);
+ return result;
+ }
+
+ private static int ZzUnpackAttribute(System.String packed, int offset, int[] result)
+ {
+ int i = 0; /* index in packed string */
+ int j = offset; /* index in unpacked array */
+ int l = packed.Length;
+ while (i < l)
+ {
+ int count = packed[i++];
+ int value_Renamed = packed[i++];
+ do
+ result[j++] = value_Renamed;
+ while (--count > 0);
+ }
+ return j;
+ }
+
+ /// <summary>the input device </summary>
+ private System.IO.TextReader zzReader;
+
+ /// <summary>the current state of the DFA </summary>
+ private int zzState;
+
+ /// <summary>the current lexical state </summary>
+ private int zzLexicalState = YYINITIAL;
+
+ /// <summary>this buffer contains the current text to be matched and is
+ /// the source of the yytext() string
+ /// </summary>
+ private char[] zzBuffer = new char[ZZ_BUFFERSIZE];
+
+ /// <summary>the textposition at the last accepting state </summary>
+ private int zzMarkedPos;
+
+ /// <summary>the textposition at the last state to be included in yytext </summary>
+ private int zzPushbackPos;
+
+ /// <summary>the current text position in the buffer </summary>
+ private int zzCurrentPos;
+
+ /// <summary>startRead marks the beginning of the yytext() string in the buffer </summary>
+ private int zzStartRead;
+
+ /// <summary>endRead marks the last character in the buffer, that has been read
+ /// from input
+ /// </summary>
+ private int zzEndRead;
+
+ /// <summary>number of newlines encountered up to the start of the matched text </summary>
+ private int yyline;
+
+ /// <summary>the number of characters up to the start of the matched text </summary>
+ private int yychar;
+
+ /// <summary> the number of characters from the last newline up to the start of the
+ /// matched text
+ /// </summary>
+ private int yycolumn;
+
+ /// <summary> zzAtBOL == true &lt;=&gt; the scanner is currently at the beginning of a line</summary>
+ private bool zzAtBOL = true;
+
+ /// <summary>zzAtEOF == true &lt;=&gt; the scanner is at the EOF </summary>
+ private bool zzAtEOF;
+
+ /* user code: */
+
+ public static readonly int ALPHANUM;
+ public static readonly int APOSTROPHE;
+ public static readonly int ACRONYM;
+ public static readonly int COMPANY;
+ public static readonly int EMAIL;
+ public static readonly int HOST;
+ public static readonly int NUM;
+ public static readonly int CJ;
+ /// <deprecated> this solves a bug where HOSTs that end with '.' are identified
+ /// as ACRONYMs.
+ /// </deprecated>
+ [Obsolete("this solves a bug where HOSTs that end with '.' are identified as ACRONYMs")]
+ public static readonly int ACRONYM_DEP;
+
+ public static readonly System.String[] TOKEN_TYPES;
+
+ public int Yychar()
+ {
+ return yychar;
+ }
+
+ /*
+ * Resets the Tokenizer to a new Reader.
+ */
+ internal void Reset(System.IO.TextReader r)
+ {
+ // reset to default buffer size, if buffer has grown
+ if (zzBuffer.Length > ZZ_BUFFERSIZE)
+ {
+ zzBuffer = new char[ZZ_BUFFERSIZE];
+ }
+ Yyreset(r);
+ }
+
+ /// <summary> Fills Lucene token with the current token text.</summary>
+ internal void GetText(Token t)
+ {
+ t.SetTermBuffer(zzBuffer, zzStartRead, zzMarkedPos - zzStartRead);
+ }
+
+ /// <summary> Fills TermAttribute with the current token text.</summary>
+ internal void GetText(ITermAttribute t)
+ {
+ t.SetTermBuffer(zzBuffer, zzStartRead, zzMarkedPos - zzStartRead);
+ }
+
+
+ /// <summary> Creates a new scanner
+ /// There is also a java.io.InputStream version of this constructor.
+ ///
+ /// </summary>
+ /// <param name="in_Renamed"> the java.io.Reader to read input from.
+ /// </param>
+ internal StandardTokenizerImpl(System.IO.TextReader in_Renamed)
+ {
+ this.zzReader = in_Renamed;
+ }
+
+ /// <summary> Creates a new scanner.
+ /// There is also java.io.Reader version of this constructor.
+ ///
+ /// </summary>
+ /// <param name="in_Renamed"> the java.io.Inputstream to read input from.
+ /// </param>
+ internal StandardTokenizerImpl(System.IO.Stream in_Renamed):this(new System.IO.StreamReader(in_Renamed, System.Text.Encoding.Default))
+ {
+ }
+
+ /// <summary> Unpacks the compressed character translation table.
+ ///
+ /// </summary>
+ /// <param name="packed"> the packed character translation table
+ /// </param>
+ /// <returns> the unpacked character translation table
+ /// </returns>
+ private static char[] ZzUnpackCMap(System.String packed)
+ {
+ char[] map = new char[0x10000];
+ int i = 0; /* index in packed string */
+ int j = 0; /* index in unpacked array */
+ while (i < 1154)
+ {
+ int count = packed[i++];
+ char value_Renamed = packed[i++];
+ do
+ map[j++] = value_Renamed;
+ while (--count > 0);
+ }
+ return map;
+ }
+
+
+ /// <summary> Refills the input buffer.
+ /// </summary>
+ /// <returns><c>false</c>, iff there was new input.
+ ///
+ /// </returns>
+ /// <exception cref="System.IO.IOException"> if any I/O-Error occurs
+ /// </exception>
+ private bool ZzRefill()
+ {
+
+ /* first: make room (if you can) */
+ if (zzStartRead > 0)
+ {
+ Array.Copy(zzBuffer, zzStartRead, zzBuffer, 0, zzEndRead - zzStartRead);
+
+ /* translate stored positions */
+ zzEndRead -= zzStartRead;
+ zzCurrentPos -= zzStartRead;
+ zzMarkedPos -= zzStartRead;
+ zzPushbackPos -= zzStartRead;
+ zzStartRead = 0;
+ }
+
+ /* is the buffer big enough? */
+ if (zzCurrentPos >= zzBuffer.Length)
+ {
+ /* if not: blow it up */
+ char[] newBuffer = new char[zzCurrentPos * 2];
+ Array.Copy(zzBuffer, 0, newBuffer, 0, zzBuffer.Length);
+ zzBuffer = newBuffer;
+ }
+
+ /* finally: fill the buffer with new input */
+ int numRead = zzReader.Read(zzBuffer, zzEndRead, zzBuffer.Length - zzEndRead);
+
+ if (numRead <= 0)
+ {
+ return true;
+ }
+ else
+ {
+ zzEndRead += numRead;
+ return false;
+ }
+ }
+
+
+ /// <summary> Closes the input stream.</summary>
+ public void Yyclose()
+ {
+ zzAtEOF = true; /* indicate end of file */
+ zzEndRead = zzStartRead; /* invalidate buffer */
+
+ if (zzReader != null)
+ zzReader.Close();
+ }
+
+
+ /// <summary> Resets the scanner to read from a new input stream.
+ /// Does not close the old reader.
+ ///
+ /// All internal variables are reset, the old input stream
+ /// <b>cannot</b> be reused (internal buffer is discarded and lost).
+ /// Lexical state is set to <tt>ZZ_INITIAL</tt>.
+ ///
+ /// </summary>
+ /// <param name="reader"> the new input stream
+ /// </param>
+ public void Yyreset(System.IO.TextReader reader)
+ {
+ zzReader = reader;
+ zzAtBOL = true;
+ zzAtEOF = false;
+ zzEndRead = zzStartRead = 0;
+ zzCurrentPos = zzMarkedPos = zzPushbackPos = 0;
+ yyline = yychar = yycolumn = 0;
+ zzLexicalState = YYINITIAL;
+ }
+
+
+ /// <summary> Returns the current lexical state.</summary>
+ public int Yystate()
+ {
+ return zzLexicalState;
+ }
+
+
+ /// <summary> Enters a new lexical state
+ ///
+ /// </summary>
+ /// <param name="newState">the new lexical state
+ /// </param>
+ public void Yybegin(int newState)
+ {
+ zzLexicalState = newState;
+ }
+
+
+ /// <summary> Returns the text matched by the current regular expression.</summary>
+ public System.String Yytext()
+ {
+ return new System.String(zzBuffer, zzStartRead, zzMarkedPos - zzStartRead);
+ }
+
+
+ /// <summary> Returns the character at position <tt>pos</tt> from the
+ /// matched text.
+ ///
+ /// It is equivalent to yytext().charAt(pos), but faster
+ ///
+ /// </summary>
+ /// <param name="pos">the position of the character to fetch.
+ /// A value from 0 to yylength()-1.
+ ///
+ /// </param>
+ /// <returns> the character at position pos
+ /// </returns>
+ public char Yycharat(int pos)
+ {
+ return zzBuffer[zzStartRead + pos];
+ }
+
+
+ /// <summary> Returns the length of the matched text region.</summary>
+ public int Yylength()
+ {
+ return zzMarkedPos - zzStartRead;
+ }
+
+
+ /// <summary> Reports an error that occured while scanning.
+ ///
+ /// In a wellformed scanner (no or only correct usage of
+ /// yypushback(int) and a match-all fallback rule) this method
+ /// will only be called with things that "Can't Possibly Happen".
+ /// If this method is called, something is seriously wrong
+ /// (e.g. a JFlex bug producing a faulty scanner etc.).
+ ///
+ /// Usual syntax/scanner level error handling should be done
+ /// in error fallback rules.
+ ///
+ /// </summary>
+ /// <param name="errorCode"> the code of the errormessage to display
+ /// </param>
+ private void ZzScanError(int errorCode)
+ {
+ System.String message;
+ try
+ {
+ message = ZZ_ERROR_MSG[errorCode];
+ }
+ catch (System.IndexOutOfRangeException)
+ {
+ message = ZZ_ERROR_MSG[ZZ_UNKNOWN_ERROR];
+ }
+
+ throw new System.ApplicationException(message);
+ }
+
+
+ /// <summary> Pushes the specified amount of characters back into the input stream.
+ ///
+ /// They will be read again by then next call of the scanning method
+ ///
+ /// </summary>
+ /// <param name="number"> the number of characters to be read again.
+ /// This number must not be greater than yylength()!
+ /// </param>
+ public virtual void Yypushback(int number)
+ {
+ if (number > Yylength())
+ ZzScanError(ZZ_PUSHBACK_2BIG);
+
+ zzMarkedPos -= number;
+ }
+
+
+ /// <summary> Resumes scanning until the next regular expression is matched,
+ /// the end of input is encountered or an I/O-Error occurs.
+ ///
+ /// </summary>
+ /// <returns> the next token
+ /// </returns>
+ /// <exception cref="System.IO.IOException"> if any I/O-Error occurs
+ /// </exception>
+ public virtual int GetNextToken()
+ {
+ int zzInput;
+ int zzAction;
+
+ // cached fields:
+ int zzCurrentPosL;
+ int zzMarkedPosL;
+ int zzEndReadL = zzEndRead;
+ char[] zzBufferL = zzBuffer;
+ char[] zzCMapL = ZZ_CMAP;
+
+ int[] zzTransL = ZZ_TRANS;
+ int[] zzRowMapL = ZZ_ROWMAP;
+ int[] zzAttrL = ZZ_ATTRIBUTE;
+
+ while (true)
+ {
+ zzMarkedPosL = zzMarkedPos;
+
+ yychar += zzMarkedPosL - zzStartRead;
+
+ zzAction = - 1;
+
+ zzCurrentPosL = zzCurrentPos = zzStartRead = zzMarkedPosL;
+
+ zzState = zzLexicalState;
+
+
+ {
+ while (true)
+ {
+
+ if (zzCurrentPosL < zzEndReadL)
+ zzInput = zzBufferL[zzCurrentPosL++];
+ else if (zzAtEOF)
+ {
+ zzInput = YYEOF;
+ goto zzForAction_brk; // {{Aroush-2.9}} this 'goto' maybe in the wrong place
+ }
+ else
+ {
+ // store back cached positions
+ zzCurrentPos = zzCurrentPosL;
+ zzMarkedPos = zzMarkedPosL;
+ bool eof = ZzRefill();
+ // get translated positions and possibly new buffer
+ zzCurrentPosL = zzCurrentPos;
+ zzMarkedPosL = zzMarkedPos;
+ zzBufferL = zzBuffer;
+ zzEndReadL = zzEndRead;
+ if (eof)
+ {
+ zzInput = YYEOF;
+ goto zzForAction_brk; // {{Aroush-2.9}} this 'goto' maybe in the wrong place
+ }
+ else
+ {
+ zzInput = zzBufferL[zzCurrentPosL++];
+ }
+ }
+ int zzNext = zzTransL[zzRowMapL[zzState] + zzCMapL[zzInput]];
+ if (zzNext == - 1)
+ {
+ goto zzForAction_brk; // {{Aroush-2.9}} this 'goto' maybe in the wrong place
+ }
+ zzState = zzNext;
+
+ int zzAttributes = zzAttrL[zzState];
+ if ((zzAttributes & 1) == 1)
+ {
+ zzAction = zzState;
+ zzMarkedPosL = zzCurrentPosL;
+ if ((zzAttributes & 8) == 8)
+ {
+ goto zzForAction_brk; // {{Aroush-2.9}} this 'goto' maybe in the wrong place
+ }
+ }
+ }
+ }
+
+zzForAction_brk: ; // {{Aroush-2.9}} this 'lable' maybe in the wrong place
+
+
+ // store back cached position
+ zzMarkedPos = zzMarkedPosL;
+
+ switch (zzAction < 0?zzAction:ZZ_ACTION[zzAction])
+ {
+
+ case 4:
+ {
+ return HOST;
+ }
+
+ case 11: break;
+
+ case 9:
+ {
+ return ACRONYM;
+ }
+
+ case 12: break;
+
+ case 8:
+ {
+ return ACRONYM_DEP;
+ }
+
+ case 13: break;
+
+ case 1:
+ {
+ /* ignore */
+ }
+ goto case 14;
+
+ case 14: break;
+
+ case 5:
+ {
+ return NUM;
+ }
+
+ case 15: break;
+
+ case 3:
+ {
+ return CJ;
+ }
+
+ case 16: break;
+
+ case 2:
+ {
+ return ALPHANUM;
+ }
+
+ case 17: break;
+
+ case 7:
+ {
+ return COMPANY;
+ }
+
+ case 18: break;
+
+ case 6:
+ {
+ return APOSTROPHE;
+ }
+
+ case 19: break;
+
+ case 10:
+ {
+ return EMAIL;
+ }
+
+ case 20: break;
+
+ default:
+ if (zzInput == YYEOF && zzStartRead == zzCurrentPos)
+ {
+ zzAtEOF = true;
+ return YYEOF;
+ }
+ else
+ {
+ ZzScanError(ZZ_NO_MATCH);
+ }
+ break;
+
+ }
+ }
+ }
+ static StandardTokenizerImpl()
+ {
+ ALPHANUM = StandardTokenizer.ALPHANUM;
+ APOSTROPHE = StandardTokenizer.APOSTROPHE;
+ ACRONYM = StandardTokenizer.ACRONYM;
+ COMPANY = StandardTokenizer.COMPANY;
+ EMAIL = StandardTokenizer.EMAIL;
+ HOST = StandardTokenizer.HOST;
+ NUM = StandardTokenizer.NUM;
+ CJ = StandardTokenizer.CJ;
+ ACRONYM_DEP = StandardTokenizer.ACRONYM_DEP;
+ TOKEN_TYPES = StandardTokenizer.TOKEN_TYPES;
+ }
+ }
+} \ No newline at end of file
diff --git a/src/core/Analysis/StopAnalyzer.cs b/src/core/Analysis/StopAnalyzer.cs
new file mode 100644
index 0000000..aabe197
--- /dev/null
+++ b/src/core/Analysis/StopAnalyzer.cs
@@ -0,0 +1,141 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+using System.Collections.Generic;
+using Version = Lucene.Net.Util.Version;
+
+namespace Lucene.Net.Analysis
+{
+
+ /// <summary> Filters <see cref="LetterTokenizer" /> with <see cref="LowerCaseFilter" /> and
+ /// <see cref="StopFilter" />.
+ ///
+ /// <a name="version"/>
+ /// <p/>
+ /// You must specify the required <see cref="Version" /> compatibility when creating
+ /// StopAnalyzer:
+ /// <list type="bullet">
+ /// <item>As of 2.9, position increments are preserved</item>
+ /// </list>
+ /// </summary>
+
+ public sealed class StopAnalyzer:Analyzer
+ {
+ private readonly ISet<string> stopWords;
+ private readonly bool enablePositionIncrements;
+
+ /// <summary>An unmodifiable set containing some common English words that are not usually useful
+ /// for searching.
+ /// </summary>
+ public static ISet<string> ENGLISH_STOP_WORDS_SET;
+
+ /// <summary> Builds an analyzer which removes words in ENGLISH_STOP_WORDS.</summary>
+ public StopAnalyzer(Version matchVersion)
+ {
+ stopWords = ENGLISH_STOP_WORDS_SET;
+ enablePositionIncrements = StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion);
+ }
+
+ /// <summary>Builds an analyzer with the stop words from the given set.</summary>
+ public StopAnalyzer(Version matchVersion, ISet<string> stopWords)
+ {
+ this.stopWords = stopWords;
+ enablePositionIncrements = StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion);
+ }
+
+ /// <summary> Builds an analyzer with the stop words from the given file.
+ ///
+ /// </summary>
+ /// <seealso cref="WordlistLoader.GetWordSet(System.IO.FileInfo)">
+ /// </seealso>
+ /// <param name="matchVersion">See <a href="#version">above</a>
+ /// </param>
+ /// <param name="stopwordsFile">File to load stop words from
+ /// </param>
+ public StopAnalyzer(Version matchVersion, System.IO.FileInfo stopwordsFile)
+ {
+ stopWords = WordlistLoader.GetWordSet(stopwordsFile);
+ enablePositionIncrements = StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion);
+ }
+
+ /// <summary>Builds an analyzer with the stop words from the given reader. </summary>
+ /// <seealso cref="WordlistLoader.GetWordSet(System.IO.TextReader)">
+ /// </seealso>
+ /// <param name="matchVersion">See <a href="#Version">above</a>
+ /// </param>
+ /// <param name="stopwords">Reader to load stop words from
+ /// </param>
+ public StopAnalyzer(Version matchVersion, System.IO.TextReader stopwords)
+ {
+ stopWords = WordlistLoader.GetWordSet(stopwords);
+ enablePositionIncrements = StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion);
+ }
+
+ /// <summary>Filters LowerCaseTokenizer with StopFilter. </summary>
+ public override TokenStream TokenStream(System.String fieldName, System.IO.TextReader reader)
+ {
+ return new StopFilter(enablePositionIncrements, new LowerCaseTokenizer(reader), stopWords);
+ }
+
+ /// <summary>Filters LowerCaseTokenizer with StopFilter. </summary>
+ private class SavedStreams
+ {
+ public SavedStreams(StopAnalyzer enclosingInstance)
+ {
+ InitBlock(enclosingInstance);
+ }
+ private void InitBlock(StopAnalyzer enclosingInstance)
+ {
+ this.enclosingInstance = enclosingInstance;
+ }
+ private StopAnalyzer enclosingInstance;
+ public StopAnalyzer Enclosing_Instance
+ {
+ get
+ {
+ return enclosingInstance;
+ }
+
+ }
+ internal Tokenizer source;
+ internal TokenStream result;
+ }
+
+ public override TokenStream ReusableTokenStream(System.String fieldName, System.IO.TextReader reader)
+ {
+ var streams = (SavedStreams) PreviousTokenStream;
+ if (streams == null)
+ {
+ streams = new SavedStreams(this) {source = new LowerCaseTokenizer(reader)};
+ streams.result = new StopFilter(enablePositionIncrements, streams.source, stopWords);
+ PreviousTokenStream = streams;
+ }
+ else
+ streams.source.Reset(reader);
+ return streams.result;
+ }
+ static StopAnalyzer()
+ {
+ {
+ var stopWords = new System.String[]{"a", "an", "and", "are", "as", "at", "be", "but", "by", "for", "if", "in", "into", "is", "it", "no", "not", "of", "on", "or", "such", "that", "the", "their", "then", "there", "these", "they", "this", "to", "was", "will", "with"};
+ var stopSet = new CharArraySet(stopWords.Length, false);
+ stopSet.AddAll(stopWords);
+ ENGLISH_STOP_WORDS_SET = CharArraySet.UnmodifiableSet(stopSet);
+ }
+ }
+ }
+} \ No newline at end of file
diff --git a/src/core/Analysis/StopFilter.cs b/src/core/Analysis/StopFilter.cs
new file mode 100644
index 0000000..81b7dd0
--- /dev/null
+++ b/src/core/Analysis/StopFilter.cs
@@ -0,0 +1,178 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+using System;
+using System.Collections.Generic;
+using Lucene.Net.Analysis.Tokenattributes;
+using Lucene.Net.Util;
+using QueryParser = Lucene.Net.QueryParsers.QueryParser;
+using Version = Lucene.Net.Util.Version;
+
+namespace Lucene.Net.Analysis
+{
+
+ /// <summary> Removes stop words from a token stream.</summary>
+
+ public sealed class StopFilter:TokenFilter
+ {
+ private readonly CharArraySet stopWords;
+ private bool enablePositionIncrements = false;
+
+ private readonly ITermAttribute termAtt;
+ private readonly IPositionIncrementAttribute posIncrAtt;
+
+ /// <summary> Construct a token stream filtering the given input.
+ /// If <c>stopWords</c> is an instance of <see cref="CharArraySet" /> (true if
+ /// <c>makeStopSet()</c> was used to construct the set) it will be directly used
+ /// and <c>ignoreCase</c> will be ignored since <c>CharArraySet</c>
+ /// directly controls case sensitivity.
+ /// <p/>
+ /// If <c>stopWords</c> is not an instance of <see cref="CharArraySet" />,
+ /// a new CharArraySet will be constructed and <c>ignoreCase</c> will be
+ /// used to specify the case sensitivity of that set.
+ /// </summary>
+ /// <param name="enablePositionIncrements">true if token positions should record the removed stop words</param>
+ /// <param name="input">Input TokenStream</param>
+ /// <param name="stopWords">A Set of strings or strings or char[] or any other ToString()-able set representing the stopwords</param>
+ /// <param name="ignoreCase">if true, all words are lower cased first</param>
+ public StopFilter(bool enablePositionIncrements, TokenStream input, ISet<string> stopWords, bool ignoreCase)
+ : base(input)
+ {
+ if (stopWords is CharArraySet)
+ {
+ this.stopWords = (CharArraySet) stopWords;
+ }
+ else
+ {
+ this.stopWords = new CharArraySet(stopWords.Count, ignoreCase);
+ this.stopWords.AddAll(stopWords);
+ }
+ this.enablePositionIncrements = enablePositionIncrements;
+ termAtt = AddAttribute<ITermAttribute>();
+ posIncrAtt = AddAttribute<IPositionIncrementAttribute>();
+ }
+
+ /// <summary> Constructs a filter which removes words from the input
+ /// TokenStream that are named in the Set.
+ /// </summary>
+ /// <param name="enablePositionIncrements">true if token positions should record the removed stop words</param>
+ /// <param name="in">Input stream</param>
+ /// <param name="stopWords">A Set of strings or char[] or any other ToString()-able set representing the stopwords</param>
+ /// <seealso cref="MakeStopSet(String[])"/>
+ public StopFilter(bool enablePositionIncrements, TokenStream @in, ISet<string> stopWords)
+ : this(enablePositionIncrements, @in, stopWords, false)
+ { }
+
+ /// <summary> Builds a Set from an array of stop words,
+ /// appropriate for passing into the StopFilter constructor.
+ /// This permits this stopWords construction to be cached once when
+ /// an Analyzer is constructed.
+ ///
+ /// </summary>
+ /// <seealso cref="MakeStopSet(String[], bool)">passing false to ignoreCase</seealso>
+ public static ISet<string> MakeStopSet(params string[] stopWords)
+ {
+ return MakeStopSet(stopWords, false);
+ }
+
+ /// <summary> Builds a Set from an array of stop words,
+ /// appropriate for passing into the StopFilter constructor.
+ /// This permits this stopWords construction to be cached once when
+ /// an Analyzer is constructed.
+ /// </summary>
+ /// <param name="stopWords">A list of strings or char[] or any other ToString()-able list representing the stop words</param>
+ /// <seealso cref="MakeStopSet(String[], bool)">passing false to ignoreCase</seealso>
+ public static ISet<string> MakeStopSet(IList<object> stopWords)
+ {
+ return MakeStopSet(stopWords, false);
+ }
+
+ /// <summary></summary>
+ /// <param name="stopWords">An array of stopwords</param>
+ /// <param name="ignoreCase">If true, all words are lower cased first.</param>
+ /// <returns> a Set containing the words</returns>
+ public static ISet<string> MakeStopSet(string[] stopWords, bool ignoreCase)
+ {
+ var stopSet = new CharArraySet(stopWords.Length, ignoreCase);
+ stopSet.AddAll(stopWords);
+ return stopSet;
+ }
+
+ /// <summary> </summary>
+ /// <param name="stopWords">A List of Strings or char[] or any other toString()-able list representing the stopwords </param>
+ /// <param name="ignoreCase">if true, all words are lower cased first</param>
+ /// <returns>A Set (<see cref="CharArraySet"/>)containing the words</returns>
+ public static ISet<string> MakeStopSet(IList<object> stopWords, bool ignoreCase)
+ {
+ var stopSet = new CharArraySet(stopWords.Count, ignoreCase);
+ foreach(var word in stopWords)
+ stopSet.Add(word.ToString());
+ return stopSet;
+ }
+
+ /// <summary> Returns the next input Token whose term() is not a stop word.</summary>
+ public override bool IncrementToken()
+ {
+ // return the first non-stop word found
+ int skippedPositions = 0;
+ while (input.IncrementToken())
+ {
+ if (!stopWords.Contains(termAtt.TermBuffer(), 0, termAtt.TermLength()))
+ {
+ if (enablePositionIncrements)
+ {
+ posIncrAtt.PositionIncrement = posIncrAtt.PositionIncrement + skippedPositions;
+ }
+ return true;
+ }
+ skippedPositions += posIncrAtt.PositionIncrement;
+ }
+ // reached EOS -- return false
+ return false;
+ }
+
+ /// <summary> Returns version-dependent default for enablePositionIncrements. Analyzers
+ /// that embed StopFilter use this method when creating the StopFilter. Prior
+ /// to 2.9, this returns false. On 2.9 or later, it returns true.
+ /// </summary>
+ public static bool GetEnablePositionIncrementsVersionDefault(Version matchVersion)
+ {
+ return matchVersion.OnOrAfter(Version.LUCENE_29);
+ }
+
+ /// <summary> If <c>true</c>, this StopFilter will preserve
+ /// positions of the incoming tokens (ie, accumulate and
+ /// set position increments of the removed stop tokens).
+ /// Generally, <c>true</c> is best as it does not
+ /// lose information (positions of the original tokens)
+ /// during indexing.
+ ///
+ /// <p/> When set, when a token is stopped
+ /// (omitted), the position increment of the following
+ /// token is incremented.
+ ///
+ /// <p/> <b>NOTE</b>: be sure to also
+ /// set <see cref="QueryParser.EnablePositionIncrements" /> if
+ /// you use QueryParser to create queries.
+ /// </summary>
+ public bool EnablePositionIncrements
+ {
+ get { return enablePositionIncrements; }
+ set { enablePositionIncrements = value; }
+ }
+ }
+} \ No newline at end of file
diff --git a/src/core/Analysis/TeeSinkTokenFilter.cs b/src/core/Analysis/TeeSinkTokenFilter.cs
new file mode 100644
index 0000000..bec605e
--- /dev/null
+++ b/src/core/Analysis/TeeSinkTokenFilter.cs
@@ -0,0 +1,266 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+using System;
+using System.Collections.Generic;
+using Attribute = Lucene.Net.Util.Attribute;
+using AttributeSource = Lucene.Net.Util.AttributeSource;
+
+namespace Lucene.Net.Analysis
+{
+
+ /// <summary> This TokenFilter provides the ability to set aside attribute states
+ /// that have already been analyzed. This is useful in situations where multiple fields share
+ /// many common analysis steps and then go their separate ways.
+ /// <p/>
+ /// It is also useful for doing things like entity extraction or proper noun analysis as
+ /// part of the analysis workflow and saving off those tokens for use in another field.
+ ///
+ /// <code>
+ /// TeeSinkTokenFilter source1 = new TeeSinkTokenFilter(new WhitespaceTokenizer(reader1));
+ /// TeeSinkTokenFilter.SinkTokenStream sink1 = source1.newSinkTokenStream();
+ /// TeeSinkTokenFilter.SinkTokenStream sink2 = source1.newSinkTokenStream();
+ /// TeeSinkTokenFilter source2 = new TeeSinkTokenFilter(new WhitespaceTokenizer(reader2));
+ /// source2.addSinkTokenStream(sink1);
+ /// source2.addSinkTokenStream(sink2);
+ /// TokenStream final1 = new LowerCaseFilter(source1);
+ /// TokenStream final2 = source2;
+ /// TokenStream final3 = new EntityDetect(sink1);
+ /// TokenStream final4 = new URLDetect(sink2);
+ /// d.add(new Field("f1", final1));
+ /// d.add(new Field("f2", final2));
+ /// d.add(new Field("f3", final3));
+ /// d.add(new Field("f4", final4));
+ /// </code>
+ /// In this example, <c>sink1</c> and <c>sink2</c> will both get tokens from both
+ /// <c>reader1</c> and <c>reader2</c> after whitespace tokenizer
+ /// and now we can further wrap any of these in extra analysis, and more "sources" can be inserted if desired.
+ /// It is important, that tees are consumed before sinks (in the above example, the field names must be
+ /// less the sink's field names). If you are not sure, which stream is consumed first, you can simply
+ /// add another sink and then pass all tokens to the sinks at once using <see cref="ConsumeAllTokens" />.
+ /// This TokenFilter is exhausted after this. In the above example, change
+ /// the example above to:
+ /// <code>
+ /// ...
+ /// TokenStream final1 = new LowerCaseFilter(source1.newSinkTokenStream());
+ /// TokenStream final2 = source2.newSinkTokenStream();
+ /// sink1.consumeAllTokens();
+ /// sink2.consumeAllTokens();
+ /// ...
+ /// </code>
+ /// In this case, the fields can be added in any order, because the sources are not used anymore and all sinks are ready.
+ /// <p/>Note, the EntityDetect and URLDetect TokenStreams are for the example and do not currently exist in Lucene.
+ /// </summary>
+ public sealed class TeeSinkTokenFilter:TokenFilter
+ {
+ public class AnonymousClassSinkFilter:SinkFilter
+ {
+ public override bool Accept(AttributeSource source)
+ {
+ return true;
+ }
+ }
+ private readonly LinkedList<WeakReference> sinks = new LinkedList<WeakReference>();
+
+ /// <summary> Instantiates a new TeeSinkTokenFilter.</summary>
+ public TeeSinkTokenFilter(TokenStream input):base(input)
+ {
+ }
+
+ /// <summary> Returns a new <see cref="SinkTokenStream" /> that receives all tokens consumed by this stream.</summary>
+ public SinkTokenStream NewSinkTokenStream()
+ {
+ return NewSinkTokenStream(ACCEPT_ALL_FILTER);
+ }
+
+ /// <summary> Returns a new <see cref="SinkTokenStream" /> that receives all tokens consumed by this stream
+ /// that pass the supplied filter.
+ /// </summary>
+ /// <seealso cref="SinkFilter">
+ /// </seealso>
+ public SinkTokenStream NewSinkTokenStream(SinkFilter filter)
+ {
+ var sink = new SinkTokenStream(this.CloneAttributes(), filter);
+ sinks.AddLast(new WeakReference(sink));
+ return sink;
+ }
+
+ /// <summary> Adds a <see cref="SinkTokenStream" /> created by another <c>TeeSinkTokenFilter</c>
+ /// to this one. The supplied stream will also receive all consumed tokens.
+ /// This method can be used to pass tokens from two different tees to one sink.
+ /// </summary>
+ public void AddSinkTokenStream(SinkTokenStream sink)
+ {
+ // check that sink has correct factory
+ if (!this.Factory.Equals(sink.Factory))
+ {
+ throw new System.ArgumentException("The supplied sink is not compatible to this tee");
+ }
+ // add eventually missing attribute impls to the existing sink
+ foreach (var impl in this.CloneAttributes().GetAttributeImplsIterator())
+ {
+ sink.AddAttributeImpl(impl);
+ }
+ sinks.AddLast(new WeakReference(sink));
+ }
+
+ /// <summary> <c>TeeSinkTokenFilter</c> passes all tokens to the added sinks
+ /// when itself is consumed. To be sure, that all tokens from the input
+ /// stream are passed to the sinks, you can call this methods.
+ /// This instance is exhausted after this, but all sinks are instant available.
+ /// </summary>
+ public void ConsumeAllTokens()
+ {
+ while (IncrementToken())
+ {
+ }
+ }
+
+ public override bool IncrementToken()
+ {
+ if (input.IncrementToken())
+ {
+ // capture state lazily - maybe no SinkFilter accepts this state
+ State state = null;
+ foreach(WeakReference wr in sinks)
+ {
+ var sink = (SinkTokenStream)wr.Target;
+ if (sink != null)
+ {
+ if (sink.Accept(this))
+ {
+ if (state == null)
+ {
+ state = this.CaptureState();
+ }
+ sink.AddState(state);
+ }
+ }
+ }
+ return true;
+ }
+
+ return false;
+ }
+
+ public override void End()
+ {
+ base.End();
+ State finalState = CaptureState();
+ foreach(WeakReference wr in sinks)
+ {
+ var sink = (SinkTokenStream)wr.Target;
+ if (sink != null)
+ {
+ sink.SetFinalState(finalState);
+ }
+ }
+ }
+
+ /// <summary> A filter that decides which <see cref="AttributeSource" /> states to store in the sink.</summary>
+ public abstract class SinkFilter
+ {
+ /// <summary> Returns true, iff the current state of the passed-in <see cref="AttributeSource" /> shall be stored
+ /// in the sink.
+ /// </summary>
+ public abstract bool Accept(AttributeSource source);
+
+ /// <summary> Called by <see cref="SinkTokenStream.Reset()" />. This method does nothing by default
+ /// and can optionally be overridden.
+ /// </summary>
+ public virtual void Reset()
+ {
+ // nothing to do; can be overridden
+ }
+ }
+
+ public sealed class SinkTokenStream : TokenStream
+ {
+ private readonly LinkedList<State> cachedStates = new LinkedList<State>();
+ private State finalState;
+ private IEnumerator<AttributeSource.State> it = null;
+ private readonly SinkFilter filter;
+
+ internal SinkTokenStream(AttributeSource source, SinkFilter filter)
+ : base(source)
+ {
+ this.filter = filter;
+ }
+
+ internal /*private*/ bool Accept(AttributeSource source)
+ {
+ return filter.Accept(source);
+ }
+
+ internal /*private*/ void AddState(AttributeSource.State state)
+ {
+ if (it != null)
+ {
+ throw new System.SystemException("The tee must be consumed before sinks are consumed.");
+ }
+ cachedStates.AddLast(state);
+ }
+
+ internal /*private*/ void SetFinalState(AttributeSource.State finalState)
+ {
+ this.finalState = finalState;
+ }
+
+ public override bool IncrementToken()
+ {
+ // lazy init the iterator
+ if (it == null)
+ {
+ it = cachedStates.GetEnumerator();
+ }
+
+ if (!it.MoveNext())
+ {
+ return false;
+ }
+
+ State state = it.Current;
+ RestoreState(state);
+ return true;
+ }
+
+ public override void End()
+ {
+ if (finalState != null)
+ {
+ RestoreState(finalState);
+ }
+ }
+
+ public override void Reset()
+ {
+ it = cachedStates.GetEnumerator();
+ }
+
+ protected override void Dispose(bool disposing)
+ {
+ // Do nothing.
+ }
+ }
+
+ private static readonly SinkFilter ACCEPT_ALL_FILTER;
+ static TeeSinkTokenFilter()
+ {
+ ACCEPT_ALL_FILTER = new AnonymousClassSinkFilter();
+ }
+ }
+} \ No newline at end of file
diff --git a/src/core/Analysis/Token.cs b/src/core/Analysis/Token.cs
new file mode 100644
index 0000000..3357f34
--- /dev/null
+++ b/src/core/Analysis/Token.cs
@@ -0,0 +1,852 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+using System;
+using Lucene.Net.Analysis.Tokenattributes;
+using Lucene.Net.Support;
+using Lucene.Net.Util;
+using Payload = Lucene.Net.Index.Payload;
+using TermPositions = Lucene.Net.Index.TermPositions;
+using ArrayUtil = Lucene.Net.Util.ArrayUtil;
+using Attribute = Lucene.Net.Util.Attribute;
+
+namespace Lucene.Net.Analysis
+{
+
+ /// <summary>A Token is an occurrence of a term from the text of a field. It consists of
+ /// a term's text, the start and end offset of the term in the text of the field,
+ /// and a type string.
+ /// <p/>
+ /// The start and end offsets permit applications to re-associate a token with
+ /// its source text, e.g., to display highlighted query terms in a document
+ /// browser, or to show matching text fragments in a <abbr
+ /// title="KeyWord In Context">KWIC</abbr> display, etc.
+ /// <p/>
+ /// The type is a string, assigned by a lexical analyzer
+ /// (a.k.a. tokenizer), naming the lexical or syntactic class that the token
+ /// belongs to. For example an end of sentence marker token might be implemented
+ /// with type "eos". The default token type is "word".
+ /// <p/>
+ /// A Token can optionally have metadata (a.k.a. Payload) in the form of a variable
+ /// length byte array. Use <see cref="TermPositions.PayloadLength" /> and
+ /// <see cref="TermPositions.GetPayload(byte[], int)" /> to retrieve the payloads from the index.
+ /// </summary>
+ /// <summary><br/><br/>
+ /// </summary>
+ /// <summary><p/><b>NOTE:</b> As of 2.9, Token implements all <see cref="IAttribute" /> interfaces
+ /// that are part of core Lucene and can be found in the <see cref="Lucene.Net.Analysis.Tokenattributes"/> namespace.
+ /// Even though it is not necessary to use Token anymore, with the new TokenStream API it can
+ /// be used as convenience class that implements all <see cref="IAttribute" />s, which is especially useful
+ /// to easily switch from the old to the new TokenStream API.
+ /// <br/><br/>
+ /// <p/>Tokenizers and TokenFilters should try to re-use a Token instance when
+ /// possible for best performance, by implementing the
+ /// <see cref="TokenStream.IncrementToken()" /> API.
+ /// Failing that, to create a new Token you should first use
+ /// one of the constructors that starts with null text. To load
+ /// the token from a char[] use <see cref="SetTermBuffer(char[], int, int)" />.
+ /// To load from a String use <see cref="SetTermBuffer(String)" /> or <see cref="SetTermBuffer(String, int, int)" />.
+ /// Alternatively you can get the Token's termBuffer by calling either <see cref="TermBuffer()" />,
+ /// if you know that your text is shorter than the capacity of the termBuffer
+ /// or <see cref="ResizeTermBuffer(int)" />, if there is any possibility
+ /// that you may need to grow the buffer. Fill in the characters of your term into this
+ /// buffer, with <see cref="string.ToCharArray(int, int)" /> if loading from a string,
+ /// or with <see cref="Array.Copy(Array, long, Array, long, long)" />, and finally call <see cref="SetTermLength(int)" /> to
+ /// set the length of the term text. See <a target="_top"
+ /// href="https://issues.apache.org/jira/browse/LUCENE-969">LUCENE-969</a>
+ /// for details.<p/>
+ /// <p/>Typical Token reuse patterns:
+ /// <list type="bullet">
+ /// <item> Copying text from a string (type is reset to <see cref="DEFAULT_TYPE" /> if not
+ /// specified):<br/>
+ /// <code>
+ /// return reusableToken.reinit(string, startOffset, endOffset[, type]);
+ /// </code>
+ /// </item>
+ /// <item> Copying some text from a string (type is reset to <see cref="DEFAULT_TYPE" />
+ /// if not specified):<br/>
+ /// <code>
+ /// return reusableToken.reinit(string, 0, string.length(), startOffset, endOffset[, type]);
+ /// </code>
+ /// </item>
+ /// <item> Copying text from char[] buffer (type is reset to <see cref="DEFAULT_TYPE" />
+ /// if not specified):<br/>
+ /// <code>
+ /// return reusableToken.reinit(buffer, 0, buffer.length, startOffset, endOffset[, type]);
+ /// </code>
+ /// </item>
+ /// <item> Copying some text from a char[] buffer (type is reset to
+ /// <see cref="DEFAULT_TYPE" /> if not specified):<br/>
+ /// <code>
+ /// return reusableToken.reinit(buffer, start, end - start, startOffset, endOffset[, type]);
+ /// </code>
+ /// </item>
+ /// <item> Copying from one one Token to another (type is reset to
+ /// <see cref="DEFAULT_TYPE" /> if not specified):<br/>
+ /// <code>
+ /// return reusableToken.reinit(source.termBuffer(), 0, source.termLength(), source.startOffset(), source.endOffset()[, source.type()]);
+ /// </code>
+ /// </item>
+ /// </list>
+ /// A few things to note:
+ /// <list type="bullet">
+ /// <item>clear() initializes all of the fields to default values. This was changed in contrast to Lucene 2.4, but should affect no one.</item>
+ /// <item>Because <c>TokenStreams</c> can be chained, one cannot assume that the <c>Token's</c> current type is correct.</item>
+ /// <item>The startOffset and endOffset represent the start and offset in the
+ /// source text, so be careful in adjusting them.</item>
+ /// <item>When caching a reusable token, clone it. When injecting a cached token into a stream that can be reset, clone it again.</item>
+ /// </list>
+ /// <p/>
+ /// </summary>
+ /// <seealso cref="Lucene.Net.Index.Payload">
+ /// </seealso>
+ [Serializable]
+ public class Token : Attribute, ITermAttribute, ITypeAttribute, IPositionIncrementAttribute, IFlagsAttribute, IOffsetAttribute, IPayloadAttribute
+ {
+ public const String DEFAULT_TYPE = "word";
+
+ private const int MIN_BUFFER_SIZE = 10;
+
+ private char[] termBuffer;
+ private int termLength;
+ private int startOffset, endOffset;
+ private string type = DEFAULT_TYPE;
+ private int flags;
+ private Payload payload;
+ private int positionIncrement = 1;
+
+ /// <summary>Constructs a Token will null text. </summary>
+ public Token()
+ {
+ }
+
+ /// <summary>Constructs a Token with null text and start &amp; end
+ /// offsets.
+ /// </summary>
+ /// <param name="start">start offset in the source text</param>
+ /// <param name="end">end offset in the source text</param>
+ public Token(int start, int end)
+ {
+ startOffset = start;
+ endOffset = end;
+ }
+
+ /// <summary>Constructs a Token with null text and start &amp; end
+ /// offsets plus the Token type.
+ /// </summary>
+ /// <param name="start">start offset in the source text</param>
+ /// <param name="end">end offset in the source text</param>
+ /// <param name="typ">the lexical type of this Token</param>
+ public Token(int start, int end, String typ)
+ {
+ startOffset = start;
+ endOffset = end;
+ type = typ;
+ }
+
+ /// <summary> Constructs a Token with null text and start &amp; end
+ /// offsets plus flags. NOTE: flags is EXPERIMENTAL.
+ /// </summary>
+ /// <param name="start">start offset in the source text</param>
+ /// <param name="end">end offset in the source text</param>
+ /// <param name="flags">The bits to set for this token</param>
+ public Token(int start, int end, int flags)
+ {
+ startOffset = start;
+ endOffset = end;
+ this.flags = flags;
+ }
+
+ /// <summary>Constructs a Token with the given term text, and start
+ /// &amp; end offsets. The type defaults to "word."
+ /// <b>NOTE:</b> for better indexing speed you should
+ /// instead use the char[] termBuffer methods to set the
+ /// term text.
+ /// </summary>
+ /// <param name="text">term text</param>
+ /// <param name="start">start offset</param>
+ /// <param name="end">end offset</param>
+ public Token(String text, int start, int end)
+ {
+ SetTermBuffer(text);
+ startOffset = start;
+ endOffset = end;
+ }
+
+ /// <summary>Constructs a Token with the given text, start and end
+ /// offsets, &amp; type. <b>NOTE:</b> for better indexing
+ /// speed you should instead use the char[] termBuffer
+ /// methods to set the term text.
+ /// </summary>
+ /// <param name="text">term text</param>
+ /// <param name="start">start offset</param>
+ /// <param name="end">end offset</param>
+ /// <param name="typ">token type</param>
+ public Token(System.String text, int start, int end, System.String typ)
+ {
+ SetTermBuffer(text);
+ startOffset = start;
+ endOffset = end;
+ type = typ;
+ }
+
+ /// <summary> Constructs a Token with the given text, start and end
+ /// offsets, &amp; type. <b>NOTE:</b> for better indexing
+ /// speed you should instead use the char[] termBuffer
+ /// methods to set the term text.
+ /// </summary>
+ /// <param name="text"></param>
+ /// <param name="start"></param>
+ /// <param name="end"></param>
+ /// <param name="flags">token type bits</param>
+ public Token(System.String text, int start, int end, int flags)
+ {
+ SetTermBuffer(text);
+ startOffset = start;
+ endOffset = end;
+ this.flags = flags;
+ }
+
+ /// <summary> Constructs a Token with the given term buffer (offset
+ /// &amp; length), start and end
+ /// offsets
+ /// </summary>
+ /// <param name="startTermBuffer"></param>
+ /// <param name="termBufferOffset"></param>
+ /// <param name="termBufferLength"></param>
+ /// <param name="start"></param>
+ /// <param name="end"></param>
+ public Token(char[] startTermBuffer, int termBufferOffset, int termBufferLength, int start, int end)
+ {
+ SetTermBuffer(startTermBuffer, termBufferOffset, termBufferLength);
+ startOffset = start;
+ endOffset = end;
+ }
+
+ /// <summary>Set the position increment. This determines the position of this token
+ /// relative to the previous Token in a <see cref="TokenStream" />, used in phrase
+ /// searching.
+ ///
+ /// <p/>The default value is one.
+ ///
+ /// <p/>Some common uses for this are:<list>
+ ///
+ /// <item>Set it to zero to put multiple terms in the same position. This is
+ /// useful if, e.g., a word has multiple stems. Searches for phrases
+ /// including either stem will match. In this case, all but the first stem's
+ /// increment should be set to zero: the increment of the first instance
+ /// should be one. Repeating a token with an increment of zero can also be
+ /// used to boost the scores of matches on that token.</item>
+ ///
+ /// <item>Set it to values greater than one to inhibit exact phrase matches.
+ /// If, for example, one does not want phrases to match across removed stop
+ /// words, then one could build a stop word filter that removes stop words and
+ /// also sets the increment to the number of stop words removed before each
+ /// non-stop word. Then exact phrase queries will only match when the terms
+ /// occur with no intervening stop words.</item>
+ ///
+ /// </list>
+ /// </summary>
+ /// <value> the distance from the prior term </value>
+ /// <seealso cref="Lucene.Net.Index.TermPositions">
+ /// </seealso>
+ public virtual int PositionIncrement
+ {
+ set
+ {
+ if (value < 0)
+ throw new System.ArgumentException("Increment must be zero or greater: " + value);
+ this.positionIncrement = value;
+ }
+ get { return positionIncrement; }
+ }
+
+ /// <summary>Returns the Token's term text.
+ ///
+ /// This method has a performance penalty
+ /// because the text is stored internally in a char[]. If
+ /// possible, use <see cref="TermBuffer()" /> and <see cref="TermLength()"/>
+ /// directly instead. If you really need a
+ /// String, use this method, which is nothing more than
+ /// a convenience call to <b>new String(token.termBuffer(), 0, token.termLength())</b>
+ /// </summary>
+ public string Term
+ {
+ get
+ {
+ InitTermBuffer();
+ return new System.String(termBuffer, 0, termLength);
+ }
+ }
+
+ /// <summary>Copies the contents of buffer, starting at offset for
+ /// length characters, into the termBuffer array.
+ /// </summary>
+ /// <param name="buffer">the buffer to copy</param>
+ /// <param name="offset">the index in the buffer of the first character to copy</param>
+ /// <param name="length">the number of characters to copy</param>
+ public void SetTermBuffer(char[] buffer, int offset, int length)
+ {
+ GrowTermBuffer(length);
+ Array.Copy(buffer, offset, termBuffer, 0, length);
+ termLength = length;
+ }
+
+ /// <summary>Copies the contents of buffer into the termBuffer array.</summary>
+ /// <param name="buffer">the buffer to copy
+ /// </param>
+ public void SetTermBuffer(System.String buffer)
+ {
+ int length = buffer.Length;
+ GrowTermBuffer(length);
+ TextSupport.GetCharsFromString(buffer, 0, length, termBuffer, 0);
+ termLength = length;
+ }
+
+ /// <summary>Copies the contents of buffer, starting at offset and continuing
+ /// for length characters, into the termBuffer array.
+ /// </summary>
+ /// <param name="buffer">the buffer to copy
+ /// </param>
+ /// <param name="offset">the index in the buffer of the first character to copy
+ /// </param>
+ /// <param name="length">the number of characters to copy
+ /// </param>
+ public void SetTermBuffer(System.String buffer, int offset, int length)
+ {
+ System.Diagnostics.Debug.Assert(offset <= buffer.Length);
+ System.Diagnostics.Debug.Assert(offset + length <= buffer.Length);
+ GrowTermBuffer(length);
+ TextSupport.GetCharsFromString(buffer, offset, offset + length, termBuffer, 0);
+ termLength = length;
+ }
+
+ /// <summary>Returns the internal termBuffer character array which
+ /// you can then directly alter. If the array is too
+ /// small for your token, use <see cref="ResizeTermBuffer(int)" />
+ /// to increase it. After
+ /// altering the buffer be sure to call <see cref="SetTermLength" />
+ /// to record the number of valid
+ /// characters that were placed into the termBuffer.
+ /// </summary>
+ public char[] TermBuffer()
+ {
+ InitTermBuffer();
+ return termBuffer;
+ }
+
+ /// <summary>Grows the termBuffer to at least size newSize, preserving the
+ /// existing content. Note: If the next operation is to change
+ /// the contents of the term buffer use
+ /// <see cref="SetTermBuffer(char[], int, int)" />,
+ /// <see cref="SetTermBuffer(String)" />, or
+ /// <see cref="SetTermBuffer(String, int, int)" />
+ /// to optimally combine the resize with the setting of the termBuffer.
+ /// </summary>
+ /// <param name="newSize">minimum size of the new termBuffer
+ /// </param>
+ /// <returns> newly created termBuffer with length >= newSize
+ /// </returns>
+ public virtual char[] ResizeTermBuffer(int newSize)
+ {
+ if (termBuffer == null)
+ {
+ termBuffer = new char[ArrayUtil.GetNextSize(newSize < MIN_BUFFER_SIZE ? MIN_BUFFER_SIZE : newSize)];
+ }
+ else
+ {
+ if (termBuffer.Length < newSize)
+ {
+ // Not big enough; create a new array with slight
+ // over allocation and preserve content
+ var newCharBuffer = new char[ArrayUtil.GetNextSize(newSize)];
+ Array.Copy(termBuffer, 0, newCharBuffer, 0, termBuffer.Length);
+ termBuffer = newCharBuffer;
+ }
+ }
+ return termBuffer;
+ }
+
+ /// <summary>Allocates a buffer char[] of at least newSize, without preserving the existing content.
+ /// its always used in places that set the content
+ /// </summary>
+ /// <param name="newSize">minimum size of the buffer
+ /// </param>
+ private void GrowTermBuffer(int newSize)
+ {
+ if (termBuffer == null)
+ {
+ // The buffer is always at least MIN_BUFFER_SIZE
+ termBuffer = new char[ArrayUtil.GetNextSize(newSize < MIN_BUFFER_SIZE?MIN_BUFFER_SIZE:newSize)];
+ }
+ else
+ {
+ if (termBuffer.Length < newSize)
+ {
+ // Not big enough; create a new array with slight
+ // over allocation:
+ termBuffer = new char[ArrayUtil.GetNextSize(newSize)];
+ }
+ }
+ }
+
+ private void InitTermBuffer()
+ {
+ if (termBuffer == null)
+ {
+ termBuffer = new char[ArrayUtil.GetNextSize(MIN_BUFFER_SIZE)];
+ termLength = 0;
+ }
+ }
+
+ /// <summary>Return number of valid characters (length of the term)
+ /// in the termBuffer array.
+ /// </summary>
+ public int TermLength()
+ {
+ InitTermBuffer();
+ return termLength;
+ }
+
+ /// <summary>Set number of valid characters (length of the term) in
+ /// the termBuffer array. Use this to truncate the termBuffer
+ /// or to synchronize with external manipulation of the termBuffer.
+ /// Note: to grow the size of the array,
+ /// use <see cref="ResizeTermBuffer(int)" /> first.
+ /// </summary>
+ /// <param name="length">the truncated length
+ /// </param>
+ public void SetTermLength(int length)
+ {
+ InitTermBuffer();
+ if (length > termBuffer.Length)
+ throw new System.ArgumentException("length " + length + " exceeds the size of the termBuffer (" + termBuffer.Length + ")");
+ termLength = length;
+ }
+
+ /// <summary>Gets or sets this Token's starting offset, the position of the first character
+ /// corresponding to this token in the source text.
+ /// Note that the difference between endOffset() and startOffset() may not be
+ /// equal to <see cref="TermLength"/>, as the term text may have been altered by a
+ /// stemmer or some other filter.
+ /// </summary>
+ public virtual int StartOffset
+ {
+ get { return startOffset; }
+ set { this.startOffset = value; }
+ }
+
+ /// <summary>Gets or sets this Token's ending offset, one greater than the position of the
+ /// last character corresponding to this token in the source text. The length
+ /// of the token in the source text is (endOffset - startOffset).
+ /// </summary>
+ public virtual int EndOffset
+ {
+ get { return endOffset; }
+ set { this.endOffset = value; }
+ }
+
+ /// <summary>Set the starting and ending offset.
+ /// See StartOffset() and EndOffset()
+ /// </summary>
+ public virtual void SetOffset(int startOffset, int endOffset)
+ {
+ this.startOffset = startOffset;
+ this.endOffset = endOffset;
+ }
+
+ /// <summary>Returns this Token's lexical type. Defaults to "word". </summary>
+ public string Type
+ {
+ get { return type; }
+ set { this.type = value; }
+ }
+
+ /// <summary> EXPERIMENTAL: While we think this is here to stay, we may want to change it to be a long.
+ /// <p/>
+ ///
+ /// Get the bitset for any bits that have been set. This is completely distinct from <see cref="Type()" />, although they do share similar purposes.
+ /// The flags can be used to encode information about the token for use by other <see cref="TokenFilter"/>s.
+ ///
+ ///
+ /// </summary>
+ /// <value> The bits </value>
+ public virtual int Flags
+ {
+ get { return flags; }
+ set { flags = value; }
+ }
+
+ /// <summary> Returns this Token's payload.</summary>
+ public virtual Payload Payload
+ {
+ get { return payload; }
+ set { payload = value; }
+ }
+
+ public override String ToString()
+ {
+ var sb = new System.Text.StringBuilder();
+ sb.Append('(');
+ InitTermBuffer();
+ if (termBuffer == null)
+ sb.Append("null");
+ else
+ sb.Append(termBuffer, 0, termLength);
+ sb.Append(',').Append(startOffset).Append(',').Append(endOffset);
+ if (!type.Equals("word"))
+ sb.Append(",type=").Append(type);
+ if (positionIncrement != 1)
+ sb.Append(",posIncr=").Append(positionIncrement);
+ sb.Append(')');
+ return sb.ToString();
+ }
+
+ /// <summary>Resets the term text, payload, flags, and positionIncrement,
+ /// startOffset, endOffset and token type to default.
+ /// </summary>
+ public override void Clear()
+ {
+ payload = null;
+ // Leave termBuffer to allow re-use
+ termLength = 0;
+ positionIncrement = 1;
+ flags = 0;
+ startOffset = endOffset = 0;
+ type = DEFAULT_TYPE;
+ }
+
+ public override System.Object Clone()
+ {
+ var t = (Token) base.Clone();
+ // Do a deep clone
+ if (termBuffer != null)
+ {
+ t.termBuffer = new char[termBuffer.Length];
+ termBuffer.CopyTo(t.termBuffer, 0);
+ }
+ if (payload != null)
+ {
+ t.payload = (Payload) payload.Clone();
+ }
+ return t;
+ }
+
+ /// <summary>Makes a clone, but replaces the term buffer &amp;
+ /// start/end offset in the process. This is more
+ /// efficient than doing a full clone (and then calling
+ /// setTermBuffer) because it saves a wasted copy of the old
+ /// termBuffer.
+ /// </summary>
+ public virtual Token Clone(char[] newTermBuffer, int newTermOffset, int newTermLength, int newStartOffset, int newEndOffset)
+ {
+ var t = new Token(newTermBuffer, newTermOffset, newTermLength, newStartOffset, newEndOffset)
+ {positionIncrement = positionIncrement, flags = flags, type = type};
+ if (payload != null)
+ t.payload = (Payload) payload.Clone();
+ return t;
+ }
+
+ public override bool Equals(Object obj)
+ {
+ if (obj == this)
+ return true;
+
+ var other = obj as Token;
+ if (other == null)
+ return false;
+
+ InitTermBuffer();
+ other.InitTermBuffer();
+
+ if (termLength == other.termLength && startOffset == other.startOffset && endOffset == other.endOffset &&
+ flags == other.flags && positionIncrement == other.positionIncrement && SubEqual(type, other.type) &&
+ SubEqual(payload, other.payload))
+ {
+ for (int i = 0; i < termLength; i++)
+ if (termBuffer[i] != other.termBuffer[i])
+ return false;
+ return true;
+ }
+ return false;
+ }
+
+ private bool SubEqual(System.Object o1, System.Object o2)
+ {
+ if (o1 == null)
+ return o2 == null;
+ return o1.Equals(o2);
+ }
+
+ public override int GetHashCode()
+ {
+ InitTermBuffer();
+ int code = termLength;
+ code = code * 31 + startOffset;
+ code = code * 31 + endOffset;
+ code = code * 31 + flags;
+ code = code * 31 + positionIncrement;
+ code = code * 31 + type.GetHashCode();
+ code = (payload == null?code:code * 31 + payload.GetHashCode());
+ code = code * 31 + ArrayUtil.HashCode(termBuffer, 0, termLength);
+ return code;
+ }
+
+ // like clear() but doesn't clear termBuffer/text
+ private void ClearNoTermBuffer()
+ {
+ payload = null;
+ positionIncrement = 1;
+ flags = 0;
+ startOffset = endOffset = 0;
+ type = DEFAULT_TYPE;
+ }
+
+ /// <summary>Shorthand for calling <see cref="Clear" />,
+ /// <see cref="SetTermBuffer(char[], int, int)" />,
+ /// <see cref="StartOffset" />,
+ /// <see cref="EndOffset" />,
+ /// <see cref="Type" />
+ /// </summary>
+ /// <returns> this Token instance
+ /// </returns>
+ public virtual Token Reinit(char[] newTermBuffer, int newTermOffset, int newTermLength, int newStartOffset, int newEndOffset, System.String newType)
+ {
+ ClearNoTermBuffer();
+ payload = null;
+ positionIncrement = 1;
+ SetTermBuffer(newTermBuffer, newTermOffset, newTermLength);
+ startOffset = newStartOffset;
+ endOffset = newEndOffset;
+ type = newType;
+ return this;
+ }
+
+ /// <summary>Shorthand for calling <see cref="Clear" />,
+ /// <see cref="SetTermBuffer(char[], int, int)" />,
+ /// <see cref="StartOffset" />,
+ /// <see cref="EndOffset" />
+ /// <see cref="Type" /> on Token.DEFAULT_TYPE
+ /// </summary>
+ /// <returns> this Token instance
+ /// </returns>
+ public virtual Token Reinit(char[] newTermBuffer, int newTermOffset, int newTermLength, int newStartOffset, int newEndOffset)
+ {
+ ClearNoTermBuffer();
+ SetTermBuffer(newTermBuffer, newTermOffset, newTermLength);
+ startOffset = newStartOffset;
+ endOffset = newEndOffset;
+ type = DEFAULT_TYPE;
+ return this;
+ }
+
+ /// <summary>Shorthand for calling <see cref="Clear" />,
+ /// <see cref="SetTermBuffer(String)" />,
+ /// <see cref="StartOffset" />,
+ /// <see cref="EndOffset" />
+ /// <see cref="Type" />
+ /// </summary>
+ /// <returns> this Token instance
+ /// </returns>
+ public virtual Token Reinit(System.String newTerm, int newStartOffset, int newEndOffset, System.String newType)
+ {
+ ClearNoTermBuffer();
+ SetTermBuffer(newTerm);
+ startOffset = newStartOffset;
+ endOffset = newEndOffset;
+ type = newType;
+ return this;
+ }
+
+ /// <summary>Shorthand for calling <see cref="Clear" />,
+ /// <see cref="SetTermBuffer(String, int, int)" />,
+ /// <see cref="StartOffset" />,
+ /// <see cref="EndOffset" />
+ /// <see cref="Type" />
+ /// </summary>
+ /// <returns> this Token instance
+ /// </returns>
+ public virtual Token Reinit(System.String newTerm, int newTermOffset, int newTermLength, int newStartOffset, int newEndOffset, System.String newType)
+ {
+ ClearNoTermBuffer();
+ SetTermBuffer(newTerm, newTermOffset, newTermLength);
+ startOffset = newStartOffset;
+ endOffset = newEndOffset;
+ type = newType;
+ return this;
+ }
+
+ /// <summary>Shorthand for calling <see cref="Clear" />,
+ /// <see cref="SetTermBuffer(String)" />,
+ /// <see cref="StartOffset" />,
+ /// <see cref="EndOffset" />
+ /// <see cref="Type" /> on Token.DEFAULT_TYPE
+ /// </summary>
+ /// <returns> this Token instance
+ /// </returns>
+ public virtual Token Reinit(System.String newTerm, int newStartOffset, int newEndOffset)
+ {
+ ClearNoTermBuffer();
+ SetTermBuffer(newTerm);
+ startOffset = newStartOffset;
+ endOffset = newEndOffset;
+ type = DEFAULT_TYPE;
+ return this;
+ }
+
+ /// <summary>Shorthand for calling <see cref="Clear" />,
+ /// <see cref="SetTermBuffer(String, int, int)" />,
+ /// <see cref="StartOffset" />,
+ /// <see cref="EndOffset" />
+ /// <see cref="Type" /> on Token.DEFAULT_TYPE
+ /// </summary>
+ /// <returns> this Token instance
+ /// </returns>
+ public virtual Token Reinit(System.String newTerm, int newTermOffset, int newTermLength, int newStartOffset, int newEndOffset)
+ {
+ ClearNoTermBuffer();
+ SetTermBuffer(newTerm, newTermOffset, newTermLength);
+ startOffset = newStartOffset;
+ endOffset = newEndOffset;
+ type = DEFAULT_TYPE;
+ return this;
+ }
+
+ /// <summary> Copy the prototype token's fields into this one. Note: Payloads are shared.</summary>
+ /// <param name="prototype">
+ /// </param>
+ public virtual void Reinit(Token prototype)
+ {
+ prototype.InitTermBuffer();
+ SetTermBuffer(prototype.termBuffer, 0, prototype.termLength);
+ positionIncrement = prototype.positionIncrement;
+ flags = prototype.flags;
+ startOffset = prototype.startOffset;
+ endOffset = prototype.endOffset;
+ type = prototype.type;
+ payload = prototype.payload;
+ }
+
+ /// <summary> Copy the prototype token's fields into this one, with a different term. Note: Payloads are shared.</summary>
+ /// <param name="prototype">
+ /// </param>
+ /// <param name="newTerm">
+ /// </param>
+ public virtual void Reinit(Token prototype, System.String newTerm)
+ {
+ SetTermBuffer(newTerm);
+ positionIncrement = prototype.positionIncrement;
+ flags = prototype.flags;
+ startOffset = prototype.startOffset;
+ endOffset = prototype.endOffset;
+ type = prototype.type;
+ payload = prototype.payload;
+ }
+
+ /// <summary> Copy the prototype token's fields into this one, with a different term. Note: Payloads are shared.</summary>
+ /// <param name="prototype">
+ /// </param>
+ /// <param name="newTermBuffer">
+ /// </param>
+ /// <param name="offset">
+ /// </param>
+ /// <param name="length">
+ /// </param>
+ public virtual void Reinit(Token prototype, char[] newTermBuffer, int offset, int length)
+ {
+ SetTermBuffer(newTermBuffer, offset, length);
+ positionIncrement = prototype.positionIncrement;
+ flags = prototype.flags;
+ startOffset = prototype.startOffset;
+ endOffset = prototype.endOffset;
+ type = prototype.type;
+ payload = prototype.payload;
+ }
+
+ public override void CopyTo(Attribute target)
+ {
+ if (target is Token)
+ {
+ var to = (Token) target;
+ to.Reinit(this);
+ // reinit shares the payload, so clone it:
+ if (payload != null)
+ {
+ to.payload = (Payload) payload.Clone();
+ }
+ }
+ else
+ {
+ InitTermBuffer();
+ ((ITermAttribute) target).SetTermBuffer(termBuffer, 0, termLength);
+ ((IOffsetAttribute) target).SetOffset(startOffset, endOffset);
+ ((IPositionIncrementAttribute) target).PositionIncrement = positionIncrement;
+ ((IPayloadAttribute) target).Payload = (payload == null)?null:(Payload) payload.Clone();
+ ((IFlagsAttribute) target).Flags = flags;
+ ((ITypeAttribute) target).Type = type;
+ }
+ }
+
+ ///<summary>
+ /// Convenience factory that returns <c>Token</c> as implementation for the basic
+ /// attributes and return the default impl (with &quot;Impl&quot; appended) for all other
+ /// attributes.
+ /// @since 3.0
+ /// </summary>
+ public static AttributeSource.AttributeFactory TOKEN_ATTRIBUTE_FACTORY =
+ new TokenAttributeFactory(AttributeSource.AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY);
+
+ /// <summary>
+ /// <b>Expert</b>: Creates an AttributeFactory returning {@link Token} as instance for the basic attributes
+ /// and for all other attributes calls the given delegate factory.
+ /// </summary>
+ public class TokenAttributeFactory : AttributeSource.AttributeFactory
+ {
+
+ private readonly AttributeSource.AttributeFactory _delegateFactory;
+
+ /// <summary>
+ /// <b>Expert</b>: Creates an AttributeFactory returning {@link Token} as instance for the basic attributes
+ /// and for all other attributes calls the given delegate factory.
+ /// </summary>
+ public TokenAttributeFactory(AttributeSource.AttributeFactory delegateFactory)
+ {
+ this._delegateFactory = delegateFactory;
+ }
+
+ public override Attribute CreateAttributeInstance<T>()
+ {
+ return typeof(T).IsAssignableFrom(typeof(Token))
+ ? new Token()
+ : _delegateFactory.CreateAttributeInstance<T>();
+ }
+
+ public override bool Equals(Object other)
+ {
+ if (this == other) return true;
+
+ var af = other as TokenAttributeFactory;
+ return af != null && _delegateFactory.Equals(af._delegateFactory);
+ }
+
+ public override int GetHashCode()
+ {
+ return _delegateFactory.GetHashCode() ^ 0x0a45aa31;
+ }
+ }
+ }
+} \ No newline at end of file
diff --git a/src/core/Analysis/TokenFilter.cs b/src/core/Analysis/TokenFilter.cs
new file mode 100644
index 0000000..7483c82
--- /dev/null
+++ b/src/core/Analysis/TokenFilter.cs
@@ -0,0 +1,72 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+namespace Lucene.Net.Analysis
+{
+
+ /// <summary> A TokenFilter is a TokenStream whose input is another TokenStream.
+ /// <p/>
+ /// This is an abstract class; subclasses must override <see cref="TokenStream.IncrementToken()" />.
+ ///
+ /// </summary>
+ /// <seealso cref="TokenStream">
+ /// </seealso>
+ public abstract class TokenFilter:TokenStream
+ {
+ /// <summary>The source of tokens for this filter. </summary>
+ protected internal TokenStream input;
+
+ private bool isDisposed;
+
+ /// <summary>Construct a token stream filtering the given input. </summary>
+ protected internal TokenFilter(TokenStream input):base(input)
+ {
+ this.input = input;
+ }
+
+ /// <summary>Performs end-of-stream operations, if any, and calls then <c>end()</c> on the
+ /// input TokenStream.<p/>
+ /// <b>NOTE:</b> Be sure to call <c>super.end()</c> first when overriding this method.
+ /// </summary>
+ public override void End()
+ {
+ input.End();
+ }
+
+ protected override void Dispose(bool disposing)
+ {
+ if (isDisposed) return;
+
+ if (disposing)
+ {
+ if (input != null)
+ {
+ input.Close();
+ }
+ }
+
+ //input = null;
+ isDisposed = true;
+ }
+
+ /// <summary>Reset the filter as well as the input TokenStream. </summary>
+ public override void Reset()
+ {
+ input.Reset();
+ }
+ }
+} \ No newline at end of file
diff --git a/src/core/Analysis/TokenStream.cs b/src/core/Analysis/TokenStream.cs
new file mode 100644
index 0000000..c624696
--- /dev/null
+++ b/src/core/Analysis/TokenStream.cs
@@ -0,0 +1,162 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+using System;
+using Lucene.Net.Util;
+using Document = Lucene.Net.Documents.Document;
+using Field = Lucene.Net.Documents.Field;
+using IndexWriter = Lucene.Net.Index.IndexWriter;
+using AttributeSource = Lucene.Net.Util.AttributeSource;
+
+namespace Lucene.Net.Analysis
+{
+
+ /// <summary> A <c>TokenStream</c> enumerates the sequence of tokens, either from
+ /// <see cref="Field" />s of a <see cref="Document" /> or from query text.
+ /// <p/>
+ /// This is an abstract class. Concrete subclasses are:
+ /// <list type="bullet">
+ /// <item><see cref="Tokenizer" />, a <c>TokenStream</c> whose input is a Reader; and</item>
+ /// <item><see cref="TokenFilter" />, a <c>TokenStream</c> whose input is another
+ /// <c>TokenStream</c>.</item>
+ /// </list>
+ /// A new <c>TokenStream</c> API has been introduced with Lucene 2.9. This API
+ /// has moved from being <see cref="Token" /> based to <see cref="IAttribute" /> based. While
+ /// <see cref="Token" /> still exists in 2.9 as a convenience class, the preferred way
+ /// to store the information of a <see cref="Token" /> is to use <see cref="Util.Attribute" />s.
+ /// <p/>
+ /// <c>TokenStream</c> now extends <see cref="AttributeSource" />, which provides
+ /// access to all of the token <see cref="IAttribute" />s for the <c>TokenStream</c>.
+ /// Note that only one instance per <see cref="Util.Attribute" /> is created and reused
+ /// for every token. This approach reduces object creation and allows local
+ /// caching of references to the <see cref="Util.Attribute" />s. See
+ /// <see cref="IncrementToken()" /> for further details.
+ /// <p/>
+ /// <b>The workflow of the new <c>TokenStream</c> API is as follows:</b>
+ /// <list type="bullet">
+ /// <item>Instantiation of <c>TokenStream</c>/<see cref="TokenFilter" />s which add/get
+ /// attributes to/from the <see cref="AttributeSource" />.</item>
+ /// <item>The consumer calls <see cref="TokenStream.Reset()" />.</item>
+ /// <item>The consumer retrieves attributes from the stream and stores local
+ /// references to all attributes it wants to access</item>
+ /// <item>The consumer calls <see cref="IncrementToken()" /> until it returns false and
+ /// consumes the attributes after each call.</item>
+ /// <item>The consumer calls <see cref="End()" /> so that any end-of-stream operations
+ /// can be performed.</item>
+ /// <item>The consumer calls <see cref="Close()" /> to release any resource when finished
+ /// using the <c>TokenStream</c></item>
+ /// </list>
+ /// To make sure that filters and consumers know which attributes are available,
+ /// the attributes must be added during instantiation. Filters and consumers are
+ /// not required to check for availability of attributes in
+ /// <see cref="IncrementToken()" />.
+ /// <p/>
+ /// You can find some example code for the new API in the analysis package level
+ /// Javadoc.
+ /// <p/>
+ /// Sometimes it is desirable to capture a current state of a <c>TokenStream</c>
+ /// , e. g. for buffering purposes (see <see cref="CachingTokenFilter" />,
+ /// <see cref="TeeSinkTokenFilter" />). For this usecase
+ /// <see cref="AttributeSource.CaptureState" /> and <see cref="AttributeSource.RestoreState" />
+ /// can be used.
+ /// </summary>
+ public abstract class TokenStream : AttributeSource, IDisposable
+ {
+ /// <summary> A TokenStream using the default attribute factory.</summary>
+ protected internal TokenStream()
+ { }
+
+ /// <summary> A TokenStream that uses the same attributes as the supplied one.</summary>
+ protected internal TokenStream(AttributeSource input)
+ : base(input)
+ { }
+
+ /// <summary> A TokenStream using the supplied AttributeFactory for creating new <see cref="IAttribute" /> instances.</summary>
+ protected internal TokenStream(AttributeFactory factory)
+ : base(factory)
+ { }
+
+ /// <summary> Consumers (i.e., <see cref="IndexWriter" />) use this method to advance the stream to
+ /// the next token. Implementing classes must implement this method and update
+ /// the appropriate <see cref="Util.Attribute" />s with the attributes of the next
+ /// token.
+ ///
+ /// The producer must make no assumptions about the attributes after the
+ /// method has been returned: the caller may arbitrarily change it. If the
+ /// producer needs to preserve the state for subsequent calls, it can use
+ /// <see cref="AttributeSource.CaptureState" /> to create a copy of the current attribute state.
+ ///
+ /// This method is called for every token of a document, so an efficient
+ /// implementation is crucial for good performance. To avoid calls to
+ /// <see cref="AttributeSource.AddAttribute{T}()" /> and <see cref="AttributeSource.GetAttribute{T}()" />,
+ /// references to all <see cref="Util.Attribute" />s that this stream uses should be
+ /// retrieved during instantiation.
+ ///
+ /// To ensure that filters and consumers know which attributes are available,
+ /// the attributes must be added during instantiation. Filters and consumers
+ /// are not required to check for availability of attributes in
+ /// <see cref="IncrementToken()" />.
+ ///
+ /// </summary>
+ /// <returns> false for end of stream; true otherwise</returns>
+ public abstract bool IncrementToken();
+
+ /// <summary> This method is called by the consumer after the last token has been
+ /// consumed, after <see cref="IncrementToken" /> returned <c>false</c>
+ /// (using the new <c>TokenStream</c> API). Streams implementing the old API
+ /// should upgrade to use this feature.
+ /// <p/>
+ /// This method can be used to perform any end-of-stream operations, such as
+ /// setting the final offset of a stream. The final offset of a stream might
+ /// differ from the offset of the last token eg in case one or more whitespaces
+ /// followed after the last token, but a <see cref="WhitespaceTokenizer" /> was used.
+ ///
+ /// </summary>
+ /// <throws> IOException </throws>
+ public virtual void End()
+ {
+ // do nothing by default
+ }
+
+ /// <summary> Resets this stream to the beginning. This is an optional operation, so
+ /// subclasses may or may not implement this method. <see cref="Reset()" /> is not needed for
+ /// the standard indexing process. However, if the tokens of a
+ /// <c>TokenStream</c> are intended to be consumed more than once, it is
+ /// necessary to implement <see cref="Reset()" />. Note that if your TokenStream
+ /// caches tokens and feeds them back again after a reset, it is imperative
+ /// that you clone the tokens when you store them away (on the first pass) as
+ /// well as when you return them (on future passes after <see cref="Reset()" />).
+ /// </summary>
+ public virtual void Reset()
+ {
+ }
+
+ /// <summary>Releases resources associated with this stream. </summary>
+ [Obsolete("Use Dispose() instead")]
+ public void Close()
+ {
+ Dispose();
+ }
+
+ public void Dispose()
+ {
+ Dispose(true);
+ }
+
+ protected abstract void Dispose(bool disposing);
+ }
+} \ No newline at end of file
diff --git a/src/core/Analysis/Tokenattributes/FlagsAttribute.cs b/src/core/Analysis/Tokenattributes/FlagsAttribute.cs
new file mode 100644
index 0000000..b5c4b7b
--- /dev/null
+++ b/src/core/Analysis/Tokenattributes/FlagsAttribute.cs
@@ -0,0 +1,85 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+using System;
+using Attribute = Lucene.Net.Util.Attribute;
+
+namespace Lucene.Net.Analysis.Tokenattributes
+{
+
+ /// <summary> This attribute can be used to pass different flags down the tokenizer chain,
+ /// eg from one TokenFilter to another one.
+ /// </summary>
+ [Serializable]
+ public class FlagsAttribute:Util.Attribute, IFlagsAttribute, System.ICloneable
+ {
+ private int flags = 0;
+
+ /// <summary> EXPERIMENTAL: While we think this is here to stay, we may want to change it to be a long.
+ /// <p/>
+ ///
+ /// Get the bitset for any bits that have been set. This is completely distinct from <see cref="ITypeAttribute.Type()" />, although they do share similar purposes.
+ /// The flags can be used to encode information about the token for use by other <see cref="Lucene.Net.Analysis.TokenFilter" />s.
+ ///
+ ///
+ /// </summary>
+ /// <value> The bits </value>
+ public virtual int Flags
+ {
+ get { return flags; }
+ set { this.flags = value; }
+ }
+
+ public override void Clear()
+ {
+ flags = 0;
+ }
+
+ public override bool Equals(System.Object other)
+ {
+ if (this == other)
+ {
+ return true;
+ }
+
+ if (other is FlagsAttribute)
+ {
+ return ((FlagsAttribute) other).flags == flags;
+ }
+
+ return false;
+ }
+
+ public override int GetHashCode()
+ {
+ return flags;
+ }
+
+ public override void CopyTo(Attribute target)
+ {
+ IFlagsAttribute t = (IFlagsAttribute) target;
+ t.Flags = flags;
+ }
+
+ override public System.Object Clone()
+ {
+ FlagsAttribute impl = new FlagsAttribute();
+ impl.flags = this.flags;
+ return impl;
+ }
+ }
+} \ No newline at end of file
diff --git a/src/core/Analysis/Tokenattributes/IFlagsAttribute.cs b/src/core/Analysis/Tokenattributes/IFlagsAttribute.cs
new file mode 100644
index 0000000..24b2bea
--- /dev/null
+++ b/src/core/Analysis/Tokenattributes/IFlagsAttribute.cs
@@ -0,0 +1,41 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+using System;
+using Lucene.Net.Util;
+using Tokenizer = Lucene.Net.Analysis.Tokenizer;
+
+namespace Lucene.Net.Analysis.Tokenattributes
+{
+
+ /// <summary> This attribute can be used to pass different flags down the <see cref="Tokenizer" /> chain,
+ /// eg from one TokenFilter to another one.
+ /// </summary>
+ public interface IFlagsAttribute:IAttribute
+ {
+ /// <summary> EXPERIMENTAL: While we think this is here to stay, we may want to change it to be a long.
+ /// <p/>
+ ///
+ /// Get the bitset for any bits that have been set. This is completely distinct from <see cref="ITypeAttribute.Type()" />, although they do share similar purposes.
+ /// The flags can be used to encode information about the token for use by other <see cref="Lucene.Net.Analysis.TokenFilter" />s.
+ ///
+ ///
+ /// </summary>
+ /// <value> The bits </value>
+ int Flags { get; set; }
+ }
+} \ No newline at end of file
diff --git a/src/core/Analysis/Tokenattributes/IOffsetAttribute.cs b/src/core/Analysis/Tokenattributes/IOffsetAttribute.cs
new file mode 100644
index 0000000..ffbbe02
--- /dev/null
+++ b/src/core/Analysis/Tokenattributes/IOffsetAttribute.cs
@@ -0,0 +1,48 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+using System;
+using Lucene.Net.Util;
+
+namespace Lucene.Net.Analysis.Tokenattributes
+{
+
+ /// <summary> The start and end character offset of a Token. </summary>
+ public interface IOffsetAttribute : IAttribute
+ {
+ /// <summary>Returns this Token's starting offset, the position of the first character
+ /// corresponding to this token in the source text.
+ /// Note that the difference between endOffset() and startOffset() may not be
+ /// equal to termText.length(), as the term text may have been altered by a
+ /// stemmer or some other filter.
+ /// </summary>
+ int StartOffset { get; }
+
+
+ /// <summary>Set the starting and ending offset.
+ /// See StartOffset() and EndOffset()
+ /// </summary>
+ void SetOffset(int startOffset, int endOffset);
+
+
+ /// <summary>Returns this Token's ending offset, one greater than the position of the
+ /// last character corresponding to this token in the source text. The length
+ /// of the token in the source text is (endOffset - startOffset).
+ /// </summary>
+ int EndOffset { get; }
+ }
+} \ No newline at end of file
diff --git a/src/core/Analysis/Tokenattributes/IPayloadAttribute.cs b/src/core/Analysis/Tokenattributes/IPayloadAttribute.cs
new file mode 100644
index 0000000..7e313ce
--- /dev/null
+++ b/src/core/Analysis/Tokenattributes/IPayloadAttribute.cs
@@ -0,0 +1,31 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+using System;
+using Lucene.Net.Util;
+using Payload = Lucene.Net.Index.Payload;
+
+namespace Lucene.Net.Analysis.Tokenattributes
+{
+
+ /// <summary> The payload of a Token. See also <see cref="Payload" />.</summary>
+ public interface IPayloadAttribute:IAttribute
+ {
+ /// <summary> Returns this Token's payload.</summary>
+ Payload Payload { get; set; }
+ }
+} \ No newline at end of file
diff --git a/src/core/Analysis/Tokenattributes/IPositionIncrementAttribute.cs b/src/core/Analysis/Tokenattributes/IPositionIncrementAttribute.cs
new file mode 100644
index 0000000..6c2a131
--- /dev/null
+++ b/src/core/Analysis/Tokenattributes/IPositionIncrementAttribute.cs
@@ -0,0 +1,59 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+using System;
+using Lucene.Net.Util;
+
+namespace Lucene.Net.Analysis.Tokenattributes
+{
+
+ /// <summary>The positionIncrement determines the position of this token
+ /// relative to the previous Token in a TokenStream, used in phrase
+ /// searching.
+ ///
+ /// <p/>The default value is one.
+ ///
+ /// <p/>Some common uses for this are:<list>
+ ///
+ /// <item>Set it to zero to put multiple terms in the same position. This is
+ /// useful if, e.g., a word has multiple stems. Searches for phrases
+ /// including either stem will match. In this case, all but the first stem's
+ /// increment should be set to zero: the increment of the first instance
+ /// should be one. Repeating a token with an increment of zero can also be
+ /// used to boost the scores of matches on that token.</item>
+ ///
+ /// <item>Set it to values greater than one to inhibit exact phrase matches.
+ /// If, for example, one does not want phrases to match across removed stop
+ /// words, then one could build a stop word filter that removes stop words and
+ /// also sets the increment to the number of stop words removed before each
+ /// non-stop word. Then exact phrase queries will only match when the terms
+ /// occur with no intervening stop words.</item>
+ ///
+ /// </list>
+ ///
+ /// </summary>
+ /// <seealso cref="Lucene.Net.Index.TermPositions">
+ /// </seealso>
+ public interface IPositionIncrementAttribute:IAttribute
+ {
+ /// <summary>Gets or sets the position increment. The default value is one.
+ ///
+ /// </summary>
+ /// <value> the distance from the prior term </value>
+ int PositionIncrement { set; get; }
+ }
+} \ No newline at end of file
diff --git a/src/core/Analysis/Tokenattributes/ITermAttribute.cs b/src/core/Analysis/Tokenattributes/ITermAttribute.cs
new file mode 100644
index 0000000..8f9b030
--- /dev/null
+++ b/src/core/Analysis/Tokenattributes/ITermAttribute.cs
@@ -0,0 +1,104 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+using System;
+using Lucene.Net.Util;
+
+namespace Lucene.Net.Analysis.Tokenattributes
+{
+
+ /// <summary> The term text of a Token.</summary>
+ public interface ITermAttribute:IAttribute
+ {
+ /// <summary>Returns the Token's term text.
+ ///
+ /// This method has a performance penalty
+ /// because the text is stored internally in a char[]. If
+ /// possible, use <see cref="TermBuffer()" /> and <see cref="TermLength()" />
+ /// directly instead. If you really need a
+ /// String, use this method, which is nothing more than
+ /// a convenience call to <b>new String(token.termBuffer(), 0, token.termLength())</b>
+ /// </summary>
+ string Term { get; }
+
+ /// <summary>Copies the contents of buffer, starting at offset for
+ /// length characters, into the termBuffer array.
+ /// </summary>
+ /// <param name="buffer">the buffer to copy
+ /// </param>
+ /// <param name="offset">the index in the buffer of the first character to copy
+ /// </param>
+ /// <param name="length">the number of characters to copy
+ /// </param>
+ void SetTermBuffer(char[] buffer, int offset, int length);
+
+ /// <summary>Copies the contents of buffer into the termBuffer array.</summary>
+ /// <param name="buffer">the buffer to copy
+ /// </param>
+ void SetTermBuffer(System.String buffer);
+
+ /// <summary>Copies the contents of buffer, starting at offset and continuing
+ /// for length characters, into the termBuffer array.
+ /// </summary>
+ /// <param name="buffer">the buffer to copy
+ /// </param>
+ /// <param name="offset">the index in the buffer of the first character to copy
+ /// </param>
+ /// <param name="length">the number of characters to copy
+ /// </param>
+ void SetTermBuffer(System.String buffer, int offset, int length);
+
+ /// <summary>Returns the internal termBuffer character array which
+ /// you can then directly alter. If the array is too
+ /// small for your token, use <see cref="ResizeTermBuffer(int)" />
+ /// to increase it. After
+ /// altering the buffer be sure to call <see cref="SetTermLength" />
+ /// to record the number of valid
+ /// characters that were placed into the termBuffer.
+ /// </summary>
+ char[] TermBuffer();
+
+ /// <summary>Grows the termBuffer to at least size newSize, preserving the
+ /// existing content. Note: If the next operation is to change
+ /// the contents of the term buffer use
+ /// <see cref="SetTermBuffer(char[], int, int)" />,
+ /// <see cref="SetTermBuffer(String)" />, or
+ /// <see cref="SetTermBuffer(String, int, int)" />
+ /// to optimally combine the resize with the setting of the termBuffer.
+ /// </summary>
+ /// <param name="newSize">minimum size of the new termBuffer
+ /// </param>
+ /// <returns> newly created termBuffer with length >= newSize
+ /// </returns>
+ char[] ResizeTermBuffer(int newSize);
+
+ /// <summary>Return number of valid characters (length of the term)
+ /// in the termBuffer array.
+ /// </summary>
+ int TermLength();
+
+ /// <summary>Set number of valid characters (length of the term) in
+ /// the termBuffer array. Use this to truncate the termBuffer
+ /// or to synchronize with external manipulation of the termBuffer.
+ /// Note: to grow the size of the array,
+ /// use <see cref="ResizeTermBuffer(int)" /> first.
+ /// </summary>
+ /// <param name="length">the truncated length
+ /// </param>
+ void SetTermLength(int length);
+ }
+} \ No newline at end of file
diff --git a/src/core/Analysis/Tokenattributes/ITypeAttribute.cs b/src/core/Analysis/Tokenattributes/ITypeAttribute.cs
new file mode 100644
index 0000000..48bcc10
--- /dev/null
+++ b/src/core/Analysis/Tokenattributes/ITypeAttribute.cs
@@ -0,0 +1,30 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+using System;
+using Lucene.Net.Util;
+
+namespace Lucene.Net.Analysis.Tokenattributes
+{
+
+ /// <summary> A Token's lexical type. The Default value is "word". </summary>
+ public interface ITypeAttribute:IAttribute
+ {
+ /// <summary>Gets or sets this Token's lexical type. Defaults to "word". </summary>
+ string Type { get; set; }
+ }
+} \ No newline at end of file
diff --git a/src/core/Analysis/Tokenattributes/OffsetAttribute.cs b/src/core/Analysis/Tokenattributes/OffsetAttribute.cs
new file mode 100644
index 0000000..5149559
--- /dev/null
+++ b/src/core/Analysis/Tokenattributes/OffsetAttribute.cs
@@ -0,0 +1,106 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+using System;
+using Attribute = Lucene.Net.Util.Attribute;
+
+namespace Lucene.Net.Analysis.Tokenattributes
+{
+
+ /// <summary> The start and end character offset of a Token. </summary>
+ [Serializable]
+ public class OffsetAttribute:Attribute, IOffsetAttribute, System.ICloneable
+ {
+ private int startOffset;
+ private int endOffset;
+
+ /// <summary>Returns this Token's starting offset, the position of the first character
+ /// corresponding to this token in the source text.
+ /// Note that the difference between endOffset() and startOffset() may not be
+ /// equal to termText.length(), as the term text may have been altered by a
+ /// stemmer or some other filter.
+ /// </summary>
+ public virtual int StartOffset
+ {
+ get { return startOffset; }
+ }
+
+
+ /// <summary>Set the starting and ending offset.
+ /// See StartOffset() and EndOffset()
+ /// </summary>
+ public virtual void SetOffset(int startOffset, int endOffset)
+ {
+ this.startOffset = startOffset;
+ this.endOffset = endOffset;
+ }
+
+
+ /// <summary>Returns this Token's ending offset, one greater than the position of the
+ /// last character corresponding to this token in the source text. The length
+ /// of the token in the source text is (endOffset - startOffset).
+ /// </summary>
+ public virtual int EndOffset
+ {
+ get { return endOffset; }
+ }
+
+
+ public override void Clear()
+ {
+ startOffset = 0;
+ endOffset = 0;
+ }
+
+ public override bool Equals(System.Object other)
+ {
+ if (other == this)
+ {
+ return true;
+ }
+
+ if (other is OffsetAttribute)
+ {
+ OffsetAttribute o = (OffsetAttribute) other;
+ return o.startOffset == startOffset && o.endOffset == endOffset;
+ }
+
+ return false;
+ }
+
+ public override int GetHashCode()
+ {
+ int code = startOffset;
+ code = code * 31 + endOffset;
+ return code;
+ }
+
+ public override void CopyTo(Attribute target)
+ {
+ IOffsetAttribute t = (IOffsetAttribute) target;
+ t.SetOffset(startOffset, endOffset);
+ }
+
+ override public System.Object Clone()
+ {
+ OffsetAttribute impl = new OffsetAttribute();
+ impl.endOffset = endOffset;
+ impl.startOffset = startOffset;
+ return impl;
+ }
+ }
+} \ No newline at end of file
diff --git a/src/core/Analysis/Tokenattributes/PayloadAttribute.cs b/src/core/Analysis/Tokenattributes/PayloadAttribute.cs
new file mode 100644
index 0000000..ae1c4d9
--- /dev/null
+++ b/src/core/Analysis/Tokenattributes/PayloadAttribute.cs
@@ -0,0 +1,100 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+using System;
+using Attribute = Lucene.Net.Util.Attribute;
+using Payload = Lucene.Net.Index.Payload;
+
+namespace Lucene.Net.Analysis.Tokenattributes
+{
+
+ /// <summary> The payload of a Token. See also <see cref="Payload" />.</summary>
+ [Serializable]
+ public class PayloadAttribute:Attribute, IPayloadAttribute, System.ICloneable
+ {
+ private Payload payload;
+
+ /// <summary> Initialize this attribute with no payload.</summary>
+ public PayloadAttribute()
+ {
+ }
+
+ /// <summary> Initialize this attribute with the given payload. </summary>
+ public PayloadAttribute(Payload payload)
+ {
+ this.payload = payload;
+ }
+
+ /// <summary> Returns this Token's payload.</summary>
+ public virtual Payload Payload
+ {
+ get { return this.payload; }
+ set { this.payload = value; }
+ }
+
+ public override void Clear()
+ {
+ payload = null;
+ }
+
+ public override System.Object Clone()
+ {
+ var clone = (PayloadAttribute) base.Clone();
+ if (payload != null)
+ {
+ clone.payload = (Payload) payload.Clone();
+ }
+ return clone;
+ // TODO: This code use to be as below. Any reason why? the if(payload!=null) was missing...
+ //PayloadAttributeImpl impl = new PayloadAttributeImpl();
+ //impl.payload = new Payload(this.payload.data, this.payload.offset, this.payload.length);
+ //return impl;
+ }
+
+ public override bool Equals(System.Object other)
+ {
+ if (other == this)
+ {
+ return true;
+ }
+
+ if (other is IPayloadAttribute)
+ {
+ PayloadAttribute o = (PayloadAttribute) other;
+ if (o.payload == null || payload == null)
+ {
+ return o.payload == null && payload == null;
+ }
+
+ return o.payload.Equals(payload);
+ }
+
+ return false;
+ }
+
+ public override int GetHashCode()
+ {
+ return (payload == null)?0:payload.GetHashCode();
+ }
+
+ public override void CopyTo(Attribute target)
+ {
+ IPayloadAttribute t = (IPayloadAttribute) target;
+ t.Payload = (payload == null)?null:(Payload) payload.Clone();
+ }
+ }
+} \ No newline at end of file
diff --git a/src/core/Analysis/Tokenattributes/PositionIncrementAttribute.cs b/src/core/Analysis/Tokenattributes/PositionIncrementAttribute.cs
new file mode 100644
index 0000000..4f7a04f
--- /dev/null
+++ b/src/core/Analysis/Tokenattributes/PositionIncrementAttribute.cs
@@ -0,0 +1,107 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+using System;
+using Attribute = Lucene.Net.Util.Attribute;
+using TokenStream = Lucene.Net.Analysis.TokenStream;
+
+namespace Lucene.Net.Analysis.Tokenattributes
+{
+
+ /// <summary>The positionIncrement determines the position of this token
+ /// relative to the previous Token in a <see cref="TokenStream" />, used in phrase
+ /// searching.
+ ///
+ /// <p/>The default value is one.
+ ///
+ /// <p/>Some common uses for this are:<list>
+ ///
+ /// <item>Set it to zero to put multiple terms in the same position. This is
+ /// useful if, e.g., a word has multiple stems. Searches for phrases
+ /// including either stem will match. In this case, all but the first stem's
+ /// increment should be set to zero: the increment of the first instance
+ /// should be one. Repeating a token with an increment of zero can also be
+ /// used to boost the scores of matches on that token.</item>
+ ///
+ /// <item>Set it to values greater than one to inhibit exact phrase matches.
+ /// If, for example, one does not want phrases to match across removed stop
+ /// words, then one could build a stop word filter that removes stop words and
+ /// also sets the increment to the number of stop words removed before each
+ /// non-stop word. Then exact phrase queries will only match when the terms
+ /// occur with no intervening stop words.</item>
+ ///
+ /// </list>
+ /// </summary>
+ [Serializable]
+ public class PositionIncrementAttribute:Attribute, IPositionIncrementAttribute, System.ICloneable
+ {
+ private int positionIncrement = 1;
+
+ /// <summary>Set the position increment. The default value is one.
+ ///
+ /// </summary>
+ /// <value> the distance from the prior term </value>
+ public virtual int PositionIncrement
+ {
+ set
+ {
+ if (value < 0)
+ throw new System.ArgumentException("Increment must be zero or greater: " + value);
+ this.positionIncrement = value;
+ }
+ get { return positionIncrement; }
+ }
+
+ public override void Clear()
+ {
+ this.positionIncrement = 1;
+ }
+
+ public override bool Equals(System.Object other)
+ {
+ if (other == this)
+ {
+ return true;
+ }
+
+ if (other is PositionIncrementAttribute)
+ {
+ return positionIncrement == ((PositionIncrementAttribute) other).positionIncrement;
+ }
+
+ return false;
+ }
+
+ public override int GetHashCode()
+ {
+ return positionIncrement;
+ }
+
+ public override void CopyTo(Attribute target)
+ {
+ IPositionIncrementAttribute t = (IPositionIncrementAttribute) target;
+ t.PositionIncrement = positionIncrement;
+ }
+
+ override public System.Object Clone()
+ {
+ PositionIncrementAttribute impl = new PositionIncrementAttribute();
+ impl.positionIncrement = positionIncrement;
+ return impl;
+ }
+ }
+} \ No newline at end of file
diff --git a/src/core/Analysis/Tokenattributes/TermAttribute.cs b/src/core/Analysis/Tokenattributes/TermAttribute.cs
new file mode 100644
index 0000000..f95402c
--- /dev/null
+++ b/src/core/Analysis/Tokenattributes/TermAttribute.cs
@@ -0,0 +1,268 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+using System;
+using Lucene.Net.Support;
+using ArrayUtil = Lucene.Net.Util.ArrayUtil;
+using Attribute = Lucene.Net.Util.Attribute;
+
+namespace Lucene.Net.Analysis.Tokenattributes
+{
+
+ /// <summary> The term text of a Token.</summary>
+ [Serializable]
+ public class TermAttribute:Attribute, ITermAttribute, System.ICloneable
+ {
+ private static int MIN_BUFFER_SIZE = 10;
+
+ private char[] termBuffer;
+ private int termLength;
+
+ /// <summary>Returns the Token's term text.
+ ///
+ /// This method has a performance penalty
+ /// because the text is stored internally in a char[]. If
+ /// possible, use <see cref="TermBuffer()" /> and
+ /// <see cref="TermLength()" /> directly instead. If you
+ /// really need a String, use this method, which is nothing more than
+ /// a convenience call to <b>new String(token.termBuffer(), 0, token.termLength())</b>
+ /// </summary>
+ public virtual string Term
+ {
+ get
+ {
+ InitTermBuffer();
+ return new System.String(termBuffer, 0, termLength);
+ }
+ }
+
+ /// <summary>Copies the contents of buffer, starting at offset for
+ /// length characters, into the termBuffer array.
+ /// </summary>
+ /// <param name="buffer">the buffer to copy
+ /// </param>
+ /// <param name="offset">the index in the buffer of the first character to copy
+ /// </param>
+ /// <param name="length">the number of characters to copy
+ /// </param>
+ public virtual void SetTermBuffer(char[] buffer, int offset, int length)
+ {
+ GrowTermBuffer(length);
+ Array.Copy(buffer, offset, termBuffer, 0, length);
+ termLength = length;
+ }
+
+ /// <summary>Copies the contents of buffer into the termBuffer array.</summary>
+ /// <param name="buffer">the buffer to copy
+ /// </param>
+ public virtual void SetTermBuffer(System.String buffer)
+ {
+ int length = buffer.Length;
+ GrowTermBuffer(length);
+ TextSupport.GetCharsFromString(buffer, 0, length, termBuffer, 0);
+ termLength = length;
+ }
+
+ /// <summary>Copies the contents of buffer, starting at offset and continuing
+ /// for length characters, into the termBuffer array.
+ /// </summary>
+ /// <param name="buffer">the buffer to copy
+ /// </param>
+ /// <param name="offset">the index in the buffer of the first character to copy
+ /// </param>
+ /// <param name="length">the number of characters to copy
+ /// </param>
+ public virtual void SetTermBuffer(System.String buffer, int offset, int length)
+ {
+ System.Diagnostics.Debug.Assert(offset <= buffer.Length);
+ System.Diagnostics.Debug.Assert(offset + length <= buffer.Length);
+ GrowTermBuffer(length);
+ TextSupport.GetCharsFromString(buffer, offset, offset + length, termBuffer, 0);
+ termLength = length;
+ }
+
+ /// <summary>Returns the internal termBuffer character array which
+ /// you can then directly alter. If the array is too
+ /// small for your token, use <see cref="ResizeTermBuffer(int)" />
+ /// to increase it. After
+ /// altering the buffer be sure to call <see cref="SetTermLength" />
+ /// to record the number of valid
+ /// characters that were placed into the termBuffer.
+ /// </summary>
+ public virtual char[] TermBuffer()
+ {
+ InitTermBuffer();
+ return termBuffer;
+ }
+
+ /// <summary>Grows the termBuffer to at least size newSize, preserving the
+ /// existing content. Note: If the next operation is to change
+ /// the contents of the term buffer use
+ /// <see cref="SetTermBuffer(char[], int, int)" />,
+ /// <see cref="SetTermBuffer(String)" />, or
+ /// <see cref="SetTermBuffer(String, int, int)" />
+ /// to optimally combine the resize with the setting of the termBuffer.
+ /// </summary>
+ /// <param name="newSize">minimum size of the new termBuffer
+ /// </param>
+ /// <returns> newly created termBuffer with length >= newSize
+ /// </returns>
+ public virtual char[] ResizeTermBuffer(int newSize)
+ {
+ if (termBuffer == null)
+ {
+ // The buffer is always at least MIN_BUFFER_SIZE
+ termBuffer = new char[ArrayUtil.GetNextSize(newSize < MIN_BUFFER_SIZE?MIN_BUFFER_SIZE:newSize)];
+ }
+ else
+ {
+ if (termBuffer.Length < newSize)
+ {
+ // Not big enough; create a new array with slight
+ // over allocation and preserve content
+ char[] newCharBuffer = new char[ArrayUtil.GetNextSize(newSize)];
+ Array.Copy(termBuffer, 0, newCharBuffer, 0, termBuffer.Length);
+ termBuffer = newCharBuffer;
+ }
+ }
+ return termBuffer;
+ }
+
+
+ /// <summary>Allocates a buffer char[] of at least newSize, without preserving the existing content.
+ /// its always used in places that set the content
+ /// </summary>
+ /// <param name="newSize">minimum size of the buffer
+ /// </param>
+ private void GrowTermBuffer(int newSize)
+ {
+ if (termBuffer == null)
+ {
+ // The buffer is always at least MIN_BUFFER_SIZE
+ termBuffer = new char[ArrayUtil.GetNextSize(newSize < MIN_BUFFER_SIZE?MIN_BUFFER_SIZE:newSize)];
+ }
+ else
+ {
+ if (termBuffer.Length < newSize)
+ {
+ // Not big enough; create a new array with slight
+ // over allocation:
+ termBuffer = new char[ArrayUtil.GetNextSize(newSize)];
+ }
+ }
+ }
+
+ private void InitTermBuffer()
+ {
+ if (termBuffer == null)
+ {
+ termBuffer = new char[ArrayUtil.GetNextSize(MIN_BUFFER_SIZE)];
+ termLength = 0;
+ }
+ }
+
+ /// <summary>Return number of valid characters (length of the term)
+ /// in the termBuffer array.
+ /// </summary>
+ public virtual int TermLength()
+ {
+ return termLength;
+ }
+
+ /// <summary>Set number of valid characters (length of the term) in
+ /// the termBuffer array. Use this to truncate the termBuffer
+ /// or to synchronize with external manipulation of the termBuffer.
+ /// Note: to grow the size of the array,
+ /// use <see cref="ResizeTermBuffer(int)" /> first.
+ /// </summary>
+ /// <param name="length">the truncated length
+ /// </param>
+ public virtual void SetTermLength(int length)
+ {
+ InitTermBuffer();
+ if (length > termBuffer.Length)
+ throw new System.ArgumentException("length " + length + " exceeds the size of the termBuffer (" + termBuffer.Length + ")");
+ termLength = length;
+ }
+
+ public override int GetHashCode()
+ {
+ InitTermBuffer();
+ int code = termLength;
+ code = code * 31 + ArrayUtil.HashCode(termBuffer, 0, termLength);
+ return code;
+ }
+
+ public override void Clear()
+ {
+ termLength = 0;
+ }
+
+ public override System.Object Clone()
+ {
+ TermAttribute t = (TermAttribute) base.Clone();
+ // Do a deep clone
+ if (termBuffer != null)
+ {
+ t.termBuffer = new char[termBuffer.Length];
+ termBuffer.CopyTo(t.termBuffer, 0);
+ }
+ return t;
+ }
+
+ public override bool Equals(System.Object other)
+ {
+ if (other == this)
+ {
+ return true;
+ }
+
+ if (other is ITermAttribute)
+ {
+ InitTermBuffer();
+ TermAttribute o = ((TermAttribute) other);
+ o.InitTermBuffer();
+
+ if (termLength != o.termLength)
+ return false;
+ for (int i = 0; i < termLength; i++)
+ {
+ if (termBuffer[i] != o.termBuffer[i])
+ {
+ return false;
+ }
+ }
+ return true;
+ }
+
+ return false;
+ }
+
+ public override System.String ToString()
+ {
+ InitTermBuffer();
+ return "term=" + new System.String(termBuffer, 0, termLength);
+ }
+
+ public override void CopyTo(Attribute target)
+ {
+ InitTermBuffer();
+ ITermAttribute t = (ITermAttribute) target;
+ t.SetTermBuffer(termBuffer, 0, termLength);
+ }
+ }
+} \ No newline at end of file
diff --git a/src/core/Analysis/Tokenattributes/TypeAttribute.cs b/src/core/Analysis/Tokenattributes/TypeAttribute.cs
new file mode 100644
index 0000000..1da1c50
--- /dev/null
+++ b/src/core/Analysis/Tokenattributes/TypeAttribute.cs
@@ -0,0 +1,85 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+using System;
+using Attribute = Lucene.Net.Util.Attribute;
+
+namespace Lucene.Net.Analysis.Tokenattributes
+{
+
+ /// <summary> A Token's lexical type. The Default value is "word". </summary>
+ [Serializable]
+ public class TypeAttribute:Attribute, ITypeAttribute, System.ICloneable
+ {
+ private System.String type;
+ public const System.String DEFAULT_TYPE = "word";
+
+ public TypeAttribute():this(DEFAULT_TYPE)
+ {
+ }
+
+ public TypeAttribute(System.String type)
+ {
+ this.type = type;
+ }
+
+ /// <summary>Returns this Token's lexical type. Defaults to "word". </summary>
+ public virtual string Type
+ {
+ get { return type; }
+ set { this.type = value; }
+ }
+
+ public override void Clear()
+ {
+ type = DEFAULT_TYPE;
+ }
+
+ public override bool Equals(System.Object other)
+ {
+ if (other == this)
+ {
+ return true;
+ }
+
+ if (other is TypeAttribute)
+ {
+ return type.Equals(((TypeAttribute) other).type);
+ }
+
+ return false;
+ }
+
+ public override int GetHashCode()
+ {
+ return type.GetHashCode();
+ }
+
+ public override void CopyTo(Attribute target)
+ {
+ ITypeAttribute t = (ITypeAttribute) target;
+ t.Type = type;
+ }
+
+ override public System.Object Clone()
+ {
+ TypeAttribute impl = new TypeAttribute();
+ impl.type = type;
+ return impl;
+ }
+ }
+} \ No newline at end of file
diff --git a/src/core/Analysis/Tokenizer.cs b/src/core/Analysis/Tokenizer.cs
new file mode 100644
index 0000000..5ab741e
--- /dev/null
+++ b/src/core/Analysis/Tokenizer.cs
@@ -0,0 +1,112 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+using AttributeSource = Lucene.Net.Util.AttributeSource;
+
+namespace Lucene.Net.Analysis
+{
+
+ /// <summary> A Tokenizer is a TokenStream whose input is a Reader.
+ /// <p/>
+ /// This is an abstract class; subclasses must override <see cref="TokenStream.IncrementToken()" />
+ /// <p/>
+ /// NOTE: Subclasses overriding <see cref="TokenStream.IncrementToken()" /> must call
+ /// <see cref="AttributeSource.ClearAttributes()" /> before setting attributes.
+ /// </summary>
+
+ public abstract class Tokenizer:TokenStream
+ {
+ /// <summary>The text source for this Tokenizer. </summary>
+ protected internal System.IO.TextReader input;
+
+ private bool isDisposed;
+
+ /// <summary>Construct a tokenizer with null input. </summary>
+ protected internal Tokenizer()
+ {
+ }
+
+ /// <summary>Construct a token stream processing the given input. </summary>
+ protected internal Tokenizer(System.IO.TextReader input)
+ {
+ this.input = CharReader.Get(input);
+ }
+
+ /// <summary>Construct a tokenizer with null input using the given AttributeFactory. </summary>
+ protected internal Tokenizer(AttributeFactory factory):base(factory)
+ {
+ }
+
+ /// <summary>Construct a token stream processing the given input using the given AttributeFactory. </summary>
+ protected internal Tokenizer(AttributeFactory factory, System.IO.TextReader input):base(factory)
+ {
+ this.input = CharReader.Get(input);
+ }
+
+ /// <summary>Construct a token stream processing the given input using the given AttributeSource. </summary>
+ protected internal Tokenizer(AttributeSource source):base(source)
+ {
+ }
+
+ /// <summary>Construct a token stream processing the given input using the given AttributeSource. </summary>
+ protected internal Tokenizer(AttributeSource source, System.IO.TextReader input):base(source)
+ {
+ this.input = CharReader.Get(input);
+ }
+
+ protected override void Dispose(bool disposing)
+ {
+ if (isDisposed) return;
+
+ if (disposing)
+ {
+ if (input != null)
+ {
+ input.Close();
+ }
+ }
+
+ // LUCENE-2387: don't hold onto Reader after close, so
+ // GC can reclaim
+ input = null;
+ isDisposed = true;
+ }
+
+ /// <summary>Return the corrected offset. If <see cref="input" /> is a <see cref="CharStream" /> subclass
+ /// this method calls <see cref="CharStream.CorrectOffset" />, else returns <c>currentOff</c>.
+ /// </summary>
+ /// <param name="currentOff">offset as seen in the output
+ /// </param>
+ /// <returns> corrected offset based on the input
+ /// </returns>
+ /// <seealso cref="CharStream.CorrectOffset">
+ /// </seealso>
+ protected internal int CorrectOffset(int currentOff)
+ {
+ return (input is CharStream)?((CharStream) input).CorrectOffset(currentOff):currentOff;
+ }
+
+ /// <summary>Expert: Reset the tokenizer to a new reader. Typically, an
+ /// analyzer (in its reusableTokenStream method) will use
+ /// this to re-use a previously created tokenizer.
+ /// </summary>
+ public virtual void Reset(System.IO.TextReader input)
+ {
+ this.input = input;
+ }
+ }
+} \ No newline at end of file
diff --git a/src/core/Analysis/WhitespaceAnalyzer.cs b/src/core/Analysis/WhitespaceAnalyzer.cs
new file mode 100644
index 0000000..77dbaa3
--- /dev/null
+++ b/src/core/Analysis/WhitespaceAnalyzer.cs
@@ -0,0 +1,43 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+namespace Lucene.Net.Analysis
+{
+
+ /// <summary>An Analyzer that uses <see cref="WhitespaceTokenizer" />. </summary>
+
+ public sealed class WhitespaceAnalyzer:Analyzer
+ {
+ public override TokenStream TokenStream(System.String fieldName, System.IO.TextReader reader)
+ {
+ return new WhitespaceTokenizer(reader);
+ }
+
+ public override TokenStream ReusableTokenStream(System.String fieldName, System.IO.TextReader reader)
+ {
+ var tokenizer = (Tokenizer) PreviousTokenStream;
+ if (tokenizer == null)
+ {
+ tokenizer = new WhitespaceTokenizer(reader);
+ PreviousTokenStream = tokenizer;
+ }
+ else
+ tokenizer.Reset(reader);
+ return tokenizer;
+ }
+ }
+} \ No newline at end of file
diff --git a/src/core/Analysis/WhitespaceTokenizer.cs b/src/core/Analysis/WhitespaceTokenizer.cs
new file mode 100644
index 0000000..c96ad50
--- /dev/null
+++ b/src/core/Analysis/WhitespaceTokenizer.cs
@@ -0,0 +1,55 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+using AttributeSource = Lucene.Net.Util.AttributeSource;
+
+namespace Lucene.Net.Analysis
+{
+
+ /// <summary>A WhitespaceTokenizer is a tokenizer that divides text at whitespace.
+ /// Adjacent sequences of non-Whitespace characters form tokens.
+ /// </summary>
+
+ public class WhitespaceTokenizer:CharTokenizer
+ {
+ /// <summary>Construct a new WhitespaceTokenizer. </summary>
+ public WhitespaceTokenizer(System.IO.TextReader @in)
+ : base(@in)
+ {
+ }
+
+ /// <summary>Construct a new WhitespaceTokenizer using a given <see cref="AttributeSource" />. </summary>
+ public WhitespaceTokenizer(AttributeSource source, System.IO.TextReader @in)
+ : base(source, @in)
+ {
+ }
+
+ /// <summary>Construct a new WhitespaceTokenizer using a given <see cref="Lucene.Net.Util.AttributeSource.AttributeFactory" />. </summary>
+ public WhitespaceTokenizer(AttributeFactory factory, System.IO.TextReader @in)
+ : base(factory, @in)
+ {
+ }
+
+ /// <summary>Collects only characters which do not satisfy
+ /// <see cref="char.IsWhiteSpace(char)" />.
+ /// </summary>
+ protected internal override bool IsTokenChar(char c)
+ {
+ return !System.Char.IsWhiteSpace(c);
+ }
+ }
+} \ No newline at end of file
diff --git a/src/core/Analysis/WordlistLoader.cs b/src/core/Analysis/WordlistLoader.cs
new file mode 100644
index 0000000..bfd1b07
--- /dev/null
+++ b/src/core/Analysis/WordlistLoader.cs
@@ -0,0 +1,146 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+using System.Collections.Generic;
+
+namespace Lucene.Net.Analysis
+{
+
+ /// <summary> Loader for text files that represent a list of stopwords.</summary>
+ public class WordlistLoader
+ {
+
+ /// <summary> Loads a text file and adds every line as an entry to a HashSet (omitting
+ /// leading and trailing whitespace). Every line of the file should contain only
+ /// one word. The words need to be in lowercase if you make use of an
+ /// Analyzer which uses LowerCaseFilter (like StandardAnalyzer).
+ /// </summary>
+ /// <param name="wordfile">File containing the wordlist</param>
+ /// <returns> A HashSet with the file's words</returns>
+ public static ISet<string> GetWordSet(System.IO.FileInfo wordfile)
+ {
+ using (var reader = new System.IO.StreamReader(wordfile.FullName, System.Text.Encoding.Default))
+ {
+ return GetWordSet(reader);
+ }
+ }
+
+ /// <summary> Loads a text file and adds every non-comment line as an entry to a HashSet (omitting
+ /// leading and trailing whitespace). Every line of the file should contain only
+ /// one word. The words need to be in lowercase if you make use of an
+ /// Analyzer which uses LowerCaseFilter (like StandardAnalyzer).
+ /// </summary>
+ /// <param name="wordfile">File containing the wordlist</param>
+ /// <param name="comment">The comment string to ignore</param>
+ /// <returns> A HashSet with the file's words</returns>
+ public static ISet<string> GetWordSet(System.IO.FileInfo wordfile, System.String comment)
+ {
+ using (var reader = new System.IO.StreamReader(wordfile.FullName, System.Text.Encoding.Default))
+ {
+ return GetWordSet(reader, comment);
+ }
+ }
+
+
+ /// <summary> Reads lines from a Reader and adds every line as an entry to a HashSet (omitting
+ /// leading and trailing whitespace). Every line of the Reader should contain only
+ /// one word. The words need to be in lowercase if you make use of an
+ /// Analyzer which uses LowerCaseFilter (like StandardAnalyzer).
+ /// </summary>
+ /// <param name="reader">Reader containing the wordlist</param>
+ /// <returns>A HashSet with the reader's words</returns>
+ public static ISet<string> GetWordSet(System.IO.TextReader reader)
+ {
+ var result = Support.Compatibility.SetFactory.CreateHashSet<string>();
+
+ System.String word;
+ while ((word = reader.ReadLine()) != null)
+ {
+ result.Add(word.Trim());
+ }
+
+ return result;
+ }
+
+ /// <summary> Reads lines from a Reader and adds every non-comment line as an entry to a HashSet (omitting
+ /// leading and trailing whitespace). Every line of the Reader should contain only
+ /// one word. The words need to be in lowercase if you make use of an
+ /// Analyzer which uses LowerCaseFilter (like StandardAnalyzer).
+ ///
+ /// </summary>
+ /// <param name="reader">Reader containing the wordlist
+ /// </param>
+ /// <param name="comment">The string representing a comment.
+ /// </param>
+ /// <returns> A HashSet with the reader's words
+ /// </returns>
+ public static ISet<string> GetWordSet(System.IO.TextReader reader, System.String comment)
+ {
+ var result = Support.Compatibility.SetFactory.CreateHashSet<string>();
+
+ System.String word = null;
+ while ((word = reader.ReadLine()) != null)
+ {
+ if (word.StartsWith(comment) == false)
+ {
+ result.Add(word.Trim());
+ }
+ }
+
+ return result;
+ }
+
+
+
+ /// <summary> Reads a stem dictionary. Each line contains:
+ /// <c>word<b>\t</b>stem</c>
+ /// (i.e. two tab seperated words)
+ ///
+ /// </summary>
+ /// <returns> stem dictionary that overrules the stemming algorithm
+ /// </returns>
+ /// <throws> IOException </throws>
+ public static Dictionary<string, string> GetStemDict(System.IO.FileInfo wordstemfile)
+ {
+ if (wordstemfile == null)
+ throw new System.NullReferenceException("wordstemfile may not be null");
+ var result = new Dictionary<string, string>();
+ System.IO.StreamReader br = null;
+ System.IO.StreamReader fr = null;
+ try
+ {
+ fr = new System.IO.StreamReader(wordstemfile.FullName, System.Text.Encoding.Default);
+ br = new System.IO.StreamReader(fr.BaseStream, fr.CurrentEncoding);
+ System.String line;
+ char[] tab = {'\t'};
+ while ((line = br.ReadLine()) != null)
+ {
+ System.String[] wordstem = line.Split(tab, 2);
+ result[wordstem[0]] = wordstem[1];
+ }
+ }
+ finally
+ {
+ if (fr != null)
+ fr.Close();
+ if (br != null)
+ br.Close();
+ }
+ return result;
+ }
+ }
+} \ No newline at end of file