Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/mono/Lucene.Net.Light.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
Diffstat (limited to 'src/core/Analysis/Standard/StandardTokenizer.cs')
-rw-r--r--src/core/Analysis/Standard/StandardTokenizer.cs232
1 files changed, 232 insertions, 0 deletions
diff --git a/src/core/Analysis/Standard/StandardTokenizer.cs b/src/core/Analysis/Standard/StandardTokenizer.cs
new file mode 100644
index 0000000..dca409d
--- /dev/null
+++ b/src/core/Analysis/Standard/StandardTokenizer.cs
@@ -0,0 +1,232 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+using System;
+using Lucene.Net.Analysis.Tokenattributes;
+using Lucene.Net.Util;
+using CharReader = Lucene.Net.Analysis.CharReader;
+using Token = Lucene.Net.Analysis.Token;
+using Tokenizer = Lucene.Net.Analysis.Tokenizer;
+using AttributeSource = Lucene.Net.Util.AttributeSource;
+using Version = Lucene.Net.Util.Version;
+
+namespace Lucene.Net.Analysis.Standard
+{
+
+ /// <summary>A grammar-based tokenizer constructed with JFlex
+ ///
+ /// <p/> This should be a good tokenizer for most European-language documents:
+ ///
+ /// <list type="bullet">
+ /// <item>Splits words at punctuation characters, removing punctuation. However, a
+ /// dot that's not followed by whitespace is considered part of a token.</item>
+ /// <item>Splits words at hyphens, unless there's a number in the token, in which case
+ /// the whole token is interpreted as a product number and is not split.</item>
+ /// <item>Recognizes email addresses and internet hostnames as one token.</item>
+ /// </list>
+ ///
+ /// <p/>Many applications have specific tokenizer needs. If this tokenizer does
+ /// not suit your application, please consider copying this source code
+ /// directory to your project and maintaining your own grammar-based tokenizer.
+ ///
+ /// <a name="version"/>
+ /// <p/>
+ /// You must specify the required <see cref="Version" /> compatibility when creating
+ /// StandardAnalyzer:
+ /// <list type="bullet">
+ /// <item>As of 2.4, Tokens incorrectly identified as acronyms are corrected (see
+ /// <a href="https://issues.apache.org/jira/browse/LUCENE-1068">LUCENE-1608</a></item>
+ /// </list>
+ /// </summary>
+
+ public sealed class StandardTokenizer:Tokenizer
+ {
+ private void InitBlock()
+ {
+ maxTokenLength = StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH;
+ }
+ /// <summary>A private instance of the JFlex-constructed scanner </summary>
+ private StandardTokenizerImpl scanner;
+
+ public const int ALPHANUM = 0;
+ public const int APOSTROPHE = 1;
+ public const int ACRONYM = 2;
+ public const int COMPANY = 3;
+ public const int EMAIL = 4;
+ public const int HOST = 5;
+ public const int NUM = 6;
+ public const int CJ = 7;
+
+ /// <deprecated> this solves a bug where HOSTs that end with '.' are identified
+ /// as ACRONYMs.
+ /// </deprecated>
+ [Obsolete("this solves a bug where HOSTs that end with '.' are identified as ACRONYMs.")]
+ public const int ACRONYM_DEP = 8;
+
+ /// <summary>String token types that correspond to token type int constants </summary>
+ public static readonly System.String[] TOKEN_TYPES = new System.String[]{"<ALPHANUM>", "<APOSTROPHE>", "<ACRONYM>", "<COMPANY>", "<EMAIL>", "<HOST>", "<NUM>", "<CJ>", "<ACRONYM_DEP>"};
+
+ private bool replaceInvalidAcronym;
+
+ private int maxTokenLength;
+
+ /// <summary>Set the max allowed token length. Any token longer
+ /// than this is skipped.
+ /// </summary>
+ public int MaxTokenLength
+ {
+ get { return maxTokenLength; }
+ set { this.maxTokenLength = value; }
+ }
+
+ /// <summary> Creates a new instance of the
+ /// <see cref="Lucene.Net.Analysis.Standard.StandardTokenizer" />. Attaches
+ /// the <c>input</c> to the newly created JFlex scanner.
+ ///
+ /// </summary>
+ /// <param name="matchVersion"></param>
+ /// <param name="input">The input reader
+ ///
+ /// See http://issues.apache.org/jira/browse/LUCENE-1068
+ /// </param>
+ public StandardTokenizer(Version matchVersion, System.IO.TextReader input):base()
+ {
+ InitBlock();
+ this.scanner = new StandardTokenizerImpl(input);
+ Init(input, matchVersion);
+ }
+
+ /// <summary> Creates a new StandardTokenizer with a given <see cref="AttributeSource" />.</summary>
+ public StandardTokenizer(Version matchVersion, AttributeSource source, System.IO.TextReader input):base(source)
+ {
+ InitBlock();
+ this.scanner = new StandardTokenizerImpl(input);
+ Init(input, matchVersion);
+ }
+
+ /// <summary> Creates a new StandardTokenizer with a given
+ /// <see cref="Lucene.Net.Util.AttributeSource.AttributeFactory" />
+ /// </summary>
+ public StandardTokenizer(Version matchVersion, AttributeFactory factory, System.IO.TextReader input):base(factory)
+ {
+ InitBlock();
+ this.scanner = new StandardTokenizerImpl(input);
+ Init(input, matchVersion);
+ }
+
+ private void Init(System.IO.TextReader input, Version matchVersion)
+ {
+ if (matchVersion.OnOrAfter(Version.LUCENE_24))
+ {
+ replaceInvalidAcronym = true;
+ }
+ else
+ {
+ replaceInvalidAcronym = false;
+ }
+ this.input = input;
+ termAtt = AddAttribute<ITermAttribute>();
+ offsetAtt = AddAttribute<IOffsetAttribute>();
+ posIncrAtt = AddAttribute<IPositionIncrementAttribute>();
+ typeAtt = AddAttribute<ITypeAttribute>();
+ }
+
+ // this tokenizer generates three attributes:
+ // offset, positionIncrement and type
+ private ITermAttribute termAtt;
+ private IOffsetAttribute offsetAtt;
+ private IPositionIncrementAttribute posIncrAtt;
+ private ITypeAttribute typeAtt;
+
+ ///<summary>
+ /// (non-Javadoc)
+ /// <see cref="Lucene.Net.Analysis.TokenStream.IncrementToken()" />
+ ///</summary>
+ public override bool IncrementToken()
+ {
+ ClearAttributes();
+ int posIncr = 1;
+
+ while (true)
+ {
+ int tokenType = scanner.GetNextToken();
+
+ if (tokenType == StandardTokenizerImpl.YYEOF)
+ {
+ return false;
+ }
+
+ if (scanner.Yylength() <= maxTokenLength)
+ {
+ posIncrAtt.PositionIncrement = posIncr;
+ scanner.GetText(termAtt);
+ int start = scanner.Yychar();
+ offsetAtt.SetOffset(CorrectOffset(start), CorrectOffset(start + termAtt.TermLength()));
+ // This 'if' should be removed in the next release. For now, it converts
+ // invalid acronyms to HOST. When removed, only the 'else' part should
+ // remain.
+ if (tokenType == StandardTokenizerImpl.ACRONYM_DEP)
+ {
+ if (replaceInvalidAcronym)
+ {
+ typeAtt.Type = StandardTokenizerImpl.TOKEN_TYPES[StandardTokenizerImpl.HOST];
+ termAtt.SetTermLength(termAtt.TermLength() - 1); // remove extra '.'
+ }
+ else
+ {
+ typeAtt.Type = StandardTokenizerImpl.TOKEN_TYPES[StandardTokenizerImpl.ACRONYM];
+ }
+ }
+ else
+ {
+ typeAtt.Type = StandardTokenizerImpl.TOKEN_TYPES[tokenType];
+ }
+ return true;
+ }
+ // When we skip a too-long term, we still increment the
+ // position increment
+ else
+ posIncr++;
+ }
+ }
+
+ public override void End()
+ {
+ // set final offset
+ int finalOffset = CorrectOffset(scanner.Yychar() + scanner.Yylength());
+ offsetAtt.SetOffset(finalOffset, finalOffset);
+ }
+
+ public override void Reset(System.IO.TextReader reader)
+ {
+ base.Reset(reader);
+ scanner.Reset(reader);
+ }
+
+ /// <summary>
+ /// Remove in 3.X and make true the only valid value
+ /// See https://issues.apache.org/jira/browse/LUCENE-1068
+ /// </summary>
+ /// <param name="replaceInvalidAcronym">Set to true to replace mischaracterized acronyms as HOST.
+ /// </param>
+ [Obsolete("Remove in 3.X and make true the only valid value. See https://issues.apache.org/jira/browse/LUCENE-1068")]
+ public void SetReplaceInvalidAcronym(bool replaceInvalidAcronym)
+ {
+ this.replaceInvalidAcronym = replaceInvalidAcronym;
+ }
+ }
+} \ No newline at end of file