diff options
Diffstat (limited to 'src/core/Analysis/Standard/StandardTokenizer.cs')
-rw-r--r-- | src/core/Analysis/Standard/StandardTokenizer.cs | 232 |
1 files changed, 232 insertions, 0 deletions
diff --git a/src/core/Analysis/Standard/StandardTokenizer.cs b/src/core/Analysis/Standard/StandardTokenizer.cs new file mode 100644 index 0000000..dca409d --- /dev/null +++ b/src/core/Analysis/Standard/StandardTokenizer.cs @@ -0,0 +1,232 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +using System; +using Lucene.Net.Analysis.Tokenattributes; +using Lucene.Net.Util; +using CharReader = Lucene.Net.Analysis.CharReader; +using Token = Lucene.Net.Analysis.Token; +using Tokenizer = Lucene.Net.Analysis.Tokenizer; +using AttributeSource = Lucene.Net.Util.AttributeSource; +using Version = Lucene.Net.Util.Version; + +namespace Lucene.Net.Analysis.Standard +{ + + /// <summary>A grammar-based tokenizer constructed with JFlex + /// + /// <p/> This should be a good tokenizer for most European-language documents: + /// + /// <list type="bullet"> + /// <item>Splits words at punctuation characters, removing punctuation. However, a + /// dot that's not followed by whitespace is considered part of a token.</item> + /// <item>Splits words at hyphens, unless there's a number in the token, in which case + /// the whole token is interpreted as a product number and is not split.</item> + /// <item>Recognizes email addresses and internet hostnames as one token.</item> + /// </list> + /// + /// <p/>Many applications have specific tokenizer needs. If this tokenizer does + /// not suit your application, please consider copying this source code + /// directory to your project and maintaining your own grammar-based tokenizer. + /// + /// <a name="version"/> + /// <p/> + /// You must specify the required <see cref="Version" /> compatibility when creating + /// StandardAnalyzer: + /// <list type="bullet"> + /// <item>As of 2.4, Tokens incorrectly identified as acronyms are corrected (see + /// <a href="https://issues.apache.org/jira/browse/LUCENE-1068">LUCENE-1608</a></item> + /// </list> + /// </summary> + + public sealed class StandardTokenizer:Tokenizer + { + private void InitBlock() + { + maxTokenLength = StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH; + } + /// <summary>A private instance of the JFlex-constructed scanner </summary> + private StandardTokenizerImpl scanner; + + public const int ALPHANUM = 0; + public const int APOSTROPHE = 1; + public const int ACRONYM = 2; + public const int COMPANY = 3; + public const int EMAIL = 4; + public const int HOST = 5; + public const int NUM = 6; + public const int CJ = 7; + + /// <deprecated> this solves a bug where HOSTs that end with '.' are identified + /// as ACRONYMs. + /// </deprecated> + [Obsolete("this solves a bug where HOSTs that end with '.' are identified as ACRONYMs.")] + public const int ACRONYM_DEP = 8; + + /// <summary>String token types that correspond to token type int constants </summary> + public static readonly System.String[] TOKEN_TYPES = new System.String[]{"<ALPHANUM>", "<APOSTROPHE>", "<ACRONYM>", "<COMPANY>", "<EMAIL>", "<HOST>", "<NUM>", "<CJ>", "<ACRONYM_DEP>"}; + + private bool replaceInvalidAcronym; + + private int maxTokenLength; + + /// <summary>Set the max allowed token length. Any token longer + /// than this is skipped. + /// </summary> + public int MaxTokenLength + { + get { return maxTokenLength; } + set { this.maxTokenLength = value; } + } + + /// <summary> Creates a new instance of the + /// <see cref="Lucene.Net.Analysis.Standard.StandardTokenizer" />. Attaches + /// the <c>input</c> to the newly created JFlex scanner. + /// + /// </summary> + /// <param name="matchVersion"></param> + /// <param name="input">The input reader + /// + /// See http://issues.apache.org/jira/browse/LUCENE-1068 + /// </param> + public StandardTokenizer(Version matchVersion, System.IO.TextReader input):base() + { + InitBlock(); + this.scanner = new StandardTokenizerImpl(input); + Init(input, matchVersion); + } + + /// <summary> Creates a new StandardTokenizer with a given <see cref="AttributeSource" />.</summary> + public StandardTokenizer(Version matchVersion, AttributeSource source, System.IO.TextReader input):base(source) + { + InitBlock(); + this.scanner = new StandardTokenizerImpl(input); + Init(input, matchVersion); + } + + /// <summary> Creates a new StandardTokenizer with a given + /// <see cref="Lucene.Net.Util.AttributeSource.AttributeFactory" /> + /// </summary> + public StandardTokenizer(Version matchVersion, AttributeFactory factory, System.IO.TextReader input):base(factory) + { + InitBlock(); + this.scanner = new StandardTokenizerImpl(input); + Init(input, matchVersion); + } + + private void Init(System.IO.TextReader input, Version matchVersion) + { + if (matchVersion.OnOrAfter(Version.LUCENE_24)) + { + replaceInvalidAcronym = true; + } + else + { + replaceInvalidAcronym = false; + } + this.input = input; + termAtt = AddAttribute<ITermAttribute>(); + offsetAtt = AddAttribute<IOffsetAttribute>(); + posIncrAtt = AddAttribute<IPositionIncrementAttribute>(); + typeAtt = AddAttribute<ITypeAttribute>(); + } + + // this tokenizer generates three attributes: + // offset, positionIncrement and type + private ITermAttribute termAtt; + private IOffsetAttribute offsetAtt; + private IPositionIncrementAttribute posIncrAtt; + private ITypeAttribute typeAtt; + + ///<summary> + /// (non-Javadoc) + /// <see cref="Lucene.Net.Analysis.TokenStream.IncrementToken()" /> + ///</summary> + public override bool IncrementToken() + { + ClearAttributes(); + int posIncr = 1; + + while (true) + { + int tokenType = scanner.GetNextToken(); + + if (tokenType == StandardTokenizerImpl.YYEOF) + { + return false; + } + + if (scanner.Yylength() <= maxTokenLength) + { + posIncrAtt.PositionIncrement = posIncr; + scanner.GetText(termAtt); + int start = scanner.Yychar(); + offsetAtt.SetOffset(CorrectOffset(start), CorrectOffset(start + termAtt.TermLength())); + // This 'if' should be removed in the next release. For now, it converts + // invalid acronyms to HOST. When removed, only the 'else' part should + // remain. + if (tokenType == StandardTokenizerImpl.ACRONYM_DEP) + { + if (replaceInvalidAcronym) + { + typeAtt.Type = StandardTokenizerImpl.TOKEN_TYPES[StandardTokenizerImpl.HOST]; + termAtt.SetTermLength(termAtt.TermLength() - 1); // remove extra '.' + } + else + { + typeAtt.Type = StandardTokenizerImpl.TOKEN_TYPES[StandardTokenizerImpl.ACRONYM]; + } + } + else + { + typeAtt.Type = StandardTokenizerImpl.TOKEN_TYPES[tokenType]; + } + return true; + } + // When we skip a too-long term, we still increment the + // position increment + else + posIncr++; + } + } + + public override void End() + { + // set final offset + int finalOffset = CorrectOffset(scanner.Yychar() + scanner.Yylength()); + offsetAtt.SetOffset(finalOffset, finalOffset); + } + + public override void Reset(System.IO.TextReader reader) + { + base.Reset(reader); + scanner.Reset(reader); + } + + /// <summary> + /// Remove in 3.X and make true the only valid value + /// See https://issues.apache.org/jira/browse/LUCENE-1068 + /// </summary> + /// <param name="replaceInvalidAcronym">Set to true to replace mischaracterized acronyms as HOST. + /// </param> + [Obsolete("Remove in 3.X and make true the only valid value. See https://issues.apache.org/jira/browse/LUCENE-1068")] + public void SetReplaceInvalidAcronym(bool replaceInvalidAcronym) + { + this.replaceInvalidAcronym = replaceInvalidAcronym; + } + } +}
\ No newline at end of file |