diff options
Diffstat (limited to 'src/core/Analysis/CharTokenizer.cs')
-rw-r--r-- | src/core/Analysis/CharTokenizer.cs | 135 |
1 files changed, 135 insertions, 0 deletions
diff --git a/src/core/Analysis/CharTokenizer.cs b/src/core/Analysis/CharTokenizer.cs new file mode 100644 index 0000000..22423ec --- /dev/null +++ b/src/core/Analysis/CharTokenizer.cs @@ -0,0 +1,135 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +using Lucene.Net.Analysis.Tokenattributes; +using AttributeSource = Lucene.Net.Util.AttributeSource; + +namespace Lucene.Net.Analysis +{ + + /// <summary>An abstract base class for simple, character-oriented tokenizers.</summary> + public abstract class CharTokenizer:Tokenizer + { + protected CharTokenizer(System.IO.TextReader input):base(input) + { + offsetAtt = AddAttribute<IOffsetAttribute>(); + termAtt = AddAttribute<ITermAttribute>(); + } + + protected CharTokenizer(AttributeSource source, System.IO.TextReader input):base(source, input) + { + offsetAtt = AddAttribute<IOffsetAttribute>(); + termAtt = AddAttribute<ITermAttribute>(); + } + + protected CharTokenizer(AttributeFactory factory, System.IO.TextReader input):base(factory, input) + { + offsetAtt = AddAttribute<IOffsetAttribute>(); + termAtt = AddAttribute<ITermAttribute>(); + } + + private int offset = 0, bufferIndex = 0, dataLen = 0; + private const int MAX_WORD_LEN = 255; + private const int IO_BUFFER_SIZE = 4096; + private readonly char[] ioBuffer = new char[IO_BUFFER_SIZE]; + + private readonly ITermAttribute termAtt; + private readonly IOffsetAttribute offsetAtt; + + /// <summary>Returns true iff a character should be included in a token. This + /// tokenizer generates as tokens adjacent sequences of characters which + /// satisfy this predicate. Characters for which this is false are used to + /// define token boundaries and are not included in tokens. + /// </summary> + protected internal abstract bool IsTokenChar(char c); + + /// <summary>Called on each token character to normalize it before it is added to the + /// token. The default implementation does nothing. Subclasses may use this + /// to, e.g., lowercase tokens. + /// </summary> + protected internal virtual char Normalize(char c) + { + return c; + } + + public override bool IncrementToken() + { + ClearAttributes(); + int length = 0; + int start = bufferIndex; + char[] buffer = termAtt.TermBuffer(); + while (true) + { + + if (bufferIndex >= dataLen) + { + offset += dataLen; + dataLen = input.Read(ioBuffer, 0, ioBuffer.Length); + if (dataLen <= 0) + { + dataLen = 0; // so next offset += dataLen won't decrement offset + if (length > 0) + break; + return false; + } + bufferIndex = 0; + } + + char c = ioBuffer[bufferIndex++]; + + if (IsTokenChar(c)) + { + // if it's a token char + + if (length == 0) + // start of token + start = offset + bufferIndex - 1; + else if (length == buffer.Length) + buffer = termAtt.ResizeTermBuffer(1 + length); + + buffer[length++] = Normalize(c); // buffer it, normalized + + if (length == MAX_WORD_LEN) + // buffer overflow! + break; + } + else if (length > 0) + // at non-Letter w/ chars + break; // return 'em + } + + termAtt.SetTermLength(length); + offsetAtt.SetOffset(CorrectOffset(start), CorrectOffset(start + length)); + return true; + } + + public override void End() + { + // set final offset + int finalOffset = CorrectOffset(offset); + offsetAtt.SetOffset(finalOffset, finalOffset); + } + + public override void Reset(System.IO.TextReader input) + { + base.Reset(input); + bufferIndex = 0; + offset = 0; + dataLen = 0; + } + } +}
\ No newline at end of file |