Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/mono/Lucene.Net.Light.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
Diffstat (limited to 'src/core/Analysis/CharTokenizer.cs')
-rw-r--r--src/core/Analysis/CharTokenizer.cs135
1 files changed, 135 insertions, 0 deletions
diff --git a/src/core/Analysis/CharTokenizer.cs b/src/core/Analysis/CharTokenizer.cs
new file mode 100644
index 0000000..22423ec
--- /dev/null
+++ b/src/core/Analysis/CharTokenizer.cs
@@ -0,0 +1,135 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+using Lucene.Net.Analysis.Tokenattributes;
+using AttributeSource = Lucene.Net.Util.AttributeSource;
+
+namespace Lucene.Net.Analysis
+{
+
+ /// <summary>An abstract base class for simple, character-oriented tokenizers.</summary>
+ public abstract class CharTokenizer:Tokenizer
+ {
+ protected CharTokenizer(System.IO.TextReader input):base(input)
+ {
+ offsetAtt = AddAttribute<IOffsetAttribute>();
+ termAtt = AddAttribute<ITermAttribute>();
+ }
+
+ protected CharTokenizer(AttributeSource source, System.IO.TextReader input):base(source, input)
+ {
+ offsetAtt = AddAttribute<IOffsetAttribute>();
+ termAtt = AddAttribute<ITermAttribute>();
+ }
+
+ protected CharTokenizer(AttributeFactory factory, System.IO.TextReader input):base(factory, input)
+ {
+ offsetAtt = AddAttribute<IOffsetAttribute>();
+ termAtt = AddAttribute<ITermAttribute>();
+ }
+
+ private int offset = 0, bufferIndex = 0, dataLen = 0;
+ private const int MAX_WORD_LEN = 255;
+ private const int IO_BUFFER_SIZE = 4096;
+ private readonly char[] ioBuffer = new char[IO_BUFFER_SIZE];
+
+ private readonly ITermAttribute termAtt;
+ private readonly IOffsetAttribute offsetAtt;
+
+ /// <summary>Returns true iff a character should be included in a token. This
+ /// tokenizer generates as tokens adjacent sequences of characters which
+ /// satisfy this predicate. Characters for which this is false are used to
+ /// define token boundaries and are not included in tokens.
+ /// </summary>
+ protected internal abstract bool IsTokenChar(char c);
+
+ /// <summary>Called on each token character to normalize it before it is added to the
+ /// token. The default implementation does nothing. Subclasses may use this
+ /// to, e.g., lowercase tokens.
+ /// </summary>
+ protected internal virtual char Normalize(char c)
+ {
+ return c;
+ }
+
+ public override bool IncrementToken()
+ {
+ ClearAttributes();
+ int length = 0;
+ int start = bufferIndex;
+ char[] buffer = termAtt.TermBuffer();
+ while (true)
+ {
+
+ if (bufferIndex >= dataLen)
+ {
+ offset += dataLen;
+ dataLen = input.Read(ioBuffer, 0, ioBuffer.Length);
+ if (dataLen <= 0)
+ {
+ dataLen = 0; // so next offset += dataLen won't decrement offset
+ if (length > 0)
+ break;
+ return false;
+ }
+ bufferIndex = 0;
+ }
+
+ char c = ioBuffer[bufferIndex++];
+
+ if (IsTokenChar(c))
+ {
+ // if it's a token char
+
+ if (length == 0)
+ // start of token
+ start = offset + bufferIndex - 1;
+ else if (length == buffer.Length)
+ buffer = termAtt.ResizeTermBuffer(1 + length);
+
+ buffer[length++] = Normalize(c); // buffer it, normalized
+
+ if (length == MAX_WORD_LEN)
+ // buffer overflow!
+ break;
+ }
+ else if (length > 0)
+ // at non-Letter w/ chars
+ break; // return 'em
+ }
+
+ termAtt.SetTermLength(length);
+ offsetAtt.SetOffset(CorrectOffset(start), CorrectOffset(start + length));
+ return true;
+ }
+
+ public override void End()
+ {
+ // set final offset
+ int finalOffset = CorrectOffset(offset);
+ offsetAtt.SetOffset(finalOffset, finalOffset);
+ }
+
+ public override void Reset(System.IO.TextReader input)
+ {
+ base.Reset(input);
+ bufferIndex = 0;
+ offset = 0;
+ dataLen = 0;
+ }
+ }
+} \ No newline at end of file