1 files changed, 135 insertions, 0 deletions
diff --git a/src/core/Analysis/CharTokenizer.cs b/src/core/Analysis/CharTokenizer.cs
new file mode 100644
index 0000000..22423ec
--- /dev/null
+++ b/src/core/Analysis/CharTokenizer.cs
@@ -0,0 +1,135 @@
+/* 
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ * 
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+using Lucene.Net.Analysis.Tokenattributes;
+using AttributeSource = Lucene.Net.Util.AttributeSource;
+
+namespace Lucene.Net.Analysis
+{
+	
+	/// <summary>An abstract base class for simple, character-oriented tokenizers.</summary>
+	public abstract class CharTokenizer:Tokenizer
+	{
+	    protected CharTokenizer(System.IO.TextReader input):base(input)
+		{
+			offsetAtt = AddAttribute<IOffsetAttribute>();
+            termAtt = AddAttribute<ITermAttribute>();
+		}
+
+	    protected CharTokenizer(AttributeSource source, System.IO.TextReader input):base(source, input)
+		{
+            offsetAtt = AddAttribute<IOffsetAttribute>();
+            termAtt = AddAttribute<ITermAttribute>();
+		}
+
+	    protected CharTokenizer(AttributeFactory factory, System.IO.TextReader input):base(factory, input)
+		{
+            offsetAtt = AddAttribute<IOffsetAttribute>();
+            termAtt = AddAttribute<ITermAttribute>();
+		}
+		
+		private int offset = 0, bufferIndex = 0, dataLen = 0;
+		private const int MAX_WORD_LEN = 255;
+		private const int IO_BUFFER_SIZE = 4096;
+		private readonly char[] ioBuffer = new char[IO_BUFFER_SIZE];
+		
+		private readonly ITermAttribute termAtt;
+		private readonly IOffsetAttribute offsetAtt;
+		
+		/// <summary>Returns true iff a character should be included in a token.  This
+		/// tokenizer generates as tokens adjacent sequences of characters which
+		/// satisfy this predicate.  Characters for which this is false are used to
+		/// define token boundaries and are not included in tokens. 
+		/// </summary>
+		protected internal abstract bool IsTokenChar(char c);
+		
+		/// <summary>Called on each token character to normalize it before it is added to the
+		/// token.  The default implementation does nothing.  Subclasses may use this
+		/// to, e.g., lowercase tokens. 
+		/// </summary>
+		protected internal virtual char Normalize(char c)
+		{
+			return c;
+		}
+		
+		public override bool IncrementToken()
+		{
+			ClearAttributes();
+			int length = 0;
+			int start = bufferIndex;
+			char[] buffer = termAtt.TermBuffer();
+			while (true)
+			{
+				
+				if (bufferIndex >= dataLen)
+				{
+					offset += dataLen;
+					dataLen = input.Read(ioBuffer, 0, ioBuffer.Length);
+					if (dataLen <= 0)
+					{
+						dataLen = 0; // so next offset += dataLen won't decrement offset
+						if (length > 0)
+							break;
+						return false;
+					}
+					bufferIndex = 0;
+				}
+				
+				char c = ioBuffer[bufferIndex++];
+				
+				if (IsTokenChar(c))
+				{
+					// if it's a token char
+					
+					if (length == 0)
+					// start of token
+						start = offset + bufferIndex - 1;
+					else if (length == buffer.Length)
+						buffer = termAtt.ResizeTermBuffer(1 + length);
+					
+					buffer[length++] = Normalize(c); // buffer it, normalized
+					
+					if (length == MAX_WORD_LEN)
+					// buffer overflow!
+						break;
+				}
+				else if (length > 0)
+				// at non-Letter w/ chars
+					break; // return 'em
+			}
+			
+			termAtt.SetTermLength(length);
+			offsetAtt.SetOffset(CorrectOffset(start), CorrectOffset(start + length));
+			return true;
+		}
+		
+		public override void  End()
+		{
+			// set final offset
+			int finalOffset = CorrectOffset(offset);
+			offsetAtt.SetOffset(finalOffset, finalOffset);
+		}
+		
+		public override void  Reset(System.IO.TextReader input)
+		{
+			base.Reset(input);
+			bufferIndex = 0;
+			offset = 0;
+			dataLen = 0;
+		}
+	}
+}
+\ No newline at end of file