src/core/Analysis/CharTokenizer.cs


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135

/* 
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 * 
 * http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

using Lucene.Net.Analysis.Tokenattributes;
using AttributeSource = Lucene.Net.Util.AttributeSource;

namespace Lucene.Net.Analysis
{
	
	/// <summary>An abstract base class for simple, character-oriented tokenizers.</summary>
	public abstract class CharTokenizer:Tokenizer
	{
	    protected CharTokenizer(System.IO.TextReader input):base(input)
		{
			offsetAtt = AddAttribute<IOffsetAttribute>();
            termAtt = AddAttribute<ITermAttribute>();
		}

	    protected CharTokenizer(AttributeSource source, System.IO.TextReader input):base(source, input)
		{
            offsetAtt = AddAttribute<IOffsetAttribute>();
            termAtt = AddAttribute<ITermAttribute>();
		}

	    protected CharTokenizer(AttributeFactory factory, System.IO.TextReader input):base(factory, input)
		{
            offsetAtt = AddAttribute<IOffsetAttribute>();
            termAtt = AddAttribute<ITermAttribute>();
		}
		
		private int offset = 0, bufferIndex = 0, dataLen = 0;
		private const int MAX_WORD_LEN = 255;
		private const int IO_BUFFER_SIZE = 4096;
		private readonly char[] ioBuffer = new char[IO_BUFFER_SIZE];
		
		private readonly ITermAttribute termAtt;
		private readonly IOffsetAttribute offsetAtt;
		
		/// <summary>Returns true iff a character should be included in a token.  This
		/// tokenizer generates as tokens adjacent sequences of characters which
		/// satisfy this predicate.  Characters for which this is false are used to
		/// define token boundaries and are not included in tokens. 
		/// </summary>
		protected internal abstract bool IsTokenChar(char c);
		
		/// <summary>Called on each token character to normalize it before it is added to the
		/// token.  The default implementation does nothing.  Subclasses may use this
		/// to, e.g., lowercase tokens. 
		/// </summary>
		protected internal virtual char Normalize(char c)
		{
			return c;
		}
		
		public override bool IncrementToken()
		{
			ClearAttributes();
			int length = 0;
			int start = bufferIndex;
			char[] buffer = termAtt.TermBuffer();
			while (true)
			{
				
				if (bufferIndex >= dataLen)
				{
					offset += dataLen;
					dataLen = input.Read(ioBuffer, 0, ioBuffer.Length);
					if (dataLen <= 0)
					{
						dataLen = 0; // so next offset += dataLen won't decrement offset
						if (length > 0)
							break;
						return false;
					}
					bufferIndex = 0;
				}
				
				char c = ioBuffer[bufferIndex++];
				
				if (IsTokenChar(c))
				{
					// if it's a token char
					
					if (length == 0)
					// start of token
						start = offset + bufferIndex - 1;
					else if (length == buffer.Length)
						buffer = termAtt.ResizeTermBuffer(1 + length);
					
					buffer[length++] = Normalize(c); // buffer it, normalized
					
					if (length == MAX_WORD_LEN)
					// buffer overflow!
						break;
				}
				else if (length > 0)
				// at non-Letter w/ chars
					break; // return 'em
			}
			
			termAtt.SetTermLength(length);
			offsetAtt.SetOffset(CorrectOffset(start), CorrectOffset(start + length));
			return true;
		}
		
		public override void  End()
		{
			// set final offset
			int finalOffset = CorrectOffset(offset);
			offsetAtt.SetOffset(finalOffset, finalOffset);
		}
		
		public override void  Reset(System.IO.TextReader input)
		{
			base.Reset(input);
			bufferIndex = 0;
			offset = 0;
			dataLen = 0;
		}
	}
}