1 files changed, 731 insertions, 0 deletions
diff --git a/src/core/Index/TermVectorsReader.cs b/src/core/Index/TermVectorsReader.cs
new file mode 100644
index 0000000..56cf764
--- /dev/null
+++ b/src/core/Index/TermVectorsReader.cs
@@ -0,0 +1,731 @@
+/* 
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ * 
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+using System;
+
+using BufferedIndexInput = Lucene.Net.Store.BufferedIndexInput;
+using Directory = Lucene.Net.Store.Directory;
+using IndexInput = Lucene.Net.Store.IndexInput;
+
+namespace Lucene.Net.Index
+{
+	class TermVectorsReader : System.ICloneable, IDisposable
+	{
+		
+		// NOTE: if you make a new format, it must be larger than
+		// the current format
+		internal const int FORMAT_VERSION = 2;
+		
+		// Changes to speed up bulk merging of term vectors:
+		internal const int FORMAT_VERSION2 = 3;
+		
+		// Changed strings to UTF8 with length-in-bytes not length-in-chars
+		internal const int FORMAT_UTF8_LENGTH_IN_BYTES = 4;
+		
+		// NOTE: always change this if you switch to a new format!
+		internal static readonly int FORMAT_CURRENT = FORMAT_UTF8_LENGTH_IN_BYTES;
+		
+		//The size in bytes that the FORMAT_VERSION will take up at the beginning of each file 
+		internal const int FORMAT_SIZE = 4;
+		
+		internal const byte STORE_POSITIONS_WITH_TERMVECTOR = (byte) (0x1);
+		internal const byte STORE_OFFSET_WITH_TERMVECTOR = (byte) (0x2);
+		
+		private FieldInfos fieldInfos;
+		
+		private IndexInput tvx;
+		private IndexInput tvd;
+		private IndexInput tvf;
+		private int size;
+		private int numTotalDocs;
+		
+		// The docID offset where our docs begin in the index
+		// file.  This will be 0 if we have our own private file.
+		private int docStoreOffset;
+		
+		private int format;
+	    private bool isDisposed;
+
+	    internal TermVectorsReader(Directory d, System.String segment, FieldInfos fieldInfos):this(d, segment, fieldInfos, BufferedIndexInput.BUFFER_SIZE)
+		{
+		}
+		
+		internal TermVectorsReader(Directory d, System.String segment, FieldInfos fieldInfos, int readBufferSize):this(d, segment, fieldInfos, readBufferSize, - 1, 0)
+		{
+		}
+		
+		internal TermVectorsReader(Directory d, System.String segment, FieldInfos fieldInfos, int readBufferSize, int docStoreOffset, int size)
+		{
+			bool success = false;
+			
+			try
+			{
+                if (d.FileExists(segment + "." + IndexFileNames.VECTORS_INDEX_EXTENSION))
+                {
+                    tvx = d.OpenInput(segment + "." + IndexFileNames.VECTORS_INDEX_EXTENSION, readBufferSize);
+                    format = CheckValidFormat(tvx);
+                    tvd = d.OpenInput(segment + "." + IndexFileNames.VECTORS_DOCUMENTS_EXTENSION, readBufferSize);
+                    int tvdFormat = CheckValidFormat(tvd);
+                    tvf = d.OpenInput(segment + "." + IndexFileNames.VECTORS_FIELDS_EXTENSION, readBufferSize);
+                    int tvfFormat = CheckValidFormat(tvf);
+
+                    System.Diagnostics.Debug.Assert(format == tvdFormat);
+                    System.Diagnostics.Debug.Assert(format == tvfFormat);
+
+                    if (format >= FORMAT_VERSION2)
+                    {
+                        System.Diagnostics.Debug.Assert((tvx.Length() - FORMAT_SIZE) % 16 == 0);
+                        numTotalDocs = (int)(tvx.Length() >> 4);
+                    }
+                    else
+                    {
+                        System.Diagnostics.Debug.Assert((tvx.Length() - FORMAT_SIZE) % 8 == 0);
+                        numTotalDocs = (int)(tvx.Length() >> 3);
+                    }
+
+                    if (-1 == docStoreOffset)
+                    {
+                        this.docStoreOffset = 0;
+                        this.size = numTotalDocs;
+                        System.Diagnostics.Debug.Assert(size == 0 || numTotalDocs == size);
+                    }
+                    else
+                    {
+                        this.docStoreOffset = docStoreOffset;
+                        this.size = size;
+                        // Verify the file is long enough to hold all of our
+                        // docs
+                        System.Diagnostics.Debug.Assert(numTotalDocs >= size + docStoreOffset, "numTotalDocs=" + numTotalDocs + " size=" + size + " docStoreOffset=" + docStoreOffset);
+                    }
+                }
+                else
+                {
+                    // If all documents flushed in a segment had hit
+                    // non-aborting exceptions, it's possible that
+                    // FieldInfos.hasVectors returns true yet the term
+                    // vector files don't exist.
+                    format = 0;
+                }
+
+				
+				this.fieldInfos = fieldInfos;
+				success = true;
+			}
+			finally
+			{
+				// With lock-less commits, it's entirely possible (and
+				// fine) to hit a FileNotFound exception above. In
+				// this case, we want to explicitly close any subset
+				// of things that were opened so that we don't have to
+				// wait for a GC to do so.
+				if (!success)
+				{
+					Dispose();
+				}
+			}
+		}
+		
+		// Used for bulk copy when merging
+		internal virtual IndexInput GetTvdStream()
+		{
+			return tvd;
+		}
+		
+		// Used for bulk copy when merging
+		internal virtual IndexInput GetTvfStream()
+		{
+			return tvf;
+		}
+		
+		private void  SeekTvx(int docNum)
+		{
+			if (format < FORMAT_VERSION2)
+				tvx.Seek((docNum + docStoreOffset) * 8L + FORMAT_SIZE);
+			else
+				tvx.Seek((docNum + docStoreOffset) * 16L + FORMAT_SIZE);
+		}
+		
+		internal virtual bool CanReadRawDocs()
+		{
+			return format >= FORMAT_UTF8_LENGTH_IN_BYTES;
+		}
+		
+		/// <summary>Retrieve the length (in bytes) of the tvd and tvf
+		/// entries for the next numDocs starting with
+		/// startDocID.  This is used for bulk copying when
+		/// merging segments, if the field numbers are
+		/// congruent.  Once this returns, the tvf &amp; tvd streams
+		/// are seeked to the startDocID. 
+		/// </summary>
+		internal void  RawDocs(int[] tvdLengths, int[] tvfLengths, int startDocID, int numDocs)
+		{
+			
+			if (tvx == null)
+			{
+                for (int i = 0; i < tvdLengths.Length; i++)
+                {
+                    tvdLengths[i] = 0;
+                }
+                for (int i = 0; i < tvfLengths.Length; i++)
+                {
+                    tvfLengths[i] = 0;
+                }
+				return ;
+			}
+			
+			// SegmentMerger calls canReadRawDocs() first and should
+			// not call us if that returns false.
+			if (format < FORMAT_VERSION2)
+				throw new System.SystemException("cannot read raw docs with older term vector formats");
+			
+			SeekTvx(startDocID);
+			
+			long tvdPosition = tvx.ReadLong();
+			tvd.Seek(tvdPosition);
+			
+			long tvfPosition = tvx.ReadLong();
+			tvf.Seek(tvfPosition);
+			
+			long lastTvdPosition = tvdPosition;
+			long lastTvfPosition = tvfPosition;
+			
+			int count = 0;
+			while (count < numDocs)
+			{
+				int docID = docStoreOffset + startDocID + count + 1;
+				System.Diagnostics.Debug.Assert(docID <= numTotalDocs);
+				if (docID < numTotalDocs)
+				{
+					tvdPosition = tvx.ReadLong();
+					tvfPosition = tvx.ReadLong();
+				}
+				else
+				{
+					tvdPosition = tvd.Length();
+					tvfPosition = tvf.Length();
+					System.Diagnostics.Debug.Assert(count == numDocs - 1);
+				}
+				tvdLengths[count] = (int) (tvdPosition - lastTvdPosition);
+				tvfLengths[count] = (int) (tvfPosition - lastTvfPosition);
+				count++;
+				lastTvdPosition = tvdPosition;
+				lastTvfPosition = tvfPosition;
+			}
+		}
+		
+		private int CheckValidFormat(IndexInput in_Renamed)
+		{
+			int format = in_Renamed.ReadInt();
+			if (format > FORMAT_CURRENT)
+			{
+				throw new CorruptIndexException("Incompatible format version: " + format + " expected " + FORMAT_CURRENT + " or less");
+			}
+			return format;
+		}
+		
+        public void Dispose()
+        {
+            Dispose(true);
+        }
+
+		protected virtual void Dispose(bool disposing)
+		{
+            if (isDisposed) return;
+
+            if (disposing)
+            {
+                // make all effort to close up. Keep the first exception
+                // and throw it as a new one.
+                System.IO.IOException keep = null;
+                if (tvx != null)
+                    try
+                    {
+                        tvx.Close();
+                    }
+                    catch (System.IO.IOException e)
+                    {
+                        if (keep == null)
+                            keep = e;
+                    }
+                if (tvd != null)
+                    try
+                    {
+                        tvd.Close();
+                    }
+                    catch (System.IO.IOException e)
+                    {
+                        if (keep == null)
+                            keep = e;
+                    }
+                if (tvf != null)
+                    try
+                    {
+                        tvf.Close();
+                    }
+                    catch (System.IO.IOException e)
+                    {
+                        if (keep == null)
+                            keep = e;
+                    }
+                if (keep != null)
+                {
+                    throw new System.IO.IOException(keep.StackTrace);
+                }
+            }
+
+		    isDisposed = true;
+		}
+		
+		/// <summary> </summary>
+		/// <returns> The number of documents in the reader
+		/// </returns>
+		internal virtual int Size()
+		{
+			return size;
+		}
+		
+		public virtual void  Get(int docNum, System.String field, TermVectorMapper mapper)
+		{
+			if (tvx != null)
+			{
+				int fieldNumber = fieldInfos.FieldNumber(field);
+				//We need to account for the FORMAT_SIZE at when seeking in the tvx
+				//We don't need to do this in other seeks because we already have the
+				// file pointer
+				//that was written in another file
+				SeekTvx(docNum);
+				//System.out.println("TVX Pointer: " + tvx.getFilePointer());
+				long tvdPosition = tvx.ReadLong();
+				
+				tvd.Seek(tvdPosition);
+				int fieldCount = tvd.ReadVInt();
+				//System.out.println("Num Fields: " + fieldCount);
+				// There are only a few fields per document. We opt for a full scan
+				// rather then requiring that they be ordered. We need to read through
+				// all of the fields anyway to get to the tvf pointers.
+				int number = 0;
+				int found = - 1;
+				for (int i = 0; i < fieldCount; i++)
+				{
+					if (format >= FORMAT_VERSION)
+						number = tvd.ReadVInt();
+					else
+						number += tvd.ReadVInt();
+					
+					if (number == fieldNumber)
+						found = i;
+				}
+				
+				// This field, although valid in the segment, was not found in this
+				// document
+				if (found != - 1)
+				{
+					// Compute position in the tvf file
+					long position;
+					if (format >= FORMAT_VERSION2)
+						position = tvx.ReadLong();
+					else
+						position = tvd.ReadVLong();
+					for (int i = 1; i <= found; i++)
+						position += tvd.ReadVLong();
+
+                    mapper.SetDocumentNumber(docNum);
+					ReadTermVector(field, position, mapper);
+				}
+				else
+				{
+					//System.out.println("Fieldable not found");
+				}
+			}
+			else
+			{
+				//System.out.println("No tvx file");
+			}
+		}
+		
+		
+		
+		/// <summary> Retrieve the term vector for the given document and field</summary>
+		/// <param name="docNum">The document number to retrieve the vector for
+		/// </param>
+		/// <param name="field">The field within the document to retrieve
+		/// </param>
+		/// <returns> The TermFreqVector for the document and field or null if there is no termVector for this field.
+		/// </returns>
+		/// <throws>  IOException if there is an error reading the term vector files </throws>
+		public /*internal*/ virtual ITermFreqVector Get(int docNum, System.String field)
+		{
+			// Check if no term vectors are available for this segment at all
+			ParallelArrayTermVectorMapper mapper = new ParallelArrayTermVectorMapper();
+			Get(docNum, field, mapper);
+			
+			return mapper.MaterializeVector();
+		}
+		
+		// Reads the String[] fields; you have to pre-seek tvd to
+		// the right point
+		private System.String[] ReadFields(int fieldCount)
+		{
+			int number = 0;
+			System.String[] fields = new System.String[fieldCount];
+			
+			for (int i = 0; i < fieldCount; i++)
+			{
+				if (format >= FORMAT_VERSION)
+					number = tvd.ReadVInt();
+				else
+					number += tvd.ReadVInt();
+				
+				fields[i] = fieldInfos.FieldName(number);
+			}
+			
+			return fields;
+		}
+		
+		// Reads the long[] offsets into TVF; you have to pre-seek
+		// tvx/tvd to the right point
+		private long[] ReadTvfPointers(int fieldCount)
+		{
+			// Compute position in the tvf file
+			long position;
+			if (format >= FORMAT_VERSION2)
+				position = tvx.ReadLong();
+			else
+				position = tvd.ReadVLong();
+			
+			long[] tvfPointers = new long[fieldCount];
+			tvfPointers[0] = position;
+			
+			for (int i = 1; i < fieldCount; i++)
+			{
+				position += tvd.ReadVLong();
+				tvfPointers[i] = position;
+			}
+			
+			return tvfPointers;
+		}
+		
+		/// <summary> Return all term vectors stored for this document or null if the could not be read in.
+		/// 
+		/// </summary>
+		/// <param name="docNum">The document number to retrieve the vector for
+		/// </param>
+		/// <returns> All term frequency vectors
+		/// </returns>
+		/// <throws>  IOException if there is an error reading the term vector files  </throws>
+		public /*internal*/ virtual ITermFreqVector[] Get(int docNum)
+		{
+			ITermFreqVector[] result = null;
+			if (tvx != null)
+			{
+				//We need to offset by
+				SeekTvx(docNum);
+				long tvdPosition = tvx.ReadLong();
+				
+				tvd.Seek(tvdPosition);
+				int fieldCount = tvd.ReadVInt();
+				
+				// No fields are vectorized for this document
+				if (fieldCount != 0)
+				{
+					System.String[] fields = ReadFields(fieldCount);
+					long[] tvfPointers = ReadTvfPointers(fieldCount);
+					result = ReadTermVectors(docNum, fields, tvfPointers);
+				}
+			}
+			else
+			{
+				//System.out.println("No tvx file");
+			}
+			return result;
+		}
+		
+		public virtual void  Get(int docNumber, TermVectorMapper mapper)
+		{
+			// Check if no term vectors are available for this segment at all
+			if (tvx != null)
+			{
+				//We need to offset by
+				
+				SeekTvx(docNumber);
+				long tvdPosition = tvx.ReadLong();
+				
+				tvd.Seek(tvdPosition);
+				int fieldCount = tvd.ReadVInt();
+				
+				// No fields are vectorized for this document
+				if (fieldCount != 0)
+				{
+					System.String[] fields = ReadFields(fieldCount);
+					long[] tvfPointers = ReadTvfPointers(fieldCount);
+					mapper.SetDocumentNumber(docNumber);
+					ReadTermVectors(fields, tvfPointers, mapper);
+				}
+			}
+			else
+			{
+				//System.out.println("No tvx file");
+			}
+		}
+		
+		
+		private SegmentTermVector[] ReadTermVectors(int docNum, System.String[] fields, long[] tvfPointers)
+		{
+			SegmentTermVector[] res = new SegmentTermVector[fields.Length];
+			for (int i = 0; i < fields.Length; i++)
+			{
+				var mapper = new ParallelArrayTermVectorMapper();
+				mapper.SetDocumentNumber(docNum);
+				ReadTermVector(fields[i], tvfPointers[i], mapper);
+				res[i] = (SegmentTermVector) mapper.MaterializeVector();
+			}
+			return res;
+		}
+		
+		private void  ReadTermVectors(System.String[] fields, long[] tvfPointers, TermVectorMapper mapper)
+		{
+			for (int i = 0; i < fields.Length; i++)
+			{
+				ReadTermVector(fields[i], tvfPointers[i], mapper);
+			}
+		}
+		
+		
+		/// <summary> </summary>
+		/// <param name="field">The field to read in
+		/// </param>
+		/// <param name="tvfPointer">The pointer within the tvf file where we should start reading
+		/// </param>
+		/// <param name="mapper">The mapper used to map the TermVector
+		/// </param>
+		/// <throws>  IOException </throws>
+		private void  ReadTermVector(System.String field, long tvfPointer, TermVectorMapper mapper)
+		{
+			
+			// Now read the data from specified position
+			//We don't need to offset by the FORMAT here since the pointer already includes the offset
+			tvf.Seek(tvfPointer);
+			
+			int numTerms = tvf.ReadVInt();
+			//System.out.println("Num Terms: " + numTerms);
+			// If no terms - return a constant empty termvector. However, this should never occur!
+			if (numTerms == 0)
+				return ;
+			
+			bool storePositions;
+			bool storeOffsets;
+			
+			if (format >= FORMAT_VERSION)
+			{
+				byte bits = tvf.ReadByte();
+				storePositions = (bits & STORE_POSITIONS_WITH_TERMVECTOR) != 0;
+				storeOffsets = (bits & STORE_OFFSET_WITH_TERMVECTOR) != 0;
+			}
+			else
+			{
+				tvf.ReadVInt();
+				storePositions = false;
+				storeOffsets = false;
+			}
+			mapper.SetExpectations(field, numTerms, storeOffsets, storePositions);
+			int start = 0;
+			int deltaLength = 0;
+			int totalLength = 0;
+			byte[] byteBuffer;
+			char[] charBuffer;
+			bool preUTF8 = format < FORMAT_UTF8_LENGTH_IN_BYTES;
+			
+			// init the buffers
+			if (preUTF8)
+			{
+				charBuffer = new char[10];
+				byteBuffer = null;
+			}
+			else
+			{
+				charBuffer = null;
+				byteBuffer = new byte[20];
+			}
+			
+			for (int i = 0; i < numTerms; i++)
+			{
+				start = tvf.ReadVInt();
+				deltaLength = tvf.ReadVInt();
+				totalLength = start + deltaLength;
+				
+				System.String term;
+				
+				if (preUTF8)
+				{
+					// Term stored as java chars
+					if (charBuffer.Length < totalLength)
+					{
+						char[] newCharBuffer = new char[(int) (1.5 * totalLength)];
+						Array.Copy(charBuffer, 0, newCharBuffer, 0, start);
+						charBuffer = newCharBuffer;
+					}
+					tvf.ReadChars(charBuffer, start, deltaLength);
+					term = new System.String(charBuffer, 0, totalLength);
+				}
+				else
+				{
+					// Term stored as utf8 bytes
+					if (byteBuffer.Length < totalLength)
+					{
+						byte[] newByteBuffer = new byte[(int) (1.5 * totalLength)];
+						Array.Copy(byteBuffer, 0, newByteBuffer, 0, start);
+						byteBuffer = newByteBuffer;
+					}
+					tvf.ReadBytes(byteBuffer, start, deltaLength);
+                    term = System.Text.Encoding.UTF8.GetString(byteBuffer, 0, totalLength);
+				}
+				int freq = tvf.ReadVInt();
+				int[] positions = null;
+				if (storePositions)
+				{
+					//read in the positions
+					//does the mapper even care about positions?
+					if (mapper.IsIgnoringPositions == false)
+					{
+						positions = new int[freq];
+						int prevPosition = 0;
+						for (int j = 0; j < freq; j++)
+						{
+							positions[j] = prevPosition + tvf.ReadVInt();
+							prevPosition = positions[j];
+						}
+					}
+					else
+					{
+						//we need to skip over the positions.  Since these are VInts, I don't believe there is anyway to know for sure how far to skip
+						//
+						for (int j = 0; j < freq; j++)
+						{
+							tvf.ReadVInt();
+						}
+					}
+				}
+				TermVectorOffsetInfo[] offsets = null;
+				if (storeOffsets)
+				{
+					//does the mapper even care about offsets?
+					if (mapper.IsIgnoringOffsets == false)
+					{
+						offsets = new TermVectorOffsetInfo[freq];
+						int prevOffset = 0;
+						for (int j = 0; j < freq; j++)
+						{
+							int startOffset = prevOffset + tvf.ReadVInt();
+							int endOffset = startOffset + tvf.ReadVInt();
+							offsets[j] = new TermVectorOffsetInfo(startOffset, endOffset);
+							prevOffset = endOffset;
+						}
+					}
+					else
+					{
+						for (int j = 0; j < freq; j++)
+						{
+							tvf.ReadVInt();
+							tvf.ReadVInt();
+						}
+					}
+				}
+				mapper.Map(term, freq, offsets, positions);
+			}
+		}
+		
+		public virtual System.Object Clone()
+		{
+			
+			TermVectorsReader clone = (TermVectorsReader) base.MemberwiseClone();
+			
+			// These are null when a TermVectorsReader was created
+			// on a segment that did not have term vectors saved
+			if (tvx != null && tvd != null && tvf != null)
+			{
+				clone.tvx = (IndexInput) tvx.Clone();
+				clone.tvd = (IndexInput) tvd.Clone();
+				clone.tvf = (IndexInput) tvf.Clone();
+			}
+			
+			return clone;
+		}
+	}
+	
+	
+	/// <summary> Models the existing parallel array structure</summary>
+	class ParallelArrayTermVectorMapper:TermVectorMapper
+	{
+		
+		private System.String[] terms;
+		private int[] termFreqs;
+		private int[][] positions;
+		private TermVectorOffsetInfo[][] offsets;
+		private int currentPosition;
+		private bool storingOffsets;
+		private bool storingPositions;
+		private System.String field;
+		
+		public override void  SetExpectations(System.String field, int numTerms, bool storeOffsets, bool storePositions)
+		{
+			this.field = field;
+			terms = new System.String[numTerms];
+			termFreqs = new int[numTerms];
+			this.storingOffsets = storeOffsets;
+			this.storingPositions = storePositions;
+			if (storePositions)
+				this.positions = new int[numTerms][];
+			if (storeOffsets)
+				this.offsets = new TermVectorOffsetInfo[numTerms][];
+		}
+		
+		public override void  Map(System.String term, int frequency, TermVectorOffsetInfo[] offsets, int[] positions)
+		{
+			terms[currentPosition] = term;
+			termFreqs[currentPosition] = frequency;
+			if (storingOffsets)
+			{
+				this.offsets[currentPosition] = offsets;
+			}
+			if (storingPositions)
+			{
+				this.positions[currentPosition] = positions;
+			}
+			currentPosition++;
+		}
+		
+		/// <summary> Construct the vector</summary>
+		/// <returns> The <see cref="ITermFreqVector" /> based on the mappings.
+		/// </returns>
+		public virtual ITermFreqVector MaterializeVector()
+		{
+			SegmentTermVector tv = null;
+			if (field != null && terms != null)
+			{
+				if (storingPositions || storingOffsets)
+				{
+					tv = new SegmentTermPositionVector(field, terms, termFreqs, positions, offsets);
+				}
+				else
+				{
+					tv = new SegmentTermVector(field, terms, termFreqs);
+				}
+			}
+			return tv;
+		}
+	}
+}
+\ No newline at end of file