Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/mono/Lucene.Net.Light.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
Diffstat (limited to 'src/core/Index/TermVectorsReader.cs')
-rw-r--r--src/core/Index/TermVectorsReader.cs731
1 files changed, 731 insertions, 0 deletions
diff --git a/src/core/Index/TermVectorsReader.cs b/src/core/Index/TermVectorsReader.cs
new file mode 100644
index 0000000..56cf764
--- /dev/null
+++ b/src/core/Index/TermVectorsReader.cs
@@ -0,0 +1,731 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+using System;
+
+using BufferedIndexInput = Lucene.Net.Store.BufferedIndexInput;
+using Directory = Lucene.Net.Store.Directory;
+using IndexInput = Lucene.Net.Store.IndexInput;
+
+namespace Lucene.Net.Index
+{
+ class TermVectorsReader : System.ICloneable, IDisposable
+ {
+
+ // NOTE: if you make a new format, it must be larger than
+ // the current format
+ internal const int FORMAT_VERSION = 2;
+
+ // Changes to speed up bulk merging of term vectors:
+ internal const int FORMAT_VERSION2 = 3;
+
+ // Changed strings to UTF8 with length-in-bytes not length-in-chars
+ internal const int FORMAT_UTF8_LENGTH_IN_BYTES = 4;
+
+ // NOTE: always change this if you switch to a new format!
+ internal static readonly int FORMAT_CURRENT = FORMAT_UTF8_LENGTH_IN_BYTES;
+
+ //The size in bytes that the FORMAT_VERSION will take up at the beginning of each file
+ internal const int FORMAT_SIZE = 4;
+
+ internal const byte STORE_POSITIONS_WITH_TERMVECTOR = (byte) (0x1);
+ internal const byte STORE_OFFSET_WITH_TERMVECTOR = (byte) (0x2);
+
+ private FieldInfos fieldInfos;
+
+ private IndexInput tvx;
+ private IndexInput tvd;
+ private IndexInput tvf;
+ private int size;
+ private int numTotalDocs;
+
+ // The docID offset where our docs begin in the index
+ // file. This will be 0 if we have our own private file.
+ private int docStoreOffset;
+
+ private int format;
+ private bool isDisposed;
+
+ internal TermVectorsReader(Directory d, System.String segment, FieldInfos fieldInfos):this(d, segment, fieldInfos, BufferedIndexInput.BUFFER_SIZE)
+ {
+ }
+
+ internal TermVectorsReader(Directory d, System.String segment, FieldInfos fieldInfos, int readBufferSize):this(d, segment, fieldInfos, readBufferSize, - 1, 0)
+ {
+ }
+
+ internal TermVectorsReader(Directory d, System.String segment, FieldInfos fieldInfos, int readBufferSize, int docStoreOffset, int size)
+ {
+ bool success = false;
+
+ try
+ {
+ if (d.FileExists(segment + "." + IndexFileNames.VECTORS_INDEX_EXTENSION))
+ {
+ tvx = d.OpenInput(segment + "." + IndexFileNames.VECTORS_INDEX_EXTENSION, readBufferSize);
+ format = CheckValidFormat(tvx);
+ tvd = d.OpenInput(segment + "." + IndexFileNames.VECTORS_DOCUMENTS_EXTENSION, readBufferSize);
+ int tvdFormat = CheckValidFormat(tvd);
+ tvf = d.OpenInput(segment + "." + IndexFileNames.VECTORS_FIELDS_EXTENSION, readBufferSize);
+ int tvfFormat = CheckValidFormat(tvf);
+
+ System.Diagnostics.Debug.Assert(format == tvdFormat);
+ System.Diagnostics.Debug.Assert(format == tvfFormat);
+
+ if (format >= FORMAT_VERSION2)
+ {
+ System.Diagnostics.Debug.Assert((tvx.Length() - FORMAT_SIZE) % 16 == 0);
+ numTotalDocs = (int)(tvx.Length() >> 4);
+ }
+ else
+ {
+ System.Diagnostics.Debug.Assert((tvx.Length() - FORMAT_SIZE) % 8 == 0);
+ numTotalDocs = (int)(tvx.Length() >> 3);
+ }
+
+ if (-1 == docStoreOffset)
+ {
+ this.docStoreOffset = 0;
+ this.size = numTotalDocs;
+ System.Diagnostics.Debug.Assert(size == 0 || numTotalDocs == size);
+ }
+ else
+ {
+ this.docStoreOffset = docStoreOffset;
+ this.size = size;
+ // Verify the file is long enough to hold all of our
+ // docs
+ System.Diagnostics.Debug.Assert(numTotalDocs >= size + docStoreOffset, "numTotalDocs=" + numTotalDocs + " size=" + size + " docStoreOffset=" + docStoreOffset);
+ }
+ }
+ else
+ {
+ // If all documents flushed in a segment had hit
+ // non-aborting exceptions, it's possible that
+ // FieldInfos.hasVectors returns true yet the term
+ // vector files don't exist.
+ format = 0;
+ }
+
+
+ this.fieldInfos = fieldInfos;
+ success = true;
+ }
+ finally
+ {
+ // With lock-less commits, it's entirely possible (and
+ // fine) to hit a FileNotFound exception above. In
+ // this case, we want to explicitly close any subset
+ // of things that were opened so that we don't have to
+ // wait for a GC to do so.
+ if (!success)
+ {
+ Dispose();
+ }
+ }
+ }
+
+ // Used for bulk copy when merging
+ internal virtual IndexInput GetTvdStream()
+ {
+ return tvd;
+ }
+
+ // Used for bulk copy when merging
+ internal virtual IndexInput GetTvfStream()
+ {
+ return tvf;
+ }
+
+ private void SeekTvx(int docNum)
+ {
+ if (format < FORMAT_VERSION2)
+ tvx.Seek((docNum + docStoreOffset) * 8L + FORMAT_SIZE);
+ else
+ tvx.Seek((docNum + docStoreOffset) * 16L + FORMAT_SIZE);
+ }
+
+ internal virtual bool CanReadRawDocs()
+ {
+ return format >= FORMAT_UTF8_LENGTH_IN_BYTES;
+ }
+
+ /// <summary>Retrieve the length (in bytes) of the tvd and tvf
+ /// entries for the next numDocs starting with
+ /// startDocID. This is used for bulk copying when
+ /// merging segments, if the field numbers are
+ /// congruent. Once this returns, the tvf &amp; tvd streams
+ /// are seeked to the startDocID.
+ /// </summary>
+ internal void RawDocs(int[] tvdLengths, int[] tvfLengths, int startDocID, int numDocs)
+ {
+
+ if (tvx == null)
+ {
+ for (int i = 0; i < tvdLengths.Length; i++)
+ {
+ tvdLengths[i] = 0;
+ }
+ for (int i = 0; i < tvfLengths.Length; i++)
+ {
+ tvfLengths[i] = 0;
+ }
+ return ;
+ }
+
+ // SegmentMerger calls canReadRawDocs() first and should
+ // not call us if that returns false.
+ if (format < FORMAT_VERSION2)
+ throw new System.SystemException("cannot read raw docs with older term vector formats");
+
+ SeekTvx(startDocID);
+
+ long tvdPosition = tvx.ReadLong();
+ tvd.Seek(tvdPosition);
+
+ long tvfPosition = tvx.ReadLong();
+ tvf.Seek(tvfPosition);
+
+ long lastTvdPosition = tvdPosition;
+ long lastTvfPosition = tvfPosition;
+
+ int count = 0;
+ while (count < numDocs)
+ {
+ int docID = docStoreOffset + startDocID + count + 1;
+ System.Diagnostics.Debug.Assert(docID <= numTotalDocs);
+ if (docID < numTotalDocs)
+ {
+ tvdPosition = tvx.ReadLong();
+ tvfPosition = tvx.ReadLong();
+ }
+ else
+ {
+ tvdPosition = tvd.Length();
+ tvfPosition = tvf.Length();
+ System.Diagnostics.Debug.Assert(count == numDocs - 1);
+ }
+ tvdLengths[count] = (int) (tvdPosition - lastTvdPosition);
+ tvfLengths[count] = (int) (tvfPosition - lastTvfPosition);
+ count++;
+ lastTvdPosition = tvdPosition;
+ lastTvfPosition = tvfPosition;
+ }
+ }
+
+ private int CheckValidFormat(IndexInput in_Renamed)
+ {
+ int format = in_Renamed.ReadInt();
+ if (format > FORMAT_CURRENT)
+ {
+ throw new CorruptIndexException("Incompatible format version: " + format + " expected " + FORMAT_CURRENT + " or less");
+ }
+ return format;
+ }
+
+ public void Dispose()
+ {
+ Dispose(true);
+ }
+
+ protected virtual void Dispose(bool disposing)
+ {
+ if (isDisposed) return;
+
+ if (disposing)
+ {
+ // make all effort to close up. Keep the first exception
+ // and throw it as a new one.
+ System.IO.IOException keep = null;
+ if (tvx != null)
+ try
+ {
+ tvx.Close();
+ }
+ catch (System.IO.IOException e)
+ {
+ if (keep == null)
+ keep = e;
+ }
+ if (tvd != null)
+ try
+ {
+ tvd.Close();
+ }
+ catch (System.IO.IOException e)
+ {
+ if (keep == null)
+ keep = e;
+ }
+ if (tvf != null)
+ try
+ {
+ tvf.Close();
+ }
+ catch (System.IO.IOException e)
+ {
+ if (keep == null)
+ keep = e;
+ }
+ if (keep != null)
+ {
+ throw new System.IO.IOException(keep.StackTrace);
+ }
+ }
+
+ isDisposed = true;
+ }
+
+ /// <summary> </summary>
+ /// <returns> The number of documents in the reader
+ /// </returns>
+ internal virtual int Size()
+ {
+ return size;
+ }
+
+ public virtual void Get(int docNum, System.String field, TermVectorMapper mapper)
+ {
+ if (tvx != null)
+ {
+ int fieldNumber = fieldInfos.FieldNumber(field);
+ //We need to account for the FORMAT_SIZE at when seeking in the tvx
+ //We don't need to do this in other seeks because we already have the
+ // file pointer
+ //that was written in another file
+ SeekTvx(docNum);
+ //System.out.println("TVX Pointer: " + tvx.getFilePointer());
+ long tvdPosition = tvx.ReadLong();
+
+ tvd.Seek(tvdPosition);
+ int fieldCount = tvd.ReadVInt();
+ //System.out.println("Num Fields: " + fieldCount);
+ // There are only a few fields per document. We opt for a full scan
+ // rather then requiring that they be ordered. We need to read through
+ // all of the fields anyway to get to the tvf pointers.
+ int number = 0;
+ int found = - 1;
+ for (int i = 0; i < fieldCount; i++)
+ {
+ if (format >= FORMAT_VERSION)
+ number = tvd.ReadVInt();
+ else
+ number += tvd.ReadVInt();
+
+ if (number == fieldNumber)
+ found = i;
+ }
+
+ // This field, although valid in the segment, was not found in this
+ // document
+ if (found != - 1)
+ {
+ // Compute position in the tvf file
+ long position;
+ if (format >= FORMAT_VERSION2)
+ position = tvx.ReadLong();
+ else
+ position = tvd.ReadVLong();
+ for (int i = 1; i <= found; i++)
+ position += tvd.ReadVLong();
+
+ mapper.SetDocumentNumber(docNum);
+ ReadTermVector(field, position, mapper);
+ }
+ else
+ {
+ //System.out.println("Fieldable not found");
+ }
+ }
+ else
+ {
+ //System.out.println("No tvx file");
+ }
+ }
+
+
+
+ /// <summary> Retrieve the term vector for the given document and field</summary>
+ /// <param name="docNum">The document number to retrieve the vector for
+ /// </param>
+ /// <param name="field">The field within the document to retrieve
+ /// </param>
+ /// <returns> The TermFreqVector for the document and field or null if there is no termVector for this field.
+ /// </returns>
+ /// <throws> IOException if there is an error reading the term vector files </throws>
+ public /*internal*/ virtual ITermFreqVector Get(int docNum, System.String field)
+ {
+ // Check if no term vectors are available for this segment at all
+ ParallelArrayTermVectorMapper mapper = new ParallelArrayTermVectorMapper();
+ Get(docNum, field, mapper);
+
+ return mapper.MaterializeVector();
+ }
+
+ // Reads the String[] fields; you have to pre-seek tvd to
+ // the right point
+ private System.String[] ReadFields(int fieldCount)
+ {
+ int number = 0;
+ System.String[] fields = new System.String[fieldCount];
+
+ for (int i = 0; i < fieldCount; i++)
+ {
+ if (format >= FORMAT_VERSION)
+ number = tvd.ReadVInt();
+ else
+ number += tvd.ReadVInt();
+
+ fields[i] = fieldInfos.FieldName(number);
+ }
+
+ return fields;
+ }
+
+ // Reads the long[] offsets into TVF; you have to pre-seek
+ // tvx/tvd to the right point
+ private long[] ReadTvfPointers(int fieldCount)
+ {
+ // Compute position in the tvf file
+ long position;
+ if (format >= FORMAT_VERSION2)
+ position = tvx.ReadLong();
+ else
+ position = tvd.ReadVLong();
+
+ long[] tvfPointers = new long[fieldCount];
+ tvfPointers[0] = position;
+
+ for (int i = 1; i < fieldCount; i++)
+ {
+ position += tvd.ReadVLong();
+ tvfPointers[i] = position;
+ }
+
+ return tvfPointers;
+ }
+
+ /// <summary> Return all term vectors stored for this document or null if the could not be read in.
+ ///
+ /// </summary>
+ /// <param name="docNum">The document number to retrieve the vector for
+ /// </param>
+ /// <returns> All term frequency vectors
+ /// </returns>
+ /// <throws> IOException if there is an error reading the term vector files </throws>
+ public /*internal*/ virtual ITermFreqVector[] Get(int docNum)
+ {
+ ITermFreqVector[] result = null;
+ if (tvx != null)
+ {
+ //We need to offset by
+ SeekTvx(docNum);
+ long tvdPosition = tvx.ReadLong();
+
+ tvd.Seek(tvdPosition);
+ int fieldCount = tvd.ReadVInt();
+
+ // No fields are vectorized for this document
+ if (fieldCount != 0)
+ {
+ System.String[] fields = ReadFields(fieldCount);
+ long[] tvfPointers = ReadTvfPointers(fieldCount);
+ result = ReadTermVectors(docNum, fields, tvfPointers);
+ }
+ }
+ else
+ {
+ //System.out.println("No tvx file");
+ }
+ return result;
+ }
+
+ public virtual void Get(int docNumber, TermVectorMapper mapper)
+ {
+ // Check if no term vectors are available for this segment at all
+ if (tvx != null)
+ {
+ //We need to offset by
+
+ SeekTvx(docNumber);
+ long tvdPosition = tvx.ReadLong();
+
+ tvd.Seek(tvdPosition);
+ int fieldCount = tvd.ReadVInt();
+
+ // No fields are vectorized for this document
+ if (fieldCount != 0)
+ {
+ System.String[] fields = ReadFields(fieldCount);
+ long[] tvfPointers = ReadTvfPointers(fieldCount);
+ mapper.SetDocumentNumber(docNumber);
+ ReadTermVectors(fields, tvfPointers, mapper);
+ }
+ }
+ else
+ {
+ //System.out.println("No tvx file");
+ }
+ }
+
+
+ private SegmentTermVector[] ReadTermVectors(int docNum, System.String[] fields, long[] tvfPointers)
+ {
+ SegmentTermVector[] res = new SegmentTermVector[fields.Length];
+ for (int i = 0; i < fields.Length; i++)
+ {
+ var mapper = new ParallelArrayTermVectorMapper();
+ mapper.SetDocumentNumber(docNum);
+ ReadTermVector(fields[i], tvfPointers[i], mapper);
+ res[i] = (SegmentTermVector) mapper.MaterializeVector();
+ }
+ return res;
+ }
+
+ private void ReadTermVectors(System.String[] fields, long[] tvfPointers, TermVectorMapper mapper)
+ {
+ for (int i = 0; i < fields.Length; i++)
+ {
+ ReadTermVector(fields[i], tvfPointers[i], mapper);
+ }
+ }
+
+
+ /// <summary> </summary>
+ /// <param name="field">The field to read in
+ /// </param>
+ /// <param name="tvfPointer">The pointer within the tvf file where we should start reading
+ /// </param>
+ /// <param name="mapper">The mapper used to map the TermVector
+ /// </param>
+ /// <throws> IOException </throws>
+ private void ReadTermVector(System.String field, long tvfPointer, TermVectorMapper mapper)
+ {
+
+ // Now read the data from specified position
+ //We don't need to offset by the FORMAT here since the pointer already includes the offset
+ tvf.Seek(tvfPointer);
+
+ int numTerms = tvf.ReadVInt();
+ //System.out.println("Num Terms: " + numTerms);
+ // If no terms - return a constant empty termvector. However, this should never occur!
+ if (numTerms == 0)
+ return ;
+
+ bool storePositions;
+ bool storeOffsets;
+
+ if (format >= FORMAT_VERSION)
+ {
+ byte bits = tvf.ReadByte();
+ storePositions = (bits & STORE_POSITIONS_WITH_TERMVECTOR) != 0;
+ storeOffsets = (bits & STORE_OFFSET_WITH_TERMVECTOR) != 0;
+ }
+ else
+ {
+ tvf.ReadVInt();
+ storePositions = false;
+ storeOffsets = false;
+ }
+ mapper.SetExpectations(field, numTerms, storeOffsets, storePositions);
+ int start = 0;
+ int deltaLength = 0;
+ int totalLength = 0;
+ byte[] byteBuffer;
+ char[] charBuffer;
+ bool preUTF8 = format < FORMAT_UTF8_LENGTH_IN_BYTES;
+
+ // init the buffers
+ if (preUTF8)
+ {
+ charBuffer = new char[10];
+ byteBuffer = null;
+ }
+ else
+ {
+ charBuffer = null;
+ byteBuffer = new byte[20];
+ }
+
+ for (int i = 0; i < numTerms; i++)
+ {
+ start = tvf.ReadVInt();
+ deltaLength = tvf.ReadVInt();
+ totalLength = start + deltaLength;
+
+ System.String term;
+
+ if (preUTF8)
+ {
+ // Term stored as java chars
+ if (charBuffer.Length < totalLength)
+ {
+ char[] newCharBuffer = new char[(int) (1.5 * totalLength)];
+ Array.Copy(charBuffer, 0, newCharBuffer, 0, start);
+ charBuffer = newCharBuffer;
+ }
+ tvf.ReadChars(charBuffer, start, deltaLength);
+ term = new System.String(charBuffer, 0, totalLength);
+ }
+ else
+ {
+ // Term stored as utf8 bytes
+ if (byteBuffer.Length < totalLength)
+ {
+ byte[] newByteBuffer = new byte[(int) (1.5 * totalLength)];
+ Array.Copy(byteBuffer, 0, newByteBuffer, 0, start);
+ byteBuffer = newByteBuffer;
+ }
+ tvf.ReadBytes(byteBuffer, start, deltaLength);
+ term = System.Text.Encoding.UTF8.GetString(byteBuffer, 0, totalLength);
+ }
+ int freq = tvf.ReadVInt();
+ int[] positions = null;
+ if (storePositions)
+ {
+ //read in the positions
+ //does the mapper even care about positions?
+ if (mapper.IsIgnoringPositions == false)
+ {
+ positions = new int[freq];
+ int prevPosition = 0;
+ for (int j = 0; j < freq; j++)
+ {
+ positions[j] = prevPosition + tvf.ReadVInt();
+ prevPosition = positions[j];
+ }
+ }
+ else
+ {
+ //we need to skip over the positions. Since these are VInts, I don't believe there is anyway to know for sure how far to skip
+ //
+ for (int j = 0; j < freq; j++)
+ {
+ tvf.ReadVInt();
+ }
+ }
+ }
+ TermVectorOffsetInfo[] offsets = null;
+ if (storeOffsets)
+ {
+ //does the mapper even care about offsets?
+ if (mapper.IsIgnoringOffsets == false)
+ {
+ offsets = new TermVectorOffsetInfo[freq];
+ int prevOffset = 0;
+ for (int j = 0; j < freq; j++)
+ {
+ int startOffset = prevOffset + tvf.ReadVInt();
+ int endOffset = startOffset + tvf.ReadVInt();
+ offsets[j] = new TermVectorOffsetInfo(startOffset, endOffset);
+ prevOffset = endOffset;
+ }
+ }
+ else
+ {
+ for (int j = 0; j < freq; j++)
+ {
+ tvf.ReadVInt();
+ tvf.ReadVInt();
+ }
+ }
+ }
+ mapper.Map(term, freq, offsets, positions);
+ }
+ }
+
+ public virtual System.Object Clone()
+ {
+
+ TermVectorsReader clone = (TermVectorsReader) base.MemberwiseClone();
+
+ // These are null when a TermVectorsReader was created
+ // on a segment that did not have term vectors saved
+ if (tvx != null && tvd != null && tvf != null)
+ {
+ clone.tvx = (IndexInput) tvx.Clone();
+ clone.tvd = (IndexInput) tvd.Clone();
+ clone.tvf = (IndexInput) tvf.Clone();
+ }
+
+ return clone;
+ }
+ }
+
+
+ /// <summary> Models the existing parallel array structure</summary>
+ class ParallelArrayTermVectorMapper:TermVectorMapper
+ {
+
+ private System.String[] terms;
+ private int[] termFreqs;
+ private int[][] positions;
+ private TermVectorOffsetInfo[][] offsets;
+ private int currentPosition;
+ private bool storingOffsets;
+ private bool storingPositions;
+ private System.String field;
+
+ public override void SetExpectations(System.String field, int numTerms, bool storeOffsets, bool storePositions)
+ {
+ this.field = field;
+ terms = new System.String[numTerms];
+ termFreqs = new int[numTerms];
+ this.storingOffsets = storeOffsets;
+ this.storingPositions = storePositions;
+ if (storePositions)
+ this.positions = new int[numTerms][];
+ if (storeOffsets)
+ this.offsets = new TermVectorOffsetInfo[numTerms][];
+ }
+
+ public override void Map(System.String term, int frequency, TermVectorOffsetInfo[] offsets, int[] positions)
+ {
+ terms[currentPosition] = term;
+ termFreqs[currentPosition] = frequency;
+ if (storingOffsets)
+ {
+ this.offsets[currentPosition] = offsets;
+ }
+ if (storingPositions)
+ {
+ this.positions[currentPosition] = positions;
+ }
+ currentPosition++;
+ }
+
+ /// <summary> Construct the vector</summary>
+ /// <returns> The <see cref="ITermFreqVector" /> based on the mappings.
+ /// </returns>
+ public virtual ITermFreqVector MaterializeVector()
+ {
+ SegmentTermVector tv = null;
+ if (field != null && terms != null)
+ {
+ if (storingPositions || storingOffsets)
+ {
+ tv = new SegmentTermPositionVector(field, terms, termFreqs, positions, offsets);
+ }
+ else
+ {
+ tv = new SegmentTermVector(field, terms, termFreqs);
+ }
+ }
+ return tv;
+ }
+ }
+} \ No newline at end of file