diff options
Diffstat (limited to 'src/core/Index/TermVectorsReader.cs')
-rw-r--r-- | src/core/Index/TermVectorsReader.cs | 731 |
1 files changed, 731 insertions, 0 deletions
diff --git a/src/core/Index/TermVectorsReader.cs b/src/core/Index/TermVectorsReader.cs new file mode 100644 index 0000000..56cf764 --- /dev/null +++ b/src/core/Index/TermVectorsReader.cs @@ -0,0 +1,731 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +using System; + +using BufferedIndexInput = Lucene.Net.Store.BufferedIndexInput; +using Directory = Lucene.Net.Store.Directory; +using IndexInput = Lucene.Net.Store.IndexInput; + +namespace Lucene.Net.Index +{ + class TermVectorsReader : System.ICloneable, IDisposable + { + + // NOTE: if you make a new format, it must be larger than + // the current format + internal const int FORMAT_VERSION = 2; + + // Changes to speed up bulk merging of term vectors: + internal const int FORMAT_VERSION2 = 3; + + // Changed strings to UTF8 with length-in-bytes not length-in-chars + internal const int FORMAT_UTF8_LENGTH_IN_BYTES = 4; + + // NOTE: always change this if you switch to a new format! + internal static readonly int FORMAT_CURRENT = FORMAT_UTF8_LENGTH_IN_BYTES; + + //The size in bytes that the FORMAT_VERSION will take up at the beginning of each file + internal const int FORMAT_SIZE = 4; + + internal const byte STORE_POSITIONS_WITH_TERMVECTOR = (byte) (0x1); + internal const byte STORE_OFFSET_WITH_TERMVECTOR = (byte) (0x2); + + private FieldInfos fieldInfos; + + private IndexInput tvx; + private IndexInput tvd; + private IndexInput tvf; + private int size; + private int numTotalDocs; + + // The docID offset where our docs begin in the index + // file. This will be 0 if we have our own private file. + private int docStoreOffset; + + private int format; + private bool isDisposed; + + internal TermVectorsReader(Directory d, System.String segment, FieldInfos fieldInfos):this(d, segment, fieldInfos, BufferedIndexInput.BUFFER_SIZE) + { + } + + internal TermVectorsReader(Directory d, System.String segment, FieldInfos fieldInfos, int readBufferSize):this(d, segment, fieldInfos, readBufferSize, - 1, 0) + { + } + + internal TermVectorsReader(Directory d, System.String segment, FieldInfos fieldInfos, int readBufferSize, int docStoreOffset, int size) + { + bool success = false; + + try + { + if (d.FileExists(segment + "." + IndexFileNames.VECTORS_INDEX_EXTENSION)) + { + tvx = d.OpenInput(segment + "." + IndexFileNames.VECTORS_INDEX_EXTENSION, readBufferSize); + format = CheckValidFormat(tvx); + tvd = d.OpenInput(segment + "." + IndexFileNames.VECTORS_DOCUMENTS_EXTENSION, readBufferSize); + int tvdFormat = CheckValidFormat(tvd); + tvf = d.OpenInput(segment + "." + IndexFileNames.VECTORS_FIELDS_EXTENSION, readBufferSize); + int tvfFormat = CheckValidFormat(tvf); + + System.Diagnostics.Debug.Assert(format == tvdFormat); + System.Diagnostics.Debug.Assert(format == tvfFormat); + + if (format >= FORMAT_VERSION2) + { + System.Diagnostics.Debug.Assert((tvx.Length() - FORMAT_SIZE) % 16 == 0); + numTotalDocs = (int)(tvx.Length() >> 4); + } + else + { + System.Diagnostics.Debug.Assert((tvx.Length() - FORMAT_SIZE) % 8 == 0); + numTotalDocs = (int)(tvx.Length() >> 3); + } + + if (-1 == docStoreOffset) + { + this.docStoreOffset = 0; + this.size = numTotalDocs; + System.Diagnostics.Debug.Assert(size == 0 || numTotalDocs == size); + } + else + { + this.docStoreOffset = docStoreOffset; + this.size = size; + // Verify the file is long enough to hold all of our + // docs + System.Diagnostics.Debug.Assert(numTotalDocs >= size + docStoreOffset, "numTotalDocs=" + numTotalDocs + " size=" + size + " docStoreOffset=" + docStoreOffset); + } + } + else + { + // If all documents flushed in a segment had hit + // non-aborting exceptions, it's possible that + // FieldInfos.hasVectors returns true yet the term + // vector files don't exist. + format = 0; + } + + + this.fieldInfos = fieldInfos; + success = true; + } + finally + { + // With lock-less commits, it's entirely possible (and + // fine) to hit a FileNotFound exception above. In + // this case, we want to explicitly close any subset + // of things that were opened so that we don't have to + // wait for a GC to do so. + if (!success) + { + Dispose(); + } + } + } + + // Used for bulk copy when merging + internal virtual IndexInput GetTvdStream() + { + return tvd; + } + + // Used for bulk copy when merging + internal virtual IndexInput GetTvfStream() + { + return tvf; + } + + private void SeekTvx(int docNum) + { + if (format < FORMAT_VERSION2) + tvx.Seek((docNum + docStoreOffset) * 8L + FORMAT_SIZE); + else + tvx.Seek((docNum + docStoreOffset) * 16L + FORMAT_SIZE); + } + + internal virtual bool CanReadRawDocs() + { + return format >= FORMAT_UTF8_LENGTH_IN_BYTES; + } + + /// <summary>Retrieve the length (in bytes) of the tvd and tvf + /// entries for the next numDocs starting with + /// startDocID. This is used for bulk copying when + /// merging segments, if the field numbers are + /// congruent. Once this returns, the tvf & tvd streams + /// are seeked to the startDocID. + /// </summary> + internal void RawDocs(int[] tvdLengths, int[] tvfLengths, int startDocID, int numDocs) + { + + if (tvx == null) + { + for (int i = 0; i < tvdLengths.Length; i++) + { + tvdLengths[i] = 0; + } + for (int i = 0; i < tvfLengths.Length; i++) + { + tvfLengths[i] = 0; + } + return ; + } + + // SegmentMerger calls canReadRawDocs() first and should + // not call us if that returns false. + if (format < FORMAT_VERSION2) + throw new System.SystemException("cannot read raw docs with older term vector formats"); + + SeekTvx(startDocID); + + long tvdPosition = tvx.ReadLong(); + tvd.Seek(tvdPosition); + + long tvfPosition = tvx.ReadLong(); + tvf.Seek(tvfPosition); + + long lastTvdPosition = tvdPosition; + long lastTvfPosition = tvfPosition; + + int count = 0; + while (count < numDocs) + { + int docID = docStoreOffset + startDocID + count + 1; + System.Diagnostics.Debug.Assert(docID <= numTotalDocs); + if (docID < numTotalDocs) + { + tvdPosition = tvx.ReadLong(); + tvfPosition = tvx.ReadLong(); + } + else + { + tvdPosition = tvd.Length(); + tvfPosition = tvf.Length(); + System.Diagnostics.Debug.Assert(count == numDocs - 1); + } + tvdLengths[count] = (int) (tvdPosition - lastTvdPosition); + tvfLengths[count] = (int) (tvfPosition - lastTvfPosition); + count++; + lastTvdPosition = tvdPosition; + lastTvfPosition = tvfPosition; + } + } + + private int CheckValidFormat(IndexInput in_Renamed) + { + int format = in_Renamed.ReadInt(); + if (format > FORMAT_CURRENT) + { + throw new CorruptIndexException("Incompatible format version: " + format + " expected " + FORMAT_CURRENT + " or less"); + } + return format; + } + + public void Dispose() + { + Dispose(true); + } + + protected virtual void Dispose(bool disposing) + { + if (isDisposed) return; + + if (disposing) + { + // make all effort to close up. Keep the first exception + // and throw it as a new one. + System.IO.IOException keep = null; + if (tvx != null) + try + { + tvx.Close(); + } + catch (System.IO.IOException e) + { + if (keep == null) + keep = e; + } + if (tvd != null) + try + { + tvd.Close(); + } + catch (System.IO.IOException e) + { + if (keep == null) + keep = e; + } + if (tvf != null) + try + { + tvf.Close(); + } + catch (System.IO.IOException e) + { + if (keep == null) + keep = e; + } + if (keep != null) + { + throw new System.IO.IOException(keep.StackTrace); + } + } + + isDisposed = true; + } + + /// <summary> </summary> + /// <returns> The number of documents in the reader + /// </returns> + internal virtual int Size() + { + return size; + } + + public virtual void Get(int docNum, System.String field, TermVectorMapper mapper) + { + if (tvx != null) + { + int fieldNumber = fieldInfos.FieldNumber(field); + //We need to account for the FORMAT_SIZE at when seeking in the tvx + //We don't need to do this in other seeks because we already have the + // file pointer + //that was written in another file + SeekTvx(docNum); + //System.out.println("TVX Pointer: " + tvx.getFilePointer()); + long tvdPosition = tvx.ReadLong(); + + tvd.Seek(tvdPosition); + int fieldCount = tvd.ReadVInt(); + //System.out.println("Num Fields: " + fieldCount); + // There are only a few fields per document. We opt for a full scan + // rather then requiring that they be ordered. We need to read through + // all of the fields anyway to get to the tvf pointers. + int number = 0; + int found = - 1; + for (int i = 0; i < fieldCount; i++) + { + if (format >= FORMAT_VERSION) + number = tvd.ReadVInt(); + else + number += tvd.ReadVInt(); + + if (number == fieldNumber) + found = i; + } + + // This field, although valid in the segment, was not found in this + // document + if (found != - 1) + { + // Compute position in the tvf file + long position; + if (format >= FORMAT_VERSION2) + position = tvx.ReadLong(); + else + position = tvd.ReadVLong(); + for (int i = 1; i <= found; i++) + position += tvd.ReadVLong(); + + mapper.SetDocumentNumber(docNum); + ReadTermVector(field, position, mapper); + } + else + { + //System.out.println("Fieldable not found"); + } + } + else + { + //System.out.println("No tvx file"); + } + } + + + + /// <summary> Retrieve the term vector for the given document and field</summary> + /// <param name="docNum">The document number to retrieve the vector for + /// </param> + /// <param name="field">The field within the document to retrieve + /// </param> + /// <returns> The TermFreqVector for the document and field or null if there is no termVector for this field. + /// </returns> + /// <throws> IOException if there is an error reading the term vector files </throws> + public /*internal*/ virtual ITermFreqVector Get(int docNum, System.String field) + { + // Check if no term vectors are available for this segment at all + ParallelArrayTermVectorMapper mapper = new ParallelArrayTermVectorMapper(); + Get(docNum, field, mapper); + + return mapper.MaterializeVector(); + } + + // Reads the String[] fields; you have to pre-seek tvd to + // the right point + private System.String[] ReadFields(int fieldCount) + { + int number = 0; + System.String[] fields = new System.String[fieldCount]; + + for (int i = 0; i < fieldCount; i++) + { + if (format >= FORMAT_VERSION) + number = tvd.ReadVInt(); + else + number += tvd.ReadVInt(); + + fields[i] = fieldInfos.FieldName(number); + } + + return fields; + } + + // Reads the long[] offsets into TVF; you have to pre-seek + // tvx/tvd to the right point + private long[] ReadTvfPointers(int fieldCount) + { + // Compute position in the tvf file + long position; + if (format >= FORMAT_VERSION2) + position = tvx.ReadLong(); + else + position = tvd.ReadVLong(); + + long[] tvfPointers = new long[fieldCount]; + tvfPointers[0] = position; + + for (int i = 1; i < fieldCount; i++) + { + position += tvd.ReadVLong(); + tvfPointers[i] = position; + } + + return tvfPointers; + } + + /// <summary> Return all term vectors stored for this document or null if the could not be read in. + /// + /// </summary> + /// <param name="docNum">The document number to retrieve the vector for + /// </param> + /// <returns> All term frequency vectors + /// </returns> + /// <throws> IOException if there is an error reading the term vector files </throws> + public /*internal*/ virtual ITermFreqVector[] Get(int docNum) + { + ITermFreqVector[] result = null; + if (tvx != null) + { + //We need to offset by + SeekTvx(docNum); + long tvdPosition = tvx.ReadLong(); + + tvd.Seek(tvdPosition); + int fieldCount = tvd.ReadVInt(); + + // No fields are vectorized for this document + if (fieldCount != 0) + { + System.String[] fields = ReadFields(fieldCount); + long[] tvfPointers = ReadTvfPointers(fieldCount); + result = ReadTermVectors(docNum, fields, tvfPointers); + } + } + else + { + //System.out.println("No tvx file"); + } + return result; + } + + public virtual void Get(int docNumber, TermVectorMapper mapper) + { + // Check if no term vectors are available for this segment at all + if (tvx != null) + { + //We need to offset by + + SeekTvx(docNumber); + long tvdPosition = tvx.ReadLong(); + + tvd.Seek(tvdPosition); + int fieldCount = tvd.ReadVInt(); + + // No fields are vectorized for this document + if (fieldCount != 0) + { + System.String[] fields = ReadFields(fieldCount); + long[] tvfPointers = ReadTvfPointers(fieldCount); + mapper.SetDocumentNumber(docNumber); + ReadTermVectors(fields, tvfPointers, mapper); + } + } + else + { + //System.out.println("No tvx file"); + } + } + + + private SegmentTermVector[] ReadTermVectors(int docNum, System.String[] fields, long[] tvfPointers) + { + SegmentTermVector[] res = new SegmentTermVector[fields.Length]; + for (int i = 0; i < fields.Length; i++) + { + var mapper = new ParallelArrayTermVectorMapper(); + mapper.SetDocumentNumber(docNum); + ReadTermVector(fields[i], tvfPointers[i], mapper); + res[i] = (SegmentTermVector) mapper.MaterializeVector(); + } + return res; + } + + private void ReadTermVectors(System.String[] fields, long[] tvfPointers, TermVectorMapper mapper) + { + for (int i = 0; i < fields.Length; i++) + { + ReadTermVector(fields[i], tvfPointers[i], mapper); + } + } + + + /// <summary> </summary> + /// <param name="field">The field to read in + /// </param> + /// <param name="tvfPointer">The pointer within the tvf file where we should start reading + /// </param> + /// <param name="mapper">The mapper used to map the TermVector + /// </param> + /// <throws> IOException </throws> + private void ReadTermVector(System.String field, long tvfPointer, TermVectorMapper mapper) + { + + // Now read the data from specified position + //We don't need to offset by the FORMAT here since the pointer already includes the offset + tvf.Seek(tvfPointer); + + int numTerms = tvf.ReadVInt(); + //System.out.println("Num Terms: " + numTerms); + // If no terms - return a constant empty termvector. However, this should never occur! + if (numTerms == 0) + return ; + + bool storePositions; + bool storeOffsets; + + if (format >= FORMAT_VERSION) + { + byte bits = tvf.ReadByte(); + storePositions = (bits & STORE_POSITIONS_WITH_TERMVECTOR) != 0; + storeOffsets = (bits & STORE_OFFSET_WITH_TERMVECTOR) != 0; + } + else + { + tvf.ReadVInt(); + storePositions = false; + storeOffsets = false; + } + mapper.SetExpectations(field, numTerms, storeOffsets, storePositions); + int start = 0; + int deltaLength = 0; + int totalLength = 0; + byte[] byteBuffer; + char[] charBuffer; + bool preUTF8 = format < FORMAT_UTF8_LENGTH_IN_BYTES; + + // init the buffers + if (preUTF8) + { + charBuffer = new char[10]; + byteBuffer = null; + } + else + { + charBuffer = null; + byteBuffer = new byte[20]; + } + + for (int i = 0; i < numTerms; i++) + { + start = tvf.ReadVInt(); + deltaLength = tvf.ReadVInt(); + totalLength = start + deltaLength; + + System.String term; + + if (preUTF8) + { + // Term stored as java chars + if (charBuffer.Length < totalLength) + { + char[] newCharBuffer = new char[(int) (1.5 * totalLength)]; + Array.Copy(charBuffer, 0, newCharBuffer, 0, start); + charBuffer = newCharBuffer; + } + tvf.ReadChars(charBuffer, start, deltaLength); + term = new System.String(charBuffer, 0, totalLength); + } + else + { + // Term stored as utf8 bytes + if (byteBuffer.Length < totalLength) + { + byte[] newByteBuffer = new byte[(int) (1.5 * totalLength)]; + Array.Copy(byteBuffer, 0, newByteBuffer, 0, start); + byteBuffer = newByteBuffer; + } + tvf.ReadBytes(byteBuffer, start, deltaLength); + term = System.Text.Encoding.UTF8.GetString(byteBuffer, 0, totalLength); + } + int freq = tvf.ReadVInt(); + int[] positions = null; + if (storePositions) + { + //read in the positions + //does the mapper even care about positions? + if (mapper.IsIgnoringPositions == false) + { + positions = new int[freq]; + int prevPosition = 0; + for (int j = 0; j < freq; j++) + { + positions[j] = prevPosition + tvf.ReadVInt(); + prevPosition = positions[j]; + } + } + else + { + //we need to skip over the positions. Since these are VInts, I don't believe there is anyway to know for sure how far to skip + // + for (int j = 0; j < freq; j++) + { + tvf.ReadVInt(); + } + } + } + TermVectorOffsetInfo[] offsets = null; + if (storeOffsets) + { + //does the mapper even care about offsets? + if (mapper.IsIgnoringOffsets == false) + { + offsets = new TermVectorOffsetInfo[freq]; + int prevOffset = 0; + for (int j = 0; j < freq; j++) + { + int startOffset = prevOffset + tvf.ReadVInt(); + int endOffset = startOffset + tvf.ReadVInt(); + offsets[j] = new TermVectorOffsetInfo(startOffset, endOffset); + prevOffset = endOffset; + } + } + else + { + for (int j = 0; j < freq; j++) + { + tvf.ReadVInt(); + tvf.ReadVInt(); + } + } + } + mapper.Map(term, freq, offsets, positions); + } + } + + public virtual System.Object Clone() + { + + TermVectorsReader clone = (TermVectorsReader) base.MemberwiseClone(); + + // These are null when a TermVectorsReader was created + // on a segment that did not have term vectors saved + if (tvx != null && tvd != null && tvf != null) + { + clone.tvx = (IndexInput) tvx.Clone(); + clone.tvd = (IndexInput) tvd.Clone(); + clone.tvf = (IndexInput) tvf.Clone(); + } + + return clone; + } + } + + + /// <summary> Models the existing parallel array structure</summary> + class ParallelArrayTermVectorMapper:TermVectorMapper + { + + private System.String[] terms; + private int[] termFreqs; + private int[][] positions; + private TermVectorOffsetInfo[][] offsets; + private int currentPosition; + private bool storingOffsets; + private bool storingPositions; + private System.String field; + + public override void SetExpectations(System.String field, int numTerms, bool storeOffsets, bool storePositions) + { + this.field = field; + terms = new System.String[numTerms]; + termFreqs = new int[numTerms]; + this.storingOffsets = storeOffsets; + this.storingPositions = storePositions; + if (storePositions) + this.positions = new int[numTerms][]; + if (storeOffsets) + this.offsets = new TermVectorOffsetInfo[numTerms][]; + } + + public override void Map(System.String term, int frequency, TermVectorOffsetInfo[] offsets, int[] positions) + { + terms[currentPosition] = term; + termFreqs[currentPosition] = frequency; + if (storingOffsets) + { + this.offsets[currentPosition] = offsets; + } + if (storingPositions) + { + this.positions[currentPosition] = positions; + } + currentPosition++; + } + + /// <summary> Construct the vector</summary> + /// <returns> The <see cref="ITermFreqVector" /> based on the mappings. + /// </returns> + public virtual ITermFreqVector MaterializeVector() + { + SegmentTermVector tv = null; + if (field != null && terms != null) + { + if (storingPositions || storingOffsets) + { + tv = new SegmentTermPositionVector(field, terms, termFreqs, positions, offsets); + } + else + { + tv = new SegmentTermVector(field, terms, termFreqs); + } + } + return tv; + } + } +}
\ No newline at end of file |