diff options
Diffstat (limited to 'src/core/Index/SegmentMerger.cs')
-rw-r--r-- | src/core/Index/SegmentMerger.cs | 934 |
1 files changed, 934 insertions, 0 deletions
diff --git a/src/core/Index/SegmentMerger.cs b/src/core/Index/SegmentMerger.cs new file mode 100644 index 0000000..0ab159d --- /dev/null +++ b/src/core/Index/SegmentMerger.cs @@ -0,0 +1,934 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +using System; +using System.Collections.Generic; +using Document = Lucene.Net.Documents.Document; +using FieldSelector = Lucene.Net.Documents.FieldSelector; +using FieldSelectorResult = Lucene.Net.Documents.FieldSelectorResult; +using FieldOption = Lucene.Net.Index.IndexReader.FieldOption; +using MergeAbortedException = Lucene.Net.Index.MergePolicy.MergeAbortedException; +using Directory = Lucene.Net.Store.Directory; +using IndexInput = Lucene.Net.Store.IndexInput; +using IndexOutput = Lucene.Net.Store.IndexOutput; + +namespace Lucene.Net.Index +{ + + /// <summary> The SegmentMerger class combines two or more Segments, represented by an IndexReader (<see cref="Add" />, + /// into a single Segment. After adding the appropriate readers, call the merge method to combine the + /// segments. + /// <p/> + /// If the compoundFile flag is set, then the segments will be merged into a compound file. + /// + /// + /// </summary> + /// <seealso cref="Merge()"> + /// </seealso> + /// <seealso cref="Add"> + /// </seealso> + public sealed class SegmentMerger + { + private class AnonymousClassCheckAbort:CheckAbort + { + private void InitBlock(SegmentMerger enclosingInstance) + { + this.enclosingInstance = enclosingInstance; + } + private SegmentMerger enclosingInstance; + public SegmentMerger Enclosing_Instance + { + get + { + return enclosingInstance; + } + + } + internal AnonymousClassCheckAbort(SegmentMerger enclosingInstance, Lucene.Net.Index.MergePolicy.OneMerge Param1, Lucene.Net.Store.Directory Param2):base(Param1, Param2) + { + InitBlock(enclosingInstance); + } + public override void Work(double units) + { + // do nothing + } + } + private class AnonymousClassCheckAbort1:CheckAbort + { + private void InitBlock(SegmentMerger enclosingInstance) + { + this.enclosingInstance = enclosingInstance; + } + private SegmentMerger enclosingInstance; + public SegmentMerger Enclosing_Instance + { + get + { + return enclosingInstance; + } + + } + internal AnonymousClassCheckAbort1(SegmentMerger enclosingInstance, Lucene.Net.Index.MergePolicy.OneMerge Param1, Lucene.Net.Store.Directory Param2):base(Param1, Param2) + { + InitBlock(enclosingInstance); + } + public override void Work(double units) + { + // do nothing + } + } + + private void InitBlock() + { + termIndexInterval = IndexWriter.DEFAULT_TERM_INDEX_INTERVAL; + } + + /// <summary>norms header placeholder </summary> + internal static readonly byte[] NORMS_HEADER = new byte[]{(byte) 'N', (byte) 'R', (byte) 'M', unchecked((byte) - 1)}; + + private Directory directory; + private System.String segment; + private int termIndexInterval; + + private IList<IndexReader> readers = new List<IndexReader>(); + private FieldInfos fieldInfos; + + private int mergedDocs; + + private CheckAbort checkAbort; + + // Whether we should merge doc stores (stored fields and + // vectors files). When all segments we are merging + // already share the same doc store files, we don't need + // to merge the doc stores. + private bool mergeDocStores; + + /// <summary>Maximum number of contiguous documents to bulk-copy + /// when merging stored fields + /// </summary> + private const int MAX_RAW_MERGE_DOCS = 4192; + + /// <summary>This ctor used only by test code. + /// + /// </summary> + /// <param name="dir">The Directory to merge the other segments into + /// </param> + /// <param name="name">The name of the new segment + /// </param> + public /*internal*/ SegmentMerger(Directory dir, System.String name) + { + InitBlock(); + directory = dir; + segment = name; + checkAbort = new AnonymousClassCheckAbort(this, null, null); + } + + internal SegmentMerger(IndexWriter writer, System.String name, MergePolicy.OneMerge merge) + { + InitBlock(); + directory = writer.Directory; + segment = name; + if (merge != null) + { + checkAbort = new CheckAbort(merge, directory); + } + else + { + checkAbort = new AnonymousClassCheckAbort1(this, null, null); + } + termIndexInterval = writer.TermIndexInterval; + } + + internal bool HasProx() + { + return fieldInfos.HasProx(); + } + + /// <summary> Add an IndexReader to the collection of readers that are to be merged</summary> + /// <param name="reader"> + /// </param> + public /*internal*/ void Add(IndexReader reader) + { + readers.Add(reader); + } + + /// <summary> </summary> + /// <param name="i">The index of the reader to return + /// </param> + /// <returns> The ith reader to be merged + /// </returns> + internal IndexReader SegmentReader(int i) + { + return readers[i]; + } + + /// <summary> Merges the readers specified by the <see cref="Add" /> method into the directory passed to the constructor</summary> + /// <returns> The number of documents that were merged + /// </returns> + /// <throws> CorruptIndexException if the index is corrupt </throws> + /// <throws> IOException if there is a low-level IO error </throws> + public /*internal*/ int Merge() + { + return Merge(true); + } + + /// <summary> Merges the readers specified by the <see cref="Add" /> method + /// into the directory passed to the constructor. + /// </summary> + /// <param name="mergeDocStores">if false, we will not merge the + /// stored fields nor vectors files + /// </param> + /// <returns> The number of documents that were merged + /// </returns> + /// <throws> CorruptIndexException if the index is corrupt </throws> + /// <throws> IOException if there is a low-level IO error </throws> + internal int Merge(bool mergeDocStores) + { + + this.mergeDocStores = mergeDocStores; + + // NOTE: it's important to add calls to + // checkAbort.work(...) if you make any changes to this + // method that will spend alot of time. The frequency + // of this check impacts how long + // IndexWriter.close(false) takes to actually stop the + // threads. + + mergedDocs = MergeFields(); + MergeTerms(); + MergeNorms(); + + if (mergeDocStores && fieldInfos.HasVectors()) + MergeVectors(); + + return mergedDocs; + } + + /// <summary> close all IndexReaders that have been added. + /// Should not be called before merge(). + /// </summary> + /// <throws> IOException </throws> + internal void CloseReaders() + { + foreach(IndexReader reader in readers) + { + reader.Dispose(); + } + } + + internal ICollection<string> GetMergedFiles() + { + ISet<string> fileSet = Lucene.Net.Support.Compatibility.SetFactory.CreateHashSet<string>(); + + // Basic files + for (int i = 0; i < IndexFileNames.COMPOUND_EXTENSIONS.Length; i++) + { + System.String ext = IndexFileNames.COMPOUND_EXTENSIONS[i]; + + if (ext.Equals(IndexFileNames.PROX_EXTENSION) && !HasProx()) + continue; + + if (mergeDocStores || (!ext.Equals(IndexFileNames.FIELDS_EXTENSION) && !ext.Equals(IndexFileNames.FIELDS_INDEX_EXTENSION))) + fileSet.Add(segment + "." + ext); + } + + // Fieldable norm files + for (int i = 0; i < fieldInfos.Size(); i++) + { + FieldInfo fi = fieldInfos.FieldInfo(i); + if (fi.isIndexed && !fi.omitNorms) + { + fileSet.Add(segment + "." + IndexFileNames.NORMS_EXTENSION); + break; + } + } + + // Vector files + if (fieldInfos.HasVectors() && mergeDocStores) + { + for (int i = 0; i < IndexFileNames.VECTOR_EXTENSIONS.Length; i++) + { + fileSet.Add(segment + "." + IndexFileNames.VECTOR_EXTENSIONS[i]); + } + } + + return fileSet; + } + + public /*internal*/ ICollection<string> CreateCompoundFile(System.String fileName) + { + ICollection<string> files = GetMergedFiles(); + CompoundFileWriter cfsWriter = new CompoundFileWriter(directory, fileName, checkAbort); + + // Now merge all added files + foreach(var file in files) + { + cfsWriter.AddFile(file); + } + + // Perform the merge + cfsWriter.Close(); + + return files; + } + + private void AddIndexed(IndexReader reader, FieldInfos fInfos, ICollection<string> names, bool storeTermVectors, bool storePositionWithTermVector, bool storeOffsetWithTermVector, bool storePayloads, bool omitTFAndPositions) + { + foreach (var field in names) + { + fInfos.Add(field, true, storeTermVectors, storePositionWithTermVector, storeOffsetWithTermVector, + !reader.HasNorms(field), storePayloads, omitTFAndPositions); + } + } + + private SegmentReader[] matchingSegmentReaders; + private int[] rawDocLengths; + private int[] rawDocLengths2; + + private void SetMatchingSegmentReaders() + { + // If the i'th reader is a SegmentReader and has + // identical fieldName -> number mapping, then this + // array will be non-null at position i: + int numReaders = readers.Count; + matchingSegmentReaders = new SegmentReader[numReaders]; + + // If this reader is a SegmentReader, and all of its + // field name -> number mappings match the "merged" + // FieldInfos, then we can do a bulk copy of the + // stored fields: + for (int i = 0; i < numReaders; i++) + { + IndexReader reader = readers[i]; + if (reader is SegmentReader) + { + SegmentReader segmentReader = (SegmentReader) reader; + bool same = true; + FieldInfos segmentFieldInfos = segmentReader.FieldInfos(); + int numFieldInfos = segmentFieldInfos.Size(); + for (int j = 0; same && j < numFieldInfos; j++) + { + same = fieldInfos.FieldName(j).Equals(segmentFieldInfos.FieldName(j)); + } + if (same) + { + matchingSegmentReaders[i] = segmentReader; + } + } + } + + // Used for bulk-reading raw bytes for stored fields + rawDocLengths = new int[MAX_RAW_MERGE_DOCS]; + rawDocLengths2 = new int[MAX_RAW_MERGE_DOCS]; + } + + /// <summary> </summary> + /// <returns> The number of documents in all of the readers + /// </returns> + /// <throws> CorruptIndexException if the index is corrupt </throws> + /// <throws> IOException if there is a low-level IO error </throws> + private int MergeFields() + { + + if (!mergeDocStores) + { + // When we are not merging by doc stores, their field + // name -> number mapping are the same. So, we start + // with the fieldInfos of the last segment in this + // case, to keep that numbering. + SegmentReader sr = (SegmentReader) readers[readers.Count - 1]; + fieldInfos = (FieldInfos) sr.core.fieldInfos.Clone(); + } + else + { + fieldInfos = new FieldInfos(); // merge field names + } + + foreach(IndexReader reader in readers) + { + if (reader is SegmentReader) + { + SegmentReader segmentReader = (SegmentReader) reader; + FieldInfos readerFieldInfos = segmentReader.FieldInfos(); + int numReaderFieldInfos = readerFieldInfos.Size(); + for (int j = 0; j < numReaderFieldInfos; j++) + { + FieldInfo fi = readerFieldInfos.FieldInfo(j); + fieldInfos.Add(fi.name, fi.isIndexed, fi.storeTermVector, fi.storePositionWithTermVector, fi.storeOffsetWithTermVector, !reader.HasNorms(fi.name), fi.storePayloads, fi.omitTermFreqAndPositions); + } + } + else + { + AddIndexed(reader, fieldInfos, reader.GetFieldNames(FieldOption.TERMVECTOR_WITH_POSITION_OFFSET), true, true, true, false, false); + AddIndexed(reader, fieldInfos, reader.GetFieldNames(FieldOption.TERMVECTOR_WITH_POSITION), true, true, false, false, false); + AddIndexed(reader, fieldInfos, reader.GetFieldNames(FieldOption.TERMVECTOR_WITH_OFFSET), true, false, true, false, false); + AddIndexed(reader, fieldInfos, reader.GetFieldNames(FieldOption.TERMVECTOR), true, false, false, false, false); + AddIndexed(reader, fieldInfos, reader.GetFieldNames(FieldOption.OMIT_TERM_FREQ_AND_POSITIONS), false, false, false, false, true); + AddIndexed(reader, fieldInfos, reader.GetFieldNames(FieldOption.STORES_PAYLOADS), false, false, false, true, false); + AddIndexed(reader, fieldInfos, reader.GetFieldNames(FieldOption.INDEXED), false, false, false, false, false); + fieldInfos.Add(reader.GetFieldNames(FieldOption.UNINDEXED), false); + } + } + fieldInfos.Write(directory, segment + ".fnm"); + + int docCount = 0; + + SetMatchingSegmentReaders(); + + if (mergeDocStores) + { + // merge field values + FieldsWriter fieldsWriter = new FieldsWriter(directory, segment, fieldInfos); + + try + { + int idx = 0; + foreach(IndexReader reader in readers) + { + SegmentReader matchingSegmentReader = matchingSegmentReaders[idx++]; + FieldsReader matchingFieldsReader = null; + if (matchingSegmentReader != null) + { + FieldsReader fieldsReader = matchingSegmentReader.GetFieldsReader(); + if (fieldsReader != null && fieldsReader.CanReadRawDocs()) + { + matchingFieldsReader = fieldsReader; + } + } + if (reader.HasDeletions) + { + docCount += CopyFieldsWithDeletions(fieldsWriter, reader, matchingFieldsReader); + } + else + { + docCount += CopyFieldsNoDeletions(fieldsWriter, reader, matchingFieldsReader); + } + } + } + finally + { + fieldsWriter.Dispose(); + } + + System.String fileName = segment + "." + IndexFileNames.FIELDS_INDEX_EXTENSION; + long fdxFileLength = directory.FileLength(fileName); + + if (4 + ((long) docCount) * 8 != fdxFileLength) + // This is most likely a bug in Sun JRE 1.6.0_04/_05; + // we detect that the bug has struck, here, and + // throw an exception to prevent the corruption from + // entering the index. See LUCENE-1282 for + // details. + throw new System.SystemException("mergeFields produced an invalid result: docCount is " + docCount + " but fdx file size is " + fdxFileLength + " file=" + fileName + " file exists?=" + directory.FileExists(fileName) + "; now aborting this merge to prevent index corruption"); + } + // If we are skipping the doc stores, that means there + // are no deletions in any of these segments, so we + // just sum numDocs() of each segment to get total docCount + else + { + foreach(IndexReader reader in readers) + { + docCount += reader.NumDocs(); + } + } + + return docCount; + } + + private int CopyFieldsWithDeletions(FieldsWriter fieldsWriter, IndexReader reader, FieldsReader matchingFieldsReader) + { + int docCount = 0; + int maxDoc = reader.MaxDoc; + if (matchingFieldsReader != null) + { + // We can bulk-copy because the fieldInfos are "congruent" + for (int j = 0; j < maxDoc; ) + { + if (reader.IsDeleted(j)) + { + // skip deleted docs + ++j; + continue; + } + // We can optimize this case (doing a bulk byte copy) since the field + // numbers are identical + int start = j, numDocs = 0; + do + { + j++; + numDocs++; + if (j >= maxDoc) + break; + if (reader.IsDeleted(j)) + { + j++; + break; + } + } + while (numDocs < MAX_RAW_MERGE_DOCS); + + IndexInput stream = matchingFieldsReader.RawDocs(rawDocLengths, start, numDocs); + fieldsWriter.AddRawDocuments(stream, rawDocLengths, numDocs); + docCount += numDocs; + checkAbort.Work(300 * numDocs); + } + } + else + { + for (int j = 0; j < maxDoc; j++) + { + if (reader.IsDeleted(j)) + { + // skip deleted docs + continue; + } + // NOTE: it's very important to first assign to doc then pass it to + // termVectorsWriter.addAllDocVectors; see LUCENE-1282 + Document doc = reader.Document(j); + fieldsWriter.AddDocument(doc); + docCount++; + checkAbort.Work(300); + } + } + return docCount; + } + + private int CopyFieldsNoDeletions(FieldsWriter fieldsWriter, IndexReader reader, FieldsReader matchingFieldsReader) + { + int maxDoc = reader.MaxDoc; + int docCount = 0; + if (matchingFieldsReader != null) + { + // We can bulk-copy because the fieldInfos are "congruent" + while (docCount < maxDoc) + { + int len = System.Math.Min(MAX_RAW_MERGE_DOCS, maxDoc - docCount); + IndexInput stream = matchingFieldsReader.RawDocs(rawDocLengths, docCount, len); + fieldsWriter.AddRawDocuments(stream, rawDocLengths, len); + docCount += len; + checkAbort.Work(300 * len); + } + } + else + { + for (; docCount < maxDoc; docCount++) + { + // NOTE: it's very important to first assign to doc then pass it to + // termVectorsWriter.addAllDocVectors; see LUCENE-1282 + Document doc = reader.Document(docCount); + fieldsWriter.AddDocument(doc); + checkAbort.Work(300); + } + } + return docCount; + } + + /// <summary> Merge the TermVectors from each of the segments into the new one.</summary> + /// <throws> IOException </throws> + private void MergeVectors() + { + TermVectorsWriter termVectorsWriter = new TermVectorsWriter(directory, segment, fieldInfos); + + try + { + int idx = 0; + foreach(IndexReader reader in readers) + { + SegmentReader matchingSegmentReader = matchingSegmentReaders[idx++]; + TermVectorsReader matchingVectorsReader = null; + if (matchingSegmentReader != null) + { + TermVectorsReader vectorsReader = matchingSegmentReader.GetTermVectorsReaderOrig(); + + // If the TV* files are an older format then they cannot read raw docs: + if (vectorsReader != null && vectorsReader.CanReadRawDocs()) + { + matchingVectorsReader = vectorsReader; + } + } + if (reader.HasDeletions) + { + CopyVectorsWithDeletions(termVectorsWriter, matchingVectorsReader, reader); + } + else + { + CopyVectorsNoDeletions(termVectorsWriter, matchingVectorsReader, reader); + } + } + } + finally + { + termVectorsWriter.Dispose(); + } + + System.String fileName = segment + "." + IndexFileNames.VECTORS_INDEX_EXTENSION; + long tvxSize = directory.FileLength(fileName); + + if (4 + ((long) mergedDocs) * 16 != tvxSize) + // This is most likely a bug in Sun JRE 1.6.0_04/_05; + // we detect that the bug has struck, here, and + // throw an exception to prevent the corruption from + // entering the index. See LUCENE-1282 for + // details. + throw new System.SystemException("mergeVectors produced an invalid result: mergedDocs is " + mergedDocs + " but tvx size is " + tvxSize + " file=" + fileName + " file exists?=" + directory.FileExists(fileName) + "; now aborting this merge to prevent index corruption"); + } + + private void CopyVectorsWithDeletions(TermVectorsWriter termVectorsWriter, TermVectorsReader matchingVectorsReader, IndexReader reader) + { + int maxDoc = reader.MaxDoc; + if (matchingVectorsReader != null) + { + // We can bulk-copy because the fieldInfos are "congruent" + for (int docNum = 0; docNum < maxDoc; ) + { + if (reader.IsDeleted(docNum)) + { + // skip deleted docs + ++docNum; + continue; + } + // We can optimize this case (doing a bulk byte copy) since the field + // numbers are identical + int start = docNum, numDocs = 0; + do + { + docNum++; + numDocs++; + if (docNum >= maxDoc) + break; + if (reader.IsDeleted(docNum)) + { + docNum++; + break; + } + } + while (numDocs < MAX_RAW_MERGE_DOCS); + + matchingVectorsReader.RawDocs(rawDocLengths, rawDocLengths2, start, numDocs); + termVectorsWriter.AddRawDocuments(matchingVectorsReader, rawDocLengths, rawDocLengths2, numDocs); + checkAbort.Work(300 * numDocs); + } + } + else + { + for (int docNum = 0; docNum < maxDoc; docNum++) + { + if (reader.IsDeleted(docNum)) + { + // skip deleted docs + continue; + } + + // NOTE: it's very important to first assign to vectors then pass it to + // termVectorsWriter.addAllDocVectors; see LUCENE-1282 + ITermFreqVector[] vectors = reader.GetTermFreqVectors(docNum); + termVectorsWriter.AddAllDocVectors(vectors); + checkAbort.Work(300); + } + } + } + + private void CopyVectorsNoDeletions(TermVectorsWriter termVectorsWriter, TermVectorsReader matchingVectorsReader, IndexReader reader) + { + int maxDoc = reader.MaxDoc; + if (matchingVectorsReader != null) + { + // We can bulk-copy because the fieldInfos are "congruent" + int docCount = 0; + while (docCount < maxDoc) + { + int len = System.Math.Min(MAX_RAW_MERGE_DOCS, maxDoc - docCount); + matchingVectorsReader.RawDocs(rawDocLengths, rawDocLengths2, docCount, len); + termVectorsWriter.AddRawDocuments(matchingVectorsReader, rawDocLengths, rawDocLengths2, len); + docCount += len; + checkAbort.Work(300 * len); + } + } + else + { + for (int docNum = 0; docNum < maxDoc; docNum++) + { + // NOTE: it's very important to first assign to vectors then pass it to + // termVectorsWriter.addAllDocVectors; see LUCENE-1282 + ITermFreqVector[] vectors = reader.GetTermFreqVectors(docNum); + termVectorsWriter.AddAllDocVectors(vectors); + checkAbort.Work(300); + } + } + } + + private SegmentMergeQueue queue = null; + + private void MergeTerms() + { + + SegmentWriteState state = new SegmentWriteState(null, directory, segment, null, mergedDocs, 0, termIndexInterval); + + FormatPostingsFieldsConsumer consumer = new FormatPostingsFieldsWriter(state, fieldInfos); + + try + { + queue = new SegmentMergeQueue(readers.Count); + + MergeTermInfos(consumer); + } + finally + { + consumer.Finish(); + if (queue != null) + queue.Dispose(); + } + } + + internal bool omitTermFreqAndPositions; + + private void MergeTermInfos(FormatPostingsFieldsConsumer consumer) + { + int base_Renamed = 0; + int readerCount = readers.Count; + for (int i = 0; i < readerCount; i++) + { + IndexReader reader = readers[i]; + TermEnum termEnum = reader.Terms(); + SegmentMergeInfo smi = new SegmentMergeInfo(base_Renamed, termEnum, reader); + int[] docMap = smi.GetDocMap(); + if (docMap != null) + { + if (docMaps == null) + { + docMaps = new int[readerCount][]; + delCounts = new int[readerCount]; + } + docMaps[i] = docMap; + delCounts[i] = smi.reader.MaxDoc - smi.reader.NumDocs(); + } + + base_Renamed += reader.NumDocs(); + + System.Diagnostics.Debug.Assert(reader.NumDocs() == reader.MaxDoc - smi.delCount); + + if (smi.Next()) + queue.Add(smi); + // initialize queue + else + smi.Dispose(); + } + + SegmentMergeInfo[] match = new SegmentMergeInfo[readers.Count]; + + System.String currentField = null; + FormatPostingsTermsConsumer termsConsumer = null; + + while (queue.Size() > 0) + { + int matchSize = 0; // pop matching terms + match[matchSize++] = queue.Pop(); + Term term = match[0].term; + SegmentMergeInfo top = queue.Top(); + + while (top != null && term.CompareTo(top.term) == 0) + { + match[matchSize++] = queue.Pop(); + top = queue.Top(); + } + + if ((System.Object) currentField != (System.Object) term.Field) + { + currentField = term.Field; + if (termsConsumer != null) + termsConsumer.Finish(); + FieldInfo fieldInfo = fieldInfos.FieldInfo(currentField); + termsConsumer = consumer.AddField(fieldInfo); + omitTermFreqAndPositions = fieldInfo.omitTermFreqAndPositions; + } + + int df = AppendPostings(termsConsumer, match, matchSize); // add new TermInfo + + checkAbort.Work(df / 3.0); + + while (matchSize > 0) + { + SegmentMergeInfo smi = match[--matchSize]; + if (smi.Next()) + queue.Add(smi); + // restore queue + else + smi.Dispose(); // done with a segment + } + } + } + + private byte[] payloadBuffer; + private int[][] docMaps; + internal int[][] GetDocMaps() + { + return docMaps; + } + private int[] delCounts; + internal int[] GetDelCounts() + { + return delCounts; + } + + /// <summary>Process postings from multiple segments all positioned on the + /// same term. Writes out merged entries into freqOutput and + /// the proxOutput streams. + /// + /// </summary> + /// <param name="smis">array of segments + /// </param> + /// <param name="n">number of cells in the array actually occupied + /// </param> + /// <returns> number of documents across all segments where this term was found + /// </returns> + /// <throws> CorruptIndexException if the index is corrupt </throws> + /// <throws> IOException if there is a low-level IO error </throws> + private int AppendPostings(FormatPostingsTermsConsumer termsConsumer, SegmentMergeInfo[] smis, int n) + { + + FormatPostingsDocsConsumer docConsumer = termsConsumer.AddTerm(smis[0].term.Text); + int df = 0; + for (int i = 0; i < n; i++) + { + SegmentMergeInfo smi = smis[i]; + TermPositions postings = smi.GetPositions(); + System.Diagnostics.Debug.Assert(postings != null); + int base_Renamed = smi.base_Renamed; + int[] docMap = smi.GetDocMap(); + postings.Seek(smi.termEnum); + + while (postings.Next()) + { + df++; + int doc = postings.Doc; + if (docMap != null) + doc = docMap[doc]; // map around deletions + doc += base_Renamed; // convert to merged space + + int freq = postings.Freq; + FormatPostingsPositionsConsumer posConsumer = docConsumer.AddDoc(doc, freq); + + if (!omitTermFreqAndPositions) + { + for (int j = 0; j < freq; j++) + { + int position = postings.NextPosition(); + int payloadLength = postings.PayloadLength; + if (payloadLength > 0) + { + if (payloadBuffer == null || payloadBuffer.Length < payloadLength) + payloadBuffer = new byte[payloadLength]; + postings.GetPayload(payloadBuffer, 0); + } + posConsumer.AddPosition(position, payloadBuffer, 0, payloadLength); + } + posConsumer.Finish(); + } + } + } + docConsumer.Finish(); + + return df; + } + + private void MergeNorms() + { + byte[] normBuffer = null; + IndexOutput output = null; + try + { + int numFieldInfos = fieldInfos.Size(); + for (int i = 0; i < numFieldInfos; i++) + { + FieldInfo fi = fieldInfos.FieldInfo(i); + if (fi.isIndexed && !fi.omitNorms) + { + if (output == null) + { + output = directory.CreateOutput(segment + "." + IndexFileNames.NORMS_EXTENSION); + output.WriteBytes(NORMS_HEADER, NORMS_HEADER.Length); + } + foreach(IndexReader reader in readers) + { + int maxDoc = reader.MaxDoc; + if (normBuffer == null || normBuffer.Length < maxDoc) + { + // the buffer is too small for the current segment + normBuffer = new byte[maxDoc]; + } + reader.Norms(fi.name, normBuffer, 0); + if (!reader.HasDeletions) + { + //optimized case for segments without deleted docs + output.WriteBytes(normBuffer, maxDoc); + } + else + { + // this segment has deleted docs, so we have to + // check for every doc if it is deleted or not + for (int k = 0; k < maxDoc; k++) + { + if (!reader.IsDeleted(k)) + { + output.WriteByte(normBuffer[k]); + } + } + } + checkAbort.Work(maxDoc); + } + } + } + } + finally + { + if (output != null) + { + output.Close(); + } + } + } + + internal class CheckAbort + { + private double workCount; + private MergePolicy.OneMerge merge; + private Directory dir; + public CheckAbort(MergePolicy.OneMerge merge, Directory dir) + { + this.merge = merge; + this.dir = dir; + } + + /// <summary> Records the fact that roughly units amount of work + /// have been done since this method was last called. + /// When adding time-consuming code into SegmentMerger, + /// you should test different values for units to ensure + /// that the time in between calls to merge.checkAborted + /// is up to ~ 1 second. + /// </summary> + public virtual void Work(double units) + { + workCount += units; + if (workCount >= 10000.0) + { + merge.CheckAborted(dir); + workCount = 0; + } + } + } + } +}
\ No newline at end of file |