1 files changed, 290 insertions, 0 deletions
diff --git a/src/core/Index/TermVectorsTermsWriterPerField.cs b/src/core/Index/TermVectorsTermsWriterPerField.cs
new file mode 100644
index 0000000..e6bb827
--- /dev/null
+++ b/src/core/Index/TermVectorsTermsWriterPerField.cs
@@ -0,0 +1,290 @@
+/* 
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ * 
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+using System;
+using Lucene.Net.Analysis.Tokenattributes;
+using Lucene.Net.Documents;
+using IndexOutput = Lucene.Net.Store.IndexOutput;
+using UnicodeUtil = Lucene.Net.Util.UnicodeUtil;
+
+namespace Lucene.Net.Index
+{
+	
+	sealed class TermVectorsTermsWriterPerField:TermsHashConsumerPerField
+	{
+		
+		internal TermVectorsTermsWriterPerThread perThread;
+		internal TermsHashPerField termsHashPerField;
+		internal TermVectorsTermsWriter termsWriter;
+		internal FieldInfo fieldInfo;
+		internal DocumentsWriter.DocState docState;
+		internal FieldInvertState fieldState;
+		
+		internal bool doVectors;
+		internal bool doVectorPositions;
+		internal bool doVectorOffsets;
+		
+		internal int maxNumPostings;
+		internal IOffsetAttribute offsetAttribute = null;
+		
+		public TermVectorsTermsWriterPerField(TermsHashPerField termsHashPerField, TermVectorsTermsWriterPerThread perThread, FieldInfo fieldInfo)
+		{
+			this.termsHashPerField = termsHashPerField;
+			this.perThread = perThread;
+			this.termsWriter = perThread.termsWriter;
+			this.fieldInfo = fieldInfo;
+			docState = termsHashPerField.docState;
+			fieldState = termsHashPerField.fieldState;
+		}
+		
+		internal override int GetStreamCount()
+		{
+			return 2;
+		}
+		
+		internal override bool Start(IFieldable[] fields, int count)
+		{
+			doVectors = false;
+			doVectorPositions = false;
+			doVectorOffsets = false;
+			
+			for (int i = 0; i < count; i++)
+			{
+				IFieldable field = fields[i];
+				if (field.IsIndexed && field.IsTermVectorStored)
+				{
+					doVectors = true;
+					doVectorPositions |= field.IsStorePositionWithTermVector;
+					doVectorOffsets |= field.IsStoreOffsetWithTermVector;
+				}
+			}
+			
+			if (doVectors)
+			{
+				if (perThread.doc == null)
+				{
+					perThread.doc = termsWriter.GetPerDoc();
+					perThread.doc.docID = docState.docID;
+					System.Diagnostics.Debug.Assert(perThread.doc.numVectorFields == 0);
+					System.Diagnostics.Debug.Assert(0 == perThread.doc.perDocTvf.Length);
+					System.Diagnostics.Debug.Assert(0 == perThread.doc.perDocTvf.FilePointer);
+				}
+
+                System.Diagnostics.Debug.Assert(perThread.doc.docID == docState.docID);
+                if (termsHashPerField.numPostings != 0)
+                {
+                    // Only necessary if previous doc hit a
+                    // non-aborting exception while writing vectors in
+                    // this field:
+                    termsHashPerField.Reset();
+                    perThread.termsHashPerThread.Reset(false);
+                }
+			}
+			
+			// TODO: only if needed for performance
+			//perThread.postingsCount = 0;
+			
+			return doVectors;
+		}
+		
+		public void  Abort()
+		{
+		}
+		
+		/// <summary>Called once per field per document if term vectors
+		/// are enabled, to write the vectors to
+		/// RAMOutputStream, which is then quickly flushed to
+		/// the real term vectors files in the Directory. 
+		/// </summary>
+		internal override void  Finish()
+		{
+			
+			System.Diagnostics.Debug.Assert(docState.TestPoint("TermVectorsTermsWriterPerField.finish start"));
+			
+			int numPostings = termsHashPerField.numPostings;
+			
+			System.Diagnostics.Debug.Assert(numPostings >= 0);
+			
+			if (!doVectors || numPostings == 0)
+				return ;
+			
+			if (numPostings > maxNumPostings)
+				maxNumPostings = numPostings;
+			
+			IndexOutput tvf = perThread.doc.perDocTvf;
+			
+			// This is called once, after inverting all occurences
+			// of a given field in the doc.  At this point we flush
+			// our hash into the DocWriter.
+			
+			System.Diagnostics.Debug.Assert(fieldInfo.storeTermVector);
+			System.Diagnostics.Debug.Assert(perThread.VectorFieldsInOrder(fieldInfo));
+			
+			perThread.doc.AddField(termsHashPerField.fieldInfo.number);
+			
+			RawPostingList[] postings = termsHashPerField.SortPostings();
+			
+			tvf.WriteVInt(numPostings);
+			byte bits = (byte) (0x0);
+			if (doVectorPositions)
+				bits |= TermVectorsReader.STORE_POSITIONS_WITH_TERMVECTOR;
+			if (doVectorOffsets)
+				bits |= TermVectorsReader.STORE_OFFSET_WITH_TERMVECTOR;
+			tvf.WriteByte(bits);
+			
+			int encoderUpto = 0;
+			int lastTermBytesCount = 0;
+			
+			ByteSliceReader reader = perThread.vectorSliceReader;
+			char[][] charBuffers = perThread.termsHashPerThread.charPool.buffers;
+			for (int j = 0; j < numPostings; j++)
+			{
+				TermVectorsTermsWriter.PostingList posting = (TermVectorsTermsWriter.PostingList) postings[j];
+				int freq = posting.freq;
+				
+				char[] text2 = charBuffers[posting.textStart >> DocumentsWriter.CHAR_BLOCK_SHIFT];
+				int start2 = posting.textStart & DocumentsWriter.CHAR_BLOCK_MASK;
+				
+				// We swap between two encoders to save copying
+				// last Term's byte array
+				UnicodeUtil.UTF8Result utf8Result = perThread.utf8Results[encoderUpto];
+				
+				// TODO: we could do this incrementally
+				UnicodeUtil.UTF16toUTF8(text2, start2, utf8Result);
+				int termBytesCount = utf8Result.length;
+				
+				// TODO: UTF16toUTF8 could tell us this prefix
+				// Compute common prefix between last term and
+				// this term
+				int prefix = 0;
+				if (j > 0)
+				{
+					byte[] lastTermBytes = perThread.utf8Results[1 - encoderUpto].result;
+					byte[] termBytes = perThread.utf8Results[encoderUpto].result;
+					while (prefix < lastTermBytesCount && prefix < termBytesCount)
+					{
+						if (lastTermBytes[prefix] != termBytes[prefix])
+							break;
+						prefix++;
+					}
+				}
+				encoderUpto = 1 - encoderUpto;
+				lastTermBytesCount = termBytesCount;
+				
+				int suffix = termBytesCount - prefix;
+				tvf.WriteVInt(prefix);
+				tvf.WriteVInt(suffix);
+				tvf.WriteBytes(utf8Result.result, prefix, suffix);
+				tvf.WriteVInt(freq);
+				
+				if (doVectorPositions)
+				{
+					termsHashPerField.InitReader(reader, posting, 0);
+					reader.WriteTo(tvf);
+				}
+				
+				if (doVectorOffsets)
+				{
+					termsHashPerField.InitReader(reader, posting, 1);
+					reader.WriteTo(tvf);
+				}
+			}
+			
+			termsHashPerField.Reset();
+
+            // NOTE: we clear, per-field, at the thread level,
+            // because term vectors fully write themselves on each
+            // field; this saves RAM (eg if large doc has two large
+            // fields w/ term vectors on) because we recycle/reuse
+            // all RAM after each field:
+			perThread.termsHashPerThread.Reset(false);
+		}
+		
+		internal void  ShrinkHash()
+		{
+			termsHashPerField.ShrinkHash(maxNumPostings);
+			maxNumPostings = 0;
+		}
+		
+		internal override void  Start(IFieldable f)
+		{
+			if (doVectorOffsets)
+			{
+				offsetAttribute = fieldState.attributeSource.AddAttribute<IOffsetAttribute>();
+			}
+			else
+			{
+				offsetAttribute = null;
+			}
+		}
+		
+		internal override void  NewTerm(RawPostingList p0)
+		{
+			
+			System.Diagnostics.Debug.Assert(docState.TestPoint("TermVectorsTermsWriterPerField.newTerm start"));
+			
+			TermVectorsTermsWriter.PostingList p = (TermVectorsTermsWriter.PostingList) p0;
+			
+			p.freq = 1;
+			
+			if (doVectorOffsets)
+			{
+				int startOffset = fieldState.offset + offsetAttribute.StartOffset; ;
+				int endOffset = fieldState.offset + offsetAttribute.EndOffset;
+				
+				termsHashPerField.WriteVInt(1, startOffset);
+				termsHashPerField.WriteVInt(1, endOffset - startOffset);
+				p.lastOffset = endOffset;
+			}
+			
+			if (doVectorPositions)
+			{
+				termsHashPerField.WriteVInt(0, fieldState.position);
+				p.lastPosition = fieldState.position;
+			}
+		}
+		
+		internal override void  AddTerm(RawPostingList p0)
+		{
+			
+			System.Diagnostics.Debug.Assert(docState.TestPoint("TermVectorsTermsWriterPerField.addTerm start"));
+			
+			TermVectorsTermsWriter.PostingList p = (TermVectorsTermsWriter.PostingList) p0;
+			p.freq++;
+			
+			if (doVectorOffsets)
+			{
+				int startOffset = fieldState.offset + offsetAttribute.StartOffset; ;
+				int endOffset = fieldState.offset + offsetAttribute.EndOffset;
+				
+				termsHashPerField.WriteVInt(1, startOffset - p.lastOffset);
+				termsHashPerField.WriteVInt(1, endOffset - startOffset);
+				p.lastOffset = endOffset;
+			}
+			
+			if (doVectorPositions)
+			{
+				termsHashPerField.WriteVInt(0, fieldState.position - p.lastPosition);
+				p.lastPosition = fieldState.position;
+			}
+		}
+		
+		internal override void  SkippingLongTerm()
+		{
+		}
+	}
+}
+\ No newline at end of file