diff options
Diffstat (limited to 'src/core/Util/BitVector.cs')
-rw-r--r-- | src/core/Util/BitVector.cs | 315 |
1 files changed, 315 insertions, 0 deletions
diff --git a/src/core/Util/BitVector.cs b/src/core/Util/BitVector.cs new file mode 100644 index 0000000..17b1212 --- /dev/null +++ b/src/core/Util/BitVector.cs @@ -0,0 +1,315 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +using System; +using Lucene.Net.Support; +using Directory = Lucene.Net.Store.Directory; +using IndexInput = Lucene.Net.Store.IndexInput; +using IndexOutput = Lucene.Net.Store.IndexOutput; + +namespace Lucene.Net.Util +{ + + /// <summary>Optimized implementation of a vector of bits. This is more-or-less like + /// java.util.BitSet, but also includes the following: + /// <list type="bullet"> + /// <item>a count() method, which efficiently computes the number of one bits;</item> + /// <item>optimized read from and write to disk;</item> + /// <item>inlinable get() method;</item> + /// <item>store and load, as bit set or d-gaps, depending on sparseness;</item> + /// </list> + /// </summary> + public sealed class BitVector : ICloneable + { + + private byte[] bits; + private int size; + private int count; + + /// <summary>Constructs a vector capable of holding <c>n</c> bits. </summary> + public BitVector(int n) + { + size = n; + bits = new byte[(size >> 3) + 1]; + count = 0; + } + + internal BitVector(byte[] bits, int size) + { + this.bits = bits; + this.size = size; + count = -1; + } + + public System.Object Clone() + { + byte[] copyBits = new byte[bits.Length]; + Array.Copy(bits, 0, copyBits, 0, bits.Length); + BitVector clone = new BitVector(copyBits, size); + clone.count = count; + return clone; + } + + /// <summary>Sets the value of <c>bit</c> to one. </summary> + public void Set(int bit) + { + if (bit >= size) + { + throw new System. IndexOutOfRangeException("Index of bound " + bit); + } + bits[bit >> 3] |= (byte) (1 << (bit & 7)); + count = - 1; + } + + /// <summary>Sets the value of <c>bit</c> to true, and + /// returns true if bit was already set + /// </summary> + public bool GetAndSet(int bit) + { + if (bit >= size) + { + throw new System. IndexOutOfRangeException("Index of bound " + bit); + } + int pos = bit >> 3; + int v = bits[pos]; + int flag = 1 << (bit & 7); + if ((flag & v) != 0) + return true; + else + { + bits[pos] = (byte) (v | flag); + if (count != - 1) + count++; + return false; + } + } + + /// <summary>Sets the value of <c>bit</c> to zero. </summary> + public void Clear(int bit) + { + if (bit >= size) + { + throw new System.IndexOutOfRangeException("Index of bound " + bit); + } + bits[bit >> 3] &= (byte) (~ (1 << (bit & 7))); + count = - 1; + } + + /// <summary>Returns <c>true</c> if <c>bit</c> is one and + /// <c>false</c> if it is zero. + /// </summary> + public bool Get(int bit) + { + System.Diagnostics.Debug.Assert(bit >= 0 && bit < size, "bit " + bit + " is out of bounds 0.." +(size - 1)); + return (bits[bit >> 3] & (1 << (bit & 7))) != 0; + } + + /// <summary>Returns the number of bits in this vector. This is also one greater than + /// the number of the largest valid bit number. + /// </summary> + public int Size() + { + return size; + } + + /// <summary>Returns the total number of one bits in this vector. This is efficiently + /// computed and cached, so that, if the vector is not changed, no + /// recomputation is done for repeated calls. + /// </summary> + public int Count() + { + // if the vector has been modified + if (count == - 1) + { + int c = 0; + int end = bits.Length; + for (int i = 0; i < end; i++) + c += BYTE_COUNTS[bits[i] & 0xFF]; // sum bits per byte + count = c; + } + return count; + } + + /// <summary> + /// For testing + /// </summary> + [System.Diagnostics.CodeAnalysis.SuppressMessage("Microsoft.Design", "CA1024:UsePropertiesWhereAppropriate")] + public int GetRecomputedCount() + { + int c = 0; + int end = bits.Length; + for (int i = 0; i < end; i++) + c += BYTE_COUNTS[bits[i] & 0xFF]; // sum bits per byte + return c; + } + + private static readonly byte[] BYTE_COUNTS = new byte[]{0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8}; + + + /// <summary>Writes this vector to the file <c>name</c> in Directory + /// <c>d</c>, in a format that can be read by the constructor + /// <see cref="BitVector(Directory, String)" />. + /// </summary> + public void Write(Directory d, System.String name) + { + IndexOutput output = d.CreateOutput(name); + try + { + if (IsSparse()) + { + WriteDgaps(output); // sparse bit-set more efficiently saved as d-gaps. + } + else + { + WriteBits(output); + } + } + finally + { + output.Close(); + } + } + + /// <summary>Write as a bit set </summary> + private void WriteBits(IndexOutput output) + { + output.WriteInt(Size()); // write size + output.WriteInt(Count()); // write count + output.WriteBytes(bits, bits.Length); + } + + /// <summary>Write as a d-gaps list </summary> + private void WriteDgaps(IndexOutput output) + { + output.WriteInt(- 1); // mark using d-gaps + output.WriteInt(Size()); // write size + output.WriteInt(Count()); // write count + int last = 0; + int n = Count(); + int m = bits.Length; + for (int i = 0; i < m && n > 0; i++) + { + if (bits[i] != 0) + { + output.WriteVInt(i - last); + output.WriteByte(bits[i]); + last = i; + n -= BYTE_COUNTS[bits[i] & 0xFF]; + } + } + } + + /// <summary>Indicates if the bit vector is sparse and should be saved as a d-gaps list, or dense, and should be saved as a bit set. </summary> + private bool IsSparse() + { + // note: order of comparisons below set to favor smaller values (no binary range search.) + // note: adding 4 because we start with ((int) -1) to indicate d-gaps format. + // note: we write the d-gap for the byte number, and the byte (bits[i]) itself, therefore + // multiplying count by (8+8) or (8+16) or (8+24) etc.: + // - first 8 for writing bits[i] (1 byte vs. 1 bit), and + // - second part for writing the byte-number d-gap as vint. + // note: factor is for read/write of byte-arrays being faster than vints. + int factor = 10; + if (bits.Length < (1 << 7)) + return factor * (4 + (8 + 8) * Count()) < Size(); + if (bits.Length < (1 << 14)) + return factor * (4 + (8 + 16) * Count()) < Size(); + if (bits.Length < (1 << 21)) + return factor * (4 + (8 + 24) * Count()) < Size(); + if (bits.Length < (1 << 28)) + return factor * (4 + (8 + 32) * Count()) < Size(); + return factor * (4 + (8 + 40) * Count()) < Size(); + } + + /// <summary>Constructs a bit vector from the file <c>name</c> in Directory + /// <c>d</c>, as written by the <see cref="Write" /> method. + /// </summary> + public BitVector(Directory d, System.String name) + { + IndexInput input = d.OpenInput(name); + try + { + size = input.ReadInt(); // read size + if (size == - 1) + { + ReadDgaps(input); + } + else + { + ReadBits(input); + } + } + finally + { + input.Close(); + } + } + + /// <summary>Read as a bit set </summary> + private void ReadBits(IndexInput input) + { + count = input.ReadInt(); // read count + bits = new byte[(size >> 3) + 1]; // allocate bits + input.ReadBytes(bits, 0, bits.Length); + } + + /// <summary>read as a d-gaps list </summary> + private void ReadDgaps(IndexInput input) + { + size = input.ReadInt(); // (re)read size + count = input.ReadInt(); // read count + bits = new byte[(size >> 3) + 1]; // allocate bits + int last = 0; + int n = Count(); + while (n > 0) + { + last += input.ReadVInt(); + bits[last] = input.ReadByte(); + n -= BYTE_COUNTS[bits[last] & 0xFF]; + } + } + + /// <summary> Retrieve a subset of this BitVector. + /// + /// </summary> + /// <param name="start">starting index, inclusive + /// </param> + /// <param name="end">ending index, exclusive + /// </param> + /// <returns> subset + /// </returns> + public BitVector Subset(int start, int end) + { + if (start < 0 || end > Size() || end < start) + throw new System.IndexOutOfRangeException(); + // Special case -- return empty vector is start == end + if (end == start) + return new BitVector(0); + byte[] bits = new byte[(Number.URShift((end - start - 1), 3)) + 1]; + int s = Number.URShift(start, 3); + for (int i = 0; i < bits.Length; i++) + { + int cur = 0xFF & this.bits[i + s]; + int next = i + s + 1 >= this.bits.Length?0:0xFF & this.bits[i + s + 1]; + bits[i] = (byte) ((Number.URShift(cur, (start & 7))) | ((next << (8 - (start & 7))))); + } + int bitsToClear = (bits.Length * 8 - (end - start)) % 8; + bits[bits.Length - 1] &= (byte) (~ (0xFF << (8 - bitsToClear))); + return new BitVector(bits, end - start); + } + } +}
\ No newline at end of file |