diff options
Diffstat (limited to 'src/core/Analysis/CharArraySet.cs')
-rw-r--r-- | src/core/Analysis/CharArraySet.cs | 517 |
1 files changed, 517 insertions, 0 deletions
diff --git a/src/core/Analysis/CharArraySet.cs b/src/core/Analysis/CharArraySet.cs new file mode 100644 index 0000000..e7df0ba --- /dev/null +++ b/src/core/Analysis/CharArraySet.cs @@ -0,0 +1,517 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +using System; +using System.Collections; +using System.Linq; +using System.Collections.Generic; + +namespace Lucene.Net.Analysis +{ + /// <summary> A simple class that stores Strings as char[]'s in a + /// hash table. Note that this is not a general purpose + /// class. For example, it cannot remove items from the + /// set, nor does it resize its hash table to be smaller, + /// etc. It is designed to be quick to test if a char[] + /// is in the set without the necessity of converting it + /// to a String first. + /// <p/> + /// <em>Please note:</em> This class implements <see cref="System.Collections.Generic.ISet{T}"/> but + /// does not behave like it should in all cases. The generic type is + /// <see cref="System.Collections.Generic.ICollection{T}"/>, because you can add any object to it, + /// that has a string representation. The add methods will use + /// <see cref="object.ToString()"/> and store the result using a <see cref="char"/> + /// buffer. The same behaviour have the <see cref="Contains(object)"/> methods. + /// The <see cref="GetEnumerator"/> method returns an <see cref="string"/> IEnumerable. + /// For type safety also {@link #stringIterator()} is provided. + /// </summary> + // TODO: java uses wildcards, .net doesn't have this, easiest way is to + // make the entire class generic. Ultimately, though, since this + // works with strings, I can't think of a reason not to just declare + // this as an ISet<string>. + public class CharArraySet : ISet<string> + { + bool _ReadOnly = false; + const int INIT_SIZE = 8; + char[][] _Entries; + int _Count; + bool _IgnoreCase; + public static CharArraySet EMPTY_SET = UnmodifiableSet(new CharArraySet(0, false)); + + private void Init(int startSize, bool ignoreCase) + { + this._IgnoreCase = ignoreCase; + int size = INIT_SIZE; + while (startSize + (startSize >> 2) > size) + size <<= 1; + _Entries = new char[size][]; + } + + /// <summary>Create set with enough capacity to hold startSize + /// terms + /// </summary> + public CharArraySet(int startSize, bool ignoreCase) + { + Init(startSize, ignoreCase); + } + + public CharArraySet(IEnumerable<string> c, bool ignoreCase) + { + Init(c.Count(), ignoreCase); + AddItems(c); + } + + /// <summary>Create set from a Collection of char[] or String </summary> + public CharArraySet(IEnumerable<object> c, bool ignoreCase) + { + Init(c.Count(), ignoreCase); + AddItems(c); + } + + private void AddItems<T>(IEnumerable<T> items) + { + foreach(var item in items) + { + Add(item.ToString()); + } + } + + /// <summary>Create set from entries </summary> + private CharArraySet(char[][] entries, bool ignoreCase, int count) + { + this._Entries = entries; + this._IgnoreCase = ignoreCase; + this._Count = count; + } + + /// <summary>true if the <c>len</c> chars of <c>text</c> starting at <c>off</c> + /// are in the set + /// </summary> + public virtual bool Contains(char[] text, int off, int len) + { + return _Entries[GetSlot(text, off, len)] != null; + } + + public virtual bool Contains(string text) + { + return _Entries[GetSlot(text)] != null; + } + + + private int GetSlot(char[] text, int off, int len) + { + int code = GetHashCode(text, off, len); + int pos = code & (_Entries.Length - 1); + char[] text2 = _Entries[pos]; + if (text2 != null && !Equals(text, off, len, text2)) + { + int inc = ((code >> 8) + code) | 1; + do + { + code += inc; + pos = code & (_Entries.Length - 1); + text2 = _Entries[pos]; + } + while (text2 != null && !Equals(text, off, len, text2)); + } + return pos; + } + + /// <summary>Returns true if the String is in the set </summary> + private int GetSlot(string text) + { + int code = GetHashCode(text); + int pos = code & (_Entries.Length - 1); + char[] text2 = _Entries[pos]; + if (text2 != null && !Equals(text, text2)) + { + int inc = ((code >> 8) + code) | 1; + do + { + code += inc; + pos = code & (_Entries.Length - 1); + text2 = _Entries[pos]; + } + while (text2 != null && !Equals(text, text2)); + } + return pos; + } + + public bool Add(string text) + { + if (_ReadOnly) throw new NotSupportedException(); + return Add(text.ToCharArray()); + } + + /// <summary>Add this char[] directly to the set. + /// If ignoreCase is true for this Set, the text array will be directly modified. + /// The user should never modify this text array after calling this method. + /// </summary> + public bool Add(char[] text) + { + if (_ReadOnly) throw new NotSupportedException(); + + if (_IgnoreCase) + for (int i = 0; i < text.Length; i++) + text[i] = Char.ToLower(text[i]); + int slot = GetSlot(text, 0, text.Length); + if (_Entries[slot] != null) + return false; + _Entries[slot] = text; + _Count++; + + if (_Count + (_Count >> 2) > _Entries.Length) + { + Rehash(); + } + + return true; + } + + private bool Equals(char[] text1, int off, int len, char[] text2) + { + if (len != text2.Length) + return false; + if (_IgnoreCase) + { + for (int i = 0; i < len; i++) + { + if (char.ToLower(text1[off + i]) != text2[i]) + return false; + } + } + else + { + for (int i = 0; i < len; i++) + { + if (text1[off + i] != text2[i]) + return false; + } + } + return true; + } + + private bool Equals(string text1, char[] text2) + { + int len = text1.Length; + if (len != text2.Length) + return false; + if (_IgnoreCase) + { + for (int i = 0; i < len; i++) + { + if (char.ToLower(text1[i]) != text2[i]) + return false; + } + } + else + { + for (int i = 0; i < len; i++) + { + if (text1[i] != text2[i]) + return false; + } + } + return true; + } + + private void Rehash() + { + int newSize = 2 * _Entries.Length; + char[][] oldEntries = _Entries; + _Entries = new char[newSize][]; + + for (int i = 0; i < oldEntries.Length; i++) + { + char[] text = oldEntries[i]; + if (text != null) + { + // todo: could be faster... no need to compare strings on collision + _Entries[GetSlot(text, 0, text.Length)] = text; + } + } + } + + private int GetHashCode(char[] text, int offset, int len) + { + int code = 0; + int stop = offset + len; + if (_IgnoreCase) + { + for (int i = offset; i < stop; i++) + { + code = code * 31 + char.ToLower(text[i]); + } + } + else + { + for (int i = offset; i < stop; i++) + { + code = code * 31 + text[i]; + } + } + return code; + } + + private int GetHashCode(string text) + { + int code = 0; + int len = text.Length; + if (_IgnoreCase) + { + for (int i = 0; i < len; i++) + { + code = code * 31 + char.ToLower(text[i]); + } + } + else + { + for (int i = 0; i < len; i++) + { + code = code * 31 + text[i]; + } + } + return code; + } + + public int Count + { + get { return _Count; } + } + + public bool IsEmpty + { + get { return _Count == 0; } + } + + public bool Contains(object item) + { + var text = item as char[]; + return text != null ? Contains(text, 0, text.Length) : Contains(item.ToString()); + } + + public bool Add(object item) + { + return Add(item.ToString()); + } + + void ICollection<string>.Add(string item) + { + this.Add(item); + } + + /// <summary> + /// Returns an unmodifiable <see cref="CharArraySet"/>. This allows to provide + /// unmodifiable views of internal sets for "read-only" use + /// </summary> + /// <param name="set">A Set for which the unmodifiable set it returns.</param> + /// <returns>A new unmodifiable <see cref="CharArraySet"/></returns> + /// <throws>ArgumentNullException of the given set is <c>null</c></throws> + public static CharArraySet UnmodifiableSet(CharArraySet set) + { + if(set == null) + throw new ArgumentNullException("Given set is null"); + if (set == EMPTY_SET) + return EMPTY_SET; + if (set._ReadOnly) + return set; + + var newSet = new CharArraySet(set._Entries, set._IgnoreCase, set.Count) {IsReadOnly = true}; + return newSet; + } + + /// <summary> + /// returns a copy of the given set as a <see cref="CharArraySet"/>. If the given set + /// is a <see cref="CharArraySet"/> the ignoreCase property will be preserved. + /// </summary> + /// <param name="set">A set to copy</param> + /// <returns>a copy of the given set as a <see cref="CharArraySet"/>. If the given set + /// is a <see cref="CharArraySet"/> the ignoreCase property will be preserved.</returns> + public static CharArraySet Copy<T>(ISet<T> set) + { + if (set == null) + throw new ArgumentNullException("set", "Given set is null!"); + if (set == EMPTY_SET) + return EMPTY_SET; + bool ignoreCase = set is CharArraySet && ((CharArraySet)set)._IgnoreCase; + var arrSet = new CharArraySet(set.Count, ignoreCase); + arrSet.AddItems(set); + return arrSet; + } + + public void Clear() + { + throw new NotSupportedException("Remove not supported!"); + } + + public bool IsReadOnly + { + get { return _ReadOnly; } + private set { _ReadOnly = value; } + } + + /// <summary>Adds all of the elements in the specified collection to this collection </summary> + public void UnionWith(IEnumerable<string> other) + { + if (_ReadOnly) throw new NotSupportedException(); + + foreach (string s in other) + { + Add(s.ToCharArray()); + } + } + + /// <summary>Wrapper that calls UnionWith</summary> + public void AddAll(IEnumerable<string> coll) + { + UnionWith(coll); + } + + #region Unneeded methods + public void RemoveAll(ICollection<string> c) + { + throw new NotSupportedException(); + } + + public void RetainAll(ICollection<string> c) + { + throw new NotSupportedException(); + } + + void ICollection<string>.CopyTo(string[] array, int arrayIndex) + { + throw new NotSupportedException(); + } + + void ISet<string>.IntersectWith(IEnumerable<string> other) + { + throw new NotSupportedException(); + } + + void ISet<string>.ExceptWith(IEnumerable<string> other) + { + throw new NotSupportedException(); + } + + void ISet<string>.SymmetricExceptWith(IEnumerable<string> other) + { + throw new NotSupportedException(); + } + + bool ISet<string>.IsSubsetOf(IEnumerable<string> other) + { + throw new NotSupportedException(); + } + + bool ISet<string>.IsSupersetOf(IEnumerable<string> other) + { + throw new NotSupportedException(); + } + + bool ISet<string>.IsProperSupersetOf(IEnumerable<string> other) + { + throw new NotSupportedException(); + } + + bool ISet<string>.IsProperSubsetOf(IEnumerable<string> other) + { + throw new NotSupportedException(); + } + + bool ISet<string>.Overlaps(IEnumerable<string> other) + { + throw new NotSupportedException(); + } + + bool ISet<string>.SetEquals(IEnumerable<string> other) + { + throw new NotSupportedException(); + } + + bool ICollection<string>.Remove(string item) + { + throw new NotSupportedException(); + } + #endregion + + /// <summary> + /// The IEnumerator<String> for this set. Strings are constructed on the fly, + /// so use <c>nextCharArray</c> for more efficient access + /// </summary> + public class CharArraySetEnumerator : IEnumerator<string> + { + readonly CharArraySet _Creator; + int pos = -1; + char[] cur; + + protected internal CharArraySetEnumerator(CharArraySet creator) + { + _Creator = creator; + } + + public bool MoveNext() + { + cur = null; + pos++; + while (pos < _Creator._Entries.Length && (cur = _Creator._Entries[pos]) == null) + pos++; + return cur != null; + } + + /// <summary>do not modify the returned char[] </summary> + public char[] NextCharArray() + { + return cur; + } + + public string Current + { + get { return new string(NextCharArray()); } + } + + public void Dispose() + { + } + + object IEnumerator.Current + { + get { return new string(NextCharArray()); } + } + + public void Reset() + { + throw new NotImplementedException(); + } + } + + public IEnumerator<string> StringEnumerator() + { + return new CharArraySetEnumerator(this); + } + + public IEnumerator<string> GetEnumerator() + { + return new CharArraySetEnumerator(this); + } + + IEnumerator IEnumerable.GetEnumerator() + { + return GetEnumerator(); + } + } + +}
\ No newline at end of file |