diff options
Diffstat (limited to 'python/vocab.py')
-rwxr-xr-x | python/vocab.py | 33 |
1 files changed, 33 insertions, 0 deletions
diff --git a/python/vocab.py b/python/vocab.py new file mode 100755 index 0000000..658051f --- /dev/null +++ b/python/vocab.py @@ -0,0 +1,33 @@ +import heapq +import operator +import sys + +class Vocab(object): + def __init__(self, words=(), unk="<unk>"): + self.words = [] + self.word_index = {} + + self.insert_word(unk) + self.unk = self.word_index[unk] + for word in words: + self.insert_word(word) + + def from_counts(self, counts, size, unk="<unk>"): + # Keep only most frequent words + q = [(-count, word) for (word, count) in counts.iteritems()] + heapq.heapify(q) + inserted = 0 + while len(self.words) < size and len(q) > 0: + _, word = heapq.heappop(q) + inserted += 1 + if word not in self.word_index: + self.insert_word(word) + return inserted + + def insert_word(self, word): + i = len(self.words) + self.words.append(word) + self.word_index[word] = i + + def lookup_word(self, word): + return self.word_index.get(word, self.unk) |