Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/nplm.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
Diffstat (limited to 'python/vocab.py')
-rwxr-xr-xpython/vocab.py33
1 files changed, 33 insertions, 0 deletions
diff --git a/python/vocab.py b/python/vocab.py
new file mode 100755
index 0000000..658051f
--- /dev/null
+++ b/python/vocab.py
@@ -0,0 +1,33 @@
+import heapq
+import operator
+import sys
+
+class Vocab(object):
+ def __init__(self, words=(), unk="<unk>"):
+ self.words = []
+ self.word_index = {}
+
+ self.insert_word(unk)
+ self.unk = self.word_index[unk]
+ for word in words:
+ self.insert_word(word)
+
+ def from_counts(self, counts, size, unk="<unk>"):
+ # Keep only most frequent words
+ q = [(-count, word) for (word, count) in counts.iteritems()]
+ heapq.heapify(q)
+ inserted = 0
+ while len(self.words) < size and len(q) > 0:
+ _, word = heapq.heappop(q)
+ inserted += 1
+ if word not in self.word_index:
+ self.insert_word(word)
+ return inserted
+
+ def insert_word(self, word):
+ i = len(self.words)
+ self.words.append(word)
+ self.word_index[word] = i
+
+ def lookup_word(self, word):
+ return self.word_index.get(word, self.unk)