Fix loading YAML vocab

author: Roman Grundkiewicz <rgrundki@exseed.ed.ac.uk> 2017-07-04 11:37:33 +0300
committer: Roman Grundkiewicz <rgrundki@exseed.ed.ac.uk> 2017-07-04 11:37:33 +0300
commit: 73c84ac17d326aa27e4818415a3f058c8cf45411 (patch)
tree: 62d2aa4c75273666488e311048c23edcdf782aa9 /scripts
parent: f14b8f333937c6e0687a315c881115d56c9f5d81 (diff)
2 files changed, 9 insertions, 7 deletions
diff --git a/scripts/embeddings/export_embeddings.py b/scripts/embeddings/export_embeddings.py
index 1476e52c..1476e52c 100644..100755
--- a/scripts/embeddings/export_embeddings.py
+++ b/scripts/embeddings/export_embeddings.py
diff --git a/scripts/embeddings/process_word2vec.py b/scripts/embeddings/process_word2vec.py
index 865056de..5be9adf7 100755
--- a/scripts/embeddings/process_word2vec.py
+++ b/scripts/embeddings/process_word2vec.py
@@ -9,7 +9,6 @@ import argparse
 import subprocess
 import json
 
-
 WORD2VEC_OPTIONS = '-cbow 0 -window 5 -negative -hs 1 -sample 1e-3 -binary 0'
 
 UNK = '<unk>'
@@ -68,7 +67,8 @@ def main():
                 cout.write("{} {}".format(vocab[word], tail))
                 n += 1
             else:
-                print("  warning: no word '{}' in vocabulary, line {}".format(word, i+1))
+                print("  warning: no word '{}' in vocabulary, line {}".format(
+                    word, i + 1))
     print("  words: {}".format(n))
 
     print("Finished")
@@ -81,8 +81,9 @@ def replace_unks(l, voc):
 def load_yaml(lines):
     vocab = {}
     for line in lines:
-        word, idx = line.strip().split(': ')
-        vocab[word.strip('"')] = int(idx)
+        # all values are integers, so splitting by ':' from right should be safe
+        word, idx = line.strip().rsplit(':', 1)
+        vocab[word.strip('"')] = int(idx.strip())
     return vocab
 
 
@@ -93,10 +94,11 @@ embedding vectors with regard to the word vocabulary."""
   {0} -v vocab.yml -i corpus.txt -o output.txt -w path/to/word2vec
   {0} -v vocab.yml -i vectors.txt -o output.txt"""
     note = note.format(os.path.basename(__file__))
+
     parser = argparse.ArgumentParser(
-            formatter_class=argparse.RawDescriptionHelpFormatter,
-            description=desc,
-            epilog=note)
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        description=desc,
+        epilog=note)
     parser.add_argument("-i", "--input", help="embedding vectors or corpus for word2vec", required=True)
     parser.add_argument("-o", "--output", help="output embedding vectors", required=True)
     parser.add_argument("-v", "--vocab", help="path to vocabulary in JSON or YAML format", required=True)
author	Roman Grundkiewicz <rgrundki@exseed.ed.ac.uk>	2017-07-04 11:37:33 +0300
committer	Roman Grundkiewicz <rgrundki@exseed.ed.ac.uk>	2017-07-04 11:37:33 +0300
commit	73c84ac17d326aa27e4818415a3f058c8cf45411 (patch)
tree	62d2aa4c75273666488e311048c23edcdf782aa9 /scripts
parent	f14b8f333937c6e0687a315c881115d56c9f5d81 (diff)