diff options
author | Roman Grundkiewicz <rgrundki@exseed.ed.ac.uk> | 2017-07-04 11:37:33 +0300 |
---|---|---|
committer | Roman Grundkiewicz <rgrundki@exseed.ed.ac.uk> | 2017-07-04 11:37:33 +0300 |
commit | 73c84ac17d326aa27e4818415a3f058c8cf45411 (patch) | |
tree | 62d2aa4c75273666488e311048c23edcdf782aa9 /scripts | |
parent | f14b8f333937c6e0687a315c881115d56c9f5d81 (diff) |
Fix loading YAML vocab
Diffstat (limited to 'scripts')
-rwxr-xr-x[-rw-r--r--] | scripts/embeddings/export_embeddings.py | 0 | ||||
-rwxr-xr-x | scripts/embeddings/process_word2vec.py | 16 |
2 files changed, 9 insertions, 7 deletions
diff --git a/scripts/embeddings/export_embeddings.py b/scripts/embeddings/export_embeddings.py index 1476e52c..1476e52c 100644..100755 --- a/scripts/embeddings/export_embeddings.py +++ b/scripts/embeddings/export_embeddings.py diff --git a/scripts/embeddings/process_word2vec.py b/scripts/embeddings/process_word2vec.py index 865056de..5be9adf7 100755 --- a/scripts/embeddings/process_word2vec.py +++ b/scripts/embeddings/process_word2vec.py @@ -9,7 +9,6 @@ import argparse import subprocess import json - WORD2VEC_OPTIONS = '-cbow 0 -window 5 -negative -hs 1 -sample 1e-3 -binary 0' UNK = '<unk>' @@ -68,7 +67,8 @@ def main(): cout.write("{} {}".format(vocab[word], tail)) n += 1 else: - print(" warning: no word '{}' in vocabulary, line {}".format(word, i+1)) + print(" warning: no word '{}' in vocabulary, line {}".format( + word, i + 1)) print(" words: {}".format(n)) print("Finished") @@ -81,8 +81,9 @@ def replace_unks(l, voc): def load_yaml(lines): vocab = {} for line in lines: - word, idx = line.strip().split(': ') - vocab[word.strip('"')] = int(idx) + # all values are integers, so splitting by ':' from right should be safe + word, idx = line.strip().rsplit(':', 1) + vocab[word.strip('"')] = int(idx.strip()) return vocab @@ -93,10 +94,11 @@ embedding vectors with regard to the word vocabulary.""" {0} -v vocab.yml -i corpus.txt -o output.txt -w path/to/word2vec {0} -v vocab.yml -i vectors.txt -o output.txt""" note = note.format(os.path.basename(__file__)) + parser = argparse.ArgumentParser( - formatter_class=argparse.RawDescriptionHelpFormatter, - description=desc, - epilog=note) + formatter_class=argparse.RawDescriptionHelpFormatter, + description=desc, + epilog=note) parser.add_argument("-i", "--input", help="embedding vectors or corpus for word2vec", required=True) parser.add_argument("-o", "--output", help="output embedding vectors", required=True) parser.add_argument("-v", "--vocab", help="path to vocabulary in JSON or YAML format", required=True) |