Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/marian-nmt/marian.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorRoman Grundkiewicz <rgrundki@exseed.ed.ac.uk>2017-07-04 11:37:33 +0300
committerRoman Grundkiewicz <rgrundki@exseed.ed.ac.uk>2017-07-04 11:37:33 +0300
commit73c84ac17d326aa27e4818415a3f058c8cf45411 (patch)
tree62d2aa4c75273666488e311048c23edcdf782aa9 /scripts
parentf14b8f333937c6e0687a315c881115d56c9f5d81 (diff)
Fix loading YAML vocab
Diffstat (limited to 'scripts')
-rwxr-xr-x[-rw-r--r--]scripts/embeddings/export_embeddings.py0
-rwxr-xr-xscripts/embeddings/process_word2vec.py16
2 files changed, 9 insertions, 7 deletions
diff --git a/scripts/embeddings/export_embeddings.py b/scripts/embeddings/export_embeddings.py
index 1476e52c..1476e52c 100644..100755
--- a/scripts/embeddings/export_embeddings.py
+++ b/scripts/embeddings/export_embeddings.py
diff --git a/scripts/embeddings/process_word2vec.py b/scripts/embeddings/process_word2vec.py
index 865056de..5be9adf7 100755
--- a/scripts/embeddings/process_word2vec.py
+++ b/scripts/embeddings/process_word2vec.py
@@ -9,7 +9,6 @@ import argparse
import subprocess
import json
-
WORD2VEC_OPTIONS = '-cbow 0 -window 5 -negative -hs 1 -sample 1e-3 -binary 0'
UNK = '<unk>'
@@ -68,7 +67,8 @@ def main():
cout.write("{} {}".format(vocab[word], tail))
n += 1
else:
- print(" warning: no word '{}' in vocabulary, line {}".format(word, i+1))
+ print(" warning: no word '{}' in vocabulary, line {}".format(
+ word, i + 1))
print(" words: {}".format(n))
print("Finished")
@@ -81,8 +81,9 @@ def replace_unks(l, voc):
def load_yaml(lines):
vocab = {}
for line in lines:
- word, idx = line.strip().split(': ')
- vocab[word.strip('"')] = int(idx)
+ # all values are integers, so splitting by ':' from right should be safe
+ word, idx = line.strip().rsplit(':', 1)
+ vocab[word.strip('"')] = int(idx.strip())
return vocab
@@ -93,10 +94,11 @@ embedding vectors with regard to the word vocabulary."""
{0} -v vocab.yml -i corpus.txt -o output.txt -w path/to/word2vec
{0} -v vocab.yml -i vectors.txt -o output.txt"""
note = note.format(os.path.basename(__file__))
+
parser = argparse.ArgumentParser(
- formatter_class=argparse.RawDescriptionHelpFormatter,
- description=desc,
- epilog=note)
+ formatter_class=argparse.RawDescriptionHelpFormatter,
+ description=desc,
+ epilog=note)
parser.add_argument("-i", "--input", help="embedding vectors or corpus for word2vec", required=True)
parser.add_argument("-o", "--output", help="output embedding vectors", required=True)
parser.add_argument("-v", "--vocab", help="path to vocabulary in JSON or YAML format", required=True)