Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJeroen Vermeulen <jtv@precisiontranslationtools.com>2015-05-16 10:58:03 +0300
committerJeroen Vermeulen <jtv@precisiontranslationtools.com>2015-05-16 10:58:03 +0300
commit0ffe79579eca183161d86ad38bb34ba8bab3c855 (patch)
tree426ef93e43acef1ff9ffa1ad5e0c9efeb3142a8d /scripts
parentf1ed14eb33c86611a9d9355caf6439a087d71d03 (diff)
Fix some python lint.
I used mainly pocketlint, a very good Python linter, but also Syntastic, a vim plugin. Didn't get anywhere near fixing all of Syntastic's complaints though. Once I've cleaned up all (or at least most) of the Python lint, we can start doing regular automated lint checks and keep the code clean.
Diffstat (limited to 'scripts')
-rw-r--r--scripts/other/gacha_filter.py55
-rwxr-xr-xscripts/training/bilingual-lm/averageNullEmbedding.py38
-rwxr-xr-xscripts/training/bilingual-lm/extract.py185
-rwxr-xr-xscripts/training/bilingual-lm/extract_test.py143
-rwxr-xr-xscripts/training/bilingual-lm/extract_training.py289
-rwxr-xr-xscripts/training/bilingual-lm/reduce_ngrams.py10
-rwxr-xr-xscripts/training/bilingual-lm/test_nplm.py112
-rwxr-xr-xscripts/training/bilingual-lm/train_nplm.py195
-rwxr-xr-xscripts/training/create_count_tables.py91
-rwxr-xr-xscripts/training/flexibility_score.py84
10 files changed, 675 insertions, 527 deletions
diff --git a/scripts/other/gacha_filter.py b/scripts/other/gacha_filter.py
index 1ec1f4616..0deb45761 100644
--- a/scripts/other/gacha_filter.py
+++ b/scripts/other/gacha_filter.py
@@ -2,12 +2,12 @@
"""
The Gacha filter cleans out sentence pairs that have global character mean
-lower than a certain threshold.
-
-Use this cleaner to produce low quantity of high quality sentence pairs.
+lower than a certain threshold.
-It is an aggressive cleaner that cleaned out ~64% of the HindEnCorp during
-WMT14 when threshold is set at 20% (Tan and Pal, 2014); achieving lowest TER.
+Use this cleaner to produce low quantity of high quality sentence pairs.
+
+It is an aggressive cleaner that cleaned out ~64% of the HindEnCorp during
+WMT14 when threshold is set at 20% (Tan and Pal, 2014); achieving lowest TER.
(see http://www.aclweb.org/anthology/W/W14/W14-3323.pdf)
This is inspired by the global character mean that is used in the Gale-Church
@@ -27,8 +27,8 @@ USAGE:
$ python3 gacha_filter.py train.en train.de
-Outputs to STDOUT a separated lines of the source and target sentence pairs.
-You can simply cut the file after that.
+Outputs to STDOUT a separated lines of the source and target sentence pairs.
+You can simply cut the file after that.
$ python3 gacha_filter.py train.en train.de > train.en-de
$ cut -f1 train.en-de > train.clean.en
@@ -37,21 +37,27 @@ You can simply cut the file after that.
You can also allow lower threshold to yield more lines:
$ python3 gacha_filter.py train.en train.de 0.05
-
+
Default threshold is set to 0.2.
"""
-import io, subprocess
+import io
+import subprocess
red = '\033[01;31m'
native = '\033[m'
+
def err_msg(txt):
- return red+txt+native
+ return red + txt + native
+
def num_char(filename):
- return float(subprocess.Popen(["wc", "-m", filename],
- stdout=subprocess.PIPE).stdout.read().split()[0])
+ return float(
+ subprocess.Popen(
+ ["wc", "-m", filename],
+ stdout=subprocess.PIPE).stdout.read().split()[0])
+
def gacha_mean(sourcefile, targetfile):
"""
@@ -60,35 +66,40 @@ def gacha_mean(sourcefile, targetfile):
"""
sys.stderr.write(err_msg('Calculating Gacha mean, please wait ...\n'))
c = num_char(sourcefile) / num_char(targetfile)
- sys.stderr.write(err_msg('Gacha mean = '+str(c)+'\n'))
+ sys.stderr.write(err_msg('Gacha mean = ' + str(c) + '\n'))
sys.stderr.write(err_msg('Filtering starts ...\n'))
return c
+
+def io_open(path):
+ """Open text file at `path` as a read-only, with UTF-8 encoding."""
+ return io.open(path, 'r', encoding='utf8')
+
+
def main(sourcefile, targetfile, threshold=0.2):
# Calculates Gacha mean.
c = gacha_mean(sourcefile, targetfile)
# Calculates lower and upperbound for filtering
threshold = float(threshold)
- lowerbound = (1-threshold) * c
- upperbound = (1+threshold) * c
-
+ lowerbound = (1 - threshold) * c
+ upperbound = (1 + threshold) * c
+
# Start filtering sentences.
- with io.open(sourcefile, 'r', encoding='utf8') as srcfin, \
- io.open(targetfile, 'r', encoding='utf8') as trgfin:
+ with io_open(sourcefile) as srcfin, io_open(targetfile) as trgfin:
for s, t in zip(srcfin, trgfin):
if lowerbound < len(s) / float(len(t)) < upperbound:
- print(u"{}\t{}".format(s.strip(),t.strip()))
+ print(u"{}\t{}".format(s.strip(), t.strip()))
if __name__ == '__main__':
import sys
- if len(sys.argv) not in range(3,5):
+ if len(sys.argv) not in range(3, 5):
usage_msg = err_msg('Usage: python3 %s srcfile trgfile (threshold)\n'
% sys.argv[0])
-
+
example_msg = err_msg('Example: python3 %s ~/Europarl.de-en.de '
'~/Europarl.de-en.en 0.4\n' % sys.argv[0])
sys.stderr.write(usage_msg)
sys.stderr.write(example_msg)
sys.exit(1)
-
+
main(*sys.argv[1:])
diff --git a/scripts/training/bilingual-lm/averageNullEmbedding.py b/scripts/training/bilingual-lm/averageNullEmbedding.py
index ced33e3df..aca03aaae 100755
--- a/scripts/training/bilingual-lm/averageNullEmbedding.py
+++ b/scripts/training/bilingual-lm/averageNullEmbedding.py
@@ -3,36 +3,50 @@ import sys
import numpy
import argparse
-parser = argparse.ArgumentParser(description='Set input embedding of <null> token to weighted average of all input embeddings')
-parser.add_argument("-p", "--nplm-python-path", type=str, dest="nplm_python_path", default='/mnt/gna0/rsennrich/tools/nplm/python')
-parser.add_argument("-i", "--input-model", type=str, dest="input_model", required=True)
-parser.add_argument("-o", "--output-model", type=str, dest="output_model", required=True)
-parser.add_argument("-n", "--null-token-index", type=int, dest="null_idx", default=-1)
-parser.add_argument("-t", "--training-ngrams", type=str, dest="training_ngrams", required=True)
+parser = argparse.ArgumentParser(
+ description=(
+ "Set input embedding of <null> token to weighted average "
+ "of all input embeddings"))
+parser.add_argument(
+ "-p", "--nplm-python-path", type=str, dest="nplm_python_path",
+ default='/mnt/gna0/rsennrich/tools/nplm/python')
+parser.add_argument(
+ "-i", "--input-model", type=str, dest="input_model", required=True)
+parser.add_argument(
+ "-o", "--output-model", type=str, dest="output_model", required=True)
+parser.add_argument(
+ "-n", "--null-token-index", type=int, dest="null_idx", default=-1)
+parser.add_argument(
+ "-t", "--training-ngrams", type=str, dest="training_ngrams",
+ required=True)
options = parser.parse_args()
sys.path.append(options.nplm_python_path)
import nplm
-from collections import defaultdict
+
def load_model(model_file):
return nplm.NeuralLM.from_file(model_file)
+
def get_weights(path, length):
- counter = [0]*length
+ counter = [0] * length
for line in open(path):
last_context = int(line.split()[-2])
counter[last_context] += 1
return counter
+
if __name__ == "__main__":
model = load_model(options.input_model)
if options.null_idx == -1:
- options.null_idx = model.word_to_index_input['<null>']
+ options.null_idx = model.word_to_index_input['<null>']
sys.stderr.write('index of <null>: {0}\n'.format(options.null_idx))
- weights = numpy.array(get_weights(options.training_ngrams, len(model.input_embeddings)))
- model.input_embeddings[options.null_idx] = numpy.average(numpy.array(model.input_embeddings), weights=weights, axis=0)
- model.to_file(open(options.output_model,'w'))
+ weights = numpy.array(
+ get_weights(options.training_ngrams, len(model.input_embeddings)))
+ model.input_embeddings[options.null_idx] = numpy.average(
+ numpy.array(model.input_embeddings), weights=weights, axis=0)
+ model.to_file(open(options.output_model, 'w'))
diff --git a/scripts/training/bilingual-lm/extract.py b/scripts/training/bilingual-lm/extract.py
index 727c15ac1..f620edb5d 100755
--- a/scripts/training/bilingual-lm/extract.py
+++ b/scripts/training/bilingual-lm/extract.py
@@ -1,9 +1,7 @@
#!/usr/bin/env python
from collections import Counter
-import heapq
import logging
-import optparse
import sys
LOG = logging.getLogger(__name__)
@@ -12,26 +10,28 @@ BOS = "<s>"
EOS = "</s>"
UNK = "<unk>"
-def replace_tags(tokens,tags,vocab):
- for i,t in enumerate(tokens):
- if not t in vocab:
- if i < len(tags):
- tokens[i] = tags[i]
- else:
- print "Error: missing tags for index i:", i
- print ' '.join(tokens)
- print ' '.join(tags)
- tokens[i] = UNK
-def replace_unks(tokens,vocab):
- for i,t in enumerate(tokens):
- if not t in vocab:
- tokens[i] = UNK
+def replace_tags(tokens, tags, vocab):
+ for i, t in enumerate(tokens):
+ if t not in vocab:
+ if i < len(tags):
+ tokens[i] = tags[i]
+ else:
+ print "Error: missing tags for index i:", i
+ print ' '.join(tokens)
+ print ' '.join(tags)
+ tokens[i] = UNK
+
+
+def replace_unks(tokens, vocab):
+ for i, t in enumerate(tokens):
+ if t not in vocab:
+ tokens[i] = UNK
def numberize(line, m, n, svocab, tvocab):
line = line.split()
- source_words = line[:2*m + 1]
+ source_words = line[:2 * m + 1]
target_words = line[-n:]
line = ' '.join([str(svocab[item]) for item in source_words]) + ' '
@@ -40,7 +40,8 @@ def numberize(line, m, n, svocab, tvocab):
return line
-def get_ngrams(corpus_stem, align_file, tagged_stem, svocab, tvocab, slang,tlang, m, n, ofh):
+def get_ngrams(corpus_stem, align_file, tagged_stem, svocab, tvocab, slang,
+ tlang, m, n, ofh):
"""
m - source context
n - target context
@@ -51,83 +52,87 @@ def get_ngrams(corpus_stem, align_file, tagged_stem, svocab, tvocab, slang,tlang
sfh = open(corpus_stem + "." + slang)
tfh = open(corpus_stem + "." + tlang)
afh = open(align_file)
- fhs = [sfh,tfh,afh]
+ fhs = [sfh, tfh, afh]
if tagged_stem:
- fhs.append(open(tagged_stem + "." + slang))
- fhs.append(open(tagged_stem + "." + tlang))
+ fhs.append(open(tagged_stem + "." + slang))
+ fhs.append(open(tagged_stem + "." + tlang))
- count = 0
+ count = 0
ngrams = 0
LOG.info("Extracting ngrams")
- for lines in zip(*fhs):
- stokens = lines[0][:-1].split()
- ttokens = lines[1][:-1].split()
- stokens.append(EOS)
- ttokens.append(EOS)
- if tagged_stem:
- stags = lines[3][:-1].split()
- ttags = lines[4][:-1].split()
- stags.append(EOS)
- ttags.append(EOS)
- tags.update(stags)
- tags.update(ttags)
- replace_tags(stokens,stags,svocab)
- replace_tags(ttokens,ttags,tvocab)
- else:
- replace_unks(stokens,svocab)
- replace_unks(ttokens,tvocab)
- # list aligns for each target
- # Note: align specifies source -> target
- target_aligns = [[] for t in range(len(ttokens))]
- for atoken in lines[2][:-1].split():
- spos,tpos = atoken.split("-")
- spos,tpos = int(spos), int(tpos)
- target_aligns[tpos].append(spos)
- #EOS alignment
- target_aligns[-1] = [len(stokens)-1]
-
- for tpos,spos_list in enumerate(target_aligns):
- # Affiliation heuristics - see Devlin t al. p1371
- if not spos_list:
- #tpos has no alignment, look right, then left, then right-right, then left-left etc
- rpos = tpos+1
- lpos = tpos-1
- while rpos < len(ttokens) or lpos >= 0:
- if rpos < len(ttokens) and target_aligns[rpos]:
- spos_list = target_aligns[rpos]
- break
- if lpos >= 0 and target_aligns[lpos]:
- spos_list = target_aligns[lpos]
- break
- rpos += 1
- lpos -= 1
-
- if not spos_list:
- raise Exception("No alignments in sentence \nSRC: " + lines[0][:-1] + "\nTGT: " + lines[1][:-1])
- midpos = (len(spos_list)-1) / 2
- spos = sorted(spos_list)[midpos]
-
-
- # source-context, target-context, predicted word
- for i in range(max(0,m-spos)):
- print>>ofh, BOS,
- #print [spos-m/2,spos+m/2+1], stokens[spos-m/2:spos+m/2+1]
- print>>ofh, " ".join([s for s in stokens[max(0,spos-m):spos+m+1]]),
- for i in range(max(0,spos+m+1-len(stokens))):
- print>>ofh, EOS,
- for i in range(max(0,n-(tpos+1))):
- print>>ofh, BOS,
- print>>ofh, " ".join([t for t in ttokens[max(0,tpos+1-n):tpos+1]]),
- print>>ofh
- ngrams += 1
-
-
- count += 1
- if count % 1000 == 0: sys.stderr.write(".")
- if count % 50000 == 0: sys.stderr.write(" [%d]\n" % count)
+ for lines in zip(*fhs):
+ stokens = lines[0][:-1].split()
+ ttokens = lines[1][:-1].split()
+ stokens.append(EOS)
+ ttokens.append(EOS)
+ if tagged_stem:
+ stags = lines[3][:-1].split()
+ ttags = lines[4][:-1].split()
+ stags.append(EOS)
+ ttags.append(EOS)
+ tags.update(stags)
+ tags.update(ttags)
+ replace_tags(stokens, stags, svocab)
+ replace_tags(ttokens, ttags, tvocab)
+ else:
+ replace_unks(stokens, svocab)
+ replace_unks(ttokens, tvocab)
+ # List aligns for each target.
+ # Note: align specifies source -> target
+ target_aligns = [[] for t in range(len(ttokens))]
+ for atoken in lines[2][:-1].split():
+ spos, tpos = atoken.split("-")
+ spos, tpos = int(spos), int(tpos)
+ target_aligns[tpos].append(spos)
+
+ # EOS alignment.
+ target_aligns[-1] = [len(stokens) - 1]
+
+ for tpos, spos_list in enumerate(target_aligns):
+ # Affiliation heuristics - see Devlin t al. p1371
+ if not spos_list:
+ # tpos has no alignment, look right, then left, then
+ # right-right, then left-left etc.
+ rpos = tpos + 1
+ lpos = tpos - 1
+ while rpos < len(ttokens) or lpos >= 0:
+ if rpos < len(ttokens) and target_aligns[rpos]:
+ spos_list = target_aligns[rpos]
+ break
+ if lpos >= 0 and target_aligns[lpos]:
+ spos_list = target_aligns[lpos]
+ break
+ rpos += 1
+ lpos -= 1
+
+ if not spos_list:
+ raise Exception(
+ "No alignments in sentence \nSRC: " +
+ lines[0][:-1] + "\nTGT: " + lines[1][:-1])
+ midpos = (len(spos_list) - 1) / 2
+ spos = sorted(spos_list)[midpos]
+
+ # source-context, target-context, predicted word
+ for i in range(max(0, m - spos)):
+ print>>ofh, BOS,
+ # print [spos-m/2,spos+m/2+1], stokens[spos-m/2:spos+m/2+1]
+ print>>ofh, " ".join(
+ [s for s in stokens[max(0, spos - m):spos + m + 1]]),
+ for i in range(max(0, spos + m + 1 - len(stokens))):
+ print>>ofh, EOS,
+ for i in range(max(0, n - (tpos + 1))):
+ print>>ofh, BOS,
+ print>>ofh, " ".join(
+ [t for t in ttokens[max(0, tpos + 1 - n):tpos + 1]]),
+ print>>ofh
+ ngrams += 1
+
+ count += 1
+ if count % 1000 == 0:
+ sys.stderr.write(".")
+ if count % 50000 == 0:
+ sys.stderr.write(" [%d]\n" % count)
ofh.close()
sys.stderr.write("\n")
LOG.info("Extracted %d ngrams" % ngrams)
return tags
-
-
diff --git a/scripts/training/bilingual-lm/extract_test.py b/scripts/training/bilingual-lm/extract_test.py
index a22c2e429..3c9a03b85 100755
--- a/scripts/training/bilingual-lm/extract_test.py
+++ b/scripts/training/bilingual-lm/extract_test.py
@@ -1,8 +1,7 @@
#!/usr/bin/env python
-#
-# Create a test corpus, using a previously pruned vocabulary.
-#
+"""Create a test corpus, using a previously pruned vocabulary."""
+
import logging
import optparse
@@ -12,72 +11,84 @@ import sys
import extract
+
def read_vocab(filename, offset=0):
- vocab = {}
- for i, line in enumerate(open(filename)):
- vocab[line.strip()] = i+offset
- return vocab, i+offset
+ vocab = {}
+ for i, line in enumerate(open(filename)):
+ vocab[line.strip()] = i + offset
+ return vocab, i + offset
-def main():
- logging.basicConfig(format='%(asctime)s %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S', level=logging.DEBUG)
- parser = optparse.OptionParser("%prog [options]")
- parser.add_option("-e", "--target-language", type="string", dest="target_language")
- parser.add_option("-f", "--source-language", type="string", dest="source_language")
- parser.add_option("-c", "--corpus", type="string", dest="corpus_stem")
- parser.add_option("-t", "--tagged-corpus", type="string", dest="tagged_stem")
- parser.add_option("-a", "--align", type="string", dest="align_file")
- parser.add_option("-w", "--working-dir", type="string", dest="working_dir")
-
-
- parser.set_defaults(
- target_language = "en",
- source_language = "de",
- corpus_stem = "test",
- align_file = "test.align",
- working_dir = "working",
- )
- options,args = parser.parse_args(sys.argv)
- if not os.path.exists(options.working_dir):
- LOG.error("Working directory '%s' not found" % working_dir)
- sys.exit(1)
-
- m,n = None,None
- for line in open(options.working_dir + "/info"):
- name,value = line[:-1].split()
- if name == "m": m = int(value)
- if name == "n": n = int(value)
- if m == None or n == None:
- LOG.error("info file is incomplete")
- sys.exit(1)
-
- tvocab, offset = read_vocab(options.working_dir + "/vocab.target")
- svocab, offset = read_vocab(options.working_dir + "/vocab.source", offset+1)
-
- file_stem = os.path.basename(options.corpus_stem)
- ofh = open(options.working_dir + "/" + file_stem + ".ngrams", "w")
- extract.get_ngrams(options.corpus_stem,
- options.align_file,
- options.tagged_stem,
- svocab,
- tvocab,
- options.source_language,
- options.target_language,
- m,
- n,
- ofh)
-
- numberized_file = options.working_dir + "/" + file_stem + ".numberized"
- ngrams_file_handle = open(options.working_dir + "/" + file_stem + ".ngrams", 'r')
- numberized_file_handle = open(numberized_file, 'w')
-
- #Numberize the file
- for line in ngrams_file_handle:
- numberized_file_handle.write(extract.numberize(line, m, n, svocab, tvocab))
-
- numberized_file_handle.close()
- ngrams_file_handle.close()
+def main():
+ logging.basicConfig(
+ format='%(asctime)s %(levelname)s: %(message)s',
+ datefmt='%Y-%m-%d %H:%M:%S', level=logging.DEBUG)
+ parser = optparse.OptionParser("%prog [options]")
+ parser.add_option(
+ "-e", "--target-language", type="string", dest="target_language")
+ parser.add_option(
+ "-f", "--source-language", type="string", dest="source_language")
+ parser.add_option(
+ "-c", "--corpus", type="string", dest="corpus_stem")
+ parser.add_option(
+ "-t", "--tagged-corpus", type="string", dest="tagged_stem")
+ parser.add_option(
+ "-a", "--align", type="string", dest="align_file")
+ parser.add_option(
+ "-w", "--working-dir", type="string", dest="working_dir")
+
+ parser.set_defaults(
+ target_language="en",
+ source_language="de",
+ corpus_stem="test",
+ align_file="test.align",
+ working_dir="working")
+ options, args = parser.parse_args(sys.argv)
+ if not os.path.exists(options.working_dir):
+ raise Exception(
+ "Working directory '%s' not found" % options.working_dir)
+
+ m, n = None, None
+ for line in open(options.working_dir + "/info"):
+ name, value = line[:-1].split()
+ if name == "m":
+ m = int(value)
+ if name == "n":
+ n = int(value)
+ if m is None or n is None:
+ raise Exception("Info file is incomplete.")
+
+ tvocab, offset = read_vocab(options.working_dir + "/vocab.target")
+ svocab, offset = read_vocab(
+ options.working_dir + "/vocab.source", offset + 1)
+
+ file_stem = os.path.basename(options.corpus_stem)
+ ofh = open(options.working_dir + "/" + file_stem + ".ngrams", "w")
+ extract.get_ngrams(
+ options.corpus_stem,
+ options.align_file,
+ options.tagged_stem,
+ svocab,
+ tvocab,
+ options.source_language,
+ options.target_language,
+ m,
+ n,
+ ofh)
+
+ numberized_file = options.working_dir + "/" + file_stem + ".numberized"
+ ngrams_file_handle = open(
+ os.path.join(options.working_dir, file_stem + ".ngrams"), 'r')
+ numberized_file_handle = open(numberized_file, 'w')
+
+ # Numberize the file.
+ for line in ngrams_file_handle:
+ numberized_file_handle.write(extract.numberize(
+ line, m, n, svocab, tvocab))
+
+ numberized_file_handle.close()
+ ngrams_file_handle.close()
if __name__ == "__main__":
- main()
+ main()
diff --git a/scripts/training/bilingual-lm/extract_training.py b/scripts/training/bilingual-lm/extract_training.py
index cd8755580..bd3538188 100755
--- a/scripts/training/bilingual-lm/extract_training.py
+++ b/scripts/training/bilingual-lm/extract_training.py
@@ -11,145 +11,160 @@ import extract
LOG = logging.getLogger(__name__)
-def get_pruned_vocab(corpus,prune):
- counts = Counter()
- LOG.info("Reading vocabulary from %s" % corpus)
- lines = 0
- for line in open(corpus):
- for token in line[:-1].split():
- counts[token] += 1
- lines += 1
- if lines % 1000 == 0: sys.stderr.write(".")
- if lines % 50000 == 0: sys.stderr.write(" [%d]\n" % lines)
- sys.stderr.write("\n")
- counts[extract.BOS] += lines
- counts[extract.EOS] += lines
- LOG.info("Vocabulary size: %d" % len(counts))
- if prune:
- return Counter(dict(counts.most_common(prune)))
- else:
- return counts
+
+def get_pruned_vocab(corpus, prune):
+ counts = Counter()
+ LOG.info("Reading vocabulary from %s" % corpus)
+ lines = 0
+ for line in open(corpus):
+ for token in line[:-1].split():
+ counts[token] += 1
+ lines += 1
+ if lines % 1000 == 0:
+ sys.stderr.write(".")
+ if lines % 50000 == 0:
+ sys.stderr.write(" [%d]\n" % lines)
+ sys.stderr.write("\n")
+ counts[extract.BOS] += lines
+ counts[extract.EOS] += lines
+ LOG.info("Vocabulary size: %d" % len(counts))
+ if prune:
+ return Counter(dict(counts.most_common(prune)))
+ else:
+ return counts
+
def save_vocab(directory, filename, vocab):
- fh = open(directory + "/" + filename, "w")
- for word in vocab:
- print>>fh, word
-
+ fh = open(directory + "/" + filename, "w")
+ for word in vocab:
+ print>>fh, word
+
+
def main():
- logging.basicConfig(format='%(asctime)s %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S', level=logging.DEBUG)
- parser = optparse.OptionParser("%prog [options]")
- parser.add_option("-e", "--target-language", type="string", dest="target_language")
- parser.add_option("-f", "--source-language", type="string", dest="source_language")
- parser.add_option("-c", "--corpus", type="string", dest="corpus_stem")
- parser.add_option("-t", "--tagged-corpus", type="string", dest="tagged_stem")
- parser.add_option("-a", "--align", type="string", dest="align_file")
- parser.add_option("-w", "--working-dir", type="string", dest="working_dir")
- parser.add_option("-n", "--target-context", type="int", dest="n")
- parser.add_option("-m", "--source-context", type="int", dest="m")
- parser.add_option("-s", "--prune-source-vocab", type="int", dest="sprune")
- parser.add_option("-p", "--prune-target-vocab", type="int", dest="tprune")
-
-
- parser.set_defaults(
- target_language = "en",
- source_language = "de",
- corpus_stem = "train.10k",
- align_file = "train.10k.align",
- n = 5,
- m = 4,
- working_dir = "working",
- sprune=16000,
- tprune=16000
- )
- options,args = parser.parse_args(sys.argv)
-
- if not os.path.exists(options.working_dir):
- os.makedirs(options.working_dir)
- else:
- LOG.warn("Directory %s already exists, re-using" % options.working_dir)
-
- info_file = options.working_dir + "/info"
- if os.path.exists(info_file):
- for line in open(info_file):
- name,value = line[:-1].split()
- if name == "n" and int(value) != options.n or \
- name == "m" and int(value) != options.m:
- LOG.error("info file exists, but parameters do not match. Delete working directory and rerun")
- sys.exit(1)
- else:
- ifh = open(info_file,"w")
- print>>ifh,"m",options.m
- print>>ifh,"n",options.n
- ifh.close()
-
- scorpus = options.corpus_stem + "." + options.source_language
- tcorpus = options.corpus_stem + "." + options.target_language
-
- tvocab,svocab = None,None
- # Extract vocabulary, and prune, if required
- svocab = get_pruned_vocab(scorpus,options.sprune)
- tvocab = get_pruned_vocab(tcorpus,options.tprune)
-
-
- file_stem = os.path.basename(options.corpus_stem)
- ngram_file = options.working_dir + "/" + file_stem + ".ngrams"
- ofh = open(ngram_file, "w")
-
- tags = extract.get_ngrams(options.corpus_stem,
- options.align_file,
- options.tagged_stem,
- svocab,
- tvocab,
- options.source_language,
- options.target_language,
- options.m,
- options.n,
- ofh)
-
- # Save vocabularies
- del svocab["<null>"]
- del tvocab["<null>"]
- del svocab["<unk>"]
- del tvocab["<unk>"]
- svocab_list = [item[0] for item in svocab.most_common()]
- tvocab_list = [item[0] for item in tvocab.most_common()]
-
- # UNK is always the first vocabulary element. Make sure
- # it appears in position 0
- # We need to use <null> token in the chart decoder in order
- # to correctly estimate the probabilities of incomplete subphrases
- # that are not sentence initial.
-
- tvocab_list.insert(0, "<null>")
- tvocab_list.insert(0, "<unk>")
- svocab_list.insert(0, "<unk>")
-
- #Get tags:
- tag_list = [item[0] for item in tags.most_common()]
- svocab_list = svocab_list + tag_list
- tvocab_list = tvocab_list + tag_list
-
- save_vocab(options.working_dir, "vocab.source", svocab_list)
- save_vocab(options.working_dir, "vocab.target", tvocab_list)
-
- #Create vocab dictionaries that map word to ID
- tvocab_idmap = {}
- for i in range(len(tvocab_list)):
- tvocab_idmap[tvocab_list[i]] = i
-
- svocab_idmap = {}
- for i in range(len(svocab_list)):
- svocab_idmap[svocab_list[i]] = i + len(tvocab_idmap)
-
- numberized_file = options.working_dir + "/" + file_stem + ".numberized"
- ngrams_file_handle = open(ngram_file, 'r')
- numberized_file_handle = open(numberized_file, 'w')
-
- #Numberize the file
- for line in ngrams_file_handle:
- numberized_file_handle.write(extract.numberize(line, options.m, options.n, svocab_idmap, tvocab_idmap))
- numberized_file_handle.close()
- ngrams_file_handle.close()
+ logging.basicConfig(
+ format='%(asctime)s %(levelname)s: %(message)s',
+ datefmt='%Y-%m-%d %H:%M:%S', level=logging.DEBUG)
+ parser = optparse.OptionParser("%prog [options]")
+ parser.add_option(
+ "-e", "--target-language", type="string", dest="target_language")
+ parser.add_option(
+ "-f", "--source-language", type="string", dest="source_language")
+ parser.add_option("-c", "--corpus", type="string", dest="corpus_stem")
+ parser.add_option(
+ "-t", "--tagged-corpus", type="string", dest="tagged_stem")
+ parser.add_option("-a", "--align", type="string", dest="align_file")
+ parser.add_option("-w", "--working-dir", type="string", dest="working_dir")
+ parser.add_option("-n", "--target-context", type="int", dest="n")
+ parser.add_option("-m", "--source-context", type="int", dest="m")
+ parser.add_option("-s", "--prune-source-vocab", type="int", dest="sprune")
+ parser.add_option("-p", "--prune-target-vocab", type="int", dest="tprune")
+
+ parser.set_defaults(
+ target_language="en",
+ source_language="de",
+ corpus_stem="train.10k",
+ align_file="train.10k.align",
+ n=5,
+ m=4,
+ working_dir="working",
+ sprune=16000,
+ tprune=16000
+ )
+ options, args = parser.parse_args(sys.argv)
+
+ if not os.path.exists(options.working_dir):
+ os.makedirs(options.working_dir)
+ else:
+ LOG.warn("Directory %s already exists, re-using" % options.working_dir)
+
+ info_file = options.working_dir + "/info"
+ if os.path.exists(info_file):
+ for line in open(info_file):
+ name, value = line[:-1].split()
+ n_mismatch = (name == 'n' and int(value) != options.n)
+ m_mismatch = (name == 'm' and int(value) != options.m)
+ if n_mismatch or m_mismatch:
+ LOG.error(
+ "info file exists, but parameters do not match. "
+ "Delete working directory and rerun.")
+ sys.exit(1)
+ else:
+ ifh = open(info_file, "w")
+ print>>ifh, "m", options.m
+ print>>ifh, "n", options.n
+ ifh.close()
+
+ scorpus = options.corpus_stem + "." + options.source_language
+ tcorpus = options.corpus_stem + "." + options.target_language
+
+ tvocab, svocab = None, None
+ # Extract vocabulary, and prune, if required.
+ svocab = get_pruned_vocab(scorpus, options.sprune)
+ tvocab = get_pruned_vocab(tcorpus, options.tprune)
+
+ file_stem = os.path.basename(options.corpus_stem)
+ ngram_file = options.working_dir + "/" + file_stem + ".ngrams"
+ ofh = open(ngram_file, "w")
+
+ tags = extract.get_ngrams(
+ options.corpus_stem,
+ options.align_file,
+ options.tagged_stem,
+ svocab,
+ tvocab,
+ options.source_language,
+ options.target_language,
+ options.m,
+ options.n,
+ ofh)
+
+ # Save vocabularies.
+ del svocab["<null>"]
+ del tvocab["<null>"]
+ del svocab["<unk>"]
+ del tvocab["<unk>"]
+ svocab_list = [item[0] for item in svocab.most_common()]
+ tvocab_list = [item[0] for item in tvocab.most_common()]
+
+ # UNK is always the first vocabulary element. Make sure
+ # it appears in position 0
+ # We need to use <null> token in the chart decoder in order
+ # to correctly estimate the probabilities of incomplete subphrases
+ # that are not sentence initial.
+
+ tvocab_list.insert(0, "<null>")
+ tvocab_list.insert(0, "<unk>")
+ svocab_list.insert(0, "<unk>")
+
+ # Get tags:
+ tag_list = [item[0] for item in tags.most_common()]
+ svocab_list = svocab_list + tag_list
+ tvocab_list = tvocab_list + tag_list
+
+ save_vocab(options.working_dir, "vocab.source", svocab_list)
+ save_vocab(options.working_dir, "vocab.target", tvocab_list)
+
+ # Create vocab dictionaries that map word to ID.
+ tvocab_idmap = {}
+ for i in range(len(tvocab_list)):
+ tvocab_idmap[tvocab_list[i]] = i
+
+ svocab_idmap = {}
+ for i in range(len(svocab_list)):
+ svocab_idmap[svocab_list[i]] = i + len(tvocab_idmap)
+
+ numberized_file = options.working_dir + "/" + file_stem + ".numberized"
+ ngrams_file_handle = open(ngram_file, 'r')
+ numberized_file_handle = open(numberized_file, 'w')
+
+ # Numberize the file.
+ for line in ngrams_file_handle:
+ numberized_file_handle.write(
+ extract.numberize(
+ line, options.m, options.n, svocab_idmap, tvocab_idmap))
+ numberized_file_handle.close()
+ ngrams_file_handle.close()
+
if __name__ == "__main__":
- main()
+ main()
diff --git a/scripts/training/bilingual-lm/reduce_ngrams.py b/scripts/training/bilingual-lm/reduce_ngrams.py
index 65795a10c..3442fb302 100755
--- a/scripts/training/bilingual-lm/reduce_ngrams.py
+++ b/scripts/training/bilingual-lm/reduce_ngrams.py
@@ -1,6 +1,10 @@
#!/usr/bin/env python3
-"""Reduces an ngrams file for training nplm to a smaller version of it with less ngrams"""
+"""Reduces an ngrams file for training nplm to a smaller version of it.
+
+The smaller version will have fewer ngrams.
+"""
+
from sys import argv
if len(argv) != 5:
@@ -15,11 +19,11 @@ NGRAMS = int(argv[4])
for line in INFILE:
line = line.split()
- line = line[START_IDX:START_IDX+NGRAMS]
+ line = line[START_IDX:START_IDX + NGRAMS]
linetowrite = ""
for token in line:
linetowrite = linetowrite + token + " "
- #Strip final empty space and add newline
+ # Strip final empty space and add newline.
linetowrite = linetowrite[:-1]
linetowrite = linetowrite + '\n'
OUTFILE.write(linetowrite)
diff --git a/scripts/training/bilingual-lm/test_nplm.py b/scripts/training/bilingual-lm/test_nplm.py
index 51b8cebda..737266bc3 100755
--- a/scripts/training/bilingual-lm/test_nplm.py
+++ b/scripts/training/bilingual-lm/test_nplm.py
@@ -7,51 +7,71 @@ import sys
def main():
- logging.basicConfig(format='%(asctime)s %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S', level=logging.DEBUG)
- parser = optparse.OptionParser("%prog [options]")
- parser.add_option("-w", "--working-dir", dest="working_dir")
- parser.add_option("-c", "--corpus", dest="corpus_stem")
- parser.add_option("-r", "--train-corpus", dest="train_stem")
- parser.add_option("-l", "--nplm-home", dest="nplm_home")
- parser.add_option("-e", "--epoch", dest="epoch", type="int")
- parser.add_option("-n", "--ngram-size", dest="ngram_size", type="int")
- parser.add_option("-b", "--minibatch-size", dest="minibatch_size", type="int")
- parser.add_option("-t", "--threads", dest="threads", type="int")
-
- parser.set_defaults(
- working_dir = "working"
- ,corpus_stem = "test"
- ,train_stem = "train.10k"
- ,nplm_home = "/home/bhaddow/tools/nplm"
- ,epoch=10
- ,ngram_size = 14
- ,minibatch_size=1000
- ,threads=8
- )
-
- options,args = parser.parse_args(sys.argv)
-
- model_prefix = options.working_dir + "/" + options.train_stem + ".model.nplm"
- model_file = model_prefix + "." + str(options.epoch)
- test_file = options.working_dir + "/" + options.corpus_stem + ".ngrams"
- prep_file = options.working_dir + "/" + options.corpus_stem + ".prepared"
- vocab_file = options.working_dir + "/vocab"
-
- #TODO: Get ngram size from info file.
- prep_args = [options.nplm_home + "/src/prepareNeuralLM", "--train_text", test_file, "--ngram_size",
- str(options.ngram_size), "--ngramize", "0", "--words_file", vocab_file, "--train_file", prep_file]
- ret = subprocess.call(prep_args)
- if ret: raise Exception("Preparation failed")
-
- test_args = [options.nplm_home + "/src/testNeuralNetwork", "--test_file", prep_file, "--model_file",
- model_file , "--minibatch_size", str(options.minibatch_size), "--num_threads", str(options.threads)]
- ret = subprocess.call(test_args)
- if ret: raise Exception("Testing failed")
-
-#$ROOT/src/prepareNeuralLM --train_text $TESTFILE1 --ngram_size $NGRAM_SIZE --ngramize 1 --vocab_size $INPUT_VOCAB_SIZE --words_file $WORKDIR/words --train_file $WORKDIR/ref.ngrams || exit 1
-
-#$ROOT/src/testNeuralNetwork --test_file $WORKDIR/ref.ngrams --model_file $OUTFILE --minibatch_size $MINIBATCH_SIZE --num_threads $THREADS || exit 1
+ logging.basicConfig(
+ format='%(asctime)s %(levelname)s: %(message)s',
+ datefmt='%Y-%m-%d %H:%M:%S', level=logging.DEBUG)
+ parser = optparse.OptionParser("%prog [options]")
+ parser.add_option("-w", "--working-dir", dest="working_dir")
+ parser.add_option("-c", "--corpus", dest="corpus_stem")
+ parser.add_option("-r", "--train-corpus", dest="train_stem")
+ parser.add_option("-l", "--nplm-home", dest="nplm_home")
+ parser.add_option("-e", "--epoch", dest="epoch", type="int")
+ parser.add_option("-n", "--ngram-size", dest="ngram_size", type="int")
+ parser.add_option(
+ "-b", "--minibatch-size", dest="minibatch_size", type="int")
+ parser.add_option("-t", "--threads", dest="threads", type="int")
+
+ parser.set_defaults(
+ working_dir="working",
+ corpus_stem="test",
+ train_stem="train.10k",
+ nplm_home="/home/bhaddow/tools/nplm",
+ epoch=10,
+ ngram_size=14,
+ minibatch_size=1000,
+ threads=8)
+
+ options, _ = parser.parse_args(sys.argv)
+
+ model_prefix = (
+ options.working_dir + "/" + options.train_stem + ".model.nplm")
+ model_file = model_prefix + "." + str(options.epoch)
+ test_file = options.working_dir + "/" + options.corpus_stem + ".ngrams"
+ prep_file = options.working_dir + "/" + options.corpus_stem + ".prepared"
+ vocab_file = options.working_dir + "/vocab"
+
+ # TODO: Get ngram size from info file.
+ prep_args = [
+ options.nplm_home + "/src/prepareNeuralLM",
+ "--train_text", test_file,
+ "--ngram_size", str(options.ngram_size),
+ "--ngramize", "0",
+ "--words_file", vocab_file,
+ "--train_file", prep_file,
+ ]
+ ret = subprocess.call(prep_args)
+ if ret:
+ raise Exception("Preparation failed")
+
+ test_args = [
+ options.nplm_home + "/src/testNeuralNetwork",
+ "--test_file", prep_file,
+ "--model_file", model_file,
+ "--minibatch_size", str(options.minibatch_size),
+ "--num_threads", str(options.threads),
+ ]
+ ret = subprocess.call(test_args)
+ if ret:
+ raise Exception("Testing failed")
+
+# $ROOT/src/prepareNeuralLM --train_text $TESTFILE1 \
+# --ngram_size $NGRAM_SIZE --ngramize 1 --vocab_size $INPUT_VOCAB_SIZE \
+# --words_file $WORKDIR/words --train_file $WORKDIR/ref.ngrams || exit 1
+
+# $ROOT/src/testNeuralNetwork --test_file $WORKDIR/ref.ngrams \
+# --model_file $OUTFILE --minibatch_size $MINIBATCH_SIZE \
+# --num_threads $THREADS || exit 1
+
if __name__ == "__main__":
- main()
-
+ main()
diff --git a/scripts/training/bilingual-lm/train_nplm.py b/scripts/training/bilingual-lm/train_nplm.py
index 356fd798d..7bc74429e 100755
--- a/scripts/training/bilingual-lm/train_nplm.py
+++ b/scripts/training/bilingual-lm/train_nplm.py
@@ -8,7 +8,9 @@ import subprocess
import sys
import os
-logging.basicConfig(format='%(asctime)s %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S', level=logging.DEBUG)
+logging.basicConfig(
+ format='%(asctime)s %(levelname)s: %(message)s',
+ datefmt='%Y-%m-%d %H:%M:%S', level=logging.DEBUG)
parser = argparse.ArgumentParser()
parser.add_argument("-w", "--working-dir", dest="working_dir")
parser.add_argument("-c", "--corpus", dest="corpus_stem")
@@ -18,8 +20,10 @@ parser.add_argument("-n", "--ngram-size", dest="ngram_size", type=int)
parser.add_argument("-b", "--minibatch-size", dest="minibatch_size", type=int)
parser.add_argument("-s", "--noise", dest="noise", type=int)
parser.add_argument("-d", "--hidden", dest="hidden", type=int)
-parser.add_argument("-i", "--input-embedding", dest="input_embedding", type=int)
-parser.add_argument("-o", "--output-embedding", dest="output_embedding", type=int)
+parser.add_argument(
+ "-i", "--input-embedding", dest="input_embedding", type=int)
+parser.add_argument(
+ "-o", "--output-embedding", dest="output_embedding", type=int)
parser.add_argument("-t", "--threads", dest="threads", type=int)
parser.add_argument("-m", "--output-model", dest="output_model")
parser.add_argument("-r", "--output-dir", dest="output_dir")
@@ -35,94 +39,109 @@ parser.add_argument("--output_vocab_size", dest="output_vocab_size", type=int)
parser.set_defaults(
- working_dir = "working"
- ,corpus_stem = "train.10k"
- ,nplm_home = "/home/bhaddow/tools/nplm"
- ,epochs = 10
- ,ngram_size = 14
- ,minibatch_size=1000
- ,noise=100
- ,hidden=750
- ,input_embedding=150
- ,output_embedding=150
- ,threads=1
- ,output_model = "train.10k"
- ,output_dir = None
- ,config_options_file = "config"
- ,log_file = "log"
- ,validation_file = None
- ,activation_fn = "rectifier"
- ,learning_rate = 1
- ,input_words_file = None
- ,output_words_file = None
- ,input_vocab_size = 0
- ,output_vocab_size = 0
+ working_dir="working",
+ corpus_stem="train.10k",
+ nplm_home="/home/bhaddow/tools/nplm",
+ epochs=10,
+ ngram_size=14,
+ minibatch_size=1000,
+ noise=100,
+ hidden=750,
+ input_embedding=150,
+ output_embedding=150,
+ threads=1,
+ output_model="train.10k",
+ output_dir=None,
+ config_options_file="config",
+ log_file="log",
+ validation_file=None,
+ activation_fn="rectifier",
+ learning_rate=1,
+ input_words_file=None,
+ output_words_file=None,
+ input_vocab_size=0,
+ output_vocab_size=0
)
+
def main(options):
- vocab_command = []
- if options.input_words_file is not None:
- vocab_command += ['--input_words_file', options.input_words_file]
- if options.output_words_file is not None:
- vocab_command += ['--output_words_file', options.output_words_file]
- if options.input_vocab_size:
- vocab_command += ['--input_vocab_size', str(options.input_vocab_size)]
- if options.output_vocab_size:
- vocab_command += ['--output_vocab_size', str(options.output_vocab_size)]
-
- # Set up validation command variable to use with validation set.
- validations_command = []
- if options.validation_file is not None:
- validations_command =["--validation_file", (options.validation_file + ".numberized")]
-
- # In order to allow for different models to be trained after the same
- # preparation step, we should provide an option for multiple output directories
- # If we have not set output_dir, set it to the same thing as the working dir
-
- if options.output_dir is None:
- options.output_dir = options.working_dir
- else:
- # Create output dir if necessary
- if not os.path.exists(options.output_dir):
- os.makedirs(options.output_dir)
-
- config_file = os.path.join(options.output_dir, options.config_options_file + '-' + options.output_model)
- log_file = os.path.join(options.output_dir, options.log_file + '-' + options.output_model)
- log_file_write = open(log_file, 'w')
- config_file_write = open(config_file, 'w')
-
- config_file_write.write("Called: " + ' '.join(sys.argv) + '\n\n')
-
- in_file = os.path.join(options.working_dir, os.path.basename(options.corpus_stem) + ".numberized")
-
- model_prefix = os.path.join(options.output_dir, options.output_model + ".model.nplm")
- train_args = [options.nplm_home + "/src/trainNeuralNetwork",
- "--train_file", in_file,
- "--num_epochs", str(options.epochs),
- "--model_prefix", model_prefix,
- "--learning_rate", str(options.learning_rate),
- "--minibatch_size", str(options.minibatch_size),
- "--num_noise_samples", str(options.noise),
- "--num_hidden", str(options.hidden),
- "--input_embedding_dimension", str(options.input_embedding),
- "--output_embedding_dimension", str(options.output_embedding),
- "--num_threads", str(options.threads),
- "--activation_function", options.activation_fn] + validations_command + vocab_command
- print("Train model command: ")
- print(', '.join(train_args))
-
- config_file_write.write("Training step:\n" + ' '.join(train_args) + '\n')
- config_file_write.close()
-
- log_file_write.write("Training output:\n")
- ret = subprocess.call(train_args, stdout=log_file_write, stderr=log_file_write)
- if ret:
- raise Exception("Training failed")
-
- log_file_write.close()
+ vocab_command = []
+ if options.input_words_file is not None:
+ vocab_command += ['--input_words_file', options.input_words_file]
+ if options.output_words_file is not None:
+ vocab_command += ['--output_words_file', options.output_words_file]
+ if options.input_vocab_size:
+ vocab_command += ['--input_vocab_size', str(options.input_vocab_size)]
+ if options.output_vocab_size:
+ vocab_command += [
+ '--output_vocab_size', str(options.output_vocab_size)]
+
+ # Set up validation command variable to use with validation set.
+ validations_command = []
+ if options.validation_file is not None:
+ validations_command = [
+ "--validation_file", (options.validation_file + ".numberized")]
+
+ # In order to allow for different models to be trained after the same
+ # preparation step, we should provide an option for multiple output
+ # directories.
+ # If we have not set output_dir, set it to the same thing as the working
+ # dir.
+
+ if options.output_dir is None:
+ options.output_dir = options.working_dir
+ else:
+ # Create output dir if necessary
+ if not os.path.exists(options.output_dir):
+ os.makedirs(options.output_dir)
+
+ config_file = os.path.join(
+ options.output_dir,
+ options.config_options_file + '-' + options.output_model)
+ log_file = os.path.join(
+ options.output_dir, options.log_file + '-' + options.output_model)
+ log_file_write = open(log_file, 'w')
+ config_file_write = open(config_file, 'w')
+
+ config_file_write.write("Called: " + ' '.join(sys.argv) + '\n\n')
+
+ in_file = os.path.join(
+ options.working_dir,
+ os.path.basename(options.corpus_stem) + ".numberized")
+
+ model_prefix = os.path.join(
+ options.output_dir, options.output_model + ".model.nplm")
+ train_args = [
+ options.nplm_home + "/src/trainNeuralNetwork",
+ "--train_file", in_file,
+ "--num_epochs", str(options.epochs),
+ "--model_prefix", model_prefix,
+ "--learning_rate", str(options.learning_rate),
+ "--minibatch_size", str(options.minibatch_size),
+ "--num_noise_samples", str(options.noise),
+ "--num_hidden", str(options.hidden),
+ "--input_embedding_dimension", str(options.input_embedding),
+ "--output_embedding_dimension", str(options.output_embedding),
+ "--num_threads", str(options.threads),
+ "--activation_function",
+ options.activation_fn,
+ ] + validations_command + vocab_command
+ print("Train model command: ")
+ print(', '.join(train_args))
+
+ config_file_write.write("Training step:\n" + ' '.join(train_args) + '\n')
+ config_file_write.close()
+
+ log_file_write.write("Training output:\n")
+ ret = subprocess.call(
+ train_args, stdout=log_file_write, stderr=log_file_write)
+ if ret:
+ raise Exception("Training failed")
+
+ log_file_write.close()
-if __name__ == "__main__":
- options = parser.parse_args()
- main(options)
+if __name__ == "__main__":
+ options = parser.parse_args()
+ main(options)
diff --git a/scripts/training/create_count_tables.py b/scripts/training/create_count_tables.py
index 064576969..2288c034a 100755
--- a/scripts/training/create_count_tables.py
+++ b/scripts/training/create_count_tables.py
@@ -2,15 +2,27 @@
# -*- coding: utf-8 -*-
# Author: Rico Sennrich <sennrich [AT] cl.uzh.ch>
-# This script creates tables that store phrase pair frequencies rather than probabilities.
-# These count tables can be used for a delayed, online computation of the original phrase translation features
-# The benefit is that models can be combined quickly, with the same results as if we trained a model on the concatenation of all data (excepting differences in word alignment).
-# Also, each model can be given a weight, which is applied to all frequencies of the model for the combination.
+# This script creates tables that store phrase pair frequencies rather than
+# probabilities.
+#
+# These count tables can be used for a delayed, online computation of the
+# original phrase translation features.
+#
+# The benefit is that models can be combined quickly, with the same results
+# as if we trained a model on the concatenation of all data (excepting
+# differences in word alignment).
+#
+# Also, each model can be given a weight, which is applied to all frequencies
+# of the model for the combination.
# Note: the input phrase table must have alignment information;
# it must be unsmoothed;
-# additionally, the phrase table type PhraseDictionaryMultiModelCounts requires the lexical counts files lex.counts.e2f and lex.counts.f2e (obtained by using the option --write-lexical-counts in train-model.perl)
-# The results may differ from training on the concatenation of all data due to differences in word alignment, and rounding errors.
+# additionally, the phrase table type PhraseDictionaryMultiModelCounts
+# requires the lexical counts files lex.counts.e2f and lex.counts.f2e
+# (obtained by using the option --write-lexical-counts in
+# train-model.perl)
+# The results may differ from training on the concatenation of all data due
+# to differences in word alignment, and rounding errors.
from __future__ import unicode_literals
@@ -21,11 +33,15 @@ from tempfile import NamedTemporaryFile
from subprocess import Popen, PIPE
if len(sys.argv) < 3 or len(sys.argv) > 4:
- sys.stderr.write('Usage: ' + sys.argv[0] + ' in_file out_path [prune_count]\nThis script will create the files out_path/count-table.gz and out_path/count-table-target.gz\n')
+ sys.stderr.write(
+ 'Usage: ' +
+ sys.argv[0] + " in_file out_path [prune_count]\n"
+ "This script will create the files out_path/count-table.gz and "
+ "out_path/count-table-target.gz\n")
exit()
-def handle_file(filename,action,fileobj=None,mode='r'):
+def handle_file(filename, action, fileobj=None, mode='r'):
"""support reading either from stdin, plain file or gzipped file"""
if action == 'open':
@@ -33,21 +49,23 @@ def handle_file(filename,action,fileobj=None,mode='r'):
if mode == 'r':
mode = 'rb'
- if mode == 'rb' and not filename == '-' and not os.path.exists(filename):
- if os.path.exists(filename+'.gz'):
- filename = filename+'.gz'
+ if mode == 'rb' and filename != '-' and not os.path.exists(filename):
+ if os.path.exists(filename + '.gz'):
+ filename = filename + '.gz'
else:
- sys.stderr.write('Error: unable to open file. ' + filename + ' - aborting.\n')
+ sys.stderr.write(
+ "Error: unable to open file. " +
+ filename + " - aborting.\n")
exit()
if filename.endswith('.gz'):
- fileobj = gzip.open(filename,mode)
+ fileobj = gzip.open(filename, mode)
elif filename == '-':
fileobj = sys.stdin
else:
- fileobj = open(filename,mode)
+ fileobj = open(filename, mode)
return fileobj
@@ -59,10 +77,13 @@ def sort_and_uniq(infile, outfile):
cmd = ['sort', infile]
fobj = handle_file(outfile, 'open', mode='w')
- sys.stderr.write('Executing: LC_ALL=C ' + ' '.join(cmd) + ' | uniq | gzip -c > ' + outfile + '\n')
- p_sort = Popen(cmd, env={'LC_ALL':'C'}, stdout=PIPE)
- p_uniq = Popen(['uniq'], stdin = p_sort.stdout, stdout=PIPE)
- p_compress = Popen(['gzip', '-c'], stdin = p_uniq.stdout, stdout=fobj)
+ sys.stderr.write(
+ "Executing: LC_ALL=C " +
+ ' '.join(cmd) +
+ ' | uniq | gzip -c > ' + outfile + '\n')
+ p_sort = Popen(cmd, env={'LC_ALL': 'C'}, stdout=PIPE)
+ p_uniq = Popen(['uniq'], stdin=p_sort.stdout, stdout=PIPE)
+ p_compress = Popen(['gzip', '-c'], stdin=p_uniq.stdout, stdout=fobj)
p_compress.wait()
fobj.close()
@@ -89,9 +110,9 @@ def create_count_lines(fobj, countobj, countobj_target, prune=0):
try:
fst = comments[2]
except IndexError:
- fst = str(int(round(float(scores[0])*float(ft)))).encode()
+ fst = str(int(round(float(scores[0]) * float(ft)))).encode()
- line[2] = b' '.join([fst,ft,fs])
+ line[2] = b' '.join([fst, ft, fs])
if prune:
if current_source != source:
@@ -106,8 +127,10 @@ def create_count_lines(fobj, countobj, countobj_target, prune=0):
else:
countobj.write(b' ||| '.join(line))
- # target count file
- tline = b' ||| '.join([line[1], b'X', ft]) + b' ||| |||\n' # if you use string formatting to make this look nicer, you may break Python 3 compatibility.
+ # Target count file.
+ # If you use string formatting to make this look nicer, you may break
+ # Python 3 compatibility.
+ tline = b' ||| '.join([line[1], b'X', ft]) + b' ||| |||\n'
countobj_target.write(tline)
if prune:
@@ -119,7 +142,8 @@ def create_count_lines(fobj, countobj, countobj_target, prune=0):
def write_batch(store_lines, outfile, prune):
top20 = sorted(store_lines, reverse=True)[:prune]
- for score, original_pos, store_line in sorted(top20, key = lambda x: x[1]): #write in original_order
+ # Write in original_order.
+ for score, original_pos, store_line in sorted(top20, key=lambda x: x[1]):
outfile.write(store_line)
@@ -130,21 +154,28 @@ if __name__ == '__main__':
else:
prune = 0
- fileobj = handle_file(sys.argv[1],'open')
+ fileobj = handle_file(sys.argv[1], 'open')
out_path = sys.argv[2]
- count_table_file = gzip.open(os.path.join(out_path,'count-table.gz'), 'w')
- count_table_target_file = os.path.join(out_path,'count-table-target.gz')
+ count_table_file = gzip.open(
+ os.path.join(out_path, 'count-table.gz'), 'w')
+ count_table_target_file = os.path.join(out_path, 'count-table-target.gz')
count_table_target_file_temp = NamedTemporaryFile(delete=False)
try:
- sys.stderr.write('Creating temporary file for unsorted target counts file: ' + count_table_target_file_temp.name + '\n')
+ sys.stderr.write(
+ "Creating temporary file for unsorted target counts file: " +
+ count_table_target_file_temp.name + '\n')
- create_count_lines(fileobj, count_table_file, count_table_target_file_temp, prune)
+ create_count_lines(
+ fileobj, count_table_file, count_table_target_file_temp, prune)
count_table_target_file_temp.close()
- sys.stderr.write('Finished writing, now re-sorting and compressing target count file\n')
+ sys.stderr.write(
+ "Finished writing, "
+ "now re-sorting and compressing target count file.\n")
- sort_and_uniq(count_table_target_file_temp.name, count_table_target_file)
+ sort_and_uniq(
+ count_table_target_file_temp. name, count_table_target_file)
os.remove(count_table_target_file_temp.name)
sys.stderr.write('Done\n')
diff --git a/scripts/training/flexibility_score.py b/scripts/training/flexibility_score.py
index 826574d7b..496184616 100755
--- a/scripts/training/flexibility_score.py
+++ b/scripts/training/flexibility_score.py
@@ -1,10 +1,19 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
-# add flexibility scores to a phrase table half
-# you usually don't have to call this script directly; to add flexibility scores to your model, run train-model.perl with the option "--flexibility-score" (will only affect steps 5 and 6)
-# usage: python flexibility_score.py extract.context(.inv).sorted [--Inverse] [--Hierarchical] < phrasetable > output_file
+
# author: Rico Sennrich
+"""Add flexibility scores to a phrase table half.
+
+You usually don't have to call this script directly; to add flexibility
+scores to your model, run train-model.perl with the option
+"--flexibility-score" (will only affect steps 5 and 6).
+
+Usage:
+ python flexibility_score.py extract.context(.inv).sorted \
+ [--Inverse] [--Hierarchical] < phrasetable > output_file
+"""
+
from __future__ import division
from __future__ import unicode_literals
@@ -12,26 +21,28 @@ import sys
import gzip
from collections import defaultdict
+
class FlexScore:
def __init__(self, inverted, hierarchical):
self.inverted = inverted
self.hierarchical = hierarchical
+ def store_pt(self, obj):
+ """Store line in dictionary.
- def store_pt(self,obj):
- """store line in dictionary; if we work with inverted phrase table, swap the two phrases"""
- src,target = obj[0],obj[1]
+ If we work with inverted phrase table, swap the two phrases.
+ """
+ src, target = obj[0], obj[1]
if self.inverted:
src, target = target, src
self.phrase_pairs[src][target] = obj
-
def update_contextcounts(self, obj):
"""count the number of contexts a phrase pair occurs in"""
- src,target = obj[0],obj[1]
+ src, target = obj[0], obj[1]
self.context_counts[src][target] += 1
if obj[-1].startswith(b'<'):
self.context_counts_l[src][target] += 1
@@ -40,18 +51,21 @@ class FlexScore:
elif obj[-1].startswith(b'v'):
self.context_counts_d[src][target] += 1
else:
- sys.stderr.write(b'\nERROR in line: {0}\n'.format(b' ||| '.join(obj)))
- sys.stderr.write(b'ERROR: expecting one of \'<, >, v\' as context marker in context extract file\n')
+ sys.stderr.write(
+ b"\nERROR in line: {0}\n".format(b' ||| '.join(obj)))
+ sys.stderr.write(
+ b"ERROR: expecting one of '<, >, v' as context marker "
+ "in context extract file.\n")
raise ValueError
-
- def traverse_incrementally(self,phrasetable,flexfile):
- """traverse phrase table and phrase extract file (with context information) incrementally
- without storing all in memory."""
+ def traverse_incrementally(self, phrasetable, flexfile):
+ """Traverse phrase table and phrase extract file (with context
+ information) incrementally without storing all in memory.
+ """
increment = b''
old_increment = 1
- stack = ['']*2
+ stack = [''] * 2
# which phrase to use for sorting
sort_pt = 0
@@ -63,10 +77,10 @@ class FlexScore:
old_increment = increment
self.phrase_pairs = defaultdict(dict)
- self.context_counts = defaultdict(lambda:defaultdict(int))
- self.context_counts_l = defaultdict(lambda:defaultdict(int))
- self.context_counts_r = defaultdict(lambda:defaultdict(int))
- self.context_counts_d = defaultdict(lambda:defaultdict(int))
+ self.context_counts = defaultdict(lambda: defaultdict(int))
+ self.context_counts_l = defaultdict(lambda: defaultdict(int))
+ self.context_counts_r = defaultdict(lambda: defaultdict(int))
+ self.context_counts_d = defaultdict(lambda: defaultdict(int))
if stack[0]:
self.store_pt(stack[0])
@@ -96,30 +110,32 @@ class FlexScore:
yield 1
-
- def main(self,phrasetable,flexfile,output_object):
+ def main(self, phrasetable, flexfile, output_object):
i = 0
- sys.stderr.write('Incrementally loading phrase table and adding flexibility score...')
- for block in self.traverse_incrementally(phrasetable,flexfile):
+ sys.stderr.write(
+ "Incrementally loading phrase table "
+ "and adding flexibility score...")
+ for block in self.traverse_incrementally(phrasetable, flexfile):
self.flexprob_l = normalize(self.context_counts_l)
self.flexprob_r = normalize(self.context_counts_r)
self.flexprob_d = normalize(self.context_counts_d)
- for src in sorted(self.phrase_pairs, key = lambda x: x + b' |'):
- for target in sorted(self.phrase_pairs[src], key = lambda x: x + b' |'):
+ # TODO: Why this lambda? It doesn't affect sorting, does it?
+ sortkey = lambda x: x + b' |'
+ for src in sorted(self.phrase_pairs, key=sortkey):
+ for target in sorted(self.phrase_pairs[src], key=sortkey):
- if not i % 1000000:
+ if i % 1000000 == 0:
sys.stderr.write('.')
i += 1
- outline = self.write_phrase_table(src,target)
+ outline = self.write_phrase_table(src, target)
output_object.write(outline)
sys.stderr.write('done\n')
-
- def write_phrase_table(self,src,target):
+ def write_phrase_table(self, src, target):
line = self.phrase_pairs[src][target]
flexscore_l = b"{0:.6g}".format(self.flexprob_l[src][target])
@@ -136,7 +152,6 @@ class FlexScore:
return b' ||| '.join(line) + b'\n'
-
def normalize(d):
out_dict = defaultdict(dict)
@@ -145,7 +160,7 @@ def normalize(d):
total = sum(d[src].values())
for target in d[src]:
- out_dict[src][target] = d[src][target]/total
+ out_dict[src][target] = d[src][target] / total
return out_dict
@@ -153,7 +168,10 @@ def normalize(d):
if __name__ == '__main__':
if len(sys.argv) < 1:
- sys.stderr.write('Usage: python flexibility_score.py extract.context(.inv).sorted [--Inverse] [--Hierarchical] < phrasetable > output_file\n')
+ sys.stderr.write(
+ "Usage: "
+ "python flexibility_score.py extract.context(.inv).sorted "
+ "[--Inverse] [--Hierarchical] < phrasetable > output_file\n")
exit()
flexfile = sys.argv[1]
@@ -168,4 +186,4 @@ if __name__ == '__main__':
hierarchical = False
FS = FlexScore(inverted, hierarchical)
- FS.main(sys.stdin,gzip.open(flexfile,'r'),sys.stdout)
+ FS.main(sys.stdin, gzip.open(flexfile, 'r'), sys.stdout)