Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJeroen Vermeulen <jtv@precisiontranslationtools.com>2015-05-16 13:26:56 +0300
committerJeroen Vermeulen <jtv@precisiontranslationtools.com>2015-05-16 13:26:56 +0300
commit61162dd24284baebdd407bb7dd4f28892b24fbfb (patch)
treea2a2807acecefa1906d648970d932409bf0e0099 /scripts
parentc07ade81422488ba1b6a6ae5eb46132fa5ac5fec (diff)
Fix more Python lint.
Most of the complaints fixed here were from Pocketlint, but many were also from Syntastic the vim plugin.
Diffstat (limited to 'scripts')
-rw-r--r--scripts/ems/support/defaultconfig.py85
-rwxr-xr-xscripts/ems/support/mml-filter.py273
-rwxr-xr-xscripts/generic/bsbleu.py199
-rw-r--r--scripts/server/moses.py410
-rwxr-xr-xscripts/server/sim-pe.py239
-rw-r--r--scripts/tokenizer/pre_tokenize_cleaning.py63
-rwxr-xr-xscripts/training/filter-rule-table.py44
-rwxr-xr-xscripts/training/rdlm/average_null_embedding.py24
-rwxr-xr-xscripts/training/rdlm/extract_syntactic_ngrams.py249
-rwxr-xr-xscripts/training/rdlm/extract_vocab.py76
-rwxr-xr-xscripts/training/rdlm/train_rdlm.py332
-rwxr-xr-xscripts/training/wrappers/conll2mosesxml.py113
-rwxr-xr-xscripts/training/wrappers/mosesxml2brackets.py16
13 files changed, 1217 insertions, 906 deletions
diff --git a/scripts/ems/support/defaultconfig.py b/scripts/ems/support/defaultconfig.py
index e88b63e3d..a118e96b3 100644
--- a/scripts/ems/support/defaultconfig.py
+++ b/scripts/ems/support/defaultconfig.py
@@ -1,53 +1,48 @@
#!/usr/bin/env python2
-#
-# Version of ConfigParser which accepts default values
-#
+"""Version of ConfigParser which accepts default values."""
import ConfigParser
class Config:
- def __init__(self,filename):
- self.config = ConfigParser.SafeConfigParser()
- cfh = open(filename)
- self.config.readfp(cfh)
- cfh.close()
-
- def get(self,section,name,default=None):
- if default == None or self.config.has_option(section,name):
- return self.config.get(section,name)
- else:
- return default
-
- def getint(self,section,name,default=None):
- if default == None or self.config.has_option(section,name):
- return self.config.getint(section,name)
- else:
- return default
-
-
- def getboolean(self,section,name,default=None):
- if default == None or self.config.has_option(section,name):
- return self.config.getboolean(section,name)
- else:
- return default
-
-
- def getfloat(self,section,name,default=None):
- if default == None or self.config.has_option(section,name):
- return self.config.getfloat(section,name)
- else:
- return default
-
-
- def __str__(self):
- ret = ""
- for section in self.config.sections():
- for option in self.config.options(section):
- ret = ret + "%s:%s = %s\n" % (section,option,self.config.get(section,option))
- return ret
-
-
-
+ """Version of ConfigParser which accepts default values."""
+
+ def __init__(self, filename):
+ self.config = ConfigParser.SafeConfigParser()
+ cfh = open(filename)
+ self.config.readfp(cfh)
+ cfh.close()
+
+ def get(self, section, name, default=None):
+ if default is None or self.config.has_option(section, name):
+ return self.config.get(section, name)
+ else:
+ return default
+
+ def getint(self, section, name, default=None):
+ if default is None or self.config.has_option(section, name):
+ return self.config.getint(section, name)
+ else:
+ return default
+
+ def getboolean(self, section, name, default=None):
+ if default is None or self.config.has_option(section, name):
+ return self.config.getboolean(section, name)
+ else:
+ return default
+
+ def getfloat(self, section, name, default=None):
+ if default is None or self.config.has_option(section, name):
+ return self.config.getfloat(section, name)
+ else:
+ return default
+
+ def __str__(self):
+ ret = ""
+ for section in self.config.sections():
+ for option in self.config.options(section):
+ ret = ret + "%s:%s = %s\n" % (
+ section, option, self.config.get(section, option))
+ return ret
diff --git a/scripts/ems/support/mml-filter.py b/scripts/ems/support/mml-filter.py
index 5fb43d71e..8e865c801 100755
--- a/scripts/ems/support/mml-filter.py
+++ b/scripts/ems/support/mml-filter.py
@@ -1,156 +1,171 @@
#!/usr/bin/env python2
-#
-# Filter a parallel corpus
-#
+"""Filter a parallel corpus."""
+
-import heapq
import logging
-import math
import optparse
import random
-import sys
from defaultconfig import Config
-logging.basicConfig(format = "%(asctime)-15s %(message)s")
+
+logging.basicConfig(format="%(asctime)-15s %(message)s")
log = logging.getLogger("filter")
log.setLevel(logging.DEBUG)
+
class FilterStrategy(object):
- def __init__(self,config):
- pass
+ def __init__(self, config):
+ pass
- def filter(self,source,target):
- return True
+ def filter(self, source, target):
+ return True
class RandomFilterStrategy(FilterStrategy):
- def __init__(self,config):
- self.threshold = config.getfloat("random", "threshold", 0.1)
- random.seed()
+ def __init__(self, config):
+ self.threshold = config.getfloat("random", "threshold", 0.1)
+ random.seed()
- def filter(self, source, target):
- return random.random() < self.threshold
+ def filter(self, source, target):
+ return random.random() < self.threshold
class ScoreFilterStrategy(FilterStrategy):
- """Filter strategy that is based on a file with sentence scores. There are three
- possible ways of specifying how to filter:
- i) threshold - filter all sentence pairs whose score is less than the threshold
- ii) proportion - filter all but a certain proportion (eg a tenth) of the sentences
+ """Filter strategy that is based on a file with sentence scores.
+
+ There are three possible ways of specifying how to filter:
+ i) threshold - filter all sentence pairs whose score is less than the
+ threshold.
+ ii) proportion - filter all but a certain proportion (eg a tenth) of the
+ sentences.
iii) count - filter all but a given count of the sentences.
"""
- def __init__(self,config):
- section = "score"
- self.score_file = config.get(section,"score_file")
- self.ignore_score = config.get(section, "ignore_score", "99999")
- option_names = ("threshold", "proportion", "count")
- options = [config.config.has_option(section,o) for o in option_names]
- if sum(options) != 1:
- raise RuntimeError("Must specify exactly one of %s for score filter" % str(option_names))
- if options[0]:
- # threshold
- self.threshold = config.getfloat(section,option_names[0])
- else:
- # proportion or count
- if options[2]:
- count = config.getint(section,option_names[2])
- else:
- # need to count entries
- count = 0
- ignore_count = 0
- for line in open(self.score_file):
- if line[:-1] != self.ignore_score:
- count = count + 1
- else:
- ignore_count = ignore_count + 1
- count = int(count * config.getfloat(section,option_names[1]))
- log.info("Retaining at least %d entries and ignoring %d" % (count, ignore_count))
- # Find the threshold
- self.threshold = sorted(\
- [float(line[:-1]) for line in open(self.score_file)], reverse=True)[ignore_count + count]
- #self.threshold = heapq.nlargest(count, \
- # [float(line[:-1]) for line in open(self.score_file)])[-1]
-
-
- self.sfh = open(self.score_file)
- log.info("Thresholding scores at " + str(self.threshold))
-
- def filter(self,source,target):
- score = self.sfh.readline()
- if not score:
- raise RuntimeError("score file truncated")
- return score[:-1] == self.ignore_score or float(score[:-1]) >= self.threshold
-
+
+ def __init__(self, config):
+ section = "score"
+ self.score_file = config.get(section, "score_file")
+ self.ignore_score = config.get(section, "ignore_score", "99999")
+ option_names = ("threshold", "proportion", "count")
+ options = [config.config.has_option(section, o) for o in option_names]
+ if sum(options) != 1:
+ raise RuntimeError(
+ "Must specify exactly one of %s for score filter"
+ % str(option_names))
+ if options[0]:
+ # Threshold.
+ self.threshold = config.getfloat(section, option_names[0])
+ else:
+ # proportion or count
+ if options[2]:
+ count = config.getint(section, option_names[2])
+ else:
+ # Need to count entries.
+ count = 0
+ ignore_count = 0
+ for line in open(self.score_file):
+ if line[:-1] != self.ignore_score:
+ count += 1
+ else:
+ ignore_count = ignore_count + 1
+ count = int(count * config.getfloat(section, option_names[1]))
+ log.info(
+ "Retaining at least %d entries and ignoring %d"
+ % (count, ignore_count))
+ # Find the threshold.
+ self.threshold = sorted([
+ float(line[:-1])
+ for line in open(self.score_file)],
+ reverse=True)[ignore_count + count]
+ # import heapq
+ # self.threshold = heapq.nlargest(
+ # count,
+ # [float(line[:-1]) for line in open(self.score_file)])[-1]
+
+ self.sfh = open(self.score_file)
+ log.info("Thresholding scores at " + str(self.threshold))
+
+ def filter(self, source, target):
+ score = self.sfh.readline()
+ if not score:
+ raise RuntimeError("score file truncated")
+ return (
+ score[:-1] == self.ignore_score or
+ float(score[:-1]) >= self.threshold
+ )
+
def main():
- parser = optparse.OptionParser(usage = "Usage: %prog [options] config-file")
- (options,args) = parser.parse_args()
- if len(args) < 1:
- parser.error("No configuration file specified")
-
- log.info("Loading configuration from " + args[0])
- config = Config(args[0])
- log.debug("Configuration:\n" + str(config))
-
- # Required general parameters
- source_lang = config.get("general", "source_language")
- target_lang = config.get("general", "target_language")
- input_stem = config.get("general", "input_stem")
- output_stem = config.get("general", "output_stem")
- strategy = config.get("general", "strategy", "")
-
- # Optional general parameters
- alignment_stem = config.get("general", "alignment_stem", "")
- alignment_type = config.get("general", "alignment_type", "grow-diag-final-and")
- domain_file_in = config.get("general", "domain_file", "")
- domain_file_out = config.get("general", "domain_file_out", "")
-
- strategy_class = globals()[strategy + "FilterStrategy"]
- strategy = strategy_class(config)
-
- source_input_fh = open(input_stem + "." + source_lang)
- target_input_fh = open(input_stem + "." + target_lang)
- source_output_fh = open(output_stem + "." + source_lang, "w")
- target_output_fh = open(output_stem + "." + target_lang, "w")
-
- alignment_input_fh = None
- alignment_output_fh = None
- if alignment_stem:
- alignment_input_fh = open(alignment_stem + "." + alignment_type)
- alignment_output_fh = open(output_stem + "." + alignment_type,"w")
-
- domain_boundaries = {}
- if domain_file_in:
- dfh = open(domain_file_in)
- for line in dfh:
- line_no,name = line[:-1].split()
- domain_boundaries[int(line_no)] = name
-
- domain_output_fh = None
- if domain_file_out:
- domain_output_fh = open(domain_file_out, "w")
-
- #log.info(str(domain_boundaries))
-
- retained = 0
- line_no = 0
- for source_line in source_input_fh:
- target_line = target_input_fh.readline()
- if alignment_input_fh:
- align_line = alignment_input_fh.readline()
- if strategy.filter(source_line,target_line):
- retained = retained + 1
- print>>source_output_fh, source_line,
- print>>target_output_fh, target_line,
- if alignment_input_fh:
- print>>alignment_output_fh, align_line,
- line_no = line_no + 1
- # check if this is a domain boundary
- if domain_boundaries and domain_boundaries.has_key(line_no):
- print>>domain_output_fh,"%d %s" % (retained,domain_boundaries[line_no])
- log.info("Lines retained: %d" % retained)
+ parser = optparse.OptionParser(usage="Usage: %prog [options] config-file")
+ (options, args) = parser.parse_args()
+ if len(args) < 1:
+ parser.error("No configuration file specified")
+
+ log.info("Loading configuration from " + args[0])
+ config = Config(args[0])
+ log.debug("Configuration:\n" + str(config))
+
+ # Required general parameters
+ source_lang = config.get("general", "source_language")
+ target_lang = config.get("general", "target_language")
+ input_stem = config.get("general", "input_stem")
+ output_stem = config.get("general", "output_stem")
+ strategy = config.get("general", "strategy", "")
+
+ # Optional general parameters
+ alignment_stem = config.get("general", "alignment_stem", "")
+ alignment_type = config.get(
+ "general", "alignment_type", "grow-diag-final-and")
+ domain_file_in = config.get("general", "domain_file", "")
+ domain_file_out = config.get("general", "domain_file_out", "")
+
+ strategy_class = globals()[strategy + "FilterStrategy"]
+ strategy = strategy_class(config)
+
+ source_input_fh = open(input_stem + "." + source_lang)
+ target_input_fh = open(input_stem + "." + target_lang)
+ source_output_fh = open(output_stem + "." + source_lang, "w")
+ target_output_fh = open(output_stem + "." + target_lang, "w")
+
+ alignment_input_fh = None
+ alignment_output_fh = None
+ if alignment_stem:
+ alignment_input_fh = open(alignment_stem + "." + alignment_type)
+ alignment_output_fh = open(output_stem + "." + alignment_type, "w")
+
+ domain_boundaries = {}
+ if domain_file_in:
+ dfh = open(domain_file_in)
+ for line in dfh:
+ line_no, name = line[:-1].split()
+ domain_boundaries[int(line_no)] = name
+
+ domain_output_fh = None
+ if domain_file_out:
+ domain_output_fh = open(domain_file_out, "w")
+
+ # log.info(str(domain_boundaries))
+
+ retained = 0
+ line_no = 0
+ for source_line in source_input_fh:
+ target_line = target_input_fh.readline()
+ if alignment_input_fh:
+ align_line = alignment_input_fh.readline()
+ if strategy.filter(source_line, target_line):
+ retained = retained + 1
+ print>>source_output_fh, source_line,
+ print>>target_output_fh, target_line,
+ if alignment_input_fh:
+ print>>alignment_output_fh, align_line,
+ line_no = line_no + 1
+ # Check if this is a domain boundary.
+ if domain_boundaries and line_no in domain_boundaries:
+ print >>domain_output_fh, (
+ "%d %s" % (retained, domain_boundaries[line_no]))
+ log.info("Lines retained: %d", retained)
+
if __name__ == "__main__":
- main()
+ main()
diff --git a/scripts/generic/bsbleu.py b/scripts/generic/bsbleu.py
index ff86fed5e..12d2201de 100755
--- a/scripts/generic/bsbleu.py
+++ b/scripts/generic/bsbleu.py
@@ -2,73 +2,73 @@
# compute Bleu scores with confidence intervals via boostrap resampling
# written by Ulrich Germann
-import math,sys,os
from argparse import ArgumentParser
-from operator import itemgetter
-from random import randint
-from operator import itemgetter
+import math
+import os
+from random import randint
+import sys
-def count_ngrams(snt,max_n):
+
+def count_ngrams(snt, max_n):
"""
- Return a dictionary of ngram counts (up to length /max_n/)
- for sentence (list of words) /snt/.
+ Return a dictionary of ngram counts (up to length /max_n/)
+ for sentence (list of words) /snt/.
"""
ret = {}
for i in xrange(len(snt)):
- for k in xrange(i+1,min(i+max_n+1,len(snt)+1)):
+ for k in xrange(i + 1, min(i + max_n + 1, len(snt) + 1)):
key = tuple(snt[i:k])
- ret[key] = ret.get(key,0) + 1
- pass
- pass
+ ret[key] = ret.get(key, 0) + 1
return ret
-def max_counts(ng1,ng2):
+
+def max_counts(ng1, ng2):
"""
- Return a dicitonary of ngram counts such that
+ Return a dicitonary of ngram counts such that
each count is the greater of the two individual counts
- for each ngram in the input ngram count dictionaries
+ for each ngram in the input ngram count dictionaries
/ng1/ and /ng2/.
"""
ret = ng1.copy()
- for k,v in ng2.items():
- ret[k] = max(ret.get(k,0),v)
- pass
+ for k, v in ng2.items():
+ ret[k] = max(ret.get(k, 0), v)
return ret
-def ng_hits(hyp,ref,max_n):
+
+def ng_hits(hyp, ref, max_n):
"""
- return a list of ngram counts such that each ngram count
- is the minimum of the counts in hyp and ref, up to ngram
- length /max_n/
+ Return a list of ngram counts such that each ngram count
+ is the minimum of the counts in hyp and ref, up to ngram
+ length /max_n/.
"""
ret = [0 for i in xrange(max_n)]
- for ng,cnt in hyp.items():
+ for ng, cnt in hyp.items():
k = ng
if len(k) <= max_n:
- ret[len(k)-1] += min(cnt,ref.get(ng,0))
- pass
- pass
+ ret[len(k) - 1] += min(cnt, ref.get(ng, 0))
return ret
+
class BleuScore:
- def __init__(self,hyp,ref,max_n=4,bootstrap=1000):
- # print len(hyp.ngrams),len(ref.ngrams),"X"
- self.hits = [ng_hits(hyp.ngrams[i],ref.ngrams[i],max_n)
- for i in xrange(len(hyp.ngrams))]
- self.max_n = max_n
- self.hyp = hyp
- self.ref = ref
- self.lower = None
- self.upper = None
+ def __init__(self, hyp, ref, max_n=4, bootstrap=1000):
+ # print len(hyp.ngrams), len(ref.ngrams), "X"
+ self.hits = [
+ ng_hits(hyp.ngrams[i], ref.ngrams[i], max_n)
+ for i in xrange(len(hyp.ngrams))]
+ self.max_n = max_n
+ self.hyp = hyp
+ self.ref = ref
+ self.lower = None
+ self.upper = None
self.median = None
- self.bootstrap = [self.score([randint(0,len(hyp.snt)-1) for s in hyp.snt])
- for i in xrange(1000)]
+ self.bootstrap = [
+ self.score([randint(0, len(hyp.snt) - 1) for s in hyp.snt])
+ for i in xrange(1000)]
self.bootstrap.sort()
self.actual = self.score([i for i in xrange(len(hyp.snt))])
- return
-
- def score(self,sample):
- hits = [0 for i in xrange(self.max_n)]
+
+ def score(self, sample):
+ hits = [0 for i in xrange(self.max_n)]
self.hyplen = 0
self.reflen = 0
for i in sample:
@@ -76,94 +76,89 @@ class BleuScore:
self.reflen += len(self.ref.snt[i])
for n in xrange(self.max_n):
hits[n] += self.hits[i][n]
- pass
- pass
- self.prec = [float(hits[n])/(self.hyplen-n*len(sample))
+ self.prec = [float(hits[n]) / (self.hyplen - n * len(sample))
for n in xrange(self.max_n)]
- ret = sum([math.log(x) for x in self.prec])/self.max_n
- self.BP = min(1,math.exp(1.-float(self.reflen)/float(self.hyplen)))
+ ret = sum([math.log(x) for x in self.prec]) / self.max_n
+ self.BP = min(
+ 1, math.exp(1. - float(self.reflen) / float(self.hyplen)))
ret += math.log(self.BP)
return math.exp(ret)
-
+
+
class Document:
- def __init__(self,fname=None):
+ def __init__(self, fname=None):
self.fname = fname
if fname:
self.snt = [line.strip().split() for line in open(fname)]
- self.ngrams = [count_ngrams(snt,4) for snt in self.snt]
+ self.ngrams = [count_ngrams(snt, 4) for snt in self.snt]
else:
self.snt = None
self.ngrams = None
- pass
- return
- def merge(self,R):
+ def merge(self, R):
self.fname = "multi-ref"
self.ngrams = [x for x in R[0].ngrams]
self.snt = [x for x in R[0].snt]
for i in xrange(len(R[0].ngrams)):
- for k in xrange(1,len(R)):
- self.ngrams[i] = max_counts(self.ngrams[i],R[k].ngrams[i])
- pass
- pass
- return
-
- def update(self,hyp,R):
- for i in xrange(len(hyp.snt)):
- clen = len(hyp.snt[i])
+ for k in xrange(1, len(R)):
+ self.ngrams[i] = max_counts(self.ngrams[i], R[k].ngrams[i])
+
+ def update(self, hyp, R):
+ for i, hyp_snt in enumerate(hyp.snt):
+ clen = len(hyp_snt)
K = 0
- for k in xrange(1,len(R)):
- assert len(R[k].snt) == len(hyp.snt),\
- "Mismatch in numer of sentences " +\
- "between reference and candidate"
- if abs(len(R[k].snt[i]) - clen) == abs(len(R[K].snt[i]) - clen):
- if len(R[k].snt[i]) < len(R[K].snt[i]):
+ for k in xrange(1, len(R)):
+ k_snt = R[k].snt[i]
+ assert len(R[k].snt) == len(hyp.snt), (
+ "Mismatch in number of sentences " +
+ "between reference and candidate")
+ if abs(len(k_snt) - clen) == abs(len(R[K].snt[i]) - clen):
+ if len(k_snt) < len(R[K].snt[i]):
K = k
- pass
- pass
- elif abs(len(R[k].snt[i]) - clen) < abs(len(R[K].snt[i]) - clen):
+ elif abs(len(k_snt) - clen) < abs(len(R[K].snt[i]) - clen):
K = k
- pass
- pass
self.snt[i] = R[K].snt[i]
- pass
- return
-
- pass
+
if __name__ == "__main__":
argparser = ArgumentParser()
- argparser.add_argument("-r","--ref",nargs='+',help="reference translation(s)")
- argparser.add_argument("-c","--cand",nargs='+',help="candidate translations")
- argparser.add_argument("-i","--individual",action='store_true',
- help="compute BLEU scores for individual references")
- argparser.add_argument("-b","--bootstrap",type=int,default=1000,
- help="sample size for bootstrap resampling")
- argparser.add_argument("-a","--alpha",help="1-alpha = confidence interval",type=float,default=.05)
+ argparser.add_argument(
+ "-r", "--ref", nargs='+', help="Reference translation(s).")
+ argparser.add_argument(
+ "-c", "--cand", nargs='+', help="Candidate translations.")
+ argparser.add_argument(
+ "-i", "--individual", action='store_true',
+ help="Compute BLEU scores for individual references.")
+ argparser.add_argument(
+ "-b", "--bootstrap", type=int, default=1000,
+ help="Sample size for bootstrap resampling.")
+ argparser.add_argument(
+ "-a", "--alpha", type=float, default=.05,
+ help="1-alpha = confidence interval.")
args = argparser.parse_args(sys.argv[1:])
- R = [ Document(fname) for fname in args.ref]
- C = [ Document(fname) for fname in args.cand]
- Rx = Document() # for multi-reference BLEU
+ R = [Document(fname) for fname in args.ref]
+ C = [Document(fname) for fname in args.cand]
+ Rx = Document() # for multi-reference BLEU
Rx.merge(R)
for c in C:
# compute multi-reference BLEU
- Rx.update(c,R)
- bleu = BleuScore(c,Rx,bootstrap=args.bootstrap)
- print "%5.2f %s [%5.2f-%5.2f; %5.2f] %s"%\
- (100*bleu.actual,
- os.path.basename(Rx.fname),
- 100*bleu.bootstrap[int((args.alpha/2)*args.bootstrap)],
- 100*bleu.bootstrap[int((1-(args.alpha/2))*args.bootstrap)],
- 100*bleu.bootstrap[int(.5*args.bootstrap)],
- c.fname) # os.path.basename(c.fname))
+ Rx.update(c, R)
+ bleu = BleuScore(c, Rx, bootstrap=args.bootstrap)
+ print "%5.2f %s [%5.2f-%5.2f; %5.2f] %s" % (
+ 100 * bleu.actual,
+ os.path.basename(Rx.fname),
+ 100 * bleu.bootstrap[int((args.alpha / 2) * args.bootstrap)],
+ 100 * bleu.bootstrap[int((1 - (args.alpha / 2)) * args.bootstrap)],
+ 100 * bleu.bootstrap[int(.5 * args.bootstrap)],
+ c.fname) # os.path.basename(c.fname))
if args.individual:
for r in R:
- bleu = BleuScore(c,r,bootstrap=args.bootstrap)
- print " %5.2f %s"%(100*bleu.actual,os.path.basename(r.fname))
- # print bleu.prec,bleu.hyplen,bleu.reflen,bleu.BP
- pass
- pass
-
- # print [sum([bleu.hits[i][n] for i in xrange(len(bleu.hits))]) for n in xrange(4)]
- pass
+ bleu = BleuScore(c, r, bootstrap=args.bootstrap)
+ print " %5.2f %s" % (
+ 100 * bleu.actual, os.path.basename(r.fname))
+ # print bleu.prec, bleu.hyplen, bleu.reflen, bleu.BP
+
+ # print [
+ # sum([bleu.hits[i][n] for i in xrange(len(bleu.hits))])
+ # for n in xrange(4)]
diff --git a/scripts/server/moses.py b/scripts/server/moses.py
index a176c473a..7cf152187 100644
--- a/scripts/server/moses.py
+++ b/scripts/server/moses.py
@@ -1,237 +1,225 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
-# Python utilities for moses
-#
-# This package mostly wraps standard Moses utilities into pipes.
-#
-# Written by Ulrich Germann
-#
-# This package borrows from scripts written by Christian Buck
-#
-# The package assumes that there is a complete moses installation
-# (including scripts) under one root directory,
-# e.g., via
-# bjam --with-xmlrpc-c=... [...] --install-scripts --prefix=${HOME}/moses
-# By default, this root directory is "${HOME}/moses".
-
-import xmlrpclib,datetime,argparse,time,os,sys
-from subprocess import *
-from unicodedata import normalize
-
-moses_root = os.environ.get('MOSES_ROOT',os.environ.get('HOME')+"/moses")
+"""
+Python utilities for moses
+
+This package mostly wraps standard Moses utilities into pipes.
+
+Written by Ulrich Germann
+
+This package borrows from scripts written by Christian Buck
+
+The package assumes that there is a complete moses installation
+(including scripts) under one root directory,
+e.g., via ::
+ bjam --with-xmlrpc-c=... [...] --install-scripts --prefix=${HOME}/moses
+By default, this root directory is "${HOME}/moses".
+"""
+
+import os
+import sys
+import time
+import xmlrpclib
+from subprocess import (
+ PIPE,
+ Popen,
+ )
+
+
+moses_root = os.environ.get('MOSES_ROOT', os.environ.get('HOME') + "/moses")
+
class ProcessWrapper:
- def __init__(self,cmd=[]):
- self.process = None
- self.cmd = cmd
- return
+ def __init__(self, cmd=[]):
+ self.process = None
+ self.cmd = cmd
+
+ def start(self, stdin=PIPE, stdout=PIPE):
+ if self.process:
+ raise Exception("Process is already running")
+ self.process = Popen(self.cmd, stdin=stdin, stdout=stdout)
- def start(self, stdin=PIPE, stdout=PIPE):
- if self.process:
- raise Exception("Process is already running")
- self.process = Popen(self.cmd, stdin = stdin, stdout = stdout)
- return
+ def __del__(self):
+ if self.process:
+ self.process.terminate()
- def __del__(self):
- if self.process:
- self.process.terminate()
- pass
- return
- pass
class LineProcessor(ProcessWrapper):
- def __call__(self,input):
- if not self.process: self.start()
- self.process.stdin.write("%s\n"%input.strip())
- self.process.stdin.flush()
- return self.process.stdout.readline().strip()
- pass
+ def __call__(self, input):
+ if not self.process:
+ self.start()
+ self.process.stdin.write("%s\n" % input.strip())
+ self.process.stdin.flush()
+ return self.process.stdout.readline().strip()
+
class SentenceSplitter(ProcessWrapper):
- """
- Wrapper for standard Moses sentence splitter
- """
- def __init__(self,lang):
- ssplit_cmd = moses_root+"/scripts/ems/support/split-sentences.perl"
- self.cmd = [ssplit_cmd, "-b", "-q", "-l",lang]
- self.process = None
- return
-
- def __call__(self,input):
- if not self.process:
- self.start()
- pass
- self.process.stdin.write(input.strip() + "\n<P>\n")
- self.process.stdin.flush()
- x = self.process.stdout.readline().strip()
- ret = []
- while x != '<P>' and x != '':
- ret.append(x)
- x = self.process.stdout.readline().strip()
- pass
- return ret
+ """Wrapper for standard Moses sentence splitter."""
+
+ def __init__(self, lang):
+ ssplit_cmd = moses_root + "/scripts/ems/support/split-sentences.perl"
+ self.cmd = [ssplit_cmd, "-b", "-q", "-l", lang]
+ self.process = None
+
+ def __call__(self, input):
+ if not self.process:
+ self.start()
+ self.process.stdin.write(input.strip() + "\n<P>\n")
+ self.process.stdin.flush()
+ x = self.process.stdout.readline().strip()
+ ret = []
+ while x != '<P>' and x != '':
+ ret.append(x)
+ x = self.process.stdout.readline().strip()
+ return ret
+
class Pretokenizer(LineProcessor):
- """
- Pretokenizer wrapper; the pretokenizer fixes known issues with the input.
- """
- def __init__(self,lang):
- pretok_cmd = moses_root+"/scripts/tokenizer/pre-tokenizer.perl"
- self.cmd = [pretok_cmd,"-b", "-q", "-l",lang]
- self.process = None
- return
- pass
+ """Pretokenizer wrapper.
+
+ The pretokenizer fixes known issues with the input.
+ """
+ def __init__(self, lang):
+ pretok_cmd = moses_root + "/scripts/tokenizer/pre-tokenizer.perl"
+ self.cmd = [pretok_cmd, "-b", "-q", "-l", lang]
+ self.process = None
+
class Tokenizer(LineProcessor):
- """
- Tokenizer wrapper; the pretokenizer fixes known issues with the input.
- """
- def __init__(self,lang,args=["-a","-no-escape"]):
- tok_cmd = moses_root+"/scripts/tokenizer/tokenizer.perl"
- self.cmd = [tok_cmd,"-b", "-q", "-l", lang] + args
- self.process = None
- return
-
+ """Tokenizer wrapper.
+
+ The pretokenizer fixes known issues with the input.
+ """
+ def __init__(self, lang, args=["-a", "-no-escape"]):
+ tok_cmd = moses_root + "/scripts/tokenizer/tokenizer.perl"
+ self.cmd = [tok_cmd, "-b", "-q", "-l", lang] + args
+ self.process = None
+
+
class Truecaser(LineProcessor):
- """
- Truecaser wrapper.
- """
- def __init__(self,model):
- truecase_cmd = moses_root+"/scripts/recaser/truecase.perl"
- self.cmd = [truecase_cmd,"-b", "--model",model]
- self.process = None
- return
- pass
+ """Truecaser wrapper."""
+ def __init__(self, model):
+ truecase_cmd = moses_root + "/scripts/recaser/truecase.perl"
+ self.cmd = [truecase_cmd, "-b", "--model", model]
+ self.process = None
+
class LineProcessorPipeline:
- """
- Line processor: one line in, one line out
- """
- def __init__(self,parts=[]):
- self.chain = [LineProcessor(p.cmd) for p in parts]
- return
-
- def start(self):
- if len(self.chain) == 0:
- return
- if self.chain[0].process:
- return
- self.chain[0].start()
- for i in xrange(1,len(self.chain)):
- self.chain[i].start(stdin = self.chain[i-1].process.stdout)
- pass
- return
-
- def __call__(self,input):
- if len(self.chain) == 0:
- return input
- self.start()
- self.chain[0].process.stdin.write("%s\n"%input.strip())
- self.chain[0].process.stdin.flush()
- return self.chain[0].process.stdout.readline().strip()
-
- pass
+ """Line processor: one line in, one line out."""
+ def __init__(self, parts=[]):
+ self.chain = [LineProcessor(p.cmd) for p in parts]
+
+ def start(self):
+ if len(self.chain) == 0:
+ return
+ if self.chain[0].process:
+ return
+ self.chain[0].start()
+ for i in xrange(1, len(self.chain)):
+ self.chain[i].start(stdin=self.chain[i - 1].process.stdout)
+
+ def __call__(self, input):
+ if len(self.chain) == 0:
+ return input
+ self.start()
+ self.chain[0].process.stdin.write("%s\n" % input.strip())
+ self.chain[0].process.stdin.flush()
+ return self.chain[0].process.stdout.readline().strip()
-def find_free_port(p):
- """
- Find a free port, starting at /p/.
- Return the free port, or False if none found.
- """
- ret = p
- while ret - p < 20:
- devnull = open(os.devnull,"w")
- n = Popen(["netstat","-tnp"],stdout=PIPE,stderr=devnull)
- if n.communicate()[0].find(":%d "%ret) < 0:
- return p
- ret += 1
- pass
- return False
-class MosesServer(ProcessWrapper):
+def find_free_port(p):
+ """Find a free port, starting at /p/.
- def __init__(self,args=[]):
- self.process = None
- mserver_cmd = moses_root+"/bin/mosesserver"
- self.cmd = [mserver_cmd] + args
- self.url = None
- self.proxy = None
- return
-
- def start(self,config=None,args=[],port=7447,debug=False):
- self.cmd.extend(args)
- if config:
- if "-f" in args:
- raise Exception("Config file specified twice")
- else:
- self.cmd.extend(["-f",config])
- pass
- pass
- self.port = port # find_free_port(port)
- if not self.port:
- raise Excpetion("Cannot find free port for moses server!")
- self.cmd.extend(["--server-port", "%d"%self.port])
- if debug:
- print >>sys.stderr,self.cmd
- # self.stderr = open("mserver.%d.stderr"%self.port,'w')
- # self.stdout = open("mserver.%d.stdout"%self.port,'w')
- # self.process = Popen(self.cmd,stderr = self.stderr,stdout = self.stdout)
- self.process = Popen(self.cmd)
- else:
- devnull = open(os.devnull,"w")
- self.process = Popen(self.cmd, stderr=devnull, stdout=devnull)
- pass
-
- if self.process.poll():
- raise Exception("FATAL ERROR: Could not launch moses server!")
- if debug:
- print >>sys.stderr,"MOSES port is %d."%self.port
- print >>sys.stderr,"Moses poll status is", self.process.poll()
- pass
-
- self.url = "http://localhost:%d/RPC2"%self.port
- self.connect(self.url)
-
- return True
-
- def connect(self,url):
- if url[:4] != "http": url = "http://%s"%url
- if url[-5:] != "/RPC2": url += "/RPC2"
- self.url = url
- self.proxy = xmlrpclib.ServerProxy(self.url)
- return
-
- def translate(self,input):
- attempts = 0
- while attempts < 100:
- try:
- if type(input) is unicode:
- # if the server does not expect unicode, provide a
- # properly encoded string!
- param = {'text': input.strip().encode('utf8')}
- return self.proxy.translate(param)['text'].decode('utf8')
-
- elif type(input) is str:
- param = {'text': input.strip()}
- return self.proxy.translate(param)['text']
-
- elif type(input) is list:
- return [self.translate(x) for x in input]
-
- elif type(input) is dict:
- return self.proxy.translate(input)
+ :return: The free port, or False if none found.
+ """
+ ret = p
+ while ret - p < 20:
+ devnull = open(os.devnull, "w")
+ n = Popen(["netstat", "-tnp"], stdout=PIPE, stderr=devnull)
+ if n.communicate()[0].find(":%d " % ret) < 0:
+ return p
+ ret += 1
+ return False
- else:
- raise Exception("Can't handle input of this type!")
- except:
- attempts += 1
- print >>sys.stderr, "WAITING", attempts
- time.sleep(1)
- pass
- pass
- raise Exception("Translation request failed")
- pass
+class MosesServer(ProcessWrapper):
+ def __init__(self, args=[]):
+ self.process = None
+ mserver_cmd = moses_root + "/bin/mosesserver"
+ self.cmd = [mserver_cmd] + args
+ self.url = None
+ self.proxy = None
+
+ def start(self, config=None, args=[], port=7447, debug=False):
+ self.cmd.extend(args)
+ if config:
+ if "-f" in args:
+ raise Exception("Config file specified twice")
+ else:
+ self.cmd.extend(["-f", config])
+ self.port = port # find_free_port(port)
+ if not self.port:
+ raise Exception("Cannot find free port for moses server!")
+ self.cmd.extend(["--server-port", "%d" % self.port])
+ if debug:
+ print >>sys.stderr, self.cmd
+ # self.stderr = open("mserver.%d.stderr"%self.port,'w')
+ # self.stdout = open("mserver.%d.stdout"%self.port,'w')
+ # self.process = Popen(
+ # self.cmd, stderr=self.stderr, stdout=self.stdout)
+ self.process = Popen(self.cmd)
+ else:
+ devnull = open(os.devnull, "w")
+ self.process = Popen(self.cmd, stderr=devnull, stdout=devnull)
+
+ if self.process.poll():
+ raise Exception("FATAL ERROR: Could not launch moses server!")
+ if debug:
+ print >>sys.stderr, "MOSES port is %d." % self.port
+ print >>sys.stderr, "Moses poll status is", self.process.poll()
+
+ self.url = "http://localhost:%d/RPC2" % self.port
+ self.connect(self.url)
+
+ return True
+
+ def connect(self, url):
+ if url[:4] != "http":
+ url = "http://%s" % url
+ if url[-5:] != "/RPC2":
+ url += "/RPC2"
+ self.url = url
+ self.proxy = xmlrpclib.ServerProxy(self.url)
+
+ def translate(self, input):
+ attempts = 0
+ while attempts < 100:
+ try:
+ if type(input) is unicode:
+ # If the server does not expect unicode, provide a
+ # properly encoded string!
+ param = {'text': input.strip().encode('utf8')}
+ return self.proxy.translate(param)['text'].decode('utf8')
+
+ elif type(input) is str:
+ param = {'text': input.strip()}
+ return self.proxy.translate(param)['text']
+
+ elif type(input) is list:
+ return [self.translate(x) for x in input]
+
+ elif type(input) is dict:
+ return self.proxy.translate(input)
+
+ else:
+ raise Exception("Can't handle input of this type!")
+
+ except:
+ attempts += 1
+ print >>sys.stderr, "WAITING", attempts
+ time.sleep(1)
+ raise Exception("Translation request failed")
diff --git a/scripts/server/sim-pe.py b/scripts/server/sim-pe.py
index 52d1e314a..5f1407524 100755
--- a/scripts/server/sim-pe.py
+++ b/scripts/server/sim-pe.py
@@ -5,29 +5,39 @@
# This script simulates post-editing of MT output and incrementally
# updates the dynamic phrase tables in the moses server.
-import xmlrpclib,datetime,argparse,sys,os,time
+import argparse
+import os
+import sys
+import time
+import xmlrpclib
import moses
-from moses import MosesServer
-from subprocess import *
+from subprocess import (
+ PIPE,
+ Popen,
+ )
+
+
mserver = moses.MosesServer()
# We must perform some custom argument processing, as moses parameter
# specifications do not comply with the standards used in standard
# argument parsing packages; an isolated double dash separates script
# arguments from moses arguments
+
+
def split_args(all_args):
"""
Split argument list all_args into arguments specific to this script and
- arguments relating to the moses server. An isolated double dash acts as
- the separator between the two types of arguments.
+ arguments relating to the moses server. An isolated double dash acts as
+ the separator between the two types of arguments.
"""
my_args = []
mo_args = []
arglist = mo_args
i = 0
- # IMPORTANT: the code below must be coordinated with
+ # IMPORTANT: the code below must be coordinated with
# - the evolution of moses command line arguments
- # - mert-moses.pl
+ # - mert-moses.pl
while i < len(all_args):
# print i,"MY_ARGS", my_args
# print i,"MO_ARGS", mo_args
@@ -36,14 +46,16 @@ def split_args(all_args):
elif all_args[i] == "--]":
arglist = mo_args
elif all_args[i] == "-i" or all_args[i] == "-input-file":
- my_args.extend(["-i",all_args[i+1]])
+ my_args.extend(["-i", all_args[i + 1]])
i += 1
elif all_args[i] == "-inputtype":
- if all_args[i+1] != "0":
- # not yet supported! Therefore:
- errmsg = "FATAL ERROR: %s "%sys.argv[0]
- errmsg += "only supports plain text input at this point."
- raise Exception(errsmg)
+ if all_args[i + 1] != "0":
+ # Not yet supported! Therefore:
+ errmsg = (
+ "FATAL ERROR: "
+ "%s only supports plain text input at this point."
+ % sys.argv[0])
+ raise Exception(errmsg)
# my_args.extend(["--input-type",all_args[i+1]])
i += 1
elif all_args[i] == "-lattice-samples":
@@ -52,13 +64,14 @@ def split_args(all_args):
# mo_args[i:i+3] = []
# i += 2
# This is not yet supported! Therefore:
- errmsg = "FATAL ERROR: %s "%sys.argv[0]
- errmsg += "does not yet support lattice sampling."
- raise Exception(errsmg)
-
+ errmsg = (
+ "FATAL ERROR: %s does not yet support lattice sampling."
+ % sys.argv[0])
+ raise Exception(errmsg)
+
elif all_args[i] == "-n-best-list":
- my_args.extend(["--nbest",all_args[i+2]])
- my_args.extend(["--nbest-file",all_args[i+1]])
+ my_args.extend(["--nbest", all_args[i + 2]])
+ my_args.extend(["--nbest-file", all_args[i + 1]])
i += 2
elif all_args[i] == "-n-best-distinct":
@@ -70,128 +83,148 @@ def split_args(all_args):
i += 1
pass
- return my_args,mo_args
-
+ return my_args, mo_args
+
+
def interpret_args(my_args):
"""
Parse script-specific argument list.
"""
aparser = argparse.ArgumentParser()
- aparser.add_argument("-s","--server-cmd",default="mosesserver",
- dest="servercmd", help="path to moses server command")
- aparser.add_argument("--url",help="URL of external moses server.")
- aparser.add_argument("-p","--port", type=int, default=7447,
- help="port number to be used for server")
-
- # input / output
- aparser.add_argument("-i","--input",help="source file",default="-")
- aparser.add_argument("-r","--ref",help="reference translation",default=None)
- aparser.add_argument("-a","--aln",help="alignment",default=None)
- aparser.add_argument("-o","--output",default="-",help="output file")
- aparser.add_argument("-d","--debug",action="store_true",help="debug mode")
-
- # moses reporting options
- aparser.add_argument("-A","--with-alignment", dest="A",
- help="include alignment in output", action="store_true")
- aparser.add_argument("-G","--with-graph",type=bool, default=False, dest="G",
- help="include search graph info in output")
- aparser.add_argument("-T","--with-transopt",type=bool, default=False, dest = "T",
- help="include translation options info in output")
- aparser.add_argument("-F","--report-all-factors", action="store_true",dest="F",
- help="report all factors")
- aparser.add_argument("-n","--nbest",type=int,dest="nbest",default=0,
- help="size of nbest list")
- aparser.add_argument("-N","--nbest-file",dest="nbestFile",default=0,
- help="output file for nbest list")
- aparser.add_argument("-u","--nbest-distinct",type=bool,dest="U",default=False,
- help="report all factors")
+ aparser.add_argument(
+ "-s", "--server-cmd", default="mosesserver", dest="servercmd",
+ help="Path to moses server command.")
+ aparser.add_argument(
+ "--url", help="URL of external moses server.")
+ aparser.add_argument(
+ "-p", "--port", type=int, default=7447,
+ help="Port number to be used for server.")
+
+ # Input / output.
+ aparser.add_argument(
+ "-i", "--input", default='-', help="source file")
+ aparser.add_argument(
+ "-r", "--ref", default=None, help="Reference translation.")
+ aparser.add_argument(
+ "-a", "--aln", default=None, help="Alignment.")
+ aparser.add_argument(
+ "-o", "--output", default="-", help="Output file.")
+ aparser.add_argument(
+ "-d", "--debug", action='store_true', help="Debug mode.")
+
+ # Moses reporting options.
+ aparser.add_argument(
+ "-A", "--with-alignment", dest="A", action='store_true',
+ help="Include alignment in output.")
+ aparser.add_argument(
+ "-G", "--with-graph", type=bool, default=False, dest="G",
+ help="Include search graph info in output.")
+ aparser.add_argument(
+ "-T", "--with-transopt", type=bool, default=False, dest="T",
+ help="Include translation options info in output.")
+ aparser.add_argument(
+ "-F", "--report-all-factors", action="store_true", dest="F",
+ help="Report all factors.")
+ aparser.add_argument(
+ "-n", "--nbest", type=int, dest="nbest", default=0,
+ help="Size of nbest list.")
+ aparser.add_argument(
+ "-N", "--nbest-file", dest="nbestFile", default=0,
+ help="Output file for nbest list.")
+ aparser.add_argument(
+ "-u", "--nbest-distinct", type=bool, dest="U", default=False,
+ help="Report all factors.")
return aparser.parse_args(my_args)
-
+
+
def translate(proxy, args, line):
if type(line) is unicode:
- param = { 'text' : line.strip().encode('utf8') }
+ param = {'text': line.strip().encode('utf8')}
elif type(line) is str:
- param = { 'text' : line.strip() }
+ param = {'text': line.strip()}
else:
raise Exception("Can't handle input")
- if args.A: param['align'] = True
- if args.T: param['topt'] = True
- if args.F: param['report-all-factors'] = True
- if args.nbest:
+ if args.A:
+ param['align'] = True
+ if args.T:
+ param['topt'] = True
+ if args.F:
+ param['report-all-factors'] = True
+ if args.nbest:
param['nbest'] = int(args.nbest)
param['add-score-breakdown'] = True
pass
- if args.U:
+ if args.U:
param['nbest-distinct'] = True
pass
attempts = 0
while attempts < 20:
t1 = time.time()
try:
- return proxy.translate(param)
+ return proxy.translate(param)
# except xmlrpclib.Fault as e:
# except xmlrpclib.ProtocolError as e:
# except xmlrpclib.ResponseError as e:
except xmlrpclib.Error as e:
- time.sleep(2) # give all the stderr stuff a chance to be flushed
- print >>sys.stderr," XMLRPC error:",e
+ sys.stderr.flush()
+ print >>sys.stderr, " XMLRPC error:", e
print >>sys.stderr, "Input was"
print >>sys.stderr, param
sys.exit(1)
except IOError as e:
- print >>sys.stderr,"I/O error({0}): {1}".format(e.errno, e.strerror)
+ print >>sys.stderr, (
+ "I/O error({0}): {1}".format(e.errno, e.strerror))
time.sleep(5)
except:
serverstatus = mserver.process.poll()
- if serverstatus == None:
- print >>sys.stderr, "Connection failed after %f seconds"%(time.time()-t1)
+ if serverstatus is None:
+ print >>sys.stderr, (
+ "Connection failed after %f seconds" % (time.time() - t1))
attempts += 1
if attempts > 10:
time.sleep(10)
else:
time.sleep(5)
- pass
else:
-
- print >>sys.stderr, "Oopsidaisy, server exited with code %d (signal %d)"\
- %(serverstatus/256,serverstatus%256)
+ print >>sys.stderr, (
+ "Oopsidaisy, server exited with code %d (signal %d)"
+ % (serverstatus / 256, serverstatus % 256))
pass
pass
pass
raise Exception("Exception: could not reach translation server.")
-
+
def read_data(fname):
"""
Read and return data (source, target or alignment) from file fname.
"""
if fname[-3:] == ".gz":
- foo = Popen(["zcat",fname],stdout=PIPE)\
- .communicate()[0]\
- .strip().split('\n')
+ process = Popen(["zcat", fname], stdout=PIPE)
+ stdout, _ = process.communicate()
+ foo = stdout.strip().split('\n')
else:
foo = [x.strip() for x in open(fname).readlines()]
- pass
return foo
-def repack_result(idx,result):
+
+def repack_result(idx, result):
global args
if args.nbest:
for h in result['nbest']:
- fields = [idx,h['hyp'],h['fvals'],h['totalScore']]
+ fields = [idx, h['hyp'], h['fvals'], h['totalScore']]
for i in xrange(len(fields)):
if type(fields[i]) is unicode:
fields[i] = fields[i].encode('utf-8')
pass
pass
- # print fields
- print >>NBestFile,"%d ||| %s ||| %s ||| %f"%tuple(fields)
- pass
+ # Print fields.
+ print >>NBestFile, "%d ||| %s ||| %s ||| %f" % tuple(fields)
pass
if 'align' in result:
t = result['text'].split()
@@ -200,16 +233,14 @@ def repack_result(idx,result):
k = 0
for a in result['align']:
k = a['tgt-start']
- if k: print " ".join(t[i:k]).encode('utf8'),span,
+ if k:
+ print " ".join(t[i:k]).encode('utf8'), span,
i = k
- span = "|%d %d|"%(a['src-start'],a['src-end'])
- pass
- print " ".join(t[k:]).encode('utf8'),span
- pass
+ span = "|%d %d|" % (a['src-start'], a['src-end'])
+ print " ".join(t[k:]).encode('utf8'), span
else:
print result['text'].encode('utf8')
- pass
- return
+
if __name__ == "__main__":
my_args, mo_args = split_args(sys.argv[1:])
@@ -221,17 +252,17 @@ if __name__ == "__main__":
args = interpret_args(my_args)
if "-show-weights" in mo_args:
- # this is for use during tuning, where moses is called to get a list of
- # feature names
- devnull = open(os.devnull,"w")
- mo = Popen(mserver.cmd + mo_args,stdout=PIPE,stderr=devnull)
+ # This is for use during tuning, where moses is called to get a list
+ # of feature names.
+ devnull = open(os.devnull, "w")
+ mo = Popen(mserver.cmd + mo_args, stdout=PIPE, stderr=devnull)
print mo.communicate()[0].strip()
sys.exit(0)
pass
if args.nbest:
if args.nbestFile:
- NBestFile = open(args.nbestFile,"w")
+ NBestFile = open(args.nbestFile, "w")
else:
NBestFile = sys.stdout
pass
@@ -239,8 +270,10 @@ if __name__ == "__main__":
ref = None
aln = None
- if args.ref: ref = read_data(args.ref)
- if args.aln: aln = read_data(args.aln)
+ if args.ref:
+ ref = read_data(args.ref)
+ if args.aln:
+ aln = read_data(args.aln)
if ref and aln:
try:
@@ -260,25 +293,21 @@ if __name__ == "__main__":
line = sys.stdin.readline()
idx = 0
while line:
- result = translate(mserver.proxy,args,line)
- repack_result(idx,result)
+ result = translate(mserver.proxy, args, line)
+ repack_result(idx, result)
line = sys.stdin.readline()
idx += 1
- pass
- pass
else:
src = read_data(args.input)
for i in xrange(len(src)):
- result = translate(mserver.proxy,args,src[i])
- repack_result(i,result)
+ result = translate(mserver.proxy, args, src[i])
+ repack_result(i, result)
if args.debug:
print >>sys.stderr, result['text'].encode('utf-8')
pass
- if ref and aln:
- result = mserver.proxy.updater({'source' : src[i],
- 'target' : ref[i],
- 'alignment' : aln[i]})
- pass
- pass
- pass
- pass
+ if ref and aln:
+ result = mserver.proxy.updater({
+ 'source': src[i],
+ 'target': ref[i],
+ 'alignment': aln[i],
+ })
diff --git a/scripts/tokenizer/pre_tokenize_cleaning.py b/scripts/tokenizer/pre_tokenize_cleaning.py
index 76736da5c..096a45dc4 100644
--- a/scripts/tokenizer/pre_tokenize_cleaning.py
+++ b/scripts/tokenizer/pre_tokenize_cleaning.py
@@ -2,12 +2,12 @@
"""
The Gacha filter cleans out sentence pairs that have global character mean
-lower than a certain threshold.
-
-Use this cleaner to produce low quantity of high quality sentence pairs.
+lower than a certain threshold.
-It is an aggressive cleaner that cleaned out ~64% of the HindEnCorp during
-WMT14 when threshold is set at 20% (Tan and Pal, 2014); achieving lowest TER.
+Use this cleaner to produce low quantity of high quality sentence pairs.
+
+It is an aggressive cleaner that cleaned out ~64% of the HindEnCorp during
+WMT14 when threshold is set at 20% (Tan and Pal, 2014); achieving lowest TER.
(see http://www.aclweb.org/anthology/W/W14/W14-3323.pdf)
This is inspired by the global character mean that is used in the Gale-Church
@@ -24,17 +24,24 @@ where:
(For details on Gale-Church, see http://www.aclweb.org/anthology/J93-1004.pdf)
"""
-import io, subprocess
+import io
+import subprocess
+
red = '\033[01;31m'
native = '\033[m'
+
def err_msg(txt):
- return red+txt+native
+ return red + txt + native
+
def num_char(filename):
- return float(subprocess.Popen(["wc", "-m", filename],
- stdout=subprocess.PIPE).stdout.read().split()[0])
+ process = subprocess.Popen(
+ ["wc", "-m", filename], stdout=subprocess.PIPE)
+ # TODO: Was this meant to call communicate()?
+ return float(process.stdout.read().split()[0])
+
def gacha_mean(sourcefile, targetfile):
"""
@@ -43,36 +50,44 @@ def gacha_mean(sourcefile, targetfile):
"""
sys.stderr.write(err_msg('Calculating Gacha mean, please wait ...\n'))
c = num_char(sourcefile) / num_char(targetfile)
- sys.stderr.write(err_msg('Gacha mean = '+str(c)+'\n'))
+ sys.stderr.write(err_msg('Gacha mean = ' + str(c) + '\n'))
sys.stderr.write(err_msg('Filtering starts ...\n'))
return c
+
+def io_open(path):
+ """Open file `path` for reading, as a UTF-8 text file."""
+ return io.open(path, 'r', encoding='utf8')
+
+
def main(sourcefile, targetfile, threshold=0.2):
# Calculates Gacha mean.
c = gacha_mean(sourcefile, targetfile)
# Calculates lower and upperbound for filtering
threshold = float(threshold)
- lowerbound = (1-threshold) * c
- upperbound = (1+threshold) * c
-
+ lowerbound = (1 - threshold) * c
+ upperbound = (1 + threshold) * c
+
# Start filtering sentences.
- with io.open(sourcefile, 'r', encoding='utf8') as srcfin, \
- io.open(targetfile, 'r', encoding='utf8') as trgfin:
+ with io_open(sourcefile) as srcfin, io_open(targetfile) as trgfin:
for s, t in zip(srcfin, trgfin):
if lowerbound < len(s) / float(len(t)) < upperbound:
- print(u"{}\t{}\n".format(s.strip(),t.strip()))
+ print(u"{}\t{}\n".format(s.strip(), t.strip()))
+
if __name__ == '__main__':
import sys
- if len(sys.argv) not in range(3,5):
- usage_msg = err_msg('Usage: python %s srcfile trgfile (threshold)\n'
- % sys.argv[0])
-
- example_msg = err_msg('Example: gacha_cleaning.py ~/Europarl.de-en.de '
- '~/Europarl.de-en.en 0.4\n'
- % sys.argv[0])
+ if len(sys.argv) not in range(3, 5):
+ usage_msg = err_msg(
+ "Usage: python %s srcfile trgfile (threshold)\n"
+ % sys.argv[0])
+
+ example_msg = err_msg(
+ "Example: "
+ "gacha_cleaning.py ~/Europarl.de-en.de ~/Europarl.de-en.en 0.4\n"
+ % sys.argv[0])
sys.stderr.write(usage_msg)
sys.stderr.write(example_msg)
sys.exit(1)
-
+
main(*sys.argv[1:])
diff --git a/scripts/training/filter-rule-table.py b/scripts/training/filter-rule-table.py
index 86c8b300e..14736fe1f 100755
--- a/scripts/training/filter-rule-table.py
+++ b/scripts/training/filter-rule-table.py
@@ -24,9 +24,11 @@
import optparse
import sys
+
class NGram(tuple):
pass
+
class Gap:
def __init__(self, minSpan):
self.minSpan = minSpan
@@ -34,8 +36,12 @@ class Gap:
def getMinSpan(self):
return self.minSpan
+
def printUsage():
- sys.stderr.write("Usage: filter-rule-table.py [--min-non-initial-rule-count=N] INPUT")
+ sys.stderr.write(
+ "Usage: "
+ "filter-rule-table.py [--min-non-initial-rule-count=N] INPUT")
+
def main():
parser = optparse.OptionParser()
@@ -54,14 +60,15 @@ def main():
inputSentences.append(line.split())
filterRuleTable(sys.stdin, inputSentences, N, options)
+
def filterRuleTable(ruleTable, inputSentences, N, options):
# Map each input n-gram (n = 1..N) to a map from sentence indices to
# lists of intra-sentence indices.
occurrences = {}
for i, sentence in enumerate(inputSentences):
- for n in range(1, N+1):
- for j in range(0, len(sentence)-n+1):
- ngram = NGram(sentence[j:j+n])
+ for n in range(1, N + 1):
+ for j in range(0, len(sentence) - n + 1):
+ ngram = NGram(sentence[j:j + n])
innerMap = occurrences.setdefault(ngram, {})
indices = innerMap.setdefault(i, [])
indices.append(j)
@@ -70,15 +77,16 @@ def filterRuleTable(ruleTable, inputSentences, N, options):
prevRuleIncluded = None
for line in ruleTable:
rhs, count = parseRule(line)
+ below_threshold = (count is not None and count < options.minCount)
# Prune non-initial rule if count is below threshold.
- if count != None and count < options.minCount and isNonInitialRule(rhs):
+ if below_threshold and isNonInitialRule(rhs):
if prevRHS != rhs:
prevRuleIncluded = None
prevRHS = rhs
continue
# If source RHS is same as last rule's then we already know whether to
# filter or not (unless it was pruned before checking).
- if rhs == prevRHS and prevRuleIncluded != None:
+ if rhs == prevRHS and prevRuleIncluded is not None:
if prevRuleIncluded:
print line,
continue
@@ -89,7 +97,10 @@ def filterRuleTable(ruleTable, inputSentences, N, options):
prevRuleIncluded = True
continue
segments = segmentRHS(rhs, N)
- ngramMaps = [occurrences.get(s, {}) for s in segments if isinstance(s, NGram)]
+ ngramMaps = [
+ occurrences.get(s, {})
+ for s in segments
+ if isinstance(s, NGram)]
if len(ngramMaps) == 0:
print line,
prevRuleIncluded = True
@@ -111,9 +122,13 @@ def filterRuleTable(ruleTable, inputSentences, N, options):
break
prevRuleIncluded = match
-# Parse a line of the rule table and return a tuple containing two items,
-# the list of RHS source symbols and the rule count (if present).
+
def parseRule(line):
+ """Parse a line of the rule table.
+
+ :return: A tuple containing two items: the list of RHS source symbols,
+ and the rule count (if present).
+ """
cols = line.split("|||")
rhsSourceSymbols = cols[0].split()[:-1]
ruleCount = None
@@ -123,15 +138,18 @@ def parseRule(line):
ruleCount = float(counts[2])
return (rhsSourceSymbols, ruleCount)
+
def isNT(symbol):
return symbol[0] == '[' and symbol[-1] == ']'
+
def isNonInitialRule(rhs):
for symbol in rhs:
if isNT(symbol):
return True
return False
+
def segmentRHS(rhs, N):
segments = []
terminals = []
@@ -159,13 +177,14 @@ def segmentRHS(rhs, N):
segments.append(NGram(terminals))
return segments
+
def matchSegments(segments, indexSeq, sentenceLength):
assert len(segments) > 0
firstSegment = segments[0]
i = 0
if isinstance(firstSegment, Gap):
minPos = firstSegment.getMinSpan()
- maxPos = sentenceLength-1
+ maxPos = sentenceLength - 1
else:
minPos = indexSeq[i] + len(firstSegment)
i += 1
@@ -175,7 +194,7 @@ def matchSegments(segments, indexSeq, sentenceLength):
if minPos + segment.getMinSpan() > sentenceLength:
return False
minPos = minPos + segment.getMinSpan()
- maxPos = sentenceLength-1
+ maxPos = sentenceLength - 1
else:
pos = indexSeq[i]
i += 1
@@ -185,6 +204,7 @@ def matchSegments(segments, indexSeq, sentenceLength):
maxPos = minPos
return True
+
def enumerateIndexSeqs(ngramMaps, sentenceIndex, minFirstIndex):
assert len(ngramMaps) > 0
if len(ngramMaps) == 1:
@@ -195,7 +215,7 @@ def enumerateIndexSeqs(ngramMaps, sentenceIndex, minFirstIndex):
for index in ngramMaps[0][sentenceIndex]:
if index < minFirstIndex:
continue
- for seq in enumerateIndexSeqs(ngramMaps[1:], sentenceIndex, index+1):
+ for seq in enumerateIndexSeqs(ngramMaps[1:], sentenceIndex, index + 1):
assert seq[0] > index
yield [index] + seq
diff --git a/scripts/training/rdlm/average_null_embedding.py b/scripts/training/rdlm/average_null_embedding.py
index cb67c9d75..28abc9508 100755
--- a/scripts/training/rdlm/average_null_embedding.py
+++ b/scripts/training/rdlm/average_null_embedding.py
@@ -2,18 +2,23 @@
# -*- coding: utf-8 -*-
# Author: Rico Sennrich
-# average embeddings of special null words for RDLM.
-# Usage: average_null_embedding.py NPLM_PATH INPUT_MODEL TRAINING_FILE OUTPUT_MODEL
+"""Average embeddings of special null words for RDLM.
+
+Usage:
+ average_null_embedding.py NPLM_PATH INPUT_MODEL TRAINING_FILE OUTPUT_MODEL
+"""
import sys
import os
import numpy
+
def load_model(model_file):
return nplm.NeuralLM.from_file(model_file)
+
def get_weights(path, vocab, len_context):
- d = [[0]*vocab for i in range(len_context)]
+ d = [[0] * vocab for i in range(len_context)]
for line in open(path):
for i, word in enumerate(line.split()[:-1]):
d[i][int(word)] += 1
@@ -26,20 +31,23 @@ if __name__ == "__main__":
training_instances = sys.argv[3]
model_output = sys.argv[4]
- sys.path.append(os.path.join(nplm_path,'python'))
+ sys.path.append(os.path.join(nplm_path, 'python'))
import nplm
model = load_model(model_input)
- len_context = len(open(training_instances).readline().split())-1
+ len_context = len(open(training_instances).readline().split()) - 1
sys.stderr.write('reading ngrams...')
- weights = numpy.array(get_weights(training_instances, len(model.input_embeddings), len_context))
+ weights = numpy.array(
+ get_weights(
+ training_instances, len(model.input_embeddings), len_context))
sys.stderr.write('done\n')
for i in range(len_context):
index = model.word_to_index_input['<null_{0}>'.format(i)]
- model.input_embeddings[index] = numpy.average(numpy.array(model.input_embeddings), weights=weights[i], axis=0)
+ model.input_embeddings[index] = numpy.average(
+ numpy.array(model.input_embeddings), weights=weights[i], axis=0)
sys.stderr.write('writing model...')
- model.to_file(open(model_output,'w'))
+ model.to_file(open(model_output, 'w'))
sys.stderr.write('done\n')
diff --git a/scripts/training/rdlm/extract_syntactic_ngrams.py b/scripts/training/rdlm/extract_syntactic_ngrams.py
index f3ce41080..c6d4b7968 100755
--- a/scripts/training/rdlm/extract_syntactic_ngrams.py
+++ b/scripts/training/rdlm/extract_syntactic_ngrams.py
@@ -2,17 +2,25 @@
# -*- coding: utf-8 -*-
# Author: Rico Sennrich
-# extract syntactic n-grams from dependency treebank in Moses XML format for training RDLM
-# expected format can be produced with mosesdecoder/scripts/training/wrapper/conll2mosesxml.py
-# OOV terminal symbols are mapped to preterminal; OOV nonterminals are mapped to 0 (<unk>)
+"""
+Extract syntactic n-grams from dependency treebank in Moses XML format for
+training RDLM.
+
+Expected format can be produced with
+mosesdecoder/scripts/training/wrapper/conll2mosesxml.py
+
+OOV terminal symbols are mapped to preterminal; OOV nonterminals are mapped
+to 0 (<unk>)
+"""
from __future__ import print_function, unicode_literals, division
import sys
import codecs
import argparse
-# hack for python2/3 compatibility
+# Hack for python2/3 compatibility
from io import open
+
argparse.open = open
try:
@@ -20,46 +28,84 @@ try:
except ImportError:
from xml.etree import cElementTree as ET
+
def create_parser():
- parser = argparse.ArgumentParser(description="extract syntactic n-grams from parsed corpus in Moses XML format for training RDLM")
-
- parser.add_argument('--input', '-i', type=argparse.FileType('r'), default=sys.stdin, metavar='PATH',
- help='input file (default: standard input).')
- parser.add_argument('--output', '-o', type=argparse.FileType('w'), default=sys.stdout, metavar='PATH',
- help='output file (default: standard output).')
- parser.add_argument('--mode', type=str, help='predict terminals (head) or dependency labels (label)',
- choices=['label', 'head'], required=True)
- parser.add_argument('--vocab', metavar='PATH', type=str, required=True,
- help='input layer vocabulary file (one item per line; first line \'<unk>\')')
- parser.add_argument('--output_vocab', metavar='PATH', type=str,
- help='output layer vocabulary file (default: use input layer vocabulary)')
- parser.add_argument('--left_context', metavar='INT', type=int,
- help='size of context vector for left siblings (default: %(default)s)', default=3)
- parser.add_argument('--right_context', metavar='INT', type=int,
- help='size of context vector for right siblings (default: %(default)s)', default=0)
- parser.add_argument('--up_context', metavar='INT', type=int,
- help='size of context vector for ancestors (default: %(default)s)', default=2)
- parser.add_argument('--glue_symbol', metavar='STR', type=str, default='Q',
- help='glue symbol. Will be skipped during extraction (default: %(default)s)')
- parser.add_argument('--start_symbol', metavar='STR', type=str, default='SSTART',
- help='sentence start symbol. Will be skipped during extraction (default: %(default)s)')
- parser.add_argument('--end_symbol', metavar='STR', type=str, default='SEND',
- help='sentence end symbol. Will be skipped during extraction (default: %(default)s)')
- parser.add_argument('--ptkvz', action='store_true',
- help='special rule for German dependency trees: concatenate separable verb prefix and verb')
+ parser = argparse.ArgumentParser(
+ description=(
+ "Extract syntactic n-grams from parsed corpus in "
+ "Moses XML format for training RDLM"))
+
+ parser.add_argument(
+ '--input', '-i', type=argparse.FileType('r'), default=sys.stdin,
+ metavar='PATH',
+ help='Input file (default: standard input).')
+ parser.add_argument(
+ '--output', '-o', type=argparse.FileType('w'), default=sys.stdout,
+ metavar='PATH',
+ help='Output file (default: standard output).')
+ parser.add_argument(
+ '--mode', type=str, choices=['label', 'head'], required=True,
+ help='Predict terminals (head) or dependency labels (label).')
+ parser.add_argument(
+ '--vocab', metavar='PATH', type=str, required=True,
+ help=(
+ "Input layer vocabulary file (one item per line; "
+ "first line '<unk>')"))
+ parser.add_argument(
+ '--output_vocab', metavar='PATH', type=str,
+ help=(
+ "Output layer vocabulary file "
+ "(default: use input layer vocabulary)"))
+ parser.add_argument(
+ '--left_context', metavar='INT', type=int, default=3,
+ help=(
+ "Size of context vector for left siblings "
+ "(default: %(default)s)"))
+ parser.add_argument(
+ '--right_context', metavar='INT', type=int, default=0,
+ help=(
+ "Size of context vector for right siblings "
+ "(default: %(default)s)"))
+ parser.add_argument(
+ '--up_context', metavar='INT', type=int, default=2,
+ help=(
+ "Size of context vector for ancestors "
+ "(default: %(default)s)"))
+ parser.add_argument(
+ '--glue_symbol', metavar='STR', type=str, default='Q',
+ help=(
+ "Glue symbol. Will be skipped during extraction "
+ "(default: %(default)s)"))
+ parser.add_argument(
+ '--start_symbol', metavar='STR', type=str, default='SSTART',
+ help=(
+ "Sentence start symbol. Will be skipped during extraction "
+ "(default: %(default)s)"))
+ parser.add_argument(
+ '--end_symbol', metavar='STR', type=str, default='SEND',
+ help=(
+ "Sentence end symbol. Will be skipped during extraction "
+ "(default: %(default)s)"))
+ parser.add_argument(
+ '--ptkvz', action='store_true',
+ help=(
+ "Special rule for German dependency trees: "
+ "concatenate separable verb prefix and verb."))
return parser
+
def escape_text(s):
- s = s.replace('|','&#124;') # factor separator
- s = s.replace('[','&#91;') # syntax non-terminal
- s = s.replace(']','&#93;') # syntax non-terminal
- s = s.replace('\'','&apos;') # xml special character
- s = s.replace('"','&quot;') # xml special character
+ s = s.replace('|', '&#124;') # factor separator
+ s = s.replace('[', '&#91;') # syntax non-terminal
+ s = s.replace(']', '&#93;') # syntax non-terminal
+ s = s.replace('\'', '&apos;') # xml special character
+ s = s.replace('"', '&quot;') # xml special character
return s
-# deterministic heuristic to get head of subtree
+
def get_head(xml, add_ptkvz):
+ """Deterministic heuristic to get head of subtree."""
head = None
preterminal = None
for child in xml:
@@ -77,23 +123,38 @@ def get_head(xml, add_ptkvz):
return head, preterminal
-def get_syntactic_ngrams(xml, options, vocab, output_vocab, parent_heads=None, parent_labels=None):
- if len(xml):
+def get_syntactic_ngrams(xml, options, vocab, output_vocab,
+ parent_heads=None, parent_labels=None):
- # skip glue rules
- if xml.get('label') == options.glue_symbol or xml.get('label') == options.start_symbol or xml.get('label') == options.end_symbol:
- for child in xml:
- get_syntactic_ngrams(child, options, vocab, output_vocab, parent_heads, parent_labels)
- return
+ if len(xml):
- # skip virtual nodes
- if xml.get('label') == '<stop_label>' or xml.get('label') == '<start_label>':
- return
+ # Skip glue rules.
+ skip_glue_labels = [
+ options.glue_symbol,
+ options.start_symbol,
+ options.end_symbo,
+ ]
+ if xml.get('label') in skip_glue_labels:
+ for child in xml:
+ get_syntactic_ngrams(
+ child, options, vocab, output_vocab, parent_heads,
+ parent_labels)
+ return
+
+ # Skip virtual nodes.
+ skip_virtual_labels = [
+ '<stop_label>',
+ '<start_label>',
+ ]
+ if xml.get('label') in skip_virtual_labels:
+ return
if not parent_heads:
- parent_heads = [vocab.get('<root_head>', 0)] * options.up_context
- parent_labels = [vocab.get('<root_label>', 0)] * options.up_context
+ parent_heads = (
+ [vocab.get('<root_head>', 0)] * options.up_context)
+ parent_labels = (
+ [vocab.get('<root_label>', 0)] * options.up_context)
head, preterminal = get_head(xml, options.ptkvz)
if not head:
@@ -119,7 +180,8 @@ def get_syntactic_ngrams(xml, options, vocab, output_vocab, parent_heads=None, p
options.output.write(' '.join(map(str, int_list)) + '\n')
elif options.mode == 'head' and not head == '<dummy_head>':
int_list.append(vocab.get(label, 0))
- int_list.append(output_vocab.get(head, output_vocab.get(preterminal, 0)))
+ int_list.append(
+ output_vocab.get(head, output_vocab.get(preterminal, 0)))
options.output.write(' '.join(map(str, int_list)) + '\n')
parent_heads.append(vocab.get(head, 0))
@@ -130,28 +192,29 @@ def get_syntactic_ngrams(xml, options, vocab, output_vocab, parent_heads=None, p
if options.right_context:
start = ET.Element('tree')
start2 = ET.Element('tree')
- start.set('label','<start_label>')
- start2.set('label','XY')
+ start.set('label', '<start_label>')
+ start2.set('label', 'XY')
start2.text = '<start_head>'
start.append(start2)
- xml.insert(0,start)
+ xml.insert(0, start)
if options.left_context:
end = ET.Element('tree')
end2 = ET.Element('tree')
- end.set('label','<stop_label>')
- end2.set('label','XY')
+ end.set('label', '<stop_label>')
+ end2.set('label', 'XY')
end2.text = '<stop_head>'
end.append(end2)
xml.append(end)
-
heads = []
preterminals = []
labels = []
for child in xml:
if not len(child):
- # mark that the previous sibling is the head of the structure (the head/label are not repeated because they're also head/label of the parent)
+ # Mark that the previous sibling is the head of the
+ # structure (the head/label are not repeated because they're
+ # also head/label of the parent).
head_child = '<head_head>'
preterminal_child = head_child
child_label = '<head_label>'
@@ -166,37 +229,60 @@ def get_syntactic_ngrams(xml, options, vocab, output_vocab, parent_heads=None, p
preterminals.append(preterminal_child)
labels.append(child_label)
- heads_idx = [vocab.get(heads[i], vocab.get(preterminals[i], 0)) for i in range(len(heads))]
- labels_idx = [vocab.get(labels[i], 0) for i in range(len(labels))]
+ heads_idx = [
+ vocab.get(heads[i], vocab.get(preterminals[i], 0))
+ for i in range(len(heads))]
+ labels_idx = [
+ vocab.get(labels[i], 0)
+ for i in range(len(labels))]
- #ancestor context is same for all children
+ # Ancestor context is the same for all children.
up_heads = parent_heads[-options.up_context:]
up_labels = parent_labels[-options.up_context:]
- for i,child in enumerate(xml):
-
- # skip some special symbols, but recursively extract n-grams for its children
- if options.mode == 'head' and (heads[i] == '<dummy_head>' or heads[i] == '<head_head>' or heads[i] == '<stop_head>' or heads[i] == '<start_head>'):
+ skip_special_heads = [
+ '<dummy_head>',
+ '<head_head>',
+ '<stop_head>',
+ '<start_head>',
+ ]
+ for i, child in enumerate(xml):
+
+ # Skip some special symbols, but recursively extract n-grams
+ # for its children.
+ if options.mode == 'head' and heads[i] in skip_special_heads:
parent_heads.append(vocab.get(heads[i], 0))
parent_labels.append(vocab.get(labels[i], 0))
- get_syntactic_ngrams(child, options, vocab, output_vocab, parent_heads, parent_labels)
+ get_syntactic_ngrams(
+ child, options, vocab, output_vocab, parent_heads,
+ parent_labels)
parent_heads.pop()
parent_labels.pop()
continue
- previous_heads = heads_idx[max(0,i-options.left_context):i]
- previous_labels = labels_idx[max(0,i-options.left_context):i]
+ previous_heads = heads_idx[max(0, i - options.left_context):i]
+ previous_labels = labels_idx[max(0, i - options.left_context):i]
- subsequent_heads = heads_idx[i+1:i+options.right_context+1]
- subsequent_labels = labels_idx[i+1:i+options.right_context+1]
+ subsequent_heads = heads_idx[i + 1:i + options.right_context + 1]
+ subsequent_labels = labels_idx[i + 1:i + options.right_context + 1]
if len(previous_heads) < options.left_context:
- previous_heads = [start_head_idx] * (options.left_context-len(previous_heads)) + previous_heads
- previous_labels = [start_label_idx] * (options.left_context-len(previous_labels)) + previous_labels
+ previous_heads = (
+ [start_head_idx] *
+ (options.left_context - len(previous_heads)) +
+ previous_heads)
+ previous_labels = (
+ [start_label_idx] *
+ (options.left_context - len(previous_labels)) +
+ previous_labels)
if len(subsequent_heads) < options.right_context:
- subsequent_heads = subsequent_heads + [stop_head_idx] * (options.right_context-len(subsequent_heads))
- subsequent_labels = subsequent_labels + [stop_label_idx] * (options.right_context-len(subsequent_labels))
+ subsequent_heads += (
+ [stop_head_idx] *
+ (options.right_context - len(subsequent_heads)))
+ subsequent_labels += (
+ [stop_label_idx] *
+ (options.right_context - len(subsequent_labels)))
int_list = []
int_list.extend(previous_heads)
@@ -209,14 +295,19 @@ def get_syntactic_ngrams(xml, options, vocab, output_vocab, parent_heads=None, p
int_list.append(output_vocab.get(labels[i], 0))
elif options.mode == 'head':
int_list.append(vocab.get(labels[i], 0))
- int_list.append(output_vocab.get(heads[i], output_vocab.get(preterminals[i], 0)))
+ int_list.append(
+ output_vocab.get(
+ heads[i], output_vocab.get(preterminals[i], 0)))
options.output.write(' '.join(map(str, int_list)) + '\n')
- parent_heads.append(vocab.get(heads[i], vocab.get(preterminals[i], 0)))
+ parent_heads.append(
+ vocab.get(heads[i], vocab.get(preterminals[i], 0)))
parent_labels.append(vocab.get(labels[i], 0))
- get_syntactic_ngrams(child, options, vocab, output_vocab, parent_heads, parent_labels)
+ get_syntactic_ngrams(
+ child, options, vocab, output_vocab, parent_heads,
+ parent_labels)
parent_heads.pop()
parent_labels.pop()
@@ -224,15 +315,17 @@ def get_syntactic_ngrams(xml, options, vocab, output_vocab, parent_heads=None, p
def load_vocab(path):
v = {}
- for i,line in enumerate(open(path, encoding="UTF-8")):
+ for i, line in enumerate(open(path, encoding="UTF-8")):
v[line.strip()] = i
return v
+
def main(options):
vocab = load_vocab(options.vocab)
if options.output_vocab is None:
- sys.stderr.write('no output vocabulary specified; using input vocabulary\n')
+ sys.stderr.write(
+ "No output vocabulary specified; using input vocabulary.\n")
output_vocab = vocab
else:
output_vocab = load_vocab(options.output_vocab)
@@ -275,4 +368,4 @@ if __name__ == '__main__':
parser = create_parser()
options = parser.parse_args()
- main(options) \ No newline at end of file
+ main(options)
diff --git a/scripts/training/rdlm/extract_vocab.py b/scripts/training/rdlm/extract_vocab.py
index 6d017602e..ed9266fd9 100755
--- a/scripts/training/rdlm/extract_vocab.py
+++ b/scripts/training/rdlm/extract_vocab.py
@@ -9,6 +9,7 @@ import sys
import codecs
import argparse
from collections import Counter
+from textwrap import dedent
# hack for python2/3 compatibility
from io import open
@@ -19,37 +20,49 @@ try:
except ImportError:
from xml.etree import cElementTree as ET
-def create_parser():
- help_text = "generate 5 vocabulary files from parsed corpus in moses XML format\n"
- help_text += " [PREFIX].special: around 40 symbols reserved for RDLM\n";
- help_text += " [PREFIX].preterminals: preterminal symbols\n";
- help_text += " [PREFIX].nonterminals: nonterminal symbols (which are not preterminal)\n";
- help_text += " [PREFIX].terminals: terminal symbols\n";
- help_text += " [PREFIX].all: all of the above\n"
+HELP_TEXT = dedent("""\
+ generate 5 vocabulary files from parsed corpus in moses XML format
+ [PREFIX].special: around 40 symbols reserved for RDLM
+ [PREFIX].preterminals: preterminal symbols
+ [PREFIX].nonterminals: nonterminal symbols (which are not preterminal)
+ [PREFIX].terminals: terminal symbols
+ [PREFIX].all: all of the above
+""")
- parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter, description=help_text)
- parser.add_argument('--input', '-i', type=argparse.FileType('r'), default=sys.stdin, metavar='PATH',
- help='input text (default: standard input).')
- parser.add_argument('--output', '-o', type=str, default='vocab', metavar='PREFIX',
- help='output prefix (default: "vocab")')
- parser.add_argument('--ptkvz', action="store_true",
- help='special rule for German dependency trees: attach separable verb prefixes to verb')
+def create_parser():
+ parser = argparse.ArgumentParser(
+ formatter_class=argparse.RawDescriptionHelpFormatter,
+ description=HELP_TEXT)
+
+ parser.add_argument(
+ '--input', '-i', type=argparse.FileType('r'), default=sys.stdin,
+ metavar='PATH',
+ help="Input text (default: standard input).")
+ parser.add_argument(
+ '--output', '-o', type=str, default='vocab', metavar='PREFIX',
+ help="Output prefix (default: 'vocab')")
+ parser.add_argument(
+ '--ptkvz', action="store_true",
+ help=(
+ "Special rule for German dependency trees: attach separable "
+ "verb prefixes to verb."))
return parser
-def escape_text(s):
- s = s.replace('|','&#124;') # factor separator
- s = s.replace('[','&#91;') # syntax non-terminal
- s = s.replace(']','&#93;') # syntax non-terminal
- s = s.replace('\'','&apos;') # xml special character
- s = s.replace('"','&quot;') # xml special character
+def escape_text(s):
+ s = s.replace('|', '&#124;') # factor separator
+ s = s.replace('[', '&#91;') # syntax non-terminal
+ s = s.replace(']', '&#93;') # syntax non-terminal
+ s = s.replace('\'', '&apos;') # xml special character
+ s = s.replace('"', '&quot;') # xml special character
return s
-# deterministic heuristic to get head of subtree
+
def get_head(xml, args):
+ """Deterministic heuristic to get head of subtree."""
head = None
preterminal = None
for child in xml:
@@ -67,6 +80,7 @@ def get_head(xml, args):
return head, preterminal
+
def get_vocab(xml, args):
if len(xml):
@@ -88,6 +102,7 @@ def get_vocab(xml, args):
continue
get_vocab(child, args)
+
def main(args):
global heads
@@ -111,10 +126,24 @@ def main(args):
get_vocab(xml, args)
i += 1
- special_tokens = ['<unk>', '<null>', '<null_label>', '<null_head>', '<head_label>', '<root_label>', '<start_label>', '<stop_label>', '<head_head>', '<root_head>', '<start_head>', '<dummy_head>', '<stop_head>']
+ special_tokens = [
+ '<unk>',
+ '<null>',
+ '<null_label>',
+ '<null_head>',
+ '<head_label>',
+ '<root_label>',
+ '<start_label>',
+ '<stop_label>',
+ '<head_head>',
+ '<root_head>',
+ '<start_head>',
+ '<dummy_head>',
+ '<stop_head>',
+ ]
for i in range(30):
- special_tokens.append('<null_{0}>'.format(i))
+ special_tokens.append('<null_{0}>'.format(i))
f = open(args.output + '.special', 'w', encoding='UTF-8')
for item in special_tokens:
@@ -158,7 +187,6 @@ def main(args):
f.close()
-
if __name__ == '__main__':
if sys.version_info < (3, 0):
diff --git a/scripts/training/rdlm/train_rdlm.py b/scripts/training/rdlm/train_rdlm.py
index 15e56c430..ae57e8dfc 100755
--- a/scripts/training/rdlm/train_rdlm.py
+++ b/scripts/training/rdlm/train_rdlm.py
@@ -9,7 +9,6 @@ import subprocess
import sys
import os
import codecs
-import copy
# ../bilingual-lm
sys.path.append(os.path.join(os.path.dirname(sys.path[0]), 'bilingual-lm'))
@@ -17,143 +16,224 @@ import train_nplm
import extract_vocab
import extract_syntactic_ngrams
-logging.basicConfig(format='%(asctime)s %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S', level=logging.DEBUG)
+logging.basicConfig(
+ format='%(asctime)s %(levelname)s: %(message)s',
+ datefmt='%Y-%m-%d %H:%M:%S', level=logging.DEBUG)
parser = argparse.ArgumentParser()
-parser.add_argument("--working-dir", dest="working_dir", metavar="PATH")
-parser.add_argument("--corpus", dest="corpus_stem", metavar="PATH", help="input file")
-parser.add_argument("--nplm-home", dest="nplm_home", metavar="PATH", help="location of NPLM", required=True)
-parser.add_argument("--epochs", dest="epochs", type=int, metavar="INT", help="number of training epochs (default: %(default)s)")
-parser.add_argument("--up-context-size", dest="up_context_size", type=int, metavar="INT", help="size of ancestor context (default: %(default)s)")
-parser.add_argument("--left-context-size", dest="left_context_size", type=int, metavar="INT", help="size of sibling context (left) (default: %(default)s)")
-parser.add_argument("--right-context-size", dest="right_context_size", type=int, metavar="INT", help="size of sibling context (right) (default: %(default)s)")
-parser.add_argument("--mode", dest="mode", choices=['head', 'label'], help="type of RDLM to train (both are required for decoding)", required=True)
-parser.add_argument("--minibatch-size", dest="minibatch_size", type=int, metavar="INT", help="minibatch size (default: %(default)s)")
-parser.add_argument("--noise", dest="noise", type=int, metavar="INT", help="number of noise samples for NCE (default: %(default)s)")
-parser.add_argument("--hidden", dest="hidden", type=int, metavar="INT", help="size of hidden layer (0 for single hidden layer) (default: %(default)s)")
-parser.add_argument("--input-embedding", dest="input_embedding", type=int, metavar="INT", help="size of input embedding layer (default: %(default)s)")
-parser.add_argument("--output-embedding", dest="output_embedding", type=int, metavar="INT", help="size of output embedding layer (default: %(default)s)")
-parser.add_argument("--threads", "-t", dest="threads", type=int, metavar="INT", help="number of threads (default: %(default)s)")
-parser.add_argument("--output-model", dest="output_model", metavar="PATH", help="name of output model (default: %(default)s)")
-parser.add_argument("--output-dir", dest="output_dir", metavar="PATH", help="output directory (default: same as working-dir)")
-parser.add_argument("--config-options-file", dest="config_options_file", metavar="PATH")
-parser.add_argument("--log-file", dest="log_file", metavar="PATH", help="log file to write to (default: %(default)s)")
-parser.add_argument("--validation-corpus", dest="validation_corpus", metavar="PATH", help="validation file (default: %(default)s)")
-parser.add_argument("--activation-function", dest="activation_fn", choices=['identity', 'rectifier', 'tanh', 'hardtanh'], help="activation function (default: %(default)s)")
-parser.add_argument("--learning-rate", dest="learning_rate", type=float, metavar="FLOAT", help="learning rate (default: %(default)s)")
-parser.add_argument("--input-words-file", dest="input_words_file", metavar="PATH", help="input vocabulary (default: %(default)s)")
-parser.add_argument("--output-words-file", dest="output_words_file", metavar="PATH", help="output vocabulary (default: %(default)s)")
-parser.add_argument("--input_vocab_size", dest="input_vocab_size", type=int, metavar="INT", help="input vocabulary size (default: %(default)s)")
-parser.add_argument("--output-vocab-size", dest="output_vocab_size", type=int, metavar="INT", help="output vocabulary size (default: %(default)s)")
+parser.add_argument(
+ "--working-dir", dest="working_dir", metavar="PATH")
+parser.add_argument(
+ "--corpus", dest="corpus_stem", metavar="PATH", help="Input file.")
+parser.add_argument(
+ "--nplm-home", dest="nplm_home", metavar="PATH", required=True,
+ help="Location of NPLM.")
+parser.add_argument(
+ "--epochs", dest="epochs", type=int, metavar="INT",
+ help="Number of training epochs (default: %(default)s).")
+parser.add_argument(
+ "--up-context-size", dest="up_context_size", type=int, metavar="INT",
+ help="Size of ancestor context (default: %(default)s).")
+parser.add_argument(
+ "--left-context-size", dest="left_context_size", type=int, metavar="INT",
+ help="Size of sibling context (left) (default: %(default)s).")
+parser.add_argument(
+ "--right-context-size", dest="right_context_size", type=int,
+ metavar="INT",
+ help="Size of sibling context (right) (default: %(default)s).")
+parser.add_argument(
+ "--mode", dest="mode", choices=['head', 'label'], required=True,
+ help="Type of RDLM to train (both are required for decoding).")
+parser.add_argument(
+ "--minibatch-size", dest="minibatch_size", type=int, metavar="INT",
+ help="Minibatch size (default: %(default)s).")
+parser.add_argument(
+ "--noise", dest="noise", type=int, metavar="INT",
+ help="Number of noise samples for NCE (default: %(default)s).")
+parser.add_argument(
+ "--hidden", dest="hidden", type=int, metavar="INT",
+ help=(
+ "Size of hidden layer (0 for single hidden layer) "
+ "(default: %(default)s)"))
+parser.add_argument(
+ "--input-embedding", dest="input_embedding", type=int, metavar="INT",
+ help="Size of input embedding layer (default: %(default)s).")
+parser.add_argument(
+ "--output-embedding", dest="output_embedding", type=int, metavar="INT",
+ help="Size of output embedding layer (default: %(default)s).")
+parser.add_argument(
+ "--threads", "-t", dest="threads", type=int, metavar="INT",
+ help="Number of threads (default: %(default)s).")
+parser.add_argument(
+ "--output-model", dest="output_model", metavar="PATH",
+ help="Name of output model (default: %(default)s).")
+parser.add_argument(
+ "--output-dir", dest="output_dir", metavar="PATH",
+ help="Output directory (default: same as working-dir).")
+parser.add_argument(
+ "--config-options-file", dest="config_options_file", metavar="PATH")
+parser.add_argument(
+ "--log-file", dest="log_file", metavar="PATH",
+ help="Log file to write to (default: %(default)s).")
+parser.add_argument(
+ "--validation-corpus", dest="validation_corpus", metavar="PATH",
+ help="Validation file (default: %(default)s).")
+parser.add_argument(
+ "--activation-function", dest="activation_fn",
+ choices=['identity', 'rectifier', 'tanh', 'hardtanh'],
+ help="Activation function (default: %(default)s).")
+parser.add_argument(
+ "--learning-rate", dest="learning_rate", type=float, metavar="FLOAT",
+ help="Learning rate (default: %(default)s).")
+parser.add_argument(
+ "--input-words-file", dest="input_words_file", metavar="PATH",
+ help="Input vocabulary (default: %(default)s).")
+parser.add_argument(
+ "--output-words-file", dest="output_words_file", metavar="PATH",
+ help="Output vocabulary (default: %(default)s).")
+parser.add_argument(
+ "--input_vocab_size", dest="input_vocab_size", type=int, metavar="INT",
+ help="Input vocabulary size (default: %(default)s).")
+parser.add_argument(
+ "--output-vocab-size", dest="output_vocab_size", type=int, metavar="INT",
+ help="Output vocabulary size (default: %(default)s).")
parser.set_defaults(
- working_dir = "working"
- ,corpus_stem = "train"
- ,nplm_home = "/home/bhaddow/tools/nplm"
- ,epochs = 2
- ,up_context_size = 2
- ,left_context_size = 3
- ,right_context_size = 0
- ,minibatch_size=1000
- ,noise=100
- ,hidden=0
- ,mode='head'
- ,input_embedding=150
- ,output_embedding=750
- ,threads=4
- ,output_model = "train"
- ,output_dir = None
- ,config_options_file = "config"
- ,log_file = "log"
- ,validation_corpus = None
- ,activation_fn = "rectifier"
- ,learning_rate = 1
- ,input_words_file = None
- ,output_words_file = None
- ,input_vocab_size = 500000
- ,output_vocab_size = 500000
- )
+ working_dir="working",
+ corpus_stem="train",
+ nplm_home="/home/bhaddow/tools/nplm",
+ epochs=2,
+ up_context_size=2,
+ left_context_size=3,
+ right_context_size=0,
+ minibatch_size=1000,
+ noise=100,
+ hidden=0,
+ mode='head',
+ input_embedding=150,
+ output_embedding=750,
+ threads=4,
+ output_model="train",
+ output_dir=None,
+ config_options_file="config",
+ log_file="log",
+ validation_corpus=None,
+ activation_fn="rectifier",
+ learning_rate=1,
+ input_words_file=None,
+ output_words_file=None,
+ input_vocab_size=500000,
+ output_vocab_size=500000)
+
def prepare_vocabulary(options):
- vocab_prefix = os.path.join(options.working_dir, 'vocab')
- extract_vocab_options = extract_vocab.create_parser().parse_args(['--input', options.corpus_stem, '--output', vocab_prefix])
- extract_vocab.main(extract_vocab_options)
-
- if options.input_words_file is None:
- options.input_words_file = vocab_prefix + '.input'
- orig = vocab_prefix + '.all'
- filtered_vocab = open(orig).readlines()
- if options.input_vocab_size:
- filtered_vocab = filtered_vocab[:options.input_vocab_size]
- open(options.input_words_file,'w').writelines(filtered_vocab)
-
- if options.output_words_file is None:
- options.output_words_file = vocab_prefix + '.output'
- if options.mode == 'label':
- blacklist = ['<null', '<root', '<start_head', '<dummy', '<head_head', '<stop_head']
- orig = vocab_prefix + '.special'
- filtered_vocab = open(orig).readlines()
- orig = vocab_prefix + '.nonterminals'
- filtered_vocab += open(orig).readlines()
- filtered_vocab = [word for word in filtered_vocab if not any(word.startswith(prefix) for prefix in blacklist)]
- if options.output_vocab_size:
- filtered_vocab = filtered_vocab[:options.output_vocab_size]
- else:
- orig = vocab_prefix + '.all'
- filtered_vocab = open(orig).readlines()[:options.output_vocab_size]
- open(options.output_words_file,'w').writelines(filtered_vocab)
+ vocab_prefix = os.path.join(options.working_dir, 'vocab')
+ extract_vocab_options = extract_vocab.create_parser().parse_args(
+ ['--input', options.corpus_stem, '--output', vocab_prefix])
+ extract_vocab.main(extract_vocab_options)
+
+ if options.input_words_file is None:
+ options.input_words_file = vocab_prefix + '.input'
+ orig = vocab_prefix + '.all'
+ filtered_vocab = open(orig).readlines()
+ if options.input_vocab_size:
+ filtered_vocab = filtered_vocab[:options.input_vocab_size]
+ open(options.input_words_file, 'w').writelines(filtered_vocab)
+
+ if options.output_words_file is None:
+ options.output_words_file = vocab_prefix + '.output'
+ if options.mode == 'label':
+ blacklist = [
+ '<null',
+ '<root',
+ '<start_head',
+ '<dummy',
+ '<head_head',
+ '<stop_head',
+ ]
+ orig = vocab_prefix + '.special'
+ filtered_vocab = open(orig).readlines()
+ orig = vocab_prefix + '.nonterminals'
+ filtered_vocab += open(orig).readlines()
+ filtered_vocab = [
+ word
+ for word in filtered_vocab
+ if not any(word.startswith(prefix) for prefix in blacklist)]
+ if options.output_vocab_size:
+ filtered_vocab = filtered_vocab[:options.output_vocab_size]
+ else:
+ orig = vocab_prefix + '.all'
+ filtered_vocab = open(orig).readlines()[:options.output_vocab_size]
+ open(options.output_words_file, 'w').writelines(filtered_vocab)
+
def main(options):
- options.ngram_size = 2*options.up_context_size + 2*options.left_context_size + 2*options.right_context_size
- if options.mode == 'head':
- options.ngram_size += 2
- elif options.mode == 'label':
- options.ngram_size += 1
-
- if options.input_words_file is None or options.output_words_file is None:
- sys.stderr.write('either input vocabulary or output vocabulary not specified: extracting vocabulary from training text\n')
- prepare_vocabulary(options)
-
- extract_options = extract_syntactic_ngrams.create_parser().parse_args(['--input', options.corpus_stem,
- '--output', os.path.join(options.working_dir, os.path.basename(options.corpus_stem) + '.numberized'),
- '--vocab', options.input_words_file,
- '--output_vocab', options.output_words_file,
- '--right_context', str(options.right_context_size),
- '--left_context', str(options.left_context_size),
- '--up_context', str(options.up_context_size),
- '--mode', options.mode
- ])
- sys.stderr.write('extracting syntactic n-grams\n')
- extract_syntactic_ngrams.main(extract_options)
-
- if options.validation_corpus:
- extract_options.input = open(options.validation_corpus)
- options.validation_file = os.path.join(options.working_dir, os.path.basename(options.validation_corpus))
- extract_options.output = open(options.validation_file + '.numberized', 'w')
- sys.stderr.write('extracting syntactic n-grams (validation file)\n')
+ options.ngram_size = (
+ 2 * options.up_context_size +
+ 2 * options.left_context_size +
+ 2 * options.right_context_size
+ )
+ if options.mode == 'head':
+ options.ngram_size += 2
+ elif options.mode == 'label':
+ options.ngram_size += 1
+
+ if options.input_words_file is None or options.output_words_file is None:
+ sys.stderr.write(
+ "Either input vocabulary or output vocabulary not specified: "
+ "extracting vocabulary from training text.\n")
+ prepare_vocabulary(options)
+
+ extract_options = extract_syntactic_ngrams.create_parser().parse_args([
+ '--input', options.corpus_stem,
+ '--output', os.path.join(
+ options.working_dir,
+ os.path.basename(options.corpus_stem) + '.numberized'),
+ '--vocab', options.input_words_file,
+ '--output_vocab', options.output_words_file,
+ '--right_context', str(options.right_context_size),
+ '--left_context', str(options.left_context_size),
+ '--up_context', str(options.up_context_size),
+ '--mode', options.mode
+ ])
+ sys.stderr.write('extracting syntactic n-grams\n')
extract_syntactic_ngrams.main(extract_options)
- extract_options.output.close()
- sys.stderr.write('training neural network\n')
- train_nplm.main(options)
+ if options.validation_corpus:
+ extract_options.input = open(options.validation_corpus)
+ options.validation_file = os.path.join(
+ options.working_dir, os.path.basename(options.validation_corpus))
+ extract_options.output = open(
+ options.validation_file + '.numberized', 'w')
+ sys.stderr.write('extracting syntactic n-grams (validation file)\n')
+ extract_syntactic_ngrams.main(extract_options)
+ extract_options.output.close()
+
+ sys.stderr.write('training neural network\n')
+ train_nplm.main(options)
+
+ sys.stderr.write('averaging null words\n')
+ ret = subprocess.call([
+ os.path.join(sys.path[0], 'average_null_embedding.py'),
+ options.nplm_home,
+ os.path.join(
+ options.output_dir,
+ options.output_model + '.model.nplm.' + str(options.epochs)),
+ os.path.join(
+ options.working_dir,
+ os.path.basename(options.corpus_stem) + '.numberized'),
+ os.path.join(options.output_dir, options.output_model + '.model.nplm')
+ ])
+ if ret:
+ raise Exception("averaging null words failed")
- sys.stderr.write('averaging null words\n')
- ret = subprocess.call([os.path.join(sys.path[0], 'average_null_embedding.py'),
- options.nplm_home,
- os.path.join(options.output_dir, options.output_model + '.model.nplm.' + str(options.epochs)),
- os.path.join(options.working_dir, os.path.basename(options.corpus_stem) + '.numberized'),
- os.path.join(options.output_dir, options.output_model + '.model.nplm')
- ])
- if ret:
- raise Exception("averaging null words failed")
if __name__ == "__main__":
- if sys.version_info < (3, 0):
- sys.stderr = codecs.getwriter('UTF-8')(sys.stderr)
- sys.stdout = codecs.getwriter('UTF-8')(sys.stdout)
- sys.stdin = codecs.getreader('UTF-8')(sys.stdin)
-
- options = parser.parse_args()
- main(options)
+ if sys.version_info < (3, 0):
+ sys.stderr = codecs.getwriter('UTF-8')(sys.stderr)
+ sys.stdout = codecs.getwriter('UTF-8')(sys.stdout)
+ sys.stdin = codecs.getreader('UTF-8')(sys.stdin)
+ options = parser.parse_args()
+ main(options)
diff --git a/scripts/training/wrappers/conll2mosesxml.py b/scripts/training/wrappers/conll2mosesxml.py
index 0e361df0b..761037488 100755
--- a/scripts/training/wrappers/conll2mosesxml.py
+++ b/scripts/training/wrappers/conll2mosesxml.py
@@ -2,42 +2,76 @@
# -*- coding: utf-8 -*-
# Author: Rico Sennrich
-# takes a file in the CoNLL dependency format (from the CoNLL-X shared task on dependency parsing; http://ilk.uvt.nl/conll/#dataformat )
-# and produces Moses XML format. Note that the structure is built based on fields 9 and 10 (projective HEAD and RELATION),
-# which not all parsers produce.
+"""
+Takes a file in the CoNLL dependency format (from the CoNLL-X shared task on
+dependency parsing; http://ilk.uvt.nl/conll/#dataformat ) and produces
+Moses XML format.
-# usage: conll2mosesxml.py [--brackets] < input_file > output_file
+Note that the structure is built based on fields 9 and 10 (projective HEAD
+and RELATION), which not all parsers produce.
+
+Usage: conll2mosesxml.py [--brackets] < input_file > output_file
+"""
from __future__ import print_function, unicode_literals
import sys
import re
import codecs
-from collections import namedtuple,defaultdict
+from collections import (
+ namedtuple,
+ defaultdict,
+ )
from lxml import etree as ET
-Word = namedtuple('Word', ['pos','word','lemma','tag','head','func', 'proj_head', 'proj_func'])
+Word = namedtuple(
+ 'Word',
+ ['pos', 'word', 'lemma', 'tag', 'head', 'func', 'proj_head', 'proj_func'])
+
def main(output_format='xml'):
sentence = []
for line in sys.stdin:
- # process sentence
+ # Process sentence.
if line == "\n":
- sentence.insert(0,[])
+ sentence.insert(0, [])
if is_projective(sentence):
- write(sentence,output_format)
+ write(sentence, output_format)
else:
- sys.stderr.write(' '.join(w.word for w in sentence[1:]) + '\n')
+ sys.stderr.write(
+ ' '.join(w.word for w in sentence[1:]) + '\n')
sys.stdout.write('\n')
sentence = []
continue
try:
- pos, word, lemma, tag, tag2, morph, head, func, proj_head, proj_func = line.split()
- except ValueError: # word may be unicode whitespace
- pos, word, lemma, tag, tag2, morph, head, func, proj_head, proj_func = re.split(' *\t*',line.strip())
+ (
+ pos,
+ word,
+ lemma,
+ tag,
+ tag2,
+ morph,
+ head,
+ func,
+ proj_head,
+ proj_func,
+ ) = line.split()
+ except ValueError: # Word may be unicode whitespace.
+ (
+ pos,
+ word,
+ lemma,
+ tag,
+ tag2,
+ morph,
+ head,
+ func,
+ proj_head,
+ proj_func,
+ ) = re.split(' *\t*', line.strip())
word = escape_special_chars(word)
lemma = escape_special_chars(lemma)
@@ -46,17 +80,20 @@ def main(output_format='xml'):
proj_head = head
proj_func = func
- sentence.append(Word(int(pos), word, lemma, tag2,int(head), func, int(proj_head), proj_func))
+ sentence.append(
+ Word(
+ int(pos), word, lemma, tag2, int(head), func, int(proj_head),
+ proj_func))
-# this script performs the same escaping as escape-special-chars.perl in Moses.
-# most of it is done in function write(), but quotation marks need to be processed first
+# This script performs the same escaping as escape-special-chars.perl in
+# Moses. Most of it is done in function write(), but quotation marks need
+# to be processed first.
def escape_special_chars(line):
-
- line = line.replace('\'','&apos;') # xml
- line = line.replace('"','&quot;') # xml
- line = line.replace('[','&#91;') # syntax non-terminal
- line = line.replace(']','&#93;') # syntax non-terminal
+ line = line.replace('\'', '&apos;') # xml
+ line = line.replace('"', '&quot;') # xml
+ line = line.replace('[', '&#91;') # syntax non-terminal
+ line = line.replace(']', '&#93;') # syntax non-terminal
return line
@@ -64,7 +101,7 @@ def escape_special_chars(line):
# make a check if structure is projective
def is_projective(sentence):
dominates = defaultdict(set)
- for i,w in enumerate(sentence):
+ for i, w in enumerate(sentence):
dominates[i].add(i)
if not i:
continue
@@ -77,7 +114,7 @@ def is_projective(sentence):
for i in dominates:
dependents = dominates[i]
- if max(dependents) - min(dependents) != len(dependents)-1:
+ if max(dependents) - min(dependents) != len(dependents) - 1:
sys.stderr.write("error: non-projective structure.\n")
return False
return True
@@ -86,24 +123,28 @@ def is_projective(sentence):
def write(sentence, output_format='xml'):
if output_format == 'xml':
- tree = create_subtree(0,sentence)
- out = ET.tostring(tree, encoding = 'UTF-8').decode('UTF-8')
+ tree = create_subtree(0, sentence)
+ out = ET.tostring(tree, encoding='UTF-8').decode('UTF-8')
if output_format == 'brackets':
- out = create_brackets(0,sentence)
+ out = create_brackets(0, sentence)
- out = out.replace('|','&#124;') # factor separator
+ out = out.replace('|', '&#124;') # factor separator
- out = out.replace('&amp;apos;','&apos;') # lxml is buggy if input is escaped
- out = out.replace('&amp;quot;','&quot;') # lxml is buggy if input is escaped
- out = out.replace('&amp;#91;','&#91;') # lxml is buggy if input is escaped
- out = out.replace('&amp;#93;','&#93;') # lxml is buggy if input is escaped
+ # lxml is buggy if input is escaped:
+ out = out.replace('&amp;apos;', '&apos;')
+ # lxml is buggy if input is escaped:
+ out = out.replace('&amp;quot;', '&quot;')
+ # lxml is buggy if input is escaped:
+ out = out.replace('&amp;#91;', '&#91;')
+ # lxml is buggy if input is escaped:
+ out = out.replace('&amp;#93;', '&#93;')
print(out)
-# write node in Moses XML format
-def create_subtree(position, sentence):
+def create_subtree(position, sentence):
+ """"Write node in Moses XML format."""
element = ET.Element('tree')
if position:
@@ -111,7 +152,7 @@ def create_subtree(position, sentence):
else:
element.set('label', 'sent')
- for i in range(1,position):
+ for i in range(1, position):
if sentence[i].proj_head == position:
element.append(create_subtree(i, sentence))
@@ -144,7 +185,7 @@ def create_brackets(position, sentence):
else:
element = "[ sent "
- for i in range(1,position):
+ for i in range(1, position):
if sentence[i].proj_head == position:
element += create_brackets(i, sentence)
@@ -167,7 +208,7 @@ def create_brackets(position, sentence):
return element
if __name__ == '__main__':
- if sys.version_info < (3,0,0):
+ if sys.version_info < (3, 0, 0):
sys.stdin = codecs.getreader('UTF-8')(sys.stdin)
sys.stdout = codecs.getwriter('UTF-8')(sys.stdout)
sys.stderr = codecs.getwriter('UTF-8')(sys.stderr)
diff --git a/scripts/training/wrappers/mosesxml2brackets.py b/scripts/training/wrappers/mosesxml2brackets.py
index bd876f087..6ff1d20c9 100755
--- a/scripts/training/wrappers/mosesxml2brackets.py
+++ b/scripts/training/wrappers/mosesxml2brackets.py
@@ -10,17 +10,21 @@ import codecs
from lxml import etree as ET
+
def escape(word):
- word = word.replace('|','&#124;') # factor separator
- word = word.replace('[','&#91;') # syntax non-terminal
- word = word.replace(']','&#93;') # syntax non-terminal
- word = word.replace('\'','&apos;')
- word = word.replace('\"','&quot;')
+ # Factor separator:
+ word = word.replace('|', '&#124;')
+ # Syntax non-terminal:
+ word = word.replace('[', '&#91;')
+ # Syntax non-terminal:
+ word = word.replace(']', '&#93;')
+ word = word.replace('\'', '&apos;')
+ word = word.replace('\"', '&quot;')
return word
-def make_brackets(xml):
+def make_brackets(xml):
out = ' [' + xml.get('label')
if xml.text and xml.text.strip():