Fix more Python lint.

Most of the complaints fixed here were from Pocketlint, but many were also from Syntastic the vim plugin.
author: Jeroen Vermeulen <jtv@precisiontranslationtools.com> 2015-05-16 13:26:56 +0300
committer: Jeroen Vermeulen <jtv@precisiontranslationtools.com> 2015-05-16 13:26:56 +0300
commit: 61162dd24284baebdd407bb7dd4f28892b24fbfb (patch)
tree: a2a2807acecefa1906d648970d932409bf0e0099 /scripts
parent: c07ade81422488ba1b6a6ae5eb46132fa5ac5fec (diff)
13 files changed, 1217 insertions, 906 deletions
diff --git a/scripts/ems/support/defaultconfig.py b/scripts/ems/support/defaultconfig.py
index e88b63e3d..a118e96b3 100644
--- a/scripts/ems/support/defaultconfig.py
+++ b/scripts/ems/support/defaultconfig.py
@@ -1,53 +1,48 @@
 #!/usr/bin/env python2
 
-#
-# Version of ConfigParser which accepts default values
-#
+"""Version of ConfigParser which accepts default values."""
 
 
 import ConfigParser
 
 
 class Config:
-  def __init__(self,filename):
-    self.config = ConfigParser.SafeConfigParser()
-    cfh = open(filename)
-    self.config.readfp(cfh)
-    cfh.close()
-
-  def get(self,section,name,default=None):
-    if default == None or self.config.has_option(section,name):
-      return self.config.get(section,name)
-    else:
-      return default
-
-  def getint(self,section,name,default=None):
-    if default == None or self.config.has_option(section,name):
-      return self.config.getint(section,name)
-    else:
-      return default
-
-
-  def getboolean(self,section,name,default=None):
-    if default == None or self.config.has_option(section,name):
-      return self.config.getboolean(section,name)
-    else:
-      return default
-
-
-  def getfloat(self,section,name,default=None):
-    if default == None or self.config.has_option(section,name):
-      return self.config.getfloat(section,name)
-    else:
-      return default
-
-
-  def __str__(self):
-    ret = ""
-    for section in self.config.sections():
-      for option in self.config.options(section):
-        ret = ret + "%s:%s = %s\n" % (section,option,self.config.get(section,option))
-    return ret
-
-
-
+    """Version of ConfigParser which accepts default values."""
+
+    def __init__(self, filename):
+        self.config = ConfigParser.SafeConfigParser()
+        cfh = open(filename)
+        self.config.readfp(cfh)
+        cfh.close()
+
+    def get(self, section, name, default=None):
+        if default is None or self.config.has_option(section, name):
+            return self.config.get(section, name)
+        else:
+            return default
+
+    def getint(self, section, name, default=None):
+        if default is None or self.config.has_option(section, name):
+            return self.config.getint(section, name)
+        else:
+            return default
+
+    def getboolean(self, section, name, default=None):
+        if default is None or self.config.has_option(section, name):
+            return self.config.getboolean(section, name)
+        else:
+            return default
+
+    def getfloat(self, section, name, default=None):
+        if default is None or self.config.has_option(section, name):
+            return self.config.getfloat(section, name)
+        else:
+            return default
+
+    def __str__(self):
+        ret = ""
+        for section in self.config.sections():
+            for option in self.config.options(section):
+                ret = ret + "%s:%s = %s\n" % (
+                    section, option, self.config.get(section, option))
+        return ret
diff --git a/scripts/ems/support/mml-filter.py b/scripts/ems/support/mml-filter.py
index 5fb43d71e..8e865c801 100755
--- a/scripts/ems/support/mml-filter.py
+++ b/scripts/ems/support/mml-filter.py
@@ -1,156 +1,171 @@
 #!/usr/bin/env python2
 
-#
-# Filter a parallel corpus
-#
+"""Filter a parallel corpus."""
+
 
-import heapq
 import logging
-import math
 import optparse
 import random
-import sys
 
 from defaultconfig import Config
 
-logging.basicConfig(format = "%(asctime)-15s %(message)s")
+
+logging.basicConfig(format="%(asctime)-15s %(message)s")
 log = logging.getLogger("filter")
 log.setLevel(logging.DEBUG)
 
+
 class FilterStrategy(object):
-  def __init__(self,config):
-    pass
+    def __init__(self, config):
+        pass
 
-  def filter(self,source,target):
-    return True
+    def filter(self, source, target):
+        return True
 
 
 class RandomFilterStrategy(FilterStrategy):
-  def __init__(self,config):
-    self.threshold = config.getfloat("random", "threshold", 0.1)
-    random.seed()
+    def __init__(self, config):
+        self.threshold = config.getfloat("random", "threshold", 0.1)
+        random.seed()
 
-  def filter(self, source, target):
-    return random.random() < self.threshold
+    def filter(self, source, target):
+        return random.random() < self.threshold
 
 
 class ScoreFilterStrategy(FilterStrategy):
-  """Filter strategy that is based on a file with sentence scores. There are three
-  possible ways of specifying how to filter:
-    i) threshold - filter all sentence pairs whose score is less than the threshold
-    ii) proportion - filter all but a certain proportion (eg a tenth) of the sentences
+    """Filter strategy that is based on a file with sentence scores.
+
+    There are three possible ways of specifying how to filter:
+    i) threshold - filter all sentence pairs whose score is less than the
+        threshold.
+    ii) proportion - filter all but a certain proportion (eg a tenth) of the
+        sentences.
     iii) count - filter all but a given count of the sentences.
     """
-  def __init__(self,config):
-    section = "score"
-    self.score_file = config.get(section,"score_file")
-    self.ignore_score = config.get(section, "ignore_score", "99999")
-    option_names = ("threshold", "proportion", "count")
-    options = [config.config.has_option(section,o) for o in option_names]
-    if sum(options) != 1:
-      raise RuntimeError("Must specify exactly one of %s for score filter" % str(option_names)) 
-    if options[0]:
-      # threshold
-      self.threshold = config.getfloat(section,option_names[0])
-    else:
-      # proportion or count
-      if options[2]:
-        count = config.getint(section,option_names[2])
-      else:
-        # need to count entries
-        count = 0
-        ignore_count = 0
-        for line in open(self.score_file):
-          if line[:-1] != self.ignore_score:
-             count = count + 1
-          else:
-            ignore_count = ignore_count + 1
-        count = int(count * config.getfloat(section,option_names[1]))
-      log.info("Retaining at least %d entries and ignoring %d" % (count, ignore_count))
-      # Find the threshold
-      self.threshold = sorted(\
-        [float(line[:-1]) for line in open(self.score_file)], reverse=True)[ignore_count + count]
-      #self.threshold = heapq.nlargest(count, \
-      #  [float(line[:-1]) for line in open(self.score_file)])[-1]
-
-
-    self.sfh = open(self.score_file) 
-    log.info("Thresholding scores at " + str(self.threshold))
-
-  def filter(self,source,target):
-    score = self.sfh.readline()
-    if not score:
-      raise RuntimeError("score file truncated")
-    return score[:-1] == self.ignore_score  or float(score[:-1]) >= self.threshold
-  
+
+    def __init__(self, config):
+        section = "score"
+        self.score_file = config.get(section, "score_file")
+        self.ignore_score = config.get(section, "ignore_score", "99999")
+        option_names = ("threshold", "proportion", "count")
+        options = [config.config.has_option(section, o) for o in option_names]
+        if sum(options) != 1:
+            raise RuntimeError(
+                "Must specify exactly one of %s for score filter"
+                % str(option_names))
+        if options[0]:
+            # Threshold.
+            self.threshold = config.getfloat(section, option_names[0])
+        else:
+            # proportion or count
+            if options[2]:
+                count = config.getint(section, option_names[2])
+            else:
+                # Need to count entries.
+                count = 0
+                ignore_count = 0
+                for line in open(self.score_file):
+                    if line[:-1] != self.ignore_score:
+                        count += 1
+                    else:
+                        ignore_count = ignore_count + 1
+                count = int(count * config.getfloat(section, option_names[1]))
+            log.info(
+                "Retaining at least %d entries and ignoring %d"
+                % (count, ignore_count))
+            # Find the threshold.
+            self.threshold = sorted([
+                float(line[:-1])
+                for line in open(self.score_file)],
+                reverse=True)[ignore_count + count]
+            # import heapq
+            # self.threshold = heapq.nlargest(
+            #     count,
+            #     [float(line[:-1]) for line in open(self.score_file)])[-1]
+
+        self.sfh = open(self.score_file)
+        log.info("Thresholding scores at " + str(self.threshold))
+
+    def filter(self, source, target):
+        score = self.sfh.readline()
+        if not score:
+            raise RuntimeError("score file truncated")
+        return (
+            score[:-1] == self.ignore_score or
+            float(score[:-1]) >= self.threshold
+            )
+
 
 def main():
-  parser = optparse.OptionParser(usage = "Usage: %prog [options] config-file")
-  (options,args) = parser.parse_args()
-  if len(args) < 1:
-    parser.error("No configuration file specified")
-
-  log.info("Loading configuration from " + args[0])
-  config = Config(args[0])
-  log.debug("Configuration:\n" + str(config))
-
-  # Required general parameters
-  source_lang = config.get("general", "source_language")
-  target_lang = config.get("general", "target_language")
-  input_stem = config.get("general", "input_stem")
-  output_stem = config.get("general", "output_stem")
-  strategy = config.get("general", "strategy", "")
-
-  # Optional general parameters
-  alignment_stem = config.get("general", "alignment_stem", "")
-  alignment_type = config.get("general", "alignment_type", "grow-diag-final-and")
-  domain_file_in = config.get("general", "domain_file", "")
-  domain_file_out = config.get("general", "domain_file_out", "")
-
-  strategy_class = globals()[strategy + "FilterStrategy"]
-  strategy = strategy_class(config)
-
-  source_input_fh = open(input_stem + "." + source_lang)
-  target_input_fh = open(input_stem + "." + target_lang)
-  source_output_fh = open(output_stem + "." + source_lang, "w")
-  target_output_fh = open(output_stem + "." + target_lang, "w")
-
-  alignment_input_fh = None
-  alignment_output_fh = None
-  if alignment_stem:
-    alignment_input_fh = open(alignment_stem + "." + alignment_type)
-    alignment_output_fh = open(output_stem + "." + alignment_type,"w")
-
-  domain_boundaries = {}
-  if domain_file_in:
-    dfh = open(domain_file_in)
-    for line in dfh:
-      line_no,name = line[:-1].split()
-      domain_boundaries[int(line_no)] = name
-  
-  domain_output_fh = None
-  if domain_file_out:
-    domain_output_fh = open(domain_file_out, "w")
-
-  #log.info(str(domain_boundaries))
-
-  retained = 0
-  line_no = 0
-  for source_line in source_input_fh:
-    target_line = target_input_fh.readline()
-    if alignment_input_fh:
-      align_line = alignment_input_fh.readline()
-    if strategy.filter(source_line,target_line):
-      retained = retained + 1
-      print>>source_output_fh, source_line,
-      print>>target_output_fh, target_line,
-      if alignment_input_fh:
-        print>>alignment_output_fh, align_line,
-    line_no = line_no + 1
-    # check if this is a domain boundary
-    if domain_boundaries and domain_boundaries.has_key(line_no):
-      print>>domain_output_fh,"%d %s" % (retained,domain_boundaries[line_no])
-  log.info("Lines retained: %d" % retained)
+    parser = optparse.OptionParser(usage="Usage: %prog [options] config-file")
+    (options, args) = parser.parse_args()
+    if len(args) < 1:
+        parser.error("No configuration file specified")
+
+    log.info("Loading configuration from " + args[0])
+    config = Config(args[0])
+    log.debug("Configuration:\n" + str(config))
+
+    # Required general parameters
+    source_lang = config.get("general", "source_language")
+    target_lang = config.get("general", "target_language")
+    input_stem = config.get("general", "input_stem")
+    output_stem = config.get("general", "output_stem")
+    strategy = config.get("general", "strategy", "")
+
+    # Optional general parameters
+    alignment_stem = config.get("general", "alignment_stem", "")
+    alignment_type = config.get(
+        "general", "alignment_type", "grow-diag-final-and")
+    domain_file_in = config.get("general", "domain_file", "")
+    domain_file_out = config.get("general", "domain_file_out", "")
+
+    strategy_class = globals()[strategy + "FilterStrategy"]
+    strategy = strategy_class(config)
+
+    source_input_fh = open(input_stem + "." + source_lang)
+    target_input_fh = open(input_stem + "." + target_lang)
+    source_output_fh = open(output_stem + "." + source_lang, "w")
+    target_output_fh = open(output_stem + "." + target_lang, "w")
+
+    alignment_input_fh = None
+    alignment_output_fh = None
+    if alignment_stem:
+        alignment_input_fh = open(alignment_stem + "." + alignment_type)
+        alignment_output_fh = open(output_stem + "." + alignment_type, "w")
+
+    domain_boundaries = {}
+    if domain_file_in:
+        dfh = open(domain_file_in)
+        for line in dfh:
+            line_no, name = line[:-1].split()
+            domain_boundaries[int(line_no)] = name
+
+    domain_output_fh = None
+    if domain_file_out:
+        domain_output_fh = open(domain_file_out, "w")
+
+    # log.info(str(domain_boundaries))
+
+    retained = 0
+    line_no = 0
+    for source_line in source_input_fh:
+        target_line = target_input_fh.readline()
+        if alignment_input_fh:
+            align_line = alignment_input_fh.readline()
+        if strategy.filter(source_line, target_line):
+            retained = retained + 1
+            print>>source_output_fh, source_line,
+            print>>target_output_fh, target_line,
+            if alignment_input_fh:
+                print>>alignment_output_fh, align_line,
+        line_no = line_no + 1
+        # Check if this is a domain boundary.
+        if domain_boundaries and line_no in domain_boundaries:
+            print >>domain_output_fh, (
+                "%d %s" % (retained, domain_boundaries[line_no]))
+    log.info("Lines retained: %d", retained)
+
 
 if __name__ == "__main__":
-  main()
+    main()
diff --git a/scripts/generic/bsbleu.py b/scripts/generic/bsbleu.py
index ff86fed5e..12d2201de 100755
--- a/scripts/generic/bsbleu.py
+++ b/scripts/generic/bsbleu.py
@@ -2,73 +2,73 @@
 # compute Bleu scores with confidence intervals via boostrap resampling
 # written by Ulrich Germann
 
-import math,sys,os
 from argparse import ArgumentParser
-from operator import itemgetter
-from random   import randint
-from operator import itemgetter
+import math
+import os
+from random import randint
+import sys
 
-def count_ngrams(snt,max_n):
+
+def count_ngrams(snt, max_n):
     """
-    Return a dictionary of ngram counts (up to length /max_n/) 
-    for sentence (list of words) /snt/. 
+    Return a dictionary of ngram counts (up to length /max_n/)
+    for sentence (list of words) /snt/.
     """
     ret = {}
     for i in xrange(len(snt)):
-        for k in xrange(i+1,min(i+max_n+1,len(snt)+1)):
+        for k in xrange(i + 1, min(i + max_n + 1, len(snt) + 1)):
             key = tuple(snt[i:k])
-            ret[key] = ret.get(key,0) + 1
-            pass
-        pass
+            ret[key] = ret.get(key, 0) + 1
     return ret
 
-def max_counts(ng1,ng2):
+
+def max_counts(ng1, ng2):
     """
-    Return a dicitonary of ngram counts such that 
+    Return a dicitonary of ngram counts such that
     each count is the greater of the two individual counts
-    for each ngram in the input ngram count dictionaries 
+    for each ngram in the input ngram count dictionaries
     /ng1/ and /ng2/.
     """
     ret = ng1.copy()
-    for k,v in ng2.items():
-        ret[k] = max(ret.get(k,0),v)
-        pass
+    for k, v in ng2.items():
+        ret[k] = max(ret.get(k, 0), v)
     return ret
 
-def ng_hits(hyp,ref,max_n):
+
+def ng_hits(hyp, ref, max_n):
     """
-    return a list of ngram counts such that each ngram count 
-    is the minimum of the counts in hyp and ref, up to ngram 
-    length /max_n/
+    Return a list of ngram counts such that each ngram count
+    is the minimum of the counts in hyp and ref, up to ngram
+    length /max_n/.
     """
     ret = [0 for i in xrange(max_n)]
-    for ng,cnt in hyp.items():
+    for ng, cnt in hyp.items():
         k = ng
         if len(k) <= max_n:
-            ret[len(k)-1] += min(cnt,ref.get(ng,0))
-            pass
-        pass
+            ret[len(k) - 1] += min(cnt, ref.get(ng, 0))
     return ret
 
+
 class BleuScore:
-    def __init__(self,hyp,ref,max_n=4,bootstrap=1000):
-        # print len(hyp.ngrams),len(ref.ngrams),"X"
-        self.hits = [ng_hits(hyp.ngrams[i],ref.ngrams[i],max_n) 
-                     for i in xrange(len(hyp.ngrams))]
-        self.max_n  = max_n
-        self.hyp    = hyp
-        self.ref    = ref
-        self.lower  = None
-        self.upper  = None
+    def __init__(self, hyp, ref, max_n=4, bootstrap=1000):
+        # print len(hyp.ngrams), len(ref.ngrams), "X"
+        self.hits = [
+            ng_hits(hyp.ngrams[i], ref.ngrams[i], max_n)
+            for i in xrange(len(hyp.ngrams))]
+        self.max_n = max_n
+        self.hyp = hyp
+        self.ref = ref
+        self.lower = None
+        self.upper = None
         self.median = None
-        self.bootstrap = [self.score([randint(0,len(hyp.snt)-1) for s in hyp.snt])
-                         for i in xrange(1000)]
+        self.bootstrap = [
+            self.score([randint(0, len(hyp.snt) - 1) for s in hyp.snt])
+            for i in xrange(1000)]
         self.bootstrap.sort()
         self.actual = self.score([i for i in xrange(len(hyp.snt))])
-        return
-    
-    def score(self,sample):
-        hits  = [0 for i in xrange(self.max_n)]
+
+    def score(self, sample):
+        hits = [0 for i in xrange(self.max_n)]
         self.hyplen = 0
         self.reflen = 0
         for i in sample:
@@ -76,94 +76,89 @@ class BleuScore:
             self.reflen += len(self.ref.snt[i])
             for n in xrange(self.max_n):
                 hits[n] += self.hits[i][n]
-                pass
-            pass
-        self.prec = [float(hits[n])/(self.hyplen-n*len(sample)) 
+        self.prec = [float(hits[n]) / (self.hyplen - n * len(sample))
                      for n in xrange(self.max_n)]
-        ret = sum([math.log(x) for x in self.prec])/self.max_n
-        self.BP = min(1,math.exp(1.-float(self.reflen)/float(self.hyplen)))
+        ret = sum([math.log(x) for x in self.prec]) / self.max_n
+        self.BP = min(
+            1, math.exp(1. - float(self.reflen) / float(self.hyplen)))
         ret += math.log(self.BP)
         return math.exp(ret)
-        
+
+
 class Document:
-    def __init__(self,fname=None):
+    def __init__(self, fname=None):
         self.fname = fname
         if fname:
             self.snt = [line.strip().split() for line in open(fname)]
-            self.ngrams = [count_ngrams(snt,4) for snt in self.snt]
+            self.ngrams = [count_ngrams(snt, 4) for snt in self.snt]
         else:
             self.snt = None
             self.ngrams = None
-            pass
-        return
 
-    def merge(self,R):
+    def merge(self, R):
         self.fname = "multi-ref"
         self.ngrams = [x for x in R[0].ngrams]
         self.snt = [x for x in R[0].snt]
         for i in xrange(len(R[0].ngrams)):
-            for k in xrange(1,len(R)):
-                self.ngrams[i] = max_counts(self.ngrams[i],R[k].ngrams[i])
-                pass
-            pass
-        return
-
-    def update(self,hyp,R):
-        for i in xrange(len(hyp.snt)):
-            clen = len(hyp.snt[i])
+            for k in xrange(1, len(R)):
+                self.ngrams[i] = max_counts(self.ngrams[i], R[k].ngrams[i])
+
+    def update(self, hyp, R):
+        for i, hyp_snt in enumerate(hyp.snt):
+            clen = len(hyp_snt)
             K = 0
-            for k in xrange(1,len(R)):
-                assert len(R[k].snt) == len(hyp.snt),\
-                    "Mismatch in numer of sentences " +\
-                    "between reference and candidate"
-                if abs(len(R[k].snt[i]) - clen) == abs(len(R[K].snt[i]) - clen):
-                    if len(R[k].snt[i]) < len(R[K].snt[i]): 
+            for k in xrange(1, len(R)):
+                k_snt = R[k].snt[i]
+                assert len(R[k].snt) == len(hyp.snt), (
+                    "Mismatch in number of sentences " +
+                    "between reference and candidate")
+                if abs(len(k_snt) - clen) == abs(len(R[K].snt[i]) - clen):
+                    if len(k_snt) < len(R[K].snt[i]):
                         K = k
-                        pass
-                    pass
-                elif abs(len(R[k].snt[i]) - clen) < abs(len(R[K].snt[i]) - clen):
+                elif abs(len(k_snt) - clen) < abs(len(R[K].snt[i]) - clen):
                     K = k
-                    pass
-                pass
             self.snt[i] = R[K].snt[i]
-            pass
-        return
-        
-    pass
+
 
 if __name__ == "__main__":
     argparser = ArgumentParser()
-    argparser.add_argument("-r","--ref",nargs='+',help="reference translation(s)")
-    argparser.add_argument("-c","--cand",nargs='+',help="candidate translations")
-    argparser.add_argument("-i","--individual",action='store_true', 
-                           help="compute BLEU scores for individual references")
-    argparser.add_argument("-b","--bootstrap",type=int,default=1000, 
-                           help="sample size for bootstrap resampling")
-    argparser.add_argument("-a","--alpha",help="1-alpha = confidence interval",type=float,default=.05)
+    argparser.add_argument(
+        "-r", "--ref", nargs='+', help="Reference translation(s).")
+    argparser.add_argument(
+        "-c", "--cand", nargs='+', help="Candidate translations.")
+    argparser.add_argument(
+        "-i", "--individual", action='store_true',
+        help="Compute BLEU scores for individual references.")
+    argparser.add_argument(
+        "-b", "--bootstrap", type=int, default=1000,
+        help="Sample size for bootstrap resampling.")
+    argparser.add_argument(
+        "-a", "--alpha", type=float, default=.05,
+        help="1-alpha = confidence interval.")
     args = argparser.parse_args(sys.argv[1:])
-    R = [ Document(fname) for fname in args.ref]
-    C = [ Document(fname) for fname in args.cand]
-    Rx = Document() # for multi-reference BLEU
+    R = [Document(fname) for fname in args.ref]
+    C = [Document(fname) for fname in args.cand]
+    Rx = Document()  # for multi-reference BLEU
     Rx.merge(R)
     for c in C:
         # compute multi-reference BLEU
-        Rx.update(c,R)
-        bleu = BleuScore(c,Rx,bootstrap=args.bootstrap)
-        print "%5.2f %s [%5.2f-%5.2f; %5.2f] %s"%\
-            (100*bleu.actual,
-             os.path.basename(Rx.fname),
-             100*bleu.bootstrap[int((args.alpha/2)*args.bootstrap)],
-             100*bleu.bootstrap[int((1-(args.alpha/2))*args.bootstrap)],
-             100*bleu.bootstrap[int(.5*args.bootstrap)],
-             c.fname) # os.path.basename(c.fname))
+        Rx.update(c, R)
+        bleu = BleuScore(c, Rx, bootstrap=args.bootstrap)
+        print "%5.2f %s [%5.2f-%5.2f; %5.2f] %s" % (
+            100 * bleu.actual,
+            os.path.basename(Rx.fname),
+            100 * bleu.bootstrap[int((args.alpha / 2) * args.bootstrap)],
+            100 * bleu.bootstrap[int((1 - (args.alpha / 2)) * args.bootstrap)],
+            100 * bleu.bootstrap[int(.5 * args.bootstrap)],
+            c.fname)  # os.path.basename(c.fname))
 
         if args.individual:
             for r in R:
-                bleu = BleuScore(c,r,bootstrap=args.bootstrap)
-                print "  %5.2f %s"%(100*bleu.actual,os.path.basename(r.fname))
-                # print bleu.prec,bleu.hyplen,bleu.reflen,bleu.BP
-                pass
-            pass
-
-        # print [sum([bleu.hits[i][n] for i in xrange(len(bleu.hits))]) for n in xrange(4)] 
-        pass
+                bleu = BleuScore(c, r, bootstrap=args.bootstrap)
+                print "  %5.2f %s" % (
+                    100 * bleu.actual, os.path.basename(r.fname))
+                # print bleu.prec, bleu.hyplen, bleu.reflen, bleu.BP
+
+        # print [
+        #     sum([bleu.hits[i][n] for i in xrange(len(bleu.hits))])
+        #     for n in xrange(4)]
diff --git a/scripts/server/moses.py b/scripts/server/moses.py
index a176c473a..7cf152187 100644
--- a/scripts/server/moses.py
+++ b/scripts/server/moses.py
@@ -1,237 +1,225 @@
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
 
-# Python utilities for moses
-# 
-# This package mostly wraps standard Moses utilities into pipes.
-#
-# Written by Ulrich Germann
-# 
-# This package borrows from scripts written by Christian Buck
-# 
-# The package assumes that there is a complete moses installation
-# (including scripts) under one root directory,
-# e.g., via 
-#    bjam --with-xmlrpc-c=... [...] --install-scripts --prefix=${HOME}/moses
-# By default, this root directory is "${HOME}/moses".
-
-import xmlrpclib,datetime,argparse,time,os,sys
-from subprocess import *
-from unicodedata import normalize
- 
-moses_root = os.environ.get('MOSES_ROOT',os.environ.get('HOME')+"/moses")
+"""
+Python utilities for moses
+
+This package mostly wraps standard Moses utilities into pipes.
+
+Written by Ulrich Germann
+
+This package borrows from scripts written by Christian Buck
+
+The package assumes that there is a complete moses installation
+(including scripts) under one root directory,
+e.g., via ::
+    bjam --with-xmlrpc-c=... [...] --install-scripts --prefix=${HOME}/moses
+By default, this root directory is "${HOME}/moses".
+"""
+
+import os
+import sys
+import time
+import xmlrpclib
+from subprocess import (
+    PIPE,
+    Popen,
+    )
+
+
+moses_root = os.environ.get('MOSES_ROOT', os.environ.get('HOME') + "/moses")
+
 
 class ProcessWrapper:
 
-  def __init__(self,cmd=[]):
-    self.process = None
-    self.cmd = cmd
-    return
+    def __init__(self, cmd=[]):
+        self.process = None
+        self.cmd = cmd
+
+    def start(self, stdin=PIPE, stdout=PIPE):
+        if self.process:
+            raise Exception("Process is already running")
+        self.process = Popen(self.cmd, stdin=stdin, stdout=stdout)
 
-  def start(self, stdin=PIPE, stdout=PIPE):
-    if self.process:
-      raise Exception("Process is already running")
-    self.process = Popen(self.cmd, stdin = stdin, stdout = stdout)
-    return
+    def __del__(self):
+        if self.process:
+            self.process.terminate()
 
-  def __del__(self):
-    if self.process:
-      self.process.terminate()
-      pass
-    return
-  pass
 
 class LineProcessor(ProcessWrapper):
 
-  def __call__(self,input):
-    if not self.process: self.start()
-    self.process.stdin.write("%s\n"%input.strip())
-    self.process.stdin.flush()
-    return self.process.stdout.readline().strip()
-  pass  
+    def __call__(self, input):
+        if not self.process:
+            self.start()
+        self.process.stdin.write("%s\n" % input.strip())
+        self.process.stdin.flush()
+        return self.process.stdout.readline().strip()
+
 
 class SentenceSplitter(ProcessWrapper):
-  """
-  Wrapper for standard Moses sentence splitter
-  """
-  def __init__(self,lang):
-    ssplit_cmd = moses_root+"/scripts/ems/support/split-sentences.perl"
-    self.cmd = [ssplit_cmd, "-b", "-q", "-l",lang]
-    self.process = None
-    return
-
-  def __call__(self,input):
-    if not self.process:
-      self.start()
-      pass
-    self.process.stdin.write(input.strip() + "\n<P>\n")
-    self.process.stdin.flush()
-    x = self.process.stdout.readline().strip()
-    ret = []
-    while x != '<P>' and x != '':
-      ret.append(x)
-      x = self.process.stdout.readline().strip()
-      pass
-    return ret
+    """Wrapper for standard Moses sentence splitter."""
+
+    def __init__(self, lang):
+        ssplit_cmd = moses_root + "/scripts/ems/support/split-sentences.perl"
+        self.cmd = [ssplit_cmd, "-b", "-q", "-l", lang]
+        self.process = None
+
+    def __call__(self, input):
+        if not self.process:
+            self.start()
+        self.process.stdin.write(input.strip() + "\n<P>\n")
+        self.process.stdin.flush()
+        x = self.process.stdout.readline().strip()
+        ret = []
+        while x != '<P>' and x != '':
+            ret.append(x)
+            x = self.process.stdout.readline().strip()
+        return ret
+
 
 class Pretokenizer(LineProcessor):
-  """
-  Pretokenizer wrapper; the pretokenizer fixes known issues with the input.
-  """
-  def __init__(self,lang):
-    pretok_cmd = moses_root+"/scripts/tokenizer/pre-tokenizer.perl"
-    self.cmd = [pretok_cmd,"-b", "-q", "-l",lang]
-    self.process = None
-    return
-  pass
+    """Pretokenizer wrapper.
+
+    The pretokenizer fixes known issues with the input.
+    """
+    def __init__(self, lang):
+        pretok_cmd = moses_root + "/scripts/tokenizer/pre-tokenizer.perl"
+        self.cmd = [pretok_cmd, "-b", "-q", "-l", lang]
+        self.process = None
+
 
 class Tokenizer(LineProcessor):
-  """
-  Tokenizer wrapper; the pretokenizer fixes known issues with the input.
-  """
-  def __init__(self,lang,args=["-a","-no-escape"]):
-    tok_cmd = moses_root+"/scripts/tokenizer/tokenizer.perl"
-    self.cmd = [tok_cmd,"-b", "-q", "-l", lang] + args
-    self.process = None
-    return
-   
+    """Tokenizer wrapper.
+
+    The pretokenizer fixes known issues with the input.
+    """
+    def __init__(self, lang, args=["-a", "-no-escape"]):
+        tok_cmd = moses_root + "/scripts/tokenizer/tokenizer.perl"
+        self.cmd = [tok_cmd, "-b", "-q", "-l", lang] + args
+        self.process = None
+
+
 class Truecaser(LineProcessor):
-  """
-  Truecaser wrapper.
-  """
-  def __init__(self,model):
-    truecase_cmd = moses_root+"/scripts/recaser/truecase.perl"
-    self.cmd = [truecase_cmd,"-b", "--model",model]
-    self.process = None
-    return
-  pass
+    """Truecaser wrapper."""
+    def __init__(self, model):
+        truecase_cmd = moses_root + "/scripts/recaser/truecase.perl"
+        self.cmd = [truecase_cmd, "-b", "--model", model]
+        self.process = None
+
 
 class LineProcessorPipeline:
-  """
-  Line processor: one line in, one line out
-  """
-  def __init__(self,parts=[]):
-    self.chain = [LineProcessor(p.cmd) for p in parts]
-    return 
-  
-  def start(self):
-    if len(self.chain) == 0:
-      return
-    if self.chain[0].process:
-      return
-    self.chain[0].start()
-    for i in xrange(1,len(self.chain)):
-      self.chain[i].start(stdin = self.chain[i-1].process.stdout)
-      pass
-    return
-
-  def __call__(self,input):
-    if len(self.chain) == 0:
-      return input
-    self.start()
-    self.chain[0].process.stdin.write("%s\n"%input.strip())
-    self.chain[0].process.stdin.flush()
-    return self.chain[0].process.stdout.readline().strip()
-
-  pass
+    """Line processor: one line in, one line out."""
+    def __init__(self, parts=[]):
+        self.chain = [LineProcessor(p.cmd) for p in parts]
+
+    def start(self):
+        if len(self.chain) == 0:
+            return
+        if self.chain[0].process:
+            return
+        self.chain[0].start()
+        for i in xrange(1, len(self.chain)):
+            self.chain[i].start(stdin=self.chain[i - 1].process.stdout)
+
+    def __call__(self, input):
+        if len(self.chain) == 0:
+            return input
+        self.start()
+        self.chain[0].process.stdin.write("%s\n" % input.strip())
+        self.chain[0].process.stdin.flush()
+        return self.chain[0].process.stdout.readline().strip()
 
-def find_free_port(p):
-  """
-  Find a free port, starting at /p/. 
-  Return the free port, or False if none found.
-  """
-  ret = p
-  while ret - p < 20:
-    devnull = open(os.devnull,"w")
-    n = Popen(["netstat","-tnp"],stdout=PIPE,stderr=devnull)
-    if n.communicate()[0].find(":%d "%ret) < 0:
-      return p
-    ret += 1
-    pass
-  return False
 
-class MosesServer(ProcessWrapper):
+def find_free_port(p):
+    """Find a free port, starting at /p/.
 
-  def __init__(self,args=[]):
-    self.process = None
-    mserver_cmd  = moses_root+"/bin/mosesserver"
-    self.cmd = [mserver_cmd] + args 
-    self.url = None
-    self.proxy = None
-    return
-  
-  def start(self,config=None,args=[],port=7447,debug=False):
-    self.cmd.extend(args)
-    if config:
-      if "-f" in args:
-        raise Exception("Config file specified twice")
-      else:
-        self.cmd.extend(["-f",config])
-        pass
-      pass
-    self.port = port # find_free_port(port)
-    if not self.port:
-      raise Excpetion("Cannot find free port for moses server!")
-    self.cmd.extend(["--server-port", "%d"%self.port])
-    if debug:
-      print >>sys.stderr,self.cmd
-      # self.stderr = open("mserver.%d.stderr"%self.port,'w')
-      # self.stdout = open("mserver.%d.stdout"%self.port,'w')
-      # self.process = Popen(self.cmd,stderr = self.stderr,stdout = self.stdout)
-      self.process = Popen(self.cmd)
-    else:
-      devnull = open(os.devnull,"w")
-      self.process = Popen(self.cmd, stderr=devnull, stdout=devnull)
-      pass
-
-    if self.process.poll():
-      raise Exception("FATAL ERROR: Could not launch moses server!")
-    if debug:
-      print >>sys.stderr,"MOSES port is %d."%self.port 
-      print >>sys.stderr,"Moses poll status is", self.process.poll()
-      pass
-
-    self.url = "http://localhost:%d/RPC2"%self.port
-    self.connect(self.url)
-
-    return True
-
-  def connect(self,url):
-    if url[:4]  != "http":  url = "http://%s"%url
-    if url[-5:] != "/RPC2": url += "/RPC2"
-    self.url = url
-    self.proxy = xmlrpclib.ServerProxy(self.url)
-    return
-
-  def translate(self,input):
-    attempts = 0
-    while attempts < 100:
-      try:
-        if type(input) is unicode:
-          # if the server does not expect unicode, provide a 
-          # properly encoded string!
-          param = {'text': input.strip().encode('utf8')}
-          return self.proxy.translate(param)['text'].decode('utf8')
-
-        elif type(input) is str:
-          param = {'text': input.strip()}
-          return self.proxy.translate(param)['text']
-
-        elif type(input) is list:
-          return [self.translate(x) for x in input]
-
-        elif type(input) is dict:
-          return self.proxy.translate(input)
+    :return: The free port, or False if none found.
+    """
+    ret = p
+    while ret - p < 20:
+        devnull = open(os.devnull, "w")
+        n = Popen(["netstat", "-tnp"], stdout=PIPE, stderr=devnull)
+        if n.communicate()[0].find(":%d " % ret) < 0:
+            return p
+        ret += 1
+    return False
 
-        else:
-          raise Exception("Can't handle input of this type!")
 
-      except:
-        attempts += 1
-        print >>sys.stderr, "WAITING", attempts
-        time.sleep(1)
-        pass
-      pass
-    raise Exception("Translation request failed")
-  pass
+class MosesServer(ProcessWrapper):
 
+    def __init__(self, args=[]):
+        self.process = None
+        mserver_cmd = moses_root + "/bin/mosesserver"
+        self.cmd = [mserver_cmd] + args
+        self.url = None
+        self.proxy = None
+
+    def start(self, config=None, args=[], port=7447, debug=False):
+        self.cmd.extend(args)
+        if config:
+            if "-f" in args:
+                raise Exception("Config file specified twice")
+            else:
+                self.cmd.extend(["-f", config])
+        self.port = port  # find_free_port(port)
+        if not self.port:
+            raise Exception("Cannot find free port for moses server!")
+        self.cmd.extend(["--server-port", "%d" % self.port])
+        if debug:
+            print >>sys.stderr, self.cmd
+            # self.stderr = open("mserver.%d.stderr"%self.port,'w')
+            # self.stdout = open("mserver.%d.stdout"%self.port,'w')
+            # self.process = Popen(
+            #     self.cmd, stderr=self.stderr, stdout=self.stdout)
+            self.process = Popen(self.cmd)
+        else:
+            devnull = open(os.devnull, "w")
+            self.process = Popen(self.cmd, stderr=devnull, stdout=devnull)
+
+        if self.process.poll():
+            raise Exception("FATAL ERROR: Could not launch moses server!")
+        if debug:
+            print >>sys.stderr, "MOSES port is %d." % self.port
+            print >>sys.stderr, "Moses poll status is", self.process.poll()
+
+        self.url = "http://localhost:%d/RPC2" % self.port
+        self.connect(self.url)
+
+        return True
+
+    def connect(self, url):
+        if url[:4] != "http":
+            url = "http://%s" % url
+        if url[-5:] != "/RPC2":
+            url += "/RPC2"
+        self.url = url
+        self.proxy = xmlrpclib.ServerProxy(self.url)
+
+    def translate(self, input):
+        attempts = 0
+        while attempts < 100:
+            try:
+                if type(input) is unicode:
+                    # If the server does not expect unicode, provide a
+                    # properly encoded string!
+                    param = {'text': input.strip().encode('utf8')}
+                    return self.proxy.translate(param)['text'].decode('utf8')
+
+                elif type(input) is str:
+                    param = {'text': input.strip()}
+                    return self.proxy.translate(param)['text']
+
+                elif type(input) is list:
+                    return [self.translate(x) for x in input]
+
+                elif type(input) is dict:
+                    return self.proxy.translate(input)
+
+                else:
+                    raise Exception("Can't handle input of this type!")
+
+            except:
+                attempts += 1
+                print >>sys.stderr, "WAITING", attempts
+                time.sleep(1)
+        raise Exception("Translation request failed")
diff --git a/scripts/server/sim-pe.py b/scripts/server/sim-pe.py
index 52d1e314a..5f1407524 100755
--- a/scripts/server/sim-pe.py
+++ b/scripts/server/sim-pe.py
@@ -5,29 +5,39 @@
 # This script simulates post-editing of MT output and incrementally
 # updates the dynamic phrase tables in the moses server.
 
-import xmlrpclib,datetime,argparse,sys,os,time
+import argparse
+import os
+import sys
+import time
+import xmlrpclib
 import moses
-from moses import MosesServer
-from subprocess import *
+from subprocess import (
+    PIPE,
+    Popen,
+    )
+
+
 mserver = moses.MosesServer()
 
 # We must perform some custom argument processing, as moses parameter
 # specifications do not comply with the standards used in standard
 # argument parsing packages; an isolated double dash separates script
 # arguments from moses arguments
+
+
 def split_args(all_args):
     """
     Split argument list all_args into arguments specific to this script and
-    arguments relating to the moses server. An isolated double dash acts as 
-    the separator between the two types of arguments. 
+    arguments relating to the moses server. An isolated double dash acts as
+    the separator between the two types of arguments.
     """
     my_args = []
     mo_args = []
     arglist = mo_args
     i = 0
-    # IMPORTANT: the code below must be coordinated with 
+    # IMPORTANT: the code below must be coordinated with
     # - the evolution of moses command line arguments
-    # - mert-moses.pl 
+    # - mert-moses.pl
     while i < len(all_args):
         # print i,"MY_ARGS", my_args
         # print i,"MO_ARGS", mo_args
@@ -36,14 +46,16 @@ def split_args(all_args):
         elif all_args[i] == "--]":
             arglist = mo_args
         elif all_args[i] == "-i" or all_args[i] == "-input-file":
-            my_args.extend(["-i",all_args[i+1]])
+            my_args.extend(["-i", all_args[i + 1]])
             i += 1
         elif all_args[i] == "-inputtype":
-            if all_args[i+1] != "0":
-                # not yet supported! Therefore:
-                errmsg  = "FATAL ERROR: %s "%sys.argv[0]
-                errmsg += "only supports plain text input at this point."
-                raise Exception(errsmg)
+            if all_args[i + 1] != "0":
+                # Not yet supported! Therefore:
+                errmsg = (
+                    "FATAL ERROR: "
+                    "%s only supports plain text input at this point."
+                    % sys.argv[0])
+                raise Exception(errmsg)
             # my_args.extend(["--input-type",all_args[i+1]])
             i += 1
         elif all_args[i] == "-lattice-samples":
@@ -52,13 +64,14 @@ def split_args(all_args):
             # mo_args[i:i+3] = []
             # i += 2
             # This is not yet supported! Therefore:
-            errmsg  = "FATAL ERROR: %s "%sys.argv[0]
-            errmsg += "does not yet support lattice sampling."
-            raise Exception(errsmg)
-        
+            errmsg = (
+                "FATAL ERROR: %s does not yet support lattice sampling."
+                % sys.argv[0])
+            raise Exception(errmsg)
+
         elif all_args[i] == "-n-best-list":
-            my_args.extend(["--nbest",all_args[i+2]])
-            my_args.extend(["--nbest-file",all_args[i+1]])
+            my_args.extend(["--nbest", all_args[i + 2]])
+            my_args.extend(["--nbest-file", all_args[i + 1]])
             i += 2
 
         elif all_args[i] == "-n-best-distinct":
@@ -70,128 +83,148 @@ def split_args(all_args):
 
         i += 1
         pass
-    return my_args,mo_args
-    
+    return my_args, mo_args
+
+
 def interpret_args(my_args):
     """
     Parse script-specific argument list.
     """
     aparser = argparse.ArgumentParser()
 
-    aparser.add_argument("-s","--server-cmd",default="mosesserver",
-                         dest="servercmd", help="path to moses server command")
-    aparser.add_argument("--url",help="URL of external moses server.")
-    aparser.add_argument("-p","--port", type=int, default=7447,
-                         help="port number to be used for server")
-    
-    # input / output
-    aparser.add_argument("-i","--input",help="source file",default="-")
-    aparser.add_argument("-r","--ref",help="reference translation",default=None)
-    aparser.add_argument("-a","--aln",help="alignment",default=None)
-    aparser.add_argument("-o","--output",default="-",help="output file")
-    aparser.add_argument("-d","--debug",action="store_true",help="debug mode")
-    
-    # moses reporting options
-    aparser.add_argument("-A","--with-alignment", dest="A",
-                         help="include alignment in output", action="store_true")
-    aparser.add_argument("-G","--with-graph",type=bool, default=False, dest="G",
-                         help="include search graph info in output")
-    aparser.add_argument("-T","--with-transopt",type=bool, default=False, dest = "T",
-                         help="include translation options info in output")
-    aparser.add_argument("-F","--report-all-factors", action="store_true",dest="F",
-                         help="report all factors")
-    aparser.add_argument("-n","--nbest",type=int,dest="nbest",default=0, 
-                         help="size of nbest list")
-    aparser.add_argument("-N","--nbest-file",dest="nbestFile",default=0,
-                         help="output file for nbest list")
-    aparser.add_argument("-u","--nbest-distinct",type=bool,dest="U",default=False,
-                         help="report all factors")
+    aparser.add_argument(
+        "-s", "--server-cmd", default="mosesserver", dest="servercmd",
+        help="Path to moses server command.")
+    aparser.add_argument(
+        "--url", help="URL of external moses server.")
+    aparser.add_argument(
+        "-p", "--port", type=int, default=7447,
+        help="Port number to be used for server.")
+
+    # Input / output.
+    aparser.add_argument(
+        "-i", "--input", default='-', help="source file")
+    aparser.add_argument(
+        "-r", "--ref", default=None, help="Reference translation.")
+    aparser.add_argument(
+        "-a", "--aln", default=None, help="Alignment.")
+    aparser.add_argument(
+        "-o", "--output", default="-", help="Output file.")
+    aparser.add_argument(
+        "-d", "--debug", action='store_true', help="Debug mode.")
+
+    # Moses reporting options.
+    aparser.add_argument(
+        "-A", "--with-alignment", dest="A", action='store_true',
+        help="Include alignment in output.")
+    aparser.add_argument(
+        "-G", "--with-graph", type=bool, default=False, dest="G",
+        help="Include search graph info in output.")
+    aparser.add_argument(
+        "-T", "--with-transopt", type=bool, default=False, dest="T",
+        help="Include translation options info in output.")
+    aparser.add_argument(
+        "-F", "--report-all-factors", action="store_true", dest="F",
+        help="Report all factors.")
+    aparser.add_argument(
+        "-n", "--nbest", type=int, dest="nbest", default=0,
+        help="Size of nbest list.")
+    aparser.add_argument(
+        "-N", "--nbest-file", dest="nbestFile", default=0,
+        help="Output file for nbest list.")
+    aparser.add_argument(
+        "-u", "--nbest-distinct", type=bool, dest="U", default=False,
+        help="Report all factors.")
 
     return aparser.parse_args(my_args)
-    
+
+
 def translate(proxy, args, line):
     if type(line) is unicode:
-        param = { 'text' : line.strip().encode('utf8') }
+        param = {'text': line.strip().encode('utf8')}
     elif type(line) is str:
-        param = { 'text' : line.strip() }
+        param = {'text': line.strip()}
     else:
         raise Exception("Can't handle input")
-    if args.A: param['align'] = True
-    if args.T: param['topt']  = True
-    if args.F: param['report-all-factors'] = True
-    if args.nbest: 
+    if args.A:
+        param['align'] = True
+    if args.T:
+        param['topt'] = True
+    if args.F:
+        param['report-all-factors'] = True
+    if args.nbest:
         param['nbest'] = int(args.nbest)
         param['add-score-breakdown'] = True
         pass
-    if args.U: 
+    if args.U:
         param['nbest-distinct'] = True
         pass
     attempts = 0
     while attempts < 20:
         t1 = time.time()
         try:
-            return proxy.translate(param) 
+            return proxy.translate(param)
 
         # except xmlrpclib.Fault as e:
         # except xmlrpclib.ProtocolError as e:
         # except xmlrpclib.ResponseError as e:
         except xmlrpclib.Error as e:
-            time.sleep(2) # give all the stderr stuff a chance to be flushed
-            print >>sys.stderr," XMLRPC error:",e
+            sys.stderr.flush()
+            print >>sys.stderr, " XMLRPC error:", e
             print >>sys.stderr, "Input was"
             print >>sys.stderr, param
             sys.exit(1)
 
         except IOError as e:
-            print >>sys.stderr,"I/O error({0}): {1}".format(e.errno, e.strerror)
+            print >>sys.stderr, (
+                "I/O error({0}): {1}".format(e.errno, e.strerror))
             time.sleep(5)
 
         except:
             serverstatus = mserver.process.poll()
-            if serverstatus == None:
-                print >>sys.stderr, "Connection failed after %f seconds"%(time.time()-t1)
+            if serverstatus is None:
+                print >>sys.stderr, (
+                    "Connection failed after %f seconds" % (time.time() - t1))
                 attempts += 1
                 if attempts > 10:
                     time.sleep(10)
                 else:
                     time.sleep(5)
-                    pass
             else:
-                
-                print >>sys.stderr, "Oopsidaisy, server exited with code %d (signal %d)"\
-                    %(serverstatus/256,serverstatus%256)
+                print >>sys.stderr, (
+                    "Oopsidaisy, server exited with code %d (signal %d)"
+                    % (serverstatus / 256, serverstatus % 256))
                 pass
             pass
         pass
     raise Exception("Exception: could not reach translation server.")
-    
+
 
 def read_data(fname):
     """
     Read and return data (source, target or alignment) from file fname.
     """
     if fname[-3:] == ".gz":
-        foo = Popen(["zcat",fname],stdout=PIPE)\
-            .communicate()[0]\
-            .strip().split('\n')
+        process = Popen(["zcat", fname], stdout=PIPE)
+        stdout, _ = process.communicate()
+        foo = stdout.strip().split('\n')
     else:
         foo = [x.strip() for x in open(fname).readlines()]
-        pass
     return foo
 
-def repack_result(idx,result):
+
+def repack_result(idx, result):
     global args
     if args.nbest:
         for h in result['nbest']:
-            fields = [idx,h['hyp'],h['fvals'],h['totalScore']]
+            fields = [idx, h['hyp'], h['fvals'], h['totalScore']]
             for i in xrange(len(fields)):
                 if type(fields[i]) is unicode:
                     fields[i] = fields[i].encode('utf-8')
                     pass
                 pass
-            # print fields
-            print >>NBestFile,"%d ||| %s ||| %s ||| %f"%tuple(fields)
-            pass
+            # Print fields.
+            print >>NBestFile, "%d ||| %s ||| %s ||| %f" % tuple(fields)
         pass
     if 'align' in result:
         t = result['text'].split()
@@ -200,16 +233,14 @@ def repack_result(idx,result):
         k = 0
         for a in result['align']:
             k = a['tgt-start']
-            if k: print " ".join(t[i:k]).encode('utf8'),span,
+            if k:
+                print " ".join(t[i:k]).encode('utf8'), span,
             i = k
-            span = "|%d %d|"%(a['src-start'],a['src-end'])
-            pass
-        print " ".join(t[k:]).encode('utf8'),span
-        pass
+            span = "|%d %d|" % (a['src-start'], a['src-end'])
+        print " ".join(t[k:]).encode('utf8'), span
     else:
         print result['text'].encode('utf8')
-        pass
-    return
+
 
 if __name__ == "__main__":
     my_args, mo_args = split_args(sys.argv[1:])
@@ -221,17 +252,17 @@ if __name__ == "__main__":
     args = interpret_args(my_args)
 
     if "-show-weights" in mo_args:
-        # this is for use during tuning, where moses is called to get a list of 
-        # feature names
-        devnull = open(os.devnull,"w")
-        mo = Popen(mserver.cmd + mo_args,stdout=PIPE,stderr=devnull)
+        # This is for use during tuning, where moses is called to get a list
+        # of feature names.
+        devnull = open(os.devnull, "w")
+        mo = Popen(mserver.cmd + mo_args, stdout=PIPE, stderr=devnull)
         print mo.communicate()[0].strip()
         sys.exit(0)
         pass
 
     if args.nbest:
         if args.nbestFile:
-            NBestFile = open(args.nbestFile,"w")
+            NBestFile = open(args.nbestFile, "w")
         else:
             NBestFile = sys.stdout
             pass
@@ -239,8 +270,10 @@ if __name__ == "__main__":
 
     ref = None
     aln = None
-    if args.ref: ref = read_data(args.ref)
-    if args.aln: aln = read_data(args.aln)
+    if args.ref:
+        ref = read_data(args.ref)
+    if args.aln:
+        aln = read_data(args.aln)
 
     if ref and aln:
         try:
@@ -260,25 +293,21 @@ if __name__ == "__main__":
         line = sys.stdin.readline()
         idx = 0
         while line:
-            result = translate(mserver.proxy,args,line)
-            repack_result(idx,result)
+            result = translate(mserver.proxy, args, line)
+            repack_result(idx, result)
             line = sys.stdin.readline()
             idx += 1
-            pass
-        pass
     else:
         src = read_data(args.input)
         for i in xrange(len(src)):
-            result = translate(mserver.proxy,args,src[i])
-            repack_result(i,result)
+            result = translate(mserver.proxy, args, src[i])
+            repack_result(i, result)
             if args.debug:
                 print >>sys.stderr, result['text'].encode('utf-8')
                 pass
-            if  ref and aln:
-                result = mserver.proxy.updater({'source'    : src[i],
-                                                'target'    : ref[i],
-                                                'alignment' : aln[i]})
-                pass
-            pass
-        pass
-    pass
+            if ref and aln:
+                result = mserver.proxy.updater({
+                    'source': src[i],
+                    'target': ref[i],
+                    'alignment': aln[i],
+                    })
diff --git a/scripts/tokenizer/pre_tokenize_cleaning.py b/scripts/tokenizer/pre_tokenize_cleaning.py
index 76736da5c..096a45dc4 100644
--- a/scripts/tokenizer/pre_tokenize_cleaning.py
+++ b/scripts/tokenizer/pre_tokenize_cleaning.py
@@ -2,12 +2,12 @@
 
 """
 The Gacha filter cleans out sentence pairs that have global character mean
-lower than a certain threshold. 
- 
-Use this cleaner to produce low quantity of high quality sentence pairs. 
+lower than a certain threshold.
 
-It is an aggressive cleaner that cleaned out ~64% of the HindEnCorp during 
-WMT14 when threshold is set at 20% (Tan and Pal, 2014); achieving lowest TER.  
+Use this cleaner to produce low quantity of high quality sentence pairs.
+
+It is an aggressive cleaner that cleaned out ~64% of the HindEnCorp during
+WMT14 when threshold is set at 20% (Tan and Pal, 2014); achieving lowest TER.
 (see http://www.aclweb.org/anthology/W/W14/W14-3323.pdf)
 
 This is inspired by the global character mean that is used in the Gale-Church
@@ -24,17 +24,24 @@ where:
 (For details on Gale-Church, see http://www.aclweb.org/anthology/J93-1004.pdf)
 """
 
-import io, subprocess
+import io
+import subprocess
+
 
 red = '\033[01;31m'
 native = '\033[m'
 
+
 def err_msg(txt):
-    return red+txt+native
+    return red + txt + native
+
 
 def num_char(filename):
-    return float(subprocess.Popen(["wc", "-m", filename], 
-                            stdout=subprocess.PIPE).stdout.read().split()[0])
+    process = subprocess.Popen(
+        ["wc", "-m", filename], stdout=subprocess.PIPE)
+    # TODO: Was this meant to call communicate()?
+    return float(process.stdout.read().split()[0])
+
 
 def gacha_mean(sourcefile, targetfile):
     """
@@ -43,36 +50,44 @@ def gacha_mean(sourcefile, targetfile):
     """
     sys.stderr.write(err_msg('Calculating Gacha mean, please wait ...\n'))
     c = num_char(sourcefile) / num_char(targetfile)
-    sys.stderr.write(err_msg('Gacha mean = '+str(c)+'\n'))
+    sys.stderr.write(err_msg('Gacha mean = ' + str(c) + '\n'))
     sys.stderr.write(err_msg('Filtering starts ...\n'))
     return c
 
+
+def io_open(path):
+    """Open file `path` for reading, as a UTF-8 text file."""
+    return io.open(path, 'r', encoding='utf8')
+
+
 def main(sourcefile, targetfile, threshold=0.2):
     # Calculates Gacha mean.
     c = gacha_mean(sourcefile, targetfile)
     # Calculates lower and upperbound for filtering
     threshold = float(threshold)
-    lowerbound = (1-threshold) * c
-    upperbound = (1+threshold) * c
-    
+    lowerbound = (1 - threshold) * c
+    upperbound = (1 + threshold) * c
+
     # Start filtering sentences.
-    with io.open(sourcefile, 'r', encoding='utf8') as srcfin, \
-    io.open(targetfile, 'r', encoding='utf8') as trgfin:
+    with io_open(sourcefile) as srcfin, io_open(targetfile) as trgfin:
         for s, t in zip(srcfin, trgfin):
             if lowerbound < len(s) / float(len(t)) < upperbound:
-                print(u"{}\t{}\n".format(s.strip(),t.strip()))
+                print(u"{}\t{}\n".format(s.strip(), t.strip()))
+
 
 if __name__ == '__main__':
     import sys
-    if len(sys.argv) not in range(3,5):
-        usage_msg = err_msg('Usage: python %s srcfile trgfile (threshold)\n'
-                            % sys.argv[0])
-        
-        example_msg = err_msg('Example: gacha_cleaning.py ~/Europarl.de-en.de '
-                            '~/Europarl.de-en.en 0.4\n'
-                            % sys.argv[0])
+    if len(sys.argv) not in range(3, 5):
+        usage_msg = err_msg(
+            "Usage: python %s srcfile trgfile (threshold)\n"
+            % sys.argv[0])
+
+        example_msg = err_msg(
+            "Example: "
+            "gacha_cleaning.py ~/Europarl.de-en.de ~/Europarl.de-en.en 0.4\n"
+            % sys.argv[0])
         sys.stderr.write(usage_msg)
         sys.stderr.write(example_msg)
         sys.exit(1)
-        
+
     main(*sys.argv[1:])
diff --git a/scripts/training/filter-rule-table.py b/scripts/training/filter-rule-table.py
index 86c8b300e..14736fe1f 100755
--- a/scripts/training/filter-rule-table.py
+++ b/scripts/training/filter-rule-table.py
@@ -24,9 +24,11 @@
 import optparse
 import sys
 
+
 class NGram(tuple):
     pass
 
+
 class Gap:
     def __init__(self, minSpan):
         self.minSpan = minSpan
@@ -34,8 +36,12 @@ class Gap:
     def getMinSpan(self):
         return self.minSpan
 
+
 def printUsage():
-    sys.stderr.write("Usage: filter-rule-table.py [--min-non-initial-rule-count=N] INPUT")
+    sys.stderr.write(
+        "Usage: "
+        "filter-rule-table.py [--min-non-initial-rule-count=N] INPUT")
+
 
 def main():
     parser = optparse.OptionParser()
@@ -54,14 +60,15 @@ def main():
         inputSentences.append(line.split())
     filterRuleTable(sys.stdin, inputSentences, N, options)
 
+
 def filterRuleTable(ruleTable, inputSentences, N, options):
     # Map each input n-gram (n = 1..N) to a map from sentence indices to
     # lists of intra-sentence indices.
     occurrences = {}
     for i, sentence in enumerate(inputSentences):
-        for n in range(1, N+1):
-            for j in range(0, len(sentence)-n+1):
-                ngram = NGram(sentence[j:j+n])
+        for n in range(1, N + 1):
+            for j in range(0, len(sentence) - n + 1):
+                ngram = NGram(sentence[j:j + n])
                 innerMap = occurrences.setdefault(ngram, {})
                 indices = innerMap.setdefault(i, [])
                 indices.append(j)
@@ -70,15 +77,16 @@ def filterRuleTable(ruleTable, inputSentences, N, options):
     prevRuleIncluded = None
     for line in ruleTable:
         rhs, count = parseRule(line)
+        below_threshold = (count is not None and count < options.minCount)
         # Prune non-initial rule if count is below threshold.
-        if count != None and count < options.minCount and isNonInitialRule(rhs):
+        if below_threshold and isNonInitialRule(rhs):
             if prevRHS != rhs:
                 prevRuleIncluded = None
                 prevRHS = rhs
             continue
         # If source RHS is same as last rule's then we already know whether to
         # filter or not (unless it was pruned before checking).
-        if rhs == prevRHS and prevRuleIncluded != None:
+        if rhs == prevRHS and prevRuleIncluded is not None:
             if prevRuleIncluded:
                 print line,
             continue
@@ -89,7 +97,10 @@ def filterRuleTable(ruleTable, inputSentences, N, options):
             prevRuleIncluded = True
             continue
         segments = segmentRHS(rhs, N)
-        ngramMaps = [occurrences.get(s, {}) for s in segments if isinstance(s, NGram)]
+        ngramMaps = [
+            occurrences.get(s, {})
+            for s in segments
+            if isinstance(s, NGram)]
         if len(ngramMaps) == 0:
             print line,
             prevRuleIncluded = True
@@ -111,9 +122,13 @@ def filterRuleTable(ruleTable, inputSentences, N, options):
                 break
         prevRuleIncluded = match
 
-# Parse a line of the rule table and return a tuple containing two items,
-# the list of RHS source symbols and the rule count (if present).
+
 def parseRule(line):
+    """Parse a line of the rule table.
+
+    :return: A tuple containing two items: the list of RHS source symbols,
+        and the rule count (if present).
+    """
     cols = line.split("|||")
     rhsSourceSymbols = cols[0].split()[:-1]
     ruleCount = None
@@ -123,15 +138,18 @@ def parseRule(line):
             ruleCount = float(counts[2])
     return (rhsSourceSymbols, ruleCount)
 
+
 def isNT(symbol):
     return symbol[0] == '[' and symbol[-1] == ']'
 
+
 def isNonInitialRule(rhs):
     for symbol in rhs:
         if isNT(symbol):
             return True
     return False
 
+
 def segmentRHS(rhs, N):
     segments = []
     terminals = []
@@ -159,13 +177,14 @@ def segmentRHS(rhs, N):
         segments.append(NGram(terminals))
     return segments
 
+
 def matchSegments(segments, indexSeq, sentenceLength):
     assert len(segments) > 0
     firstSegment = segments[0]
     i = 0
     if isinstance(firstSegment, Gap):
         minPos = firstSegment.getMinSpan()
-        maxPos = sentenceLength-1
+        maxPos = sentenceLength - 1
     else:
         minPos = indexSeq[i] + len(firstSegment)
         i += 1
@@ -175,7 +194,7 @@ def matchSegments(segments, indexSeq, sentenceLength):
             if minPos + segment.getMinSpan() > sentenceLength:
                 return False
             minPos = minPos + segment.getMinSpan()
-            maxPos = sentenceLength-1
+            maxPos = sentenceLength - 1
         else:
             pos = indexSeq[i]
             i += 1
@@ -185,6 +204,7 @@ def matchSegments(segments, indexSeq, sentenceLength):
             maxPos = minPos
     return True
 
+
 def enumerateIndexSeqs(ngramMaps, sentenceIndex, minFirstIndex):
     assert len(ngramMaps) > 0
     if len(ngramMaps) == 1:
@@ -195,7 +215,7 @@ def enumerateIndexSeqs(ngramMaps, sentenceIndex, minFirstIndex):
     for index in ngramMaps[0][sentenceIndex]:
         if index < minFirstIndex:
             continue
-        for seq in enumerateIndexSeqs(ngramMaps[1:], sentenceIndex, index+1):
+        for seq in enumerateIndexSeqs(ngramMaps[1:], sentenceIndex, index + 1):
             assert seq[0] > index
             yield [index] + seq
 
diff --git a/scripts/training/rdlm/average_null_embedding.py b/scripts/training/rdlm/average_null_embedding.py
index cb67c9d75..28abc9508 100755
--- a/scripts/training/rdlm/average_null_embedding.py
+++ b/scripts/training/rdlm/average_null_embedding.py
@@ -2,18 +2,23 @@
 # -*- coding: utf-8 -*-
 # Author: Rico Sennrich
 
-# average embeddings of special null words for RDLM.
-# Usage: average_null_embedding.py NPLM_PATH INPUT_MODEL TRAINING_FILE OUTPUT_MODEL
+"""Average embeddings of special null words for RDLM.
+
+Usage:
+    average_null_embedding.py NPLM_PATH INPUT_MODEL TRAINING_FILE OUTPUT_MODEL
+"""
 
 import sys
 import os
 import numpy
 
+
 def load_model(model_file):
     return nplm.NeuralLM.from_file(model_file)
 
+
 def get_weights(path, vocab, len_context):
-    d = [[0]*vocab for i in range(len_context)]
+    d = [[0] * vocab for i in range(len_context)]
     for line in open(path):
         for i, word in enumerate(line.split()[:-1]):
             d[i][int(word)] += 1
@@ -26,20 +31,23 @@ if __name__ == "__main__":
     training_instances = sys.argv[3]
     model_output = sys.argv[4]
 
-    sys.path.append(os.path.join(nplm_path,'python'))
+    sys.path.append(os.path.join(nplm_path, 'python'))
     import nplm
 
     model = load_model(model_input)
 
-    len_context = len(open(training_instances).readline().split())-1
+    len_context = len(open(training_instances).readline().split()) - 1
 
     sys.stderr.write('reading ngrams...')
-    weights = numpy.array(get_weights(training_instances, len(model.input_embeddings), len_context))
+    weights = numpy.array(
+        get_weights(
+            training_instances, len(model.input_embeddings), len_context))
     sys.stderr.write('done\n')
 
     for i in range(len_context):
         index = model.word_to_index_input['<null_{0}>'.format(i)]
-        model.input_embeddings[index] = numpy.average(numpy.array(model.input_embeddings), weights=weights[i], axis=0)
+        model.input_embeddings[index] = numpy.average(
+            numpy.array(model.input_embeddings), weights=weights[i], axis=0)
     sys.stderr.write('writing model...')
-    model.to_file(open(model_output,'w'))
+    model.to_file(open(model_output, 'w'))
     sys.stderr.write('done\n')
diff --git a/scripts/training/rdlm/extract_syntactic_ngrams.py b/scripts/training/rdlm/extract_syntactic_ngrams.py
index f3ce41080..c6d4b7968 100755
--- a/scripts/training/rdlm/extract_syntactic_ngrams.py
+++ b/scripts/training/rdlm/extract_syntactic_ngrams.py
@@ -2,17 +2,25 @@
 # -*- coding: utf-8 -*-
 # Author: Rico Sennrich
 
-# extract syntactic n-grams from dependency treebank in Moses XML format for training RDLM
-# expected format can be produced with mosesdecoder/scripts/training/wrapper/conll2mosesxml.py
-# OOV terminal symbols are mapped to preterminal; OOV nonterminals are mapped to 0 (<unk>)
+"""
+Extract syntactic n-grams from dependency treebank in Moses XML format for
+training RDLM.
+
+Expected format can be produced with
+mosesdecoder/scripts/training/wrapper/conll2mosesxml.py
+
+OOV terminal symbols are mapped to preterminal; OOV nonterminals are mapped
+to 0 (<unk>)
+"""
 
 from __future__ import print_function, unicode_literals, division
 import sys
 import codecs
 import argparse
 
-# hack for python2/3 compatibility
+# Hack for python2/3 compatibility
 from io import open
+
 argparse.open = open
 
 try:
@@ -20,46 +28,84 @@ try:
 except ImportError:
     from xml.etree import cElementTree as ET
 
+
 def create_parser():
-    parser = argparse.ArgumentParser(description="extract syntactic n-grams from parsed corpus in Moses XML format for training RDLM")
-
-    parser.add_argument('--input', '-i', type=argparse.FileType('r'), default=sys.stdin, metavar='PATH',
-                        help='input file (default: standard input).')
-    parser.add_argument('--output', '-o', type=argparse.FileType('w'), default=sys.stdout, metavar='PATH',
-                        help='output file (default: standard output).')
-    parser.add_argument('--mode', type=str, help='predict terminals (head) or dependency labels (label)',
-                        choices=['label', 'head'], required=True)
-    parser.add_argument('--vocab', metavar='PATH', type=str, required=True,
-                        help='input layer vocabulary file (one item per line; first line \'<unk>\')')
-    parser.add_argument('--output_vocab', metavar='PATH', type=str,
-                        help='output layer vocabulary file (default: use input layer vocabulary)')
-    parser.add_argument('--left_context', metavar='INT', type=int,
-                        help='size of context vector for left siblings (default: %(default)s)', default=3)
-    parser.add_argument('--right_context', metavar='INT', type=int,
-                        help='size of context vector for right siblings (default: %(default)s)', default=0)
-    parser.add_argument('--up_context', metavar='INT', type=int,
-                        help='size of context vector for ancestors (default: %(default)s)', default=2)
-    parser.add_argument('--glue_symbol', metavar='STR', type=str, default='Q',
-                        help='glue symbol. Will be skipped during extraction (default: %(default)s)')
-    parser.add_argument('--start_symbol', metavar='STR', type=str, default='SSTART',
-                        help='sentence start symbol. Will be skipped during extraction (default: %(default)s)')
-    parser.add_argument('--end_symbol', metavar='STR', type=str, default='SEND',
-                        help='sentence end symbol. Will be skipped during extraction (default: %(default)s)')
-    parser.add_argument('--ptkvz', action='store_true',
-                        help='special rule for German dependency trees: concatenate separable verb prefix and verb')
+    parser = argparse.ArgumentParser(
+        description=(
+            "Extract syntactic n-grams from parsed corpus in "
+            "Moses XML format for training RDLM"))
+
+    parser.add_argument(
+        '--input', '-i', type=argparse.FileType('r'), default=sys.stdin,
+        metavar='PATH',
+        help='Input file (default: standard input).')
+    parser.add_argument(
+        '--output', '-o', type=argparse.FileType('w'), default=sys.stdout,
+        metavar='PATH',
+        help='Output file (default: standard output).')
+    parser.add_argument(
+        '--mode', type=str, choices=['label', 'head'], required=True,
+        help='Predict terminals (head) or dependency labels (label).')
+    parser.add_argument(
+        '--vocab', metavar='PATH', type=str, required=True,
+        help=(
+            "Input layer vocabulary file (one item per line; "
+            "first line '<unk>')"))
+    parser.add_argument(
+        '--output_vocab', metavar='PATH', type=str,
+        help=(
+            "Output layer vocabulary file "
+            "(default: use input layer vocabulary)"))
+    parser.add_argument(
+        '--left_context', metavar='INT', type=int, default=3,
+        help=(
+            "Size of context vector for left siblings "
+            "(default: %(default)s)"))
+    parser.add_argument(
+        '--right_context', metavar='INT', type=int, default=0,
+        help=(
+            "Size of context vector for right siblings "
+            "(default: %(default)s)"))
+    parser.add_argument(
+        '--up_context', metavar='INT', type=int, default=2,
+        help=(
+            "Size of context vector for ancestors "
+            "(default: %(default)s)"))
+    parser.add_argument(
+        '--glue_symbol', metavar='STR', type=str, default='Q',
+        help=(
+            "Glue symbol. Will be skipped during extraction "
+            "(default: %(default)s)"))
+    parser.add_argument(
+        '--start_symbol', metavar='STR', type=str, default='SSTART',
+        help=(
+            "Sentence start symbol. Will be skipped during extraction "
+            "(default: %(default)s)"))
+    parser.add_argument(
+        '--end_symbol', metavar='STR', type=str, default='SEND',
+        help=(
+            "Sentence end symbol. Will be skipped during extraction "
+            "(default: %(default)s)"))
+    parser.add_argument(
+        '--ptkvz', action='store_true',
+        help=(
+            "Special rule for German dependency trees: "
+            "concatenate separable verb prefix and verb."))
     return parser
 
+
 def escape_text(s):
 
-    s = s.replace('|','&#124;') # factor separator
-    s = s.replace('[','&#91;') # syntax non-terminal
-    s = s.replace(']','&#93;') # syntax non-terminal
-    s = s.replace('\'','&apos;') # xml special character
-    s = s.replace('"','&quot;') # xml special character
+    s = s.replace('|', '&#124;')  # factor separator
+    s = s.replace('[', '&#91;')  # syntax non-terminal
+    s = s.replace(']', '&#93;')  # syntax non-terminal
+    s = s.replace('\'', '&apos;')  # xml special character
+    s = s.replace('"', '&quot;')  # xml special character
     return s
 
-# deterministic heuristic to get head of subtree
+
 def get_head(xml, add_ptkvz):
+    """Deterministic heuristic to get head of subtree."""
     head = None
     preterminal = None
     for child in xml:
@@ -77,23 +123,38 @@ def get_head(xml, add_ptkvz):
 
     return head, preterminal
 
-def get_syntactic_ngrams(xml, options, vocab, output_vocab, parent_heads=None, parent_labels=None):
 
-    if len(xml):
+def get_syntactic_ngrams(xml, options, vocab, output_vocab,
+                         parent_heads=None, parent_labels=None):
 
-        # skip glue rules
-        if xml.get('label') == options.glue_symbol or xml.get('label') == options.start_symbol or xml.get('label') == options.end_symbol:
-          for child in xml:
-            get_syntactic_ngrams(child, options, vocab, output_vocab, parent_heads, parent_labels)
-          return
+    if len(xml):
 
-        # skip virtual nodes
-        if xml.get('label') == '<stop_label>' or xml.get('label') == '<start_label>':
-          return
+        # Skip glue rules.
+        skip_glue_labels = [
+            options.glue_symbol,
+            options.start_symbol,
+            options.end_symbo,
+            ]
+        if xml.get('label') in skip_glue_labels:
+            for child in xml:
+                get_syntactic_ngrams(
+                    child, options, vocab, output_vocab, parent_heads,
+                    parent_labels)
+            return
+
+        # Skip virtual nodes.
+        skip_virtual_labels = [
+            '<stop_label>',
+            '<start_label>',
+            ]
+        if xml.get('label') in skip_virtual_labels:
+            return
 
         if not parent_heads:
-            parent_heads = [vocab.get('<root_head>', 0)] * options.up_context
-            parent_labels = [vocab.get('<root_label>', 0)] * options.up_context
+            parent_heads = (
+                [vocab.get('<root_head>', 0)] * options.up_context)
+            parent_labels = (
+                [vocab.get('<root_label>', 0)] * options.up_context)
 
             head, preterminal = get_head(xml, options.ptkvz)
             if not head:
@@ -119,7 +180,8 @@ def get_syntactic_ngrams(xml, options, vocab, output_vocab, parent_heads=None, p
                 options.output.write(' '.join(map(str, int_list)) + '\n')
             elif options.mode == 'head' and not head == '<dummy_head>':
                 int_list.append(vocab.get(label, 0))
-                int_list.append(output_vocab.get(head, output_vocab.get(preterminal, 0)))
+                int_list.append(
+                    output_vocab.get(head, output_vocab.get(preterminal, 0)))
                 options.output.write(' '.join(map(str, int_list)) + '\n')
 
             parent_heads.append(vocab.get(head, 0))
@@ -130,28 +192,29 @@ def get_syntactic_ngrams(xml, options, vocab, output_vocab, parent_heads=None, p
             if options.right_context:
                 start = ET.Element('tree')
                 start2 = ET.Element('tree')
-                start.set('label','<start_label>')
-                start2.set('label','XY')
+                start.set('label', '<start_label>')
+                start2.set('label', 'XY')
                 start2.text = '<start_head>'
                 start.append(start2)
-                xml.insert(0,start)
+                xml.insert(0, start)
             if options.left_context:
                 end = ET.Element('tree')
                 end2 = ET.Element('tree')
-                end.set('label','<stop_label>')
-                end2.set('label','XY')
+                end.set('label', '<stop_label>')
+                end2.set('label', 'XY')
                 end2.text = '<stop_head>'
                 end.append(end2)
                 xml.append(end)
 
-
         heads = []
         preterminals = []
         labels = []
 
         for child in xml:
             if not len(child):
-                # mark that the previous sibling is the head of the structure (the head/label are not repeated because they're also head/label of the parent)
+                # Mark that the previous sibling is the head of the
+                # structure (the head/label are not repeated because they're
+                # also head/label of the parent).
                 head_child = '<head_head>'
                 preterminal_child = head_child
                 child_label = '<head_label>'
@@ -166,37 +229,60 @@ def get_syntactic_ngrams(xml, options, vocab, output_vocab, parent_heads=None, p
             preterminals.append(preterminal_child)
             labels.append(child_label)
 
-            heads_idx = [vocab.get(heads[i], vocab.get(preterminals[i], 0)) for i in range(len(heads))]
-            labels_idx = [vocab.get(labels[i], 0) for i in range(len(labels))]
+            heads_idx = [
+                vocab.get(heads[i], vocab.get(preterminals[i], 0))
+                for i in range(len(heads))]
+            labels_idx = [
+                vocab.get(labels[i], 0)
+                for i in range(len(labels))]
 
-        #ancestor context is same for all children
+        # Ancestor context is the same for all children.
         up_heads = parent_heads[-options.up_context:]
         up_labels = parent_labels[-options.up_context:]
 
-        for i,child in enumerate(xml):
-
-            # skip some special symbols, but recursively extract n-grams for its children
-            if options.mode == 'head' and (heads[i] == '<dummy_head>' or heads[i] == '<head_head>' or heads[i] == '<stop_head>' or heads[i] == '<start_head>'):
+        skip_special_heads = [
+            '<dummy_head>',
+            '<head_head>',
+            '<stop_head>',
+            '<start_head>',
+            ]
+        for i, child in enumerate(xml):
+
+            # Skip some special symbols, but recursively extract n-grams
+            # for its children.
+            if options.mode == 'head' and heads[i] in skip_special_heads:
                 parent_heads.append(vocab.get(heads[i], 0))
                 parent_labels.append(vocab.get(labels[i], 0))
-                get_syntactic_ngrams(child, options, vocab, output_vocab, parent_heads, parent_labels)
+                get_syntactic_ngrams(
+                    child, options, vocab, output_vocab, parent_heads,
+                    parent_labels)
                 parent_heads.pop()
                 parent_labels.pop()
                 continue
 
-            previous_heads = heads_idx[max(0,i-options.left_context):i]
-            previous_labels = labels_idx[max(0,i-options.left_context):i]
+            previous_heads = heads_idx[max(0, i - options.left_context):i]
+            previous_labels = labels_idx[max(0, i - options.left_context):i]
 
-            subsequent_heads = heads_idx[i+1:i+options.right_context+1]
-            subsequent_labels = labels_idx[i+1:i+options.right_context+1]
+            subsequent_heads = heads_idx[i + 1:i + options.right_context + 1]
+            subsequent_labels = labels_idx[i + 1:i + options.right_context + 1]
 
             if len(previous_heads) < options.left_context:
-                previous_heads = [start_head_idx] * (options.left_context-len(previous_heads)) + previous_heads
-                previous_labels = [start_label_idx] * (options.left_context-len(previous_labels)) + previous_labels
+                previous_heads = (
+                    [start_head_idx] *
+                    (options.left_context - len(previous_heads)) +
+                    previous_heads)
+                previous_labels = (
+                    [start_label_idx] *
+                    (options.left_context - len(previous_labels)) +
+                    previous_labels)
 
             if len(subsequent_heads) < options.right_context:
-                subsequent_heads = subsequent_heads + [stop_head_idx] * (options.right_context-len(subsequent_heads))
-                subsequent_labels = subsequent_labels + [stop_label_idx] * (options.right_context-len(subsequent_labels))
+                subsequent_heads += (
+                    [stop_head_idx] *
+                    (options.right_context - len(subsequent_heads)))
+                subsequent_labels += (
+                    [stop_label_idx] *
+                    (options.right_context - len(subsequent_labels)))
 
             int_list = []
             int_list.extend(previous_heads)
@@ -209,14 +295,19 @@ def get_syntactic_ngrams(xml, options, vocab, output_vocab, parent_heads=None, p
                 int_list.append(output_vocab.get(labels[i], 0))
             elif options.mode == 'head':
                 int_list.append(vocab.get(labels[i], 0))
-                int_list.append(output_vocab.get(heads[i], output_vocab.get(preterminals[i], 0)))
+                int_list.append(
+                    output_vocab.get(
+                        heads[i], output_vocab.get(preterminals[i], 0)))
 
             options.output.write(' '.join(map(str, int_list)) + '\n')
 
-            parent_heads.append(vocab.get(heads[i], vocab.get(preterminals[i], 0)))
+            parent_heads.append(
+                vocab.get(heads[i], vocab.get(preterminals[i], 0)))
             parent_labels.append(vocab.get(labels[i], 0))
 
-            get_syntactic_ngrams(child, options, vocab, output_vocab, parent_heads, parent_labels)
+            get_syntactic_ngrams(
+                child, options, vocab, output_vocab, parent_heads,
+                parent_labels)
 
             parent_heads.pop()
             parent_labels.pop()
@@ -224,15 +315,17 @@ def get_syntactic_ngrams(xml, options, vocab, output_vocab, parent_heads=None, p
 
 def load_vocab(path):
     v = {}
-    for i,line in enumerate(open(path, encoding="UTF-8")):
+    for i, line in enumerate(open(path, encoding="UTF-8")):
         v[line.strip()] = i
     return v
 
+
 def main(options):
     vocab = load_vocab(options.vocab)
 
     if options.output_vocab is None:
-        sys.stderr.write('no output vocabulary specified; using input vocabulary\n')
+        sys.stderr.write(
+            "No output vocabulary specified; using input vocabulary.\n")
         output_vocab = vocab
     else:
         output_vocab = load_vocab(options.output_vocab)
@@ -275,4 +368,4 @@ if __name__ == '__main__':
     parser = create_parser()
     options = parser.parse_args()
 
-    main(options)
-\ No newline at end of file
+    main(options)
diff --git a/scripts/training/rdlm/extract_vocab.py b/scripts/training/rdlm/extract_vocab.py
index 6d017602e..ed9266fd9 100755
--- a/scripts/training/rdlm/extract_vocab.py
+++ b/scripts/training/rdlm/extract_vocab.py
@@ -9,6 +9,7 @@ import sys
 import codecs
 import argparse
 from collections import Counter
+from textwrap import dedent
 
 # hack for python2/3 compatibility
 from io import open
@@ -19,37 +20,49 @@ try:
 except ImportError:
     from xml.etree import cElementTree as ET
 
-def create_parser():
 
-    help_text =  "generate 5 vocabulary files from parsed corpus in moses XML format\n"
-    help_text += "  [PREFIX].special: around 40 symbols reserved for RDLM\n";
-    help_text += "  [PREFIX].preterminals: preterminal symbols\n";
-    help_text += "  [PREFIX].nonterminals: nonterminal symbols (which are not preterminal)\n";
-    help_text += "  [PREFIX].terminals: terminal symbols\n";
-    help_text += "  [PREFIX].all: all of the above\n"
+HELP_TEXT = dedent("""\
+    generate 5 vocabulary files from parsed corpus in moses XML format
+      [PREFIX].special: around 40 symbols reserved for RDLM
+      [PREFIX].preterminals: preterminal symbols
+      [PREFIX].nonterminals: nonterminal symbols (which are not preterminal)
+      [PREFIX].terminals: terminal symbols
+      [PREFIX].all: all of the above
+""")
 
-    parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter, description=help_text)
 
-    parser.add_argument('--input', '-i', type=argparse.FileType('r'), default=sys.stdin, metavar='PATH',
-                        help='input text (default: standard input).')
-    parser.add_argument('--output', '-o', type=str, default='vocab', metavar='PREFIX',
-                        help='output prefix (default: "vocab")')
-    parser.add_argument('--ptkvz', action="store_true",
-                    help='special rule for German dependency trees: attach separable verb prefixes to verb')
+def create_parser():
+    parser = argparse.ArgumentParser(
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        description=HELP_TEXT)
+
+    parser.add_argument(
+        '--input', '-i', type=argparse.FileType('r'), default=sys.stdin,
+        metavar='PATH',
+        help="Input text (default: standard input).")
+    parser.add_argument(
+        '--output', '-o', type=str, default='vocab', metavar='PREFIX',
+        help="Output prefix (default: 'vocab')")
+    parser.add_argument(
+        '--ptkvz', action="store_true",
+        help=(
+            "Special rule for German dependency trees: attach separable "
+            "verb prefixes to verb."))
 
     return parser
 
-def escape_text(s):
 
-    s = s.replace('|','&#124;') # factor separator
-    s = s.replace('[','&#91;') # syntax non-terminal
-    s = s.replace(']','&#93;') # syntax non-terminal
-    s = s.replace('\'','&apos;') # xml special character
-    s = s.replace('"','&quot;') # xml special character
+def escape_text(s):
+    s = s.replace('|', '&#124;')  # factor separator
+    s = s.replace('[', '&#91;')  # syntax non-terminal
+    s = s.replace(']', '&#93;')  # syntax non-terminal
+    s = s.replace('\'', '&apos;')  # xml special character
+    s = s.replace('"', '&quot;')  # xml special character
     return s
 
-# deterministic heuristic to get head of subtree
+
 def get_head(xml, args):
+    """Deterministic heuristic to get head of subtree."""
     head = None
     preterminal = None
     for child in xml:
@@ -67,6 +80,7 @@ def get_head(xml, args):
 
     return head, preterminal
 
+
 def get_vocab(xml, args):
 
     if len(xml):
@@ -88,6 +102,7 @@ def get_vocab(xml, args):
                 continue
             get_vocab(child, args)
 
+
 def main(args):
 
     global heads
@@ -111,10 +126,24 @@ def main(args):
         get_vocab(xml, args)
         i += 1
 
-    special_tokens = ['<unk>', '<null>', '<null_label>', '<null_head>', '<head_label>', '<root_label>', '<start_label>', '<stop_label>', '<head_head>', '<root_head>', '<start_head>', '<dummy_head>', '<stop_head>']
+    special_tokens = [
+        '<unk>',
+        '<null>',
+        '<null_label>',
+        '<null_head>',
+        '<head_label>',
+        '<root_label>',
+        '<start_label>',
+        '<stop_label>',
+        '<head_head>',
+        '<root_head>',
+        '<start_head>',
+        '<dummy_head>',
+        '<stop_head>',
+    ]
 
     for i in range(30):
-      special_tokens.append('<null_{0}>'.format(i))
+        special_tokens.append('<null_{0}>'.format(i))
 
     f = open(args.output + '.special', 'w', encoding='UTF-8')
     for item in special_tokens:
@@ -158,7 +187,6 @@ def main(args):
     f.close()
 
 
-
 if __name__ == '__main__':
 
     if sys.version_info < (3, 0):
diff --git a/scripts/training/rdlm/train_rdlm.py b/scripts/training/rdlm/train_rdlm.py
index 15e56c430..ae57e8dfc 100755
--- a/scripts/training/rdlm/train_rdlm.py
+++ b/scripts/training/rdlm/train_rdlm.py
@@ -9,7 +9,6 @@ import subprocess
 import sys
 import os
 import codecs
-import copy
 
 # ../bilingual-lm
 sys.path.append(os.path.join(os.path.dirname(sys.path[0]), 'bilingual-lm'))
@@ -17,143 +16,224 @@ import train_nplm
 import extract_vocab
 import extract_syntactic_ngrams
 
-logging.basicConfig(format='%(asctime)s %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S', level=logging.DEBUG)
+logging.basicConfig(
+    format='%(asctime)s %(levelname)s: %(message)s',
+    datefmt='%Y-%m-%d %H:%M:%S', level=logging.DEBUG)
 parser = argparse.ArgumentParser()
-parser.add_argument("--working-dir", dest="working_dir", metavar="PATH")
-parser.add_argument("--corpus", dest="corpus_stem", metavar="PATH", help="input file")
-parser.add_argument("--nplm-home", dest="nplm_home", metavar="PATH", help="location of NPLM", required=True)
-parser.add_argument("--epochs", dest="epochs", type=int, metavar="INT", help="number of training epochs (default: %(default)s)")
-parser.add_argument("--up-context-size", dest="up_context_size", type=int, metavar="INT", help="size of ancestor context (default: %(default)s)")
-parser.add_argument("--left-context-size", dest="left_context_size", type=int, metavar="INT", help="size of sibling context (left) (default: %(default)s)")
-parser.add_argument("--right-context-size", dest="right_context_size", type=int, metavar="INT", help="size of sibling context (right) (default: %(default)s)")
-parser.add_argument("--mode", dest="mode", choices=['head', 'label'], help="type of RDLM to train (both are required for decoding)", required=True)
-parser.add_argument("--minibatch-size", dest="minibatch_size", type=int, metavar="INT", help="minibatch size (default: %(default)s)")
-parser.add_argument("--noise", dest="noise", type=int, metavar="INT", help="number of noise samples for NCE (default: %(default)s)")
-parser.add_argument("--hidden", dest="hidden", type=int, metavar="INT", help="size of hidden layer (0 for single hidden layer) (default: %(default)s)")
-parser.add_argument("--input-embedding", dest="input_embedding", type=int, metavar="INT", help="size of input embedding layer (default: %(default)s)")
-parser.add_argument("--output-embedding", dest="output_embedding", type=int, metavar="INT", help="size of output embedding layer (default: %(default)s)")
-parser.add_argument("--threads", "-t", dest="threads", type=int, metavar="INT", help="number of threads (default: %(default)s)")
-parser.add_argument("--output-model", dest="output_model", metavar="PATH", help="name of output model (default: %(default)s)")
-parser.add_argument("--output-dir", dest="output_dir", metavar="PATH", help="output directory (default: same as working-dir)")
-parser.add_argument("--config-options-file", dest="config_options_file", metavar="PATH")
-parser.add_argument("--log-file", dest="log_file", metavar="PATH", help="log file to write to (default: %(default)s)")
-parser.add_argument("--validation-corpus", dest="validation_corpus", metavar="PATH", help="validation file (default: %(default)s)")
-parser.add_argument("--activation-function", dest="activation_fn", choices=['identity', 'rectifier', 'tanh', 'hardtanh'], help="activation function (default: %(default)s)")
-parser.add_argument("--learning-rate", dest="learning_rate", type=float, metavar="FLOAT", help="learning rate (default: %(default)s)")
-parser.add_argument("--input-words-file", dest="input_words_file", metavar="PATH", help="input vocabulary (default: %(default)s)")
-parser.add_argument("--output-words-file", dest="output_words_file", metavar="PATH", help="output vocabulary (default: %(default)s)")
-parser.add_argument("--input_vocab_size", dest="input_vocab_size", type=int, metavar="INT", help="input vocabulary size (default: %(default)s)")
-parser.add_argument("--output-vocab-size", dest="output_vocab_size", type=int, metavar="INT", help="output vocabulary size (default: %(default)s)")
+parser.add_argument(
+    "--working-dir", dest="working_dir", metavar="PATH")
+parser.add_argument(
+    "--corpus", dest="corpus_stem", metavar="PATH", help="Input file.")
+parser.add_argument(
+    "--nplm-home", dest="nplm_home", metavar="PATH", required=True,
+    help="Location of NPLM.")
+parser.add_argument(
+    "--epochs", dest="epochs", type=int, metavar="INT",
+    help="Number of training epochs (default: %(default)s).")
+parser.add_argument(
+    "--up-context-size", dest="up_context_size", type=int, metavar="INT",
+    help="Size of ancestor context (default: %(default)s).")
+parser.add_argument(
+    "--left-context-size", dest="left_context_size", type=int, metavar="INT",
+    help="Size of sibling context (left) (default: %(default)s).")
+parser.add_argument(
+    "--right-context-size", dest="right_context_size", type=int,
+    metavar="INT",
+    help="Size of sibling context (right) (default: %(default)s).")
+parser.add_argument(
+    "--mode", dest="mode", choices=['head', 'label'], required=True,
+    help="Type of RDLM to train (both are required for decoding).")
+parser.add_argument(
+    "--minibatch-size", dest="minibatch_size", type=int, metavar="INT",
+    help="Minibatch size (default: %(default)s).")
+parser.add_argument(
+    "--noise", dest="noise", type=int, metavar="INT",
+    help="Number of noise samples for NCE (default: %(default)s).")
+parser.add_argument(
+    "--hidden", dest="hidden", type=int, metavar="INT",
+    help=(
+        "Size of hidden layer (0 for single hidden layer) "
+        "(default: %(default)s)"))
+parser.add_argument(
+    "--input-embedding", dest="input_embedding", type=int, metavar="INT",
+    help="Size of input embedding layer (default: %(default)s).")
+parser.add_argument(
+    "--output-embedding", dest="output_embedding", type=int, metavar="INT",
+    help="Size of output embedding layer (default: %(default)s).")
+parser.add_argument(
+    "--threads", "-t", dest="threads", type=int, metavar="INT",
+    help="Number of threads (default: %(default)s).")
+parser.add_argument(
+    "--output-model", dest="output_model", metavar="PATH",
+    help="Name of output model (default: %(default)s).")
+parser.add_argument(
+    "--output-dir", dest="output_dir", metavar="PATH",
+    help="Output directory (default: same as working-dir).")
+parser.add_argument(
+    "--config-options-file", dest="config_options_file", metavar="PATH")
+parser.add_argument(
+    "--log-file", dest="log_file", metavar="PATH",
+    help="Log file to write to (default: %(default)s).")
+parser.add_argument(
+    "--validation-corpus", dest="validation_corpus", metavar="PATH",
+    help="Validation file (default: %(default)s).")
+parser.add_argument(
+    "--activation-function", dest="activation_fn",
+    choices=['identity', 'rectifier', 'tanh', 'hardtanh'],
+    help="Activation function (default: %(default)s).")
+parser.add_argument(
+    "--learning-rate", dest="learning_rate", type=float, metavar="FLOAT",
+    help="Learning rate (default: %(default)s).")
+parser.add_argument(
+    "--input-words-file", dest="input_words_file", metavar="PATH",
+    help="Input vocabulary (default: %(default)s).")
+parser.add_argument(
+    "--output-words-file", dest="output_words_file", metavar="PATH",
+    help="Output vocabulary (default: %(default)s).")
+parser.add_argument(
+    "--input_vocab_size", dest="input_vocab_size", type=int, metavar="INT",
+    help="Input vocabulary size (default: %(default)s).")
+parser.add_argument(
+    "--output-vocab-size", dest="output_vocab_size", type=int, metavar="INT",
+    help="Output vocabulary size (default: %(default)s).")
 
 
 parser.set_defaults(
-    working_dir = "working"
-    ,corpus_stem = "train"
-    ,nplm_home = "/home/bhaddow/tools/nplm"
-    ,epochs = 2
-    ,up_context_size = 2
-    ,left_context_size = 3
-    ,right_context_size = 0
-    ,minibatch_size=1000
-    ,noise=100
-    ,hidden=0
-    ,mode='head'
-    ,input_embedding=150
-    ,output_embedding=750
-    ,threads=4
-    ,output_model = "train"
-    ,output_dir = None
-    ,config_options_file = "config"
-    ,log_file = "log"
-    ,validation_corpus = None
-    ,activation_fn = "rectifier"
-    ,learning_rate = 1
-    ,input_words_file = None
-    ,output_words_file = None
-    ,input_vocab_size = 500000
-    ,output_vocab_size = 500000
-    )
+    working_dir="working",
+    corpus_stem="train",
+    nplm_home="/home/bhaddow/tools/nplm",
+    epochs=2,
+    up_context_size=2,
+    left_context_size=3,
+    right_context_size=0,
+    minibatch_size=1000,
+    noise=100,
+    hidden=0,
+    mode='head',
+    input_embedding=150,
+    output_embedding=750,
+    threads=4,
+    output_model="train",
+    output_dir=None,
+    config_options_file="config",
+    log_file="log",
+    validation_corpus=None,
+    activation_fn="rectifier",
+    learning_rate=1,
+    input_words_file=None,
+    output_words_file=None,
+    input_vocab_size=500000,
+    output_vocab_size=500000)
+
 
 def prepare_vocabulary(options):
-  vocab_prefix = os.path.join(options.working_dir, 'vocab')
-  extract_vocab_options = extract_vocab.create_parser().parse_args(['--input', options.corpus_stem, '--output', vocab_prefix])
-  extract_vocab.main(extract_vocab_options)
-
-  if options.input_words_file is None:
-    options.input_words_file = vocab_prefix + '.input'
-    orig = vocab_prefix + '.all'
-    filtered_vocab = open(orig).readlines()
-    if options.input_vocab_size:
-      filtered_vocab = filtered_vocab[:options.input_vocab_size]
-    open(options.input_words_file,'w').writelines(filtered_vocab)
-
-  if options.output_words_file is None:
-    options.output_words_file = vocab_prefix + '.output'
-    if options.mode == 'label':
-      blacklist = ['<null', '<root', '<start_head', '<dummy', '<head_head', '<stop_head']
-      orig = vocab_prefix + '.special'
-      filtered_vocab = open(orig).readlines()
-      orig = vocab_prefix + '.nonterminals'
-      filtered_vocab += open(orig).readlines()
-      filtered_vocab = [word for word in filtered_vocab if not any(word.startswith(prefix) for prefix in blacklist)]
-      if options.output_vocab_size:
-        filtered_vocab = filtered_vocab[:options.output_vocab_size]
-    else:
-      orig = vocab_prefix + '.all'
-      filtered_vocab = open(orig).readlines()[:options.output_vocab_size]
-    open(options.output_words_file,'w').writelines(filtered_vocab)
+    vocab_prefix = os.path.join(options.working_dir, 'vocab')
+    extract_vocab_options = extract_vocab.create_parser().parse_args(
+        ['--input', options.corpus_stem, '--output', vocab_prefix])
+    extract_vocab.main(extract_vocab_options)
+
+    if options.input_words_file is None:
+        options.input_words_file = vocab_prefix + '.input'
+        orig = vocab_prefix + '.all'
+        filtered_vocab = open(orig).readlines()
+        if options.input_vocab_size:
+            filtered_vocab = filtered_vocab[:options.input_vocab_size]
+        open(options.input_words_file, 'w').writelines(filtered_vocab)
+
+    if options.output_words_file is None:
+        options.output_words_file = vocab_prefix + '.output'
+        if options.mode == 'label':
+            blacklist = [
+                '<null',
+                '<root',
+                '<start_head',
+                '<dummy',
+                '<head_head',
+                '<stop_head',
+            ]
+            orig = vocab_prefix + '.special'
+            filtered_vocab = open(orig).readlines()
+            orig = vocab_prefix + '.nonterminals'
+            filtered_vocab += open(orig).readlines()
+            filtered_vocab = [
+                word
+                for word in filtered_vocab
+                if not any(word.startswith(prefix) for prefix in blacklist)]
+            if options.output_vocab_size:
+                filtered_vocab = filtered_vocab[:options.output_vocab_size]
+        else:
+            orig = vocab_prefix + '.all'
+            filtered_vocab = open(orig).readlines()[:options.output_vocab_size]
+        open(options.output_words_file, 'w').writelines(filtered_vocab)
+
 
 def main(options):
 
-  options.ngram_size = 2*options.up_context_size + 2*options.left_context_size + 2*options.right_context_size
-  if options.mode == 'head':
-    options.ngram_size += 2
-  elif options.mode == 'label':
-    options.ngram_size += 1
-
-  if options.input_words_file is None or options.output_words_file is None:
-    sys.stderr.write('either input vocabulary or output vocabulary not specified: extracting vocabulary from training text\n')
-    prepare_vocabulary(options)
-
-  extract_options = extract_syntactic_ngrams.create_parser().parse_args(['--input', options.corpus_stem,
-                                                                         '--output', os.path.join(options.working_dir, os.path.basename(options.corpus_stem) + '.numberized'),
-                                                                         '--vocab', options.input_words_file,
-                                                                         '--output_vocab', options.output_words_file,
-                                                                         '--right_context', str(options.right_context_size),
-                                                                         '--left_context', str(options.left_context_size),
-                                                                         '--up_context', str(options.up_context_size),
-                                                                         '--mode', options.mode
-                                                                         ])
-  sys.stderr.write('extracting syntactic n-grams\n')
-  extract_syntactic_ngrams.main(extract_options)
-
-  if options.validation_corpus:
-    extract_options.input = open(options.validation_corpus)
-    options.validation_file = os.path.join(options.working_dir, os.path.basename(options.validation_corpus))
-    extract_options.output = open(options.validation_file + '.numberized', 'w')
-    sys.stderr.write('extracting syntactic n-grams (validation file)\n')
+    options.ngram_size = (
+        2 * options.up_context_size +
+        2 * options.left_context_size +
+        2 * options.right_context_size
+        )
+    if options.mode == 'head':
+        options.ngram_size += 2
+    elif options.mode == 'label':
+        options.ngram_size += 1
+
+    if options.input_words_file is None or options.output_words_file is None:
+        sys.stderr.write(
+            "Either input vocabulary or output vocabulary not specified: "
+            "extracting vocabulary from training text.\n")
+        prepare_vocabulary(options)
+
+    extract_options = extract_syntactic_ngrams.create_parser().parse_args([
+        '--input', options.corpus_stem,
+        '--output', os.path.join(
+            options.working_dir,
+            os.path.basename(options.corpus_stem) + '.numberized'),
+        '--vocab', options.input_words_file,
+        '--output_vocab', options.output_words_file,
+        '--right_context', str(options.right_context_size),
+        '--left_context', str(options.left_context_size),
+        '--up_context', str(options.up_context_size),
+        '--mode', options.mode
+        ])
+    sys.stderr.write('extracting syntactic n-grams\n')
     extract_syntactic_ngrams.main(extract_options)
-    extract_options.output.close()
 
-  sys.stderr.write('training neural network\n')
-  train_nplm.main(options)
+    if options.validation_corpus:
+        extract_options.input = open(options.validation_corpus)
+        options.validation_file = os.path.join(
+            options.working_dir, os.path.basename(options.validation_corpus))
+        extract_options.output = open(
+            options.validation_file + '.numberized', 'w')
+        sys.stderr.write('extracting syntactic n-grams (validation file)\n')
+        extract_syntactic_ngrams.main(extract_options)
+        extract_options.output.close()
+
+    sys.stderr.write('training neural network\n')
+    train_nplm.main(options)
+
+    sys.stderr.write('averaging null words\n')
+    ret = subprocess.call([
+        os.path.join(sys.path[0], 'average_null_embedding.py'),
+        options.nplm_home,
+        os.path.join(
+            options.output_dir,
+            options.output_model + '.model.nplm.' + str(options.epochs)),
+        os.path.join(
+            options.working_dir,
+            os.path.basename(options.corpus_stem) + '.numberized'),
+        os.path.join(options.output_dir, options.output_model + '.model.nplm')
+        ])
+    if ret:
+        raise Exception("averaging null words failed")
 
-  sys.stderr.write('averaging null words\n')
-  ret = subprocess.call([os.path.join(sys.path[0], 'average_null_embedding.py'),
-                   options.nplm_home,
-                   os.path.join(options.output_dir, options.output_model + '.model.nplm.' + str(options.epochs)),
-                   os.path.join(options.working_dir, os.path.basename(options.corpus_stem) + '.numberized'),
-                   os.path.join(options.output_dir, options.output_model + '.model.nplm')
-                   ])
-  if ret:
-      raise Exception("averaging null words failed")
 
 if __name__ == "__main__":
-  if sys.version_info < (3, 0):
-    sys.stderr = codecs.getwriter('UTF-8')(sys.stderr)
-    sys.stdout = codecs.getwriter('UTF-8')(sys.stdout)
-    sys.stdin = codecs.getreader('UTF-8')(sys.stdin)
-
-  options = parser.parse_args()
-  main(options)
+    if sys.version_info < (3, 0):
+        sys.stderr = codecs.getwriter('UTF-8')(sys.stderr)
+        sys.stdout = codecs.getwriter('UTF-8')(sys.stdout)
+        sys.stdin = codecs.getreader('UTF-8')(sys.stdin)
 
+    options = parser.parse_args()
+    main(options)
diff --git a/scripts/training/wrappers/conll2mosesxml.py b/scripts/training/wrappers/conll2mosesxml.py
index 0e361df0b..761037488 100755
--- a/scripts/training/wrappers/conll2mosesxml.py
+++ b/scripts/training/wrappers/conll2mosesxml.py
@@ -2,42 +2,76 @@
 # -*- coding: utf-8 -*-
 # Author: Rico Sennrich
 
-# takes a file in the CoNLL dependency format (from the CoNLL-X shared task on dependency parsing; http://ilk.uvt.nl/conll/#dataformat )
-# and produces Moses XML format. Note that the structure is built based on fields 9 and 10 (projective HEAD and RELATION),
-# which not all parsers produce.
+"""
+Takes a file in the CoNLL dependency format (from the CoNLL-X shared task on
+dependency parsing; http://ilk.uvt.nl/conll/#dataformat ) and produces
+Moses XML format.
 
-# usage: conll2mosesxml.py [--brackets] < input_file > output_file
+Note that the structure is built based on fields 9 and 10 (projective HEAD
+and RELATION), which not all parsers produce.
+
+Usage: conll2mosesxml.py [--brackets] < input_file > output_file
+"""
 
 from __future__ import print_function, unicode_literals
 import sys
 import re
 import codecs
-from collections import namedtuple,defaultdict
+from collections import (
+    namedtuple,
+    defaultdict,
+    )
 from lxml import etree as ET
 
 
-Word = namedtuple('Word', ['pos','word','lemma','tag','head','func', 'proj_head', 'proj_func'])
+Word = namedtuple(
+    'Word',
+    ['pos', 'word', 'lemma', 'tag', 'head', 'func', 'proj_head', 'proj_func'])
+
 
 def main(output_format='xml'):
     sentence = []
 
     for line in sys.stdin:
 
-        # process sentence
+        # Process sentence.
         if line == "\n":
-            sentence.insert(0,[])
+            sentence.insert(0, [])
             if is_projective(sentence):
-                write(sentence,output_format)
+                write(sentence, output_format)
             else:
-                sys.stderr.write(' '.join(w.word for w in sentence[1:]) + '\n')
+                sys.stderr.write(
+                    ' '.join(w.word for w in sentence[1:]) + '\n')
                 sys.stdout.write('\n')
             sentence = []
             continue
 
         try:
-            pos, word, lemma, tag, tag2, morph, head, func, proj_head, proj_func = line.split()
-        except ValueError: # word may be unicode whitespace
-            pos, word, lemma, tag, tag2, morph, head, func, proj_head, proj_func = re.split(' *\t*',line.strip())
+            (
+                pos,
+                word,
+                lemma,
+                tag,
+                tag2,
+                morph,
+                head,
+                func,
+                proj_head,
+                proj_func,
+            ) = line.split()
+        except ValueError:  # Word may be unicode whitespace.
+            (
+                pos,
+                word,
+                lemma,
+                tag,
+                tag2,
+                morph,
+                head,
+                func,
+                proj_head,
+                proj_func,
+            ) = re.split(' *\t*', line.strip())
 
         word = escape_special_chars(word)
         lemma = escape_special_chars(lemma)
@@ -46,17 +80,20 @@ def main(output_format='xml'):
             proj_head = head
             proj_func = func
 
-        sentence.append(Word(int(pos), word, lemma, tag2,int(head), func, int(proj_head), proj_func))
+        sentence.append(
+            Word(
+                int(pos), word, lemma, tag2, int(head), func, int(proj_head),
+                proj_func))
 
 
-# this script performs the same escaping as escape-special-chars.perl in Moses.
-# most of it is done in function write(), but quotation marks need to be processed first
+# This script performs the same escaping as escape-special-chars.perl in
+# Moses.  Most of it is done in function write(), but quotation marks need
+# to be processed first.
 def escape_special_chars(line):
-
-    line = line.replace('\'','&apos;') # xml
-    line = line.replace('"','&quot;') # xml
-    line = line.replace('[','&#91;') # syntax non-terminal
-    line = line.replace(']','&#93;') # syntax non-terminal
+    line = line.replace('\'', '&apos;')  # xml
+    line = line.replace('"', '&quot;')  # xml
+    line = line.replace('[', '&#91;')  # syntax non-terminal
+    line = line.replace(']', '&#93;')  # syntax non-terminal
 
     return line
 
@@ -64,7 +101,7 @@ def escape_special_chars(line):
 # make a check if structure is projective
 def is_projective(sentence):
     dominates = defaultdict(set)
-    for i,w in enumerate(sentence):
+    for i, w in enumerate(sentence):
         dominates[i].add(i)
         if not i:
             continue
@@ -77,7 +114,7 @@ def is_projective(sentence):
 
     for i in dominates:
         dependents = dominates[i]
-        if max(dependents) - min(dependents) != len(dependents)-1:
+        if max(dependents) - min(dependents) != len(dependents) - 1:
             sys.stderr.write("error: non-projective structure.\n")
             return False
     return True
@@ -86,24 +123,28 @@ def is_projective(sentence):
 def write(sentence, output_format='xml'):
 
     if output_format == 'xml':
-        tree = create_subtree(0,sentence)
-        out = ET.tostring(tree, encoding = 'UTF-8').decode('UTF-8')
+        tree = create_subtree(0, sentence)
+        out = ET.tostring(tree, encoding='UTF-8').decode('UTF-8')
 
     if output_format == 'brackets':
-        out = create_brackets(0,sentence)
+        out = create_brackets(0, sentence)
 
-    out = out.replace('|','&#124;') # factor separator
+    out = out.replace('|', '&#124;')  # factor separator
 
-    out = out.replace('&amp;apos;','&apos;') # lxml is buggy if input is escaped
-    out = out.replace('&amp;quot;','&quot;') # lxml is buggy if input is escaped
-    out = out.replace('&amp;#91;','&#91;') # lxml is buggy if input is escaped
-    out = out.replace('&amp;#93;','&#93;') # lxml is buggy if input is escaped
+    # lxml is buggy if input is escaped:
+    out = out.replace('&amp;apos;', '&apos;')
+    # lxml is buggy if input is escaped:
+    out = out.replace('&amp;quot;', '&quot;')
+    # lxml is buggy if input is escaped:
+    out = out.replace('&amp;#91;', '&#91;')
+    # lxml is buggy if input is escaped:
+    out = out.replace('&amp;#93;', '&#93;')
 
     print(out)
 
-# write node in Moses XML format
-def create_subtree(position, sentence):
 
+def create_subtree(position, sentence):
+    """"Write node in Moses XML format."""
     element = ET.Element('tree')
 
     if position:
@@ -111,7 +152,7 @@ def create_subtree(position, sentence):
     else:
         element.set('label', 'sent')
 
-    for i in range(1,position):
+    for i in range(1, position):
         if sentence[i].proj_head == position:
             element.append(create_subtree(i, sentence))
 
@@ -144,7 +185,7 @@ def create_brackets(position, sentence):
     else:
         element = "[ sent "
 
-    for i in range(1,position):
+    for i in range(1, position):
         if sentence[i].proj_head == position:
             element += create_brackets(i, sentence)
 
@@ -167,7 +208,7 @@ def create_brackets(position, sentence):
     return element
 
 if __name__ == '__main__':
-    if sys.version_info < (3,0,0):
+    if sys.version_info < (3, 0, 0):
         sys.stdin = codecs.getreader('UTF-8')(sys.stdin)
         sys.stdout = codecs.getwriter('UTF-8')(sys.stdout)
         sys.stderr = codecs.getwriter('UTF-8')(sys.stderr)
diff --git a/scripts/training/wrappers/mosesxml2brackets.py b/scripts/training/wrappers/mosesxml2brackets.py
index bd876f087..6ff1d20c9 100755
--- a/scripts/training/wrappers/mosesxml2brackets.py
+++ b/scripts/training/wrappers/mosesxml2brackets.py
@@ -10,17 +10,21 @@ import codecs
 
 from lxml import etree as ET
 
+
 def escape(word):
-    word = word.replace('|','&#124;') # factor separator
-    word = word.replace('[','&#91;') # syntax non-terminal
-    word = word.replace(']','&#93;') # syntax non-terminal
-    word = word.replace('\'','&apos;')
-    word = word.replace('\"','&quot;')
+    # Factor separator:
+    word = word.replace('|', '&#124;')
+    # Syntax non-terminal:
+    word = word.replace('[', '&#91;')
+    # Syntax non-terminal:
+    word = word.replace(']', '&#93;')
+    word = word.replace('\'', '&apos;')
+    word = word.replace('\"', '&quot;')
 
     return word
 
-def make_brackets(xml):
 
+def make_brackets(xml):
     out = ' [' + xml.get('label')
 
     if xml.text and xml.text.strip():
author	Jeroen Vermeulen <jtv@precisiontranslationtools.com>	2015-05-16 13:26:56 +0300
committer	Jeroen Vermeulen <jtv@precisiontranslationtools.com>	2015-05-16 13:26:56 +0300
commit	61162dd24284baebdd407bb7dd4f28892b24fbfb (patch)
tree	a2a2807acecefa1906d648970d932409bf0e0099 /scripts
parent	c07ade81422488ba1b6a6ae5eb46132fa5ac5fec (diff)