Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
Diffstat (limited to 'scripts/tokenizer/mosestokenizer/tokenizer.py')
-rw-r--r--scripts/tokenizer/mosestokenizer/tokenizer.py92
1 files changed, 92 insertions, 0 deletions
diff --git a/scripts/tokenizer/mosestokenizer/tokenizer.py b/scripts/tokenizer/mosestokenizer/tokenizer.py
new file mode 100644
index 000000000..b3af06647
--- /dev/null
+++ b/scripts/tokenizer/mosestokenizer/tokenizer.py
@@ -0,0 +1,92 @@
+"""
+A module for interfacing with ``tokenizer.perl`` from Moses.
+
+Copyright ® 2016-2017, Luís Gomes <luismsgomes@gmail.com>
+"""
+
+usage = """
+Usage:
+ moses-tokenizer [options] <lang> [<inputfile> [<outputfile>]]
+ moses-tokenizer --selftest [--verbose]
+
+Options:
+ --selftest, -t Run selftests.
+ --verbose, -v Be more verbose.
+ --old Use older version (1.0) of the tokenizer.
+ If this option is not given, then version 1.1
+ will be used.
+
+2016, Luís Gomes <luismsgomes@gmail.com>
+"""
+
+
+from docopt import docopt
+from openfile import openfile
+from os import path
+from toolwrapper import ToolWrapper
+import sys
+
+
+class MosesTokenizer(ToolWrapper):
+ """A module for interfacing with ``tokenizer.perl`` from Moses.
+
+ This class communicates with tokenizer.perl process via pipes. When the
+ MosesTokenizer object is no longer needed, the close() method should be
+ called to free system resources. The class supports the context manager
+ interface. If used in a with statement, the close() method is invoked
+ automatically.
+
+ >>> tokenize = MosesTokenizer('en')
+ >>> tokenize('Hello World!')
+ ['Hello', 'World', '!']
+ """
+
+ def __init__(self, lang="en"):
+ self.lang = lang
+ program = path.join(
+ path.dirname(__file__),
+ "../tokenizer.perl"
+ )
+ argv = ["perl", program, "-q", "-l", self.lang]
+
+ # -b = disable output buffering
+ # -a = aggressive hyphen splitting
+ argv.extend(["-b", "-a"])
+ super().__init__(argv)
+
+ def __str__(self):
+ return "MosesTokenizer(lang=\"{lang}\")".format(lang=self.lang)
+
+ def __call__(self, sentence):
+ """Tokenizes a single sentence.
+
+ Newline characters are not allowed in the sentence to be tokenized.
+ """
+ assert isinstance(sentence, str)
+ sentence = sentence.rstrip("\n")
+ assert "\n" not in sentence
+ if not sentence:
+ return []
+ self.writeline(sentence)
+ return self.readline().split()
+
+
+def main():
+ args = docopt(usage)
+ if args["--selftest"]:
+ import doctest
+ import mosestokenizer.tokenizer
+ doctest.testmod(mosestokenizer.tokenizer)
+ if not args["<lang>"]:
+ sys.exit(0)
+ tokenize = MosesTokenizer(
+ args["<lang>"]
+ )
+ inputfile = openfile(args["<inputfile>"])
+ outputfile = openfile(args["<outputfile>"], "wt")
+ with inputfile, outputfile:
+ for line in inputfile:
+ print(*tokenize(line), file=outputfile)
+
+if __name__ == "__main__":
+ main()