Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorHieu Hoang <hieuhoang@gmail.com>2018-11-07 20:12:05 +0300
committerHieu Hoang <hieuhoang@gmail.com>2018-11-07 20:12:05 +0300
commit2451c469603bd297a2f52369c2d57b2fab835ef4 (patch)
treea7a86612cf6c0c8b73c6b84fa437da32437f6844
parent2217bc136e97b6a0f0ec574bf914235649860539 (diff)
start borging Luis Gomes code
-rw-r--r--scripts/tokenizer/python-wrapper/__init__.py31
-rw-r--r--scripts/tokenizer/python-wrapper/detokenizer.py82
-rw-r--r--scripts/tokenizer/python-wrapper/punctnormalizer.py84
-rw-r--r--scripts/tokenizer/python-wrapper/sentsplitter.py134
-rw-r--r--scripts/tokenizer/python-wrapper/tokenizer.py93
5 files changed, 424 insertions, 0 deletions
diff --git a/scripts/tokenizer/python-wrapper/__init__.py b/scripts/tokenizer/python-wrapper/__init__.py
new file mode 100644
index 000000000..8ff517176
--- /dev/null
+++ b/scripts/tokenizer/python-wrapper/__init__.py
@@ -0,0 +1,31 @@
+"""
+Wrappers for several pre-processing scripts from the Moses toolkit.
+
+Copyright ® 2016-2017, Luís Gomes <luismsgomes@gmail.com>
+
+This package provides wrappers for the following Perl scripts:
+
+``tokenizer.perl``
+ class `mosestokenizer.tokenizer.MosesTokenizer`
+
+``split-sentences.perl``
+ class `mosestokenizer.sentsplitter.MosesSentenceSplitter`
+
+``normalize-punctuation.perl``
+ class `mosestokenizer.punctnormalizer.MosesPunctuationNormalizer`
+
+"""
+
+from mosestokenizer.tokenizer import MosesTokenizer
+from mosestokenizer.detokenizer import MosesDetokenizer
+from mosestokenizer.sentsplitter import MosesSentenceSplitter
+from mosestokenizer.punctnormalizer import MosesPunctuationNormalizer
+
+__version__ = "1.0.0"
+
+__all__ = [
+ "MosesTokenizer",
+ "MosesDetokenizer",
+ "MosesSentenceSplitter",
+ "MosesPunctuationNormalizer",
+]
diff --git a/scripts/tokenizer/python-wrapper/detokenizer.py b/scripts/tokenizer/python-wrapper/detokenizer.py
new file mode 100644
index 000000000..95333414c
--- /dev/null
+++ b/scripts/tokenizer/python-wrapper/detokenizer.py
@@ -0,0 +1,82 @@
+"""
+A module for interfacing with ``detokenizer.perl`` from Moses.
+
+Copyright ® 2017, Luís Gomes <luismsgomes@gmail.com>
+"""
+
+usage = """
+Usage:
+ moses-detokenizer [options] <lang> [<inputfile> [<outputfile>]]
+ moses-detokenizer --selftest [--verbose]
+
+Options:
+ --selftest, -t Run selftests.
+ --verbose, -v Be more verbose.
+
+2017, Luís Gomes <luismsgomes@gmail.com>
+"""
+
+
+from docopt import docopt
+from openfile import openfile
+from os import path
+from toolwrapper import ToolWrapper
+import sys
+
+
+class MosesDetokenizer(ToolWrapper):
+ """A module for interfacing with ``detokenizer.perl`` from Moses.
+
+ This class communicates with detokenizer.perl process via pipes. When the
+ MosesDetokenizer object is no longer needed, the close() method should be
+ called to free system resources. The class supports the context manager
+ interface. If used in a with statement, the close() method is invoked
+ automatically.
+
+ >>> detokenize = MosesDetokenizer('en')
+ >>> detokenize('Hello', 'World', '!')
+ 'Hello World!'
+ """
+
+ def __init__(self, lang="en"):
+ self.lang = lang
+ program = path.join(path.dirname(__file__), "detokenizer.perl")
+ # -q = quiet
+ # -b = disable output buffering
+ argv = ["perl", program, "-q", "-b", "-l", self.lang]
+ super().__init__(argv)
+
+ def __str__(self):
+ return "MosesDetokenizer(lang=\"{lang}\")".format(lang=self.lang)
+
+ def __call__(self, sentence):
+ """Detokenizes a single sentence.
+
+ Newline characters are not allowed in tokens.
+ """
+ assert isinstance(sentence, (list, tuple))
+ assert all(isinstance(token, str) for token in sentence)
+ assert all("\n" not in token for token in sentence)
+ if not sentence:
+ return ""
+ self.writeline(" ".join(sentence))
+ return self.readline()
+
+
+def main():
+ args = docopt(usage)
+ if args["--selftest"]:
+ import doctest
+ import mosestokenizer.detokenizer
+ doctest.testmod(mosestokenizer.detokenizer)
+ if not args["<lang>"]:
+ sys.exit(0)
+ detokenize = MosesDetokenizer(args["<lang>"])
+ inputfile = openfile(args["<inputfile>"])
+ outputfile = openfile(args["<outputfile>"], "wt")
+ with inputfile, outputfile:
+ for line in inputfile:
+ print(detokenize(line.split()), file=outputfile)
+
+if __name__ == "__main__":
+ main()
diff --git a/scripts/tokenizer/python-wrapper/punctnormalizer.py b/scripts/tokenizer/python-wrapper/punctnormalizer.py
new file mode 100644
index 000000000..73db1ace1
--- /dev/null
+++ b/scripts/tokenizer/python-wrapper/punctnormalizer.py
@@ -0,0 +1,84 @@
+"""
+A module for interfacing with ``normalize-punctuation.perl`` from Moses.
+
+Copyright ® 2016-2017, Luís Gomes <luismsgomes@gmail.com>
+"""
+
+usage = """
+Usage:
+ moses-punct-normalizer [options] <lang> [<inputfile> [<outputfile>]]
+ moses-punct-normalizer --selftest [--verbose]
+
+Options:
+ --selftest, -t Run selftests.
+ --verbose, -v Be more verbose.
+
+2016, Luís Gomes <luismsgomes@gmail.com>
+"""
+
+
+from docopt import docopt
+from os import path
+from toolwrapper import ToolWrapper
+import sys
+
+
+class MosesPunctuationNormalizer(ToolWrapper):
+ """A module for interfacing with ``normalize-punctuation.perl`` from Moses.
+
+ This class communicates with normalize-punctuation.perl process via pipes.
+ When the MosesPunctuationNormalizer object is no longer needed, the close()
+ method should be called to free system resources. The class supports the
+ context manager interface. If used in a with statement, the close() method
+ is invoked automatically.
+
+ >>> normalize = MosesPunctuationNormalizer("en")
+ >>> normalize("«Hello World» — she said…")
+ '"Hello World" - she said...'
+ """
+
+ def __init__(self, lang="en"):
+ self.lang = lang
+ program = path.join(
+ path.dirname(__file__),
+ "normalize-punctuation.perl"
+ )
+ argv = ["perl", program, "-b", "-l", self.lang]
+ super().__init__(argv)
+
+ def __str__(self):
+ return "MosesPunctuationNormalizer(lang=\"{lang}\")".format(
+ lang=self.lang
+ )
+
+ def __call__(self, line):
+ """Normalizes punctuation of a single line of text.
+
+ Newline characters are not allowed in the text to be normalized.
+ """
+ assert isinstance(line, str)
+ line = line.strip()
+ assert "\n" not in line
+ if not line:
+ return []
+ self.writeline(line)
+ return self.readline()
+
+
+def main():
+ args = docopt(usage)
+ if args["--selftest"]:
+ import doctest
+ import mosestokenizer.punctnormalizer
+ doctest.testmod(mosestokenizer.punctnormalizer)
+ if not args["<lang>"]:
+ sys.exit(0)
+ normalize = MosesPunctuationNormalizer(args["<lang>"])
+ inputfile = open(args["<inputfile>"]) if args["<inputfile>"] else sys.stdin
+ outputfile = open(args["<outputfile>"], "wt") if args["<outputfile>"] else sys.stdout
+ with inputfile, outputfile:
+ for line in inputfile:
+ print(normalize(line), file=outputfile)
+
+if __name__ == "__main__":
+ main()
diff --git a/scripts/tokenizer/python-wrapper/sentsplitter.py b/scripts/tokenizer/python-wrapper/sentsplitter.py
new file mode 100644
index 000000000..4fd58c7d7
--- /dev/null
+++ b/scripts/tokenizer/python-wrapper/sentsplitter.py
@@ -0,0 +1,134 @@
+"""
+A module for interfacing with ``split-sentences.perl`` from Moses toolkit.
+
+Copyright ® 2016-2017, Luís Gomes <luismsgomes@gmail.com>
+"""
+
+usage = """
+Usage:
+ moses-sentence-splitter [options] <lang> [<inputfile> [<outputfile>]]
+ moses-sentence-splitter --selftest [--verbose]
+
+Options:
+ --selftest, -t Run selftests.
+ --verbose, -v Be more verbose.
+ --unwrap, -u Assume that the text is wrapped and try to unwrap it.
+ Note that this option will cause all consecutive non-empty
+ lines to be buffered in memory. If you give this option
+ make sure that you have empty lines separating paragraphs.
+ When this option is not given, each line is assumed to be
+ an independent paragraph or sentence and thus will not be
+ joined with other lines.
+ --more Also split on colons and semi-colons.
+
+2016, Luís Gomes <luismsgomes@gmail.com>
+"""
+
+
+from docopt import docopt
+from openfile import openfile
+from os import path
+from toolwrapper import ToolWrapper
+import sys
+
+
+class MosesSentenceSplitter(ToolWrapper):
+ """
+ A class for interfacing with ``split-sentences.perl`` from Moses toolkit.
+
+ This class communicates with split-sentences.perl process via pipes. When
+ the MosesSentenceSplitter object is no longer needed, the close() method
+ should be called to free system resources. The class supports the context
+ manager interface. If used in a with statement, the close() method is
+ invoked automatically.
+
+ When attribute ``more`` is True, colons and semi-colons are considered
+ sentence separators.
+
+ >>> split_sents = MosesSentenceSplitter('en')
+ >>> split_sents(['Hello World! Hello', 'again.'])
+ ['Hello World!', 'Hello again.']
+
+ """
+
+ def __init__(self, lang="en", more=True):
+ self.lang = lang
+ program = path.join(
+ path.dirname(__file__),
+ "split-sentences.perl"
+ )
+ argv = ["perl", program, "-q", "-b", "-l", self.lang]
+ if more:
+ argv.append("-m")
+ super().__init__(argv)
+
+ def __str__(self):
+ return "MosesSentenceSplitter(lang=\"{lang}\")".format(lang=self.lang)
+
+ def __call__(self, paragraph):
+ """Splits sentences within a paragraph.
+ The paragraph is a list of non-empty lines. XML-like tags are not
+ allowed.
+ """
+ assert isinstance(paragraph, (list, tuple))
+ if not paragraph: # empty paragraph is OK
+ return []
+ assert all(isinstance(line, str) for line in paragraph)
+ paragraph = [line.strip() for line in paragraph]
+ assert all(paragraph), "blank lines are not allowed"
+ for line in paragraph:
+ self.writeline(line)
+ self.writeline("<P>")
+ sentences = []
+ while True:
+ sentence = self.readline().strip()
+ if sentence == "<P>":
+ break
+ sentences.append(sentence)
+ return sentences
+
+
+def read_paragraphs(inputfile, wrapped=True):
+ lines = map(str.strip, inputfile)
+ if wrapped:
+ paragraph = []
+ for line in lines:
+ if line:
+ paragraph.append(line)
+ elif paragraph:
+ yield paragraph
+ paragraph = []
+ if paragraph:
+ yield paragraph
+ else:
+ for line in lines:
+ yield [line] if line else []
+
+
+def write_paragraphs(paragraphs, outputfile, blank_sep=True):
+ for paragraph in paragraphs:
+ for sentence in paragraph:
+ print(sentence, file=outputfile)
+ if blank_sep or not paragraph:
+ print(file=outputfile) # paragraph separator
+
+
+def main():
+ args = docopt(usage)
+ if args["--selftest"]:
+ import doctest
+ import mosestokenizer.sentsplitter
+ doctest.testmod(mosestokenizer.sentsplitter)
+ if not args["<lang>"]:
+ sys.exit(0)
+ split_sents = MosesSentenceSplitter(args["<lang>"], more=args["--more"])
+ inputfile = openfile(args["<inputfile>"])
+ outputfile = openfile(args["<outputfile>"], "wt")
+ with inputfile, outputfile:
+ paragraphs = read_paragraphs(inputfile, wrapped=args["--unwrap"])
+ paragraphs = map(split_sents, paragraphs)
+ write_paragraphs(paragraphs, outputfile, blank_sep=args["--unwrap"])
+
+
+if __name__ == "__main__":
+ main()
diff --git a/scripts/tokenizer/python-wrapper/tokenizer.py b/scripts/tokenizer/python-wrapper/tokenizer.py
new file mode 100644
index 000000000..eb5aec3dc
--- /dev/null
+++ b/scripts/tokenizer/python-wrapper/tokenizer.py
@@ -0,0 +1,93 @@
+"""
+A module for interfacing with ``tokenizer.perl`` from Moses.
+
+Copyright ® 2016-2017, Luís Gomes <luismsgomes@gmail.com>
+"""
+
+usage = """
+Usage:
+ moses-tokenizer [options] <lang> [<inputfile> [<outputfile>]]
+ moses-tokenizer --selftest [--verbose]
+
+Options:
+ --selftest, -t Run selftests.
+ --verbose, -v Be more verbose.
+ --old Use older version (1.0) of the tokenizer.
+ If this option is not given, then version 1.1
+ will be used.
+
+2016, Luís Gomes <luismsgomes@gmail.com>
+"""
+
+
+from docopt import docopt
+from openfile import openfile
+from os import path
+from toolwrapper import ToolWrapper
+import sys
+
+
+class MosesTokenizer(ToolWrapper):
+ """A module for interfacing with ``tokenizer.perl`` from Moses.
+
+ This class communicates with tokenizer.perl process via pipes. When the
+ MosesTokenizer object is no longer needed, the close() method should be
+ called to free system resources. The class supports the context manager
+ interface. If used in a with statement, the close() method is invoked
+ automatically.
+
+ >>> tokenize = MosesTokenizer('en')
+ >>> tokenize('Hello World!')
+ ['Hello', 'World', '!']
+ """
+
+ def __init__(self, lang="en", old_version=False):
+ self.lang = lang
+ program = path.join(
+ path.dirname(__file__),
+ "tokenizer-" + ("v1.0" if old_version else "v1.1") + ".perl"
+ )
+ argv = ["perl", program, "-q", "-l", self.lang]
+ if not old_version:
+ # -b = disable output buffering
+ # -a = aggressive hyphen splitting
+ argv.extend(["-b", "-a"])
+ super().__init__(argv)
+
+ def __str__(self):
+ return "MosesTokenizer(lang=\"{lang}\")".format(lang=self.lang)
+
+ def __call__(self, sentence):
+ """Tokenizes a single sentence.
+
+ Newline characters are not allowed in the sentence to be tokenized.
+ """
+ assert isinstance(sentence, str)
+ sentence = sentence.rstrip("\n")
+ assert "\n" not in sentence
+ if not sentence:
+ return []
+ self.writeline(sentence)
+ return self.readline().split()
+
+
+def main():
+ args = docopt(usage)
+ if args["--selftest"]:
+ import doctest
+ import mosestokenizer.tokenizer
+ doctest.testmod(mosestokenizer.tokenizer)
+ if not args["<lang>"]:
+ sys.exit(0)
+ tokenize = MosesTokenizer(
+ args["<lang>"],
+ old_version=args["--old"],
+ )
+ inputfile = openfile(args["<inputfile>"])
+ outputfile = openfile(args["<outputfile>"], "wt")
+ with inputfile, outputfile:
+ for line in inputfile:
+ print(*tokenize(line), file=outputfile)
+
+if __name__ == "__main__":
+ main()