diff options
author | Hieu Hoang <hieuhoang@gmail.com> | 2018-11-07 20:12:05 +0300 |
---|---|---|
committer | Hieu Hoang <hieuhoang@gmail.com> | 2018-11-07 20:12:05 +0300 |
commit | 2451c469603bd297a2f52369c2d57b2fab835ef4 (patch) | |
tree | a7a86612cf6c0c8b73c6b84fa437da32437f6844 | |
parent | 2217bc136e97b6a0f0ec574bf914235649860539 (diff) |
start borging Luis Gomes code
-rw-r--r-- | scripts/tokenizer/python-wrapper/__init__.py | 31 | ||||
-rw-r--r-- | scripts/tokenizer/python-wrapper/detokenizer.py | 82 | ||||
-rw-r--r-- | scripts/tokenizer/python-wrapper/punctnormalizer.py | 84 | ||||
-rw-r--r-- | scripts/tokenizer/python-wrapper/sentsplitter.py | 134 | ||||
-rw-r--r-- | scripts/tokenizer/python-wrapper/tokenizer.py | 93 |
5 files changed, 424 insertions, 0 deletions
diff --git a/scripts/tokenizer/python-wrapper/__init__.py b/scripts/tokenizer/python-wrapper/__init__.py new file mode 100644 index 000000000..8ff517176 --- /dev/null +++ b/scripts/tokenizer/python-wrapper/__init__.py @@ -0,0 +1,31 @@ +""" +Wrappers for several pre-processing scripts from the Moses toolkit. + +Copyright ® 2016-2017, Luís Gomes <luismsgomes@gmail.com> + +This package provides wrappers for the following Perl scripts: + +``tokenizer.perl`` + class `mosestokenizer.tokenizer.MosesTokenizer` + +``split-sentences.perl`` + class `mosestokenizer.sentsplitter.MosesSentenceSplitter` + +``normalize-punctuation.perl`` + class `mosestokenizer.punctnormalizer.MosesPunctuationNormalizer` + +""" + +from mosestokenizer.tokenizer import MosesTokenizer +from mosestokenizer.detokenizer import MosesDetokenizer +from mosestokenizer.sentsplitter import MosesSentenceSplitter +from mosestokenizer.punctnormalizer import MosesPunctuationNormalizer + +__version__ = "1.0.0" + +__all__ = [ + "MosesTokenizer", + "MosesDetokenizer", + "MosesSentenceSplitter", + "MosesPunctuationNormalizer", +] diff --git a/scripts/tokenizer/python-wrapper/detokenizer.py b/scripts/tokenizer/python-wrapper/detokenizer.py new file mode 100644 index 000000000..95333414c --- /dev/null +++ b/scripts/tokenizer/python-wrapper/detokenizer.py @@ -0,0 +1,82 @@ +""" +A module for interfacing with ``detokenizer.perl`` from Moses. + +Copyright ® 2017, Luís Gomes <luismsgomes@gmail.com> +""" + +usage = """ +Usage: + moses-detokenizer [options] <lang> [<inputfile> [<outputfile>]] + moses-detokenizer --selftest [--verbose] + +Options: + --selftest, -t Run selftests. + --verbose, -v Be more verbose. + +2017, Luís Gomes <luismsgomes@gmail.com> +""" + + +from docopt import docopt +from openfile import openfile +from os import path +from toolwrapper import ToolWrapper +import sys + + +class MosesDetokenizer(ToolWrapper): + """A module for interfacing with ``detokenizer.perl`` from Moses. + + This class communicates with detokenizer.perl process via pipes. When the + MosesDetokenizer object is no longer needed, the close() method should be + called to free system resources. The class supports the context manager + interface. If used in a with statement, the close() method is invoked + automatically. + + >>> detokenize = MosesDetokenizer('en') + >>> detokenize('Hello', 'World', '!') + 'Hello World!' + """ + + def __init__(self, lang="en"): + self.lang = lang + program = path.join(path.dirname(__file__), "detokenizer.perl") + # -q = quiet + # -b = disable output buffering + argv = ["perl", program, "-q", "-b", "-l", self.lang] + super().__init__(argv) + + def __str__(self): + return "MosesDetokenizer(lang=\"{lang}\")".format(lang=self.lang) + + def __call__(self, sentence): + """Detokenizes a single sentence. + + Newline characters are not allowed in tokens. + """ + assert isinstance(sentence, (list, tuple)) + assert all(isinstance(token, str) for token in sentence) + assert all("\n" not in token for token in sentence) + if not sentence: + return "" + self.writeline(" ".join(sentence)) + return self.readline() + + +def main(): + args = docopt(usage) + if args["--selftest"]: + import doctest + import mosestokenizer.detokenizer + doctest.testmod(mosestokenizer.detokenizer) + if not args["<lang>"]: + sys.exit(0) + detokenize = MosesDetokenizer(args["<lang>"]) + inputfile = openfile(args["<inputfile>"]) + outputfile = openfile(args["<outputfile>"], "wt") + with inputfile, outputfile: + for line in inputfile: + print(detokenize(line.split()), file=outputfile) + +if __name__ == "__main__": + main() diff --git a/scripts/tokenizer/python-wrapper/punctnormalizer.py b/scripts/tokenizer/python-wrapper/punctnormalizer.py new file mode 100644 index 000000000..73db1ace1 --- /dev/null +++ b/scripts/tokenizer/python-wrapper/punctnormalizer.py @@ -0,0 +1,84 @@ +""" +A module for interfacing with ``normalize-punctuation.perl`` from Moses. + +Copyright ® 2016-2017, Luís Gomes <luismsgomes@gmail.com> +""" + +usage = """ +Usage: + moses-punct-normalizer [options] <lang> [<inputfile> [<outputfile>]] + moses-punct-normalizer --selftest [--verbose] + +Options: + --selftest, -t Run selftests. + --verbose, -v Be more verbose. + +2016, Luís Gomes <luismsgomes@gmail.com> +""" + + +from docopt import docopt +from os import path +from toolwrapper import ToolWrapper +import sys + + +class MosesPunctuationNormalizer(ToolWrapper): + """A module for interfacing with ``normalize-punctuation.perl`` from Moses. + + This class communicates with normalize-punctuation.perl process via pipes. + When the MosesPunctuationNormalizer object is no longer needed, the close() + method should be called to free system resources. The class supports the + context manager interface. If used in a with statement, the close() method + is invoked automatically. + + >>> normalize = MosesPunctuationNormalizer("en") + >>> normalize("«Hello World» — she said…") + '"Hello World" - she said...' + """ + + def __init__(self, lang="en"): + self.lang = lang + program = path.join( + path.dirname(__file__), + "normalize-punctuation.perl" + ) + argv = ["perl", program, "-b", "-l", self.lang] + super().__init__(argv) + + def __str__(self): + return "MosesPunctuationNormalizer(lang=\"{lang}\")".format( + lang=self.lang + ) + + def __call__(self, line): + """Normalizes punctuation of a single line of text. + + Newline characters are not allowed in the text to be normalized. + """ + assert isinstance(line, str) + line = line.strip() + assert "\n" not in line + if not line: + return [] + self.writeline(line) + return self.readline() + + +def main(): + args = docopt(usage) + if args["--selftest"]: + import doctest + import mosestokenizer.punctnormalizer + doctest.testmod(mosestokenizer.punctnormalizer) + if not args["<lang>"]: + sys.exit(0) + normalize = MosesPunctuationNormalizer(args["<lang>"]) + inputfile = open(args["<inputfile>"]) if args["<inputfile>"] else sys.stdin + outputfile = open(args["<outputfile>"], "wt") if args["<outputfile>"] else sys.stdout + with inputfile, outputfile: + for line in inputfile: + print(normalize(line), file=outputfile) + +if __name__ == "__main__": + main() diff --git a/scripts/tokenizer/python-wrapper/sentsplitter.py b/scripts/tokenizer/python-wrapper/sentsplitter.py new file mode 100644 index 000000000..4fd58c7d7 --- /dev/null +++ b/scripts/tokenizer/python-wrapper/sentsplitter.py @@ -0,0 +1,134 @@ +""" +A module for interfacing with ``split-sentences.perl`` from Moses toolkit. + +Copyright ® 2016-2017, Luís Gomes <luismsgomes@gmail.com> +""" + +usage = """ +Usage: + moses-sentence-splitter [options] <lang> [<inputfile> [<outputfile>]] + moses-sentence-splitter --selftest [--verbose] + +Options: + --selftest, -t Run selftests. + --verbose, -v Be more verbose. + --unwrap, -u Assume that the text is wrapped and try to unwrap it. + Note that this option will cause all consecutive non-empty + lines to be buffered in memory. If you give this option + make sure that you have empty lines separating paragraphs. + When this option is not given, each line is assumed to be + an independent paragraph or sentence and thus will not be + joined with other lines. + --more Also split on colons and semi-colons. + +2016, Luís Gomes <luismsgomes@gmail.com> +""" + + +from docopt import docopt +from openfile import openfile +from os import path +from toolwrapper import ToolWrapper +import sys + + +class MosesSentenceSplitter(ToolWrapper): + """ + A class for interfacing with ``split-sentences.perl`` from Moses toolkit. + + This class communicates with split-sentences.perl process via pipes. When + the MosesSentenceSplitter object is no longer needed, the close() method + should be called to free system resources. The class supports the context + manager interface. If used in a with statement, the close() method is + invoked automatically. + + When attribute ``more`` is True, colons and semi-colons are considered + sentence separators. + + >>> split_sents = MosesSentenceSplitter('en') + >>> split_sents(['Hello World! Hello', 'again.']) + ['Hello World!', 'Hello again.'] + + """ + + def __init__(self, lang="en", more=True): + self.lang = lang + program = path.join( + path.dirname(__file__), + "split-sentences.perl" + ) + argv = ["perl", program, "-q", "-b", "-l", self.lang] + if more: + argv.append("-m") + super().__init__(argv) + + def __str__(self): + return "MosesSentenceSplitter(lang=\"{lang}\")".format(lang=self.lang) + + def __call__(self, paragraph): + """Splits sentences within a paragraph. + The paragraph is a list of non-empty lines. XML-like tags are not + allowed. + """ + assert isinstance(paragraph, (list, tuple)) + if not paragraph: # empty paragraph is OK + return [] + assert all(isinstance(line, str) for line in paragraph) + paragraph = [line.strip() for line in paragraph] + assert all(paragraph), "blank lines are not allowed" + for line in paragraph: + self.writeline(line) + self.writeline("<P>") + sentences = [] + while True: + sentence = self.readline().strip() + if sentence == "<P>": + break + sentences.append(sentence) + return sentences + + +def read_paragraphs(inputfile, wrapped=True): + lines = map(str.strip, inputfile) + if wrapped: + paragraph = [] + for line in lines: + if line: + paragraph.append(line) + elif paragraph: + yield paragraph + paragraph = [] + if paragraph: + yield paragraph + else: + for line in lines: + yield [line] if line else [] + + +def write_paragraphs(paragraphs, outputfile, blank_sep=True): + for paragraph in paragraphs: + for sentence in paragraph: + print(sentence, file=outputfile) + if blank_sep or not paragraph: + print(file=outputfile) # paragraph separator + + +def main(): + args = docopt(usage) + if args["--selftest"]: + import doctest + import mosestokenizer.sentsplitter + doctest.testmod(mosestokenizer.sentsplitter) + if not args["<lang>"]: + sys.exit(0) + split_sents = MosesSentenceSplitter(args["<lang>"], more=args["--more"]) + inputfile = openfile(args["<inputfile>"]) + outputfile = openfile(args["<outputfile>"], "wt") + with inputfile, outputfile: + paragraphs = read_paragraphs(inputfile, wrapped=args["--unwrap"]) + paragraphs = map(split_sents, paragraphs) + write_paragraphs(paragraphs, outputfile, blank_sep=args["--unwrap"]) + + +if __name__ == "__main__": + main() diff --git a/scripts/tokenizer/python-wrapper/tokenizer.py b/scripts/tokenizer/python-wrapper/tokenizer.py new file mode 100644 index 000000000..eb5aec3dc --- /dev/null +++ b/scripts/tokenizer/python-wrapper/tokenizer.py @@ -0,0 +1,93 @@ +""" +A module for interfacing with ``tokenizer.perl`` from Moses. + +Copyright ® 2016-2017, Luís Gomes <luismsgomes@gmail.com> +""" + +usage = """ +Usage: + moses-tokenizer [options] <lang> [<inputfile> [<outputfile>]] + moses-tokenizer --selftest [--verbose] + +Options: + --selftest, -t Run selftests. + --verbose, -v Be more verbose. + --old Use older version (1.0) of the tokenizer. + If this option is not given, then version 1.1 + will be used. + +2016, Luís Gomes <luismsgomes@gmail.com> +""" + + +from docopt import docopt +from openfile import openfile +from os import path +from toolwrapper import ToolWrapper +import sys + + +class MosesTokenizer(ToolWrapper): + """A module for interfacing with ``tokenizer.perl`` from Moses. + + This class communicates with tokenizer.perl process via pipes. When the + MosesTokenizer object is no longer needed, the close() method should be + called to free system resources. The class supports the context manager + interface. If used in a with statement, the close() method is invoked + automatically. + + >>> tokenize = MosesTokenizer('en') + >>> tokenize('Hello World!') + ['Hello', 'World', '!'] + """ + + def __init__(self, lang="en", old_version=False): + self.lang = lang + program = path.join( + path.dirname(__file__), + "tokenizer-" + ("v1.0" if old_version else "v1.1") + ".perl" + ) + argv = ["perl", program, "-q", "-l", self.lang] + if not old_version: + # -b = disable output buffering + # -a = aggressive hyphen splitting + argv.extend(["-b", "-a"]) + super().__init__(argv) + + def __str__(self): + return "MosesTokenizer(lang=\"{lang}\")".format(lang=self.lang) + + def __call__(self, sentence): + """Tokenizes a single sentence. + + Newline characters are not allowed in the sentence to be tokenized. + """ + assert isinstance(sentence, str) + sentence = sentence.rstrip("\n") + assert "\n" not in sentence + if not sentence: + return [] + self.writeline(sentence) + return self.readline().split() + + +def main(): + args = docopt(usage) + if args["--selftest"]: + import doctest + import mosestokenizer.tokenizer + doctest.testmod(mosestokenizer.tokenizer) + if not args["<lang>"]: + sys.exit(0) + tokenize = MosesTokenizer( + args["<lang>"], + old_version=args["--old"], + ) + inputfile = openfile(args["<inputfile>"]) + outputfile = openfile(args["<outputfile>"], "wt") + with inputfile, outputfile: + for line in inputfile: + print(*tokenize(line), file=outputfile) + +if __name__ == "__main__": + main() |