diff options
author | alvations <alvations@gmail.com> | 2015-03-20 21:37:34 +0300 |
---|---|---|
committer | alvations <alvations@gmail.com> | 2015-03-20 21:37:34 +0300 |
commit | e5feb1a73e1f85f6b0c71a33968681c1bf19d1c1 (patch) | |
tree | ca9a4697e29247953a283181e1bc3471025650f1 | |
parent | 8f2d687d27f560b8e09ecd5a19542dde0507d84e (diff) |
Enforce python3 and also remove extra empty newline from STDOUT
-rw-r--r-- | scripts/other/gacha_filter.py | 14 |
1 files changed, 7 insertions, 7 deletions
diff --git a/scripts/other/gacha_filter.py b/scripts/other/gacha_filter.py index 4ebc501ac..1ec1f4616 100644 --- a/scripts/other/gacha_filter.py +++ b/scripts/other/gacha_filter.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python -*- coding: utf-8 -*- +#!/usr/bin/env python3 -*- coding: utf-8 -*- """ The Gacha filter cleans out sentence pairs that have global character mean @@ -25,18 +25,18 @@ where: USAGE: - $ python gacha_filter.py train.en train.de + $ python3 gacha_filter.py train.en train.de Outputs to STDOUT a separated lines of the source and target sentence pairs. You can simply cut the file after that. - $ python gacha_filter.py train.en train.de > train.en-de + $ python3 gacha_filter.py train.en train.de > train.en-de $ cut -f1 train.en-de > train.clean.en $ cut -f2 train.en-de > train.clean.de You can also allow lower threshold to yield more lines: - $ python gacha_filter.py train.en train.de 0.05 + $ python3 gacha_filter.py train.en train.de 0.05 Default threshold is set to 0.2. """ @@ -77,15 +77,15 @@ def main(sourcefile, targetfile, threshold=0.2): io.open(targetfile, 'r', encoding='utf8') as trgfin: for s, t in zip(srcfin, trgfin): if lowerbound < len(s) / float(len(t)) < upperbound: - print(u"{}\t{}\n".format(s.strip(),t.strip())) + print(u"{}\t{}".format(s.strip(),t.strip())) if __name__ == '__main__': import sys if len(sys.argv) not in range(3,5): - usage_msg = err_msg('Usage: python %s srcfile trgfile (threshold)\n' + usage_msg = err_msg('Usage: python3 %s srcfile trgfile (threshold)\n' % sys.argv[0]) - example_msg = err_msg('Example: python %s ~/Europarl.de-en.de ' + example_msg = err_msg('Example: python3 %s ~/Europarl.de-en.de ' '~/Europarl.de-en.en 0.4\n' % sys.argv[0]) sys.stderr.write(usage_msg) sys.stderr.write(example_msg) |