Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authoralvations <alvations@gmail.com>2015-03-20 20:48:47 +0300
committeralvations <alvations@gmail.com>2015-03-20 20:48:47 +0300
commit93ea5853e85a5a83e3aeea528bc07451bd43f70e (patch)
treebab78cdbca11db9ac8ca70424964718f1ccff611
parentb8ca33c34ee7108c52f5afe6c5f6dc2722c0dbab (diff)
Added Gacha Filter used by Manawi system from WMT14 translation task
-rw-r--r--scripts/other/gacha_filter.py78
1 files changed, 78 insertions, 0 deletions
diff --git a/scripts/other/gacha_filter.py b/scripts/other/gacha_filter.py
new file mode 100644
index 000000000..76736da5c
--- /dev/null
+++ b/scripts/other/gacha_filter.py
@@ -0,0 +1,78 @@
+#!/usr/bin/env python -*- coding: utf-8 -*-
+
+"""
+The Gacha filter cleans out sentence pairs that have global character mean
+lower than a certain threshold.
+
+Use this cleaner to produce low quantity of high quality sentence pairs.
+
+It is an aggressive cleaner that cleaned out ~64% of the HindEnCorp during
+WMT14 when threshold is set at 20% (Tan and Pal, 2014); achieving lowest TER.
+(see http://www.aclweb.org/anthology/W/W14/W14-3323.pdf)
+
+This is inspired by the global character mean that is used in the Gale-Church
+algorithm (Gale aand Church, 1993), the c variable in:
+
+ delta = (l2-l1*c)/math.sqrt(l1*s2)
+
+where:
+ - l1 = len(source_sentence)
+ - l2 = len(target_sentence)
+ - c = global mean, i.e. #char in source corpus / #char in target corpus
+ - s2 = global variance, i.e. d ((l1 - l2)^2) / d (l1)
+
+(For details on Gale-Church, see http://www.aclweb.org/anthology/J93-1004.pdf)
+"""
+
+import io, subprocess
+
+red = '\033[01;31m'
+native = '\033[m'
+
+def err_msg(txt):
+ return red+txt+native
+
+def num_char(filename):
+ return float(subprocess.Popen(["wc", "-m", filename],
+ stdout=subprocess.PIPE).stdout.read().split()[0])
+
+def gacha_mean(sourcefile, targetfile):
+ """
+ Counts the global character mean between source and target language as
+ in Gale-Church (1993)
+ """
+ sys.stderr.write(err_msg('Calculating Gacha mean, please wait ...\n'))
+ c = num_char(sourcefile) / num_char(targetfile)
+ sys.stderr.write(err_msg('Gacha mean = '+str(c)+'\n'))
+ sys.stderr.write(err_msg('Filtering starts ...\n'))
+ return c
+
+def main(sourcefile, targetfile, threshold=0.2):
+ # Calculates Gacha mean.
+ c = gacha_mean(sourcefile, targetfile)
+ # Calculates lower and upperbound for filtering
+ threshold = float(threshold)
+ lowerbound = (1-threshold) * c
+ upperbound = (1+threshold) * c
+
+ # Start filtering sentences.
+ with io.open(sourcefile, 'r', encoding='utf8') as srcfin, \
+ io.open(targetfile, 'r', encoding='utf8') as trgfin:
+ for s, t in zip(srcfin, trgfin):
+ if lowerbound < len(s) / float(len(t)) < upperbound:
+ print(u"{}\t{}\n".format(s.strip(),t.strip()))
+
+if __name__ == '__main__':
+ import sys
+ if len(sys.argv) not in range(3,5):
+ usage_msg = err_msg('Usage: python %s srcfile trgfile (threshold)\n'
+ % sys.argv[0])
+
+ example_msg = err_msg('Example: gacha_cleaning.py ~/Europarl.de-en.de '
+ '~/Europarl.de-en.en 0.4\n'
+ % sys.argv[0])
+ sys.stderr.write(usage_msg)
+ sys.stderr.write(example_msg)
+ sys.exit(1)
+
+ main(*sys.argv[1:])