Merge pull request #100 from alvations/master

Added the Gacha Filter used in WMT14 by the Manawi system
author: Hieu Hoang <hieuhoang@gmail.com> 2015-03-22 01:42:55 +0300
committer: Hieu Hoang <hieuhoang@gmail.com> 2015-03-22 01:42:55 +0300
commit: c4af7d28b5fd5c06dcc44d6c67dfe8d22aaac6ea (patch)
tree: 78677b4f1be1135514868a37bce8aa5e6809347e
parent: ddf7bc3e234bdbdbe80f49e07ac4e4b7ba1e9bbc (diff)
parent: e5feb1a73e1f85f6b0c71a33968681c1bf19d1c1 (diff)
1 files changed, 94 insertions, 0 deletions
diff --git a/scripts/other/gacha_filter.py b/scripts/other/gacha_filter.py
new file mode 100644
index 000000000..1ec1f4616
--- /dev/null
+++ b/scripts/other/gacha_filter.py
@@ -0,0 +1,94 @@
+#!/usr/bin/env python3 -*- coding: utf-8 -*-
+
+"""
+The Gacha filter cleans out sentence pairs that have global character mean
+lower than a certain threshold. 
+ 
+Use this cleaner to produce low quantity of high quality sentence pairs. 
+
+It is an aggressive cleaner that cleaned out ~64% of the HindEnCorp during 
+WMT14 when threshold is set at 20% (Tan and Pal, 2014); achieving lowest TER.  
+(see http://www.aclweb.org/anthology/W/W14/W14-3323.pdf)
+
+This is inspired by the global character mean that is used in the Gale-Church
+algorithm (Gale aand Church, 1993), the c variable in:
+
+    delta = (l2-l1*c)/math.sqrt(l1*s2)
+
+where:
+ - l1 = len(source_sentence)
+ - l2 = len(target_sentence)
+ - c = global mean, i.e. #char in source corpus / #char in target corpus
+ - s2 = global variance, i.e. d ((l1 - l2)^2) / d (l1)
+
+(For details on Gale-Church, see http://www.aclweb.org/anthology/J93-1004.pdf)
+
+USAGE:
+
+    $ python3 gacha_filter.py train.en train.de
+
+Outputs to STDOUT a separated lines of the source and target sentence pairs. 
+You can simply cut the file after that. 
+
+    $ python3 gacha_filter.py train.en train.de > train.en-de
+    $ cut -f1 train.en-de > train.clean.en
+    $ cut -f2 train.en-de > train.clean.de
+
+You can also allow lower threshold to yield more lines:
+
+    $ python3 gacha_filter.py train.en train.de 0.05
+    
+Default threshold is set to 0.2.
+"""
+
+import io, subprocess
+
+red = '\033[01;31m'
+native = '\033[m'
+
+def err_msg(txt):
+    return red+txt+native
+
+def num_char(filename):
+    return float(subprocess.Popen(["wc", "-m", filename], 
+                            stdout=subprocess.PIPE).stdout.read().split()[0])
+
+def gacha_mean(sourcefile, targetfile):
+    """
+    Counts the global character mean between source and target language as
+    in Gale-Church (1993)
+    """
+    sys.stderr.write(err_msg('Calculating Gacha mean, please wait ...\n'))
+    c = num_char(sourcefile) / num_char(targetfile)
+    sys.stderr.write(err_msg('Gacha mean = '+str(c)+'\n'))
+    sys.stderr.write(err_msg('Filtering starts ...\n'))
+    return c
+
+def main(sourcefile, targetfile, threshold=0.2):
+    # Calculates Gacha mean.
+    c = gacha_mean(sourcefile, targetfile)
+    # Calculates lower and upperbound for filtering
+    threshold = float(threshold)
+    lowerbound = (1-threshold) * c
+    upperbound = (1+threshold) * c
+    
+    # Start filtering sentences.
+    with io.open(sourcefile, 'r', encoding='utf8') as srcfin, \
+    io.open(targetfile, 'r', encoding='utf8') as trgfin:
+        for s, t in zip(srcfin, trgfin):
+            if lowerbound < len(s) / float(len(t)) < upperbound:
+                print(u"{}\t{}".format(s.strip(),t.strip()))
+
+if __name__ == '__main__':
+    import sys
+    if len(sys.argv) not in range(3,5):
+        usage_msg = err_msg('Usage: python3 %s srcfile trgfile (threshold)\n'
+                            % sys.argv[0])
+        
+        example_msg = err_msg('Example: python3 %s ~/Europarl.de-en.de '
+                              '~/Europarl.de-en.en 0.4\n' % sys.argv[0])
+        sys.stderr.write(usage_msg)
+        sys.stderr.write(example_msg)
+        sys.exit(1)
+        
+    main(*sys.argv[1:])
author	Hieu Hoang <hieuhoang@gmail.com>	2015-03-22 01:42:55 +0300
committer	Hieu Hoang <hieuhoang@gmail.com>	2015-03-22 01:42:55 +0300
commit	c4af7d28b5fd5c06dcc44d6c67dfe8d22aaac6ea (patch)
tree	78677b4f1be1135514868a37bce8aa5e6809347e
parent	ddf7bc3e234bdbdbe80f49e07ac4e4b7ba1e9bbc (diff)
parent	e5feb1a73e1f85f6b0c71a33968681c1bf19d1c1 (diff)