scripts/tokenizer/pre_tokenize_cleaning.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96

#!/usr/bin/env python -*- coding: utf-8 -*-
#
# This file is part of moses.  Its use is licensed under the GNU Lesser General
# Public License version 2.1 or, at your option, any later version.

"""
The Gacha filter cleans out sentence pairs that have global character mean
lower than a certain threshold.

Use this cleaner to produce low quantity of high quality sentence pairs.

It is an aggressive cleaner that cleaned out ~64% of the HindEnCorp during
WMT14 when threshold is set at 20% (Tan and Pal, 2014); achieving lowest TER.
(see http://www.aclweb.org/anthology/W/W14/W14-3323.pdf)

This is inspired by the global character mean that is used in the Gale-Church
algorithm (Gale aand Church, 1993), the c variable in:

    delta = (l2-l1*c)/math.sqrt(l1*s2)

where:
 - l1 = len(source_sentence)
 - l2 = len(target_sentence)
 - c = global mean, i.e. #char in source corpus / #char in target corpus
 - s2 = global variance, i.e. d ((l1 - l2)^2) / d (l1)

(For details on Gale-Church, see http://www.aclweb.org/anthology/J93-1004.pdf)
"""

import io
import subprocess


red = '\033[01;31m'
native = '\033[m'


def err_msg(txt):
    return red + txt + native


def num_char(filename):
    process = subprocess.Popen(
        ["wc", "-m", filename], stdout=subprocess.PIPE)
    # TODO: Was this meant to call communicate()?
    return float(process.stdout.read().split()[0])


def gacha_mean(sourcefile, targetfile):
    """
    Counts the global character mean between source and target language as
    in Gale-Church (1993)
    """
    sys.stderr.write(err_msg('Calculating Gacha mean, please wait ...\n'))
    c = num_char(sourcefile) / num_char(targetfile)
    sys.stderr.write(err_msg('Gacha mean = ' + str(c) + '\n'))
    sys.stderr.write(err_msg('Filtering starts ...\n'))
    return c


def io_open(path):
    """Open file `path` for reading, as a UTF-8 text file."""
    return io.open(path, 'r', encoding='utf8')


def main(sourcefile, targetfile, threshold=0.2):
    # Calculates Gacha mean.
    c = gacha_mean(sourcefile, targetfile)
    # Calculates lower and upperbound for filtering
    threshold = float(threshold)
    lowerbound = (1 - threshold) * c
    upperbound = (1 + threshold) * c

    # Start filtering sentences.
    with io_open(sourcefile) as srcfin, io_open(targetfile) as trgfin:
        for s, t in zip(srcfin, trgfin):
            if lowerbound < len(s) / float(len(t)) < upperbound:
                print(u"{}\t{}\n".format(s.strip(), t.strip()))


if __name__ == '__main__':
    import sys
    if len(sys.argv) not in range(3, 5):
        usage_msg = err_msg(
            "Usage: python %s srcfile trgfile (threshold)\n"
            % sys.argv[0])

        example_msg = err_msg(
            "Example: "
            "gacha_cleaning.py ~/Europarl.de-en.de ~/Europarl.de-en.en 0.4\n"
            % sys.argv[0])
        sys.stderr.write(usage_msg)
        sys.stderr.write(example_msg)
        sys.exit(1)

    main(*sys.argv[1:])