scripts/training/bilingual-lm/extract.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141

#!/usr/bin/env python
#
# This file is part of moses.  Its use is licensed under the GNU Lesser General
# Public License version 2.1 or, at your option, any later version.

from collections import Counter
import logging
import sys

LOG = logging.getLogger(__name__)

BOS = "<s>"
EOS = "</s>"
UNK = "<unk>"


def replace_tags(tokens, tags, vocab):
    for i, t in enumerate(tokens):
        if t not in vocab:
            if i < len(tags):
                tokens[i] = tags[i]
            else:
                print "Error: missing tags for index i:", i
                print ' '.join(tokens)
                print ' '.join(tags)
                tokens[i] = UNK


def replace_unks(tokens, vocab):
    for i, t in enumerate(tokens):
        if t not in vocab:
            tokens[i] = UNK


def numberize(line, m, n, svocab, tvocab):
    line = line.split()
    source_words = line[:2 * m + 1]
    target_words = line[-n:]

    line = ' '.join([str(svocab[item]) for item in source_words]) + ' '
    line += ' '.join([str(tvocab[item]) for item in target_words]) + '\n'

    return line


def get_ngrams(corpus_stem, align_file, tagged_stem, svocab, tvocab, slang,
               tlang, m, n, ofh):
    """
      m - source context
      n - target context

      returns set of tags used
    """
    tags = Counter()
    sfh = open(corpus_stem + "." + slang)
    tfh = open(corpus_stem + "." + tlang)
    afh = open(align_file)
    fhs = [sfh, tfh, afh]
    if tagged_stem:
        fhs.append(open(tagged_stem + "." + slang))
        fhs.append(open(tagged_stem + "." + tlang))

    count = 0
    ngrams = 0
    LOG.info("Extracting ngrams")
    for lines in zip(*fhs):
        stokens = lines[0][:-1].split()
        ttokens = lines[1][:-1].split()
        stokens.append(EOS)
        ttokens.append(EOS)
        if tagged_stem:
            stags = lines[3][:-1].split()
            ttags = lines[4][:-1].split()
            stags.append(EOS)
            ttags.append(EOS)
            tags.update(stags)
            tags.update(ttags)
            replace_tags(stokens, stags, svocab)
            replace_tags(ttokens, ttags, tvocab)
        else:
            replace_unks(stokens, svocab)
            replace_unks(ttokens, tvocab)
        # List aligns for each target.
        # Note: align specifies source -> target
        target_aligns = [[] for t in range(len(ttokens))]
        for atoken in lines[2][:-1].split():
            spos, tpos = atoken.split("-")
            spos, tpos = int(spos), int(tpos)
            target_aligns[tpos].append(spos)

        # EOS alignment.
        target_aligns[-1] = [len(stokens) - 1]

        for tpos, spos_list in enumerate(target_aligns):
            # Affiliation heuristics - see Devlin t al. p1371
            if not spos_list:
                # tpos has no alignment, look right, then left, then
                # right-right, then left-left etc.
                rpos = tpos + 1
                lpos = tpos - 1
                while rpos < len(ttokens) or lpos >= 0:
                    if rpos < len(ttokens) and target_aligns[rpos]:
                        spos_list = target_aligns[rpos]
                        break
                    if lpos >= 0 and target_aligns[lpos]:
                        spos_list = target_aligns[lpos]
                        break
                    rpos += 1
                    lpos -= 1

            if not spos_list:
                raise Exception(
                    "No alignments in sentence \nSRC: " +
                    lines[0][:-1] + "\nTGT: " + lines[1][:-1])
            midpos = (len(spos_list) - 1) / 2
            spos = sorted(spos_list)[midpos]

            # source-context, target-context, predicted word
            for i in range(max(0, m - spos)):
                print>>ofh, BOS,
                # print [spos-m/2,spos+m/2+1], stokens[spos-m/2:spos+m/2+1]
            print>>ofh, " ".join(
                [s for s in stokens[max(0, spos - m):spos + m + 1]]),
            for i in range(max(0, spos + m + 1 - len(stokens))):
                print>>ofh, EOS,
            for i in range(max(0, n - (tpos + 1))):
                print>>ofh, BOS,
            print>>ofh, " ".join(
                [t for t in ttokens[max(0, tpos + 1 - n):tpos + 1]]),
            print>>ofh
            ngrams += 1

        count += 1
        if count % 1000 == 0:
            sys.stderr.write(".")
        if count % 50000 == 0:
            sys.stderr.write(" [%d]\n" % count)
    ofh.close()
    sys.stderr.write("\n")
    LOG.info("Extracted %d ngrams" % ngrams)
    return tags