python/word_alignment.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101

import pyvw

# the dataset is triples of E, A, F where A[i] = list of words E_i
# aligned to, or [] for null-aligned
my_dataset = [
    ( "the blue house".split(),
      ([0], [2], [1]),
      "la maison bleue".split() ),
    ( "the house".split(),
      ([0], [1]),
      "la maison".split() ),
    ( "the flower".split(),
      ([0], [1]),
      "la fleur".split() )
    ]

my_dataset2 = [
    ( "mary did not slap    the green witch".split(),
      ([0], [], [1],[2,3,4],[6],[8],  [7]),
      "maria no dio una bofetada a la bruja verde".split() ) ]
      #  0   1   2   3     4     5  6   7     8

def alignmentError(true, sys):
    t = set(true)
    s = set(sys)
    if len(t | s) == 0: return 0.
    return 1. - float(len(t & s)) / float(len(t | s))
    
class WordAligner(pyvw.SearchTask):
    def __init__(self, vw, sch, num_actions):
        pyvw.SearchTask.__init__(self, vw, sch, num_actions)
        sch.set_options( sch.AUTO_HAMMING_LOSS | sch.IS_LDF | sch.AUTO_CONDITION_FEATURES )

    def makeExample(self, E, F, i, j0, l):
        f  = 'Null' if j0 is None else [ F[j0+k] for k in range(l+1) ]
        ex = self.vw.example( { 'e': E[i],
                                'f': f,
                                'p': '_'.join(f),
                                'l': str(l),
                                'o': [str(i-j0), str(i-j0-l)] if j0 is not None else [] },
                              labelType = self.vw.lCostSensitive )
        lab = 'Null' if j0 is None else str(j0+l)
        ex.set_label_string(lab + ':0')
        return ex
        
    def _run(self, alignedSentence):
        E,A,F = alignedSentence

        # for each E word, we pick a F span
        covered = {}  # which F words have been covered so far?
        output  = []
        
        for i in range(len(E)):
            examples = []  # contains vw examples
            spans    = []  # contains triples (alignment error, index in examples, [range])
            
            # empty span:
            examples.append( self.makeExample(E, F, i, None, None) )
            spans.append( (alignmentError(A[i], []), 0, []) )

            # non-empty spans
            for j0 in range(len(F)):
                for l in range(3):   # max phrase length of 3
                    if j0+l >= len(F): break
                    if covered.has_key(j0+l): break

                    id = len(examples)
                    examples.append( self.makeExample(E, F, i, j0, l) )
                    spans.append( (alignmentError(A[i], range(j0,j0+l+1)), id, range(j0,j0+l+1)) )

            sortedSpans = []
            for s in spans: sortedSpans.append(s)
            sortedSpans.sort()
            oracle = []
            for id in range(len(sortedSpans)):
                if sortedSpans[id][0] > sortedSpans[0][0]: break
                oracle.append( sortedSpans[id][1] )
                
            pred = self.sch.predict(examples  = examples,
                                    my_tag    = i+1,
                                    oracle    = oracle,
                                    condition = [ (i, 'p'), (i-1, 'q') ] )

            for ex in examples: ex.finish()

            output.append( spans[pred][2] )
            for j in spans[pred][2]:
                covered[j] = True

        return output
    

print 'training LDF'
vw = pyvw.vw("--search 0 --csoaa_ldf m --search_task hook --ring_size 1024 --quiet -q ef -q ep")
task = vw.init_search_task(WordAligner)
for p in range(10):
    task.learn(my_dataset.__iter__)
print '====== test ======'
print task.predict( ("the blue flower".split(), ([],[],[]), "la fleur bleue".split()) )
print 'should have printed [[0], [2], [1]]'