moses/TranslationModel/UG/mm/ug_lexical_phrase_scorer2.h


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157

// -*- c++ -*-
// lexical phrase scorer, version 1
// written by Ulrich Germann

// Is the +1 in computing the lexical probabilities taken from the original phrase-scoring code?

#ifndef __ug_lexical_phrase_scorer_h
#define __ug_lexical_phrase_scorer_h

#include "moses/TranslationModel/UG/generic/file_io/ug_stream.h"
#include "tpt_tokenindex.h"
#include <string>
#include <boost/unordered_map.hpp>
#include "tpt_pickler.h"
#include "ug_mm_2d_table.h"
#include "util/exception.hh"
using namespace std;
namespace ugdiss
{

  template<typename TKN> 
  class 
  LexicalPhraseScorer2
  {
    vector<string> ftag;
  public:
    typedef mm2dTable<id_type,id_type,uint32_t,uint32_t> table_t;
    table_t COOC;
    void open(string const& fname);

    template<typename someint>
    void 
    score(TKN const* snt1, size_t const s1, size_t const e1,
	  TKN const* snt2, size_t const s2, size_t const e2,
	  vector<someint> const & aln, float const alpha,
	  float & fwd_score, float& bwd_score) const;

    void 
    score(TKN const* snt1, size_t const s1, size_t const e1,
	  TKN const* snt2, size_t const s2, size_t const e2,
	  char const* const aln_start, char const* const aln_end,
	  float const alpha, float & fwd_score, float& bwd_score) const;

    // plup: permissive lookup
    float plup_fwd(id_type const s,id_type const t, float const alpha) const; 
    float plup_bwd(id_type const s,id_type const t, float const alpha) const;
    // to be done: 
    // - on-the-fly smoothing ? 
    // - better (than permissive-lookup) treatment of unknown combinations 
    //   permissive lookup is currently used for compatibility reasons
    // - zens-ney smoothed scoring via noisy-or combination
  };
  
  template<typename TKN>
  void
  LexicalPhraseScorer2<TKN>::
  open(string const& fname)
  {
    COOC.open(fname);
  }

  template<typename TKN>
  template<typename someint>
  void
  LexicalPhraseScorer2<TKN>::
  score(TKN const* snt1, size_t const s1, size_t const e1,
	TKN const* snt2, size_t const s2, size_t const e2,
	vector<someint> const & aln, float const alpha, 
	float & fwd_score, float& bwd_score) const
  {
    vector<float> p1(e1,0), p2(e2,0);
    vector<int>   c1(e1,0), c2(e2,0);
    size_t i1=0,i2=0;
    for (size_t k = 0; k < aln.size(); ++k)
      {
	i1 = aln[k]; i2 = aln[++k];
	if (i1 < s1 || i1 >= e1 || i2 < s2 || i2 >= e2) continue;
	p1[i1] += plup_fwd(snt1[i1].id(),snt2[i2].id(),alpha); 
	++c1[i1];
	p2[i2] += plup_bwd(snt1[i1].id(),snt2[i2].id(),alpha); 
	++c2[i2];
      }
    fwd_score = 0;
    for (size_t i = s1; i < e1; ++i)
      {
	if (c1[i] == 1) fwd_score += log(p1[i]);
	else if (c1[i]) fwd_score += log(p1[i])-log(c1[i]);
	else            fwd_score += log(plup_fwd(snt1[i].id(),0,alpha));
      }
    bwd_score = 0;
    for (size_t i = s2; i < e2; ++i)
      {
	if (c2[i] == 1) bwd_score += log(p2[i]);
	else if (c2[i]) bwd_score += log(p2[i])-log(c2[i]);
	else            bwd_score += log(plup_bwd(0,snt2[i].id(),alpha));
      }
  }

  template<typename TKN>
  float
  LexicalPhraseScorer2<TKN>::
  plup_fwd(id_type const s, id_type const t, float const alpha) const
  {
    if (COOC.m1(s) == 0 || COOC.m2(t) == 0) return 1.0;
    UTIL_THROW_IF2(alpha < 0,"At " << __FILE__ << ":" << __LINE__
		   << ": alpha parameter must be >= 0");
    return float(COOC[s][t]+alpha)/(COOC.m1(s)+alpha);
  }
  
  template<typename TKN>
  float
  LexicalPhraseScorer2<TKN>::
  plup_bwd(id_type const s, id_type const t,float const alpha) const
  {
    if (COOC.m1(s) == 0 || COOC.m2(t) == 0) return 1.0;
    UTIL_THROW_IF2(alpha < 0,"At " << __FILE__ << ":" << __LINE__
		   << ": alpha parameter must be >= 0");
    return float(COOC[s][t]+alpha)/(COOC.m2(t)+alpha);
  }
  
  template<typename TKN>
  void
  LexicalPhraseScorer2<TKN>::
  score(TKN const* snt1, size_t const s1, size_t const e1,
	TKN const* snt2, size_t const s2, size_t const e2,
	char const* const aln_start, char const* const aln_end,
	float const alpha, float & fwd_score, float& bwd_score) const
  {
    vector<float> p1(e1,0), p2(e2,0);
    vector<int>   c1(e1,0), c2(e2,0);
    size_t i1=0,i2=0;
    for (char const* x = aln_start; x < aln_end;)
      {
	x = binread(binread(x,i1),i2);
	if (i1 < s1 || i1 >= e1 || i2 < s2 || i2 >= e2) continue;
	p1[i1] += plup_fwd(snt1[i1].id(), snt2[i2].id(),alpha); 
	++c1[i1];
	p2[i2] += plup_bwd(snt1[i1].id(), snt2[i2].id(),alpha); 
	++c2[i2];
      }
    fwd_score = 0;
    for (size_t i = s1; i < e1; ++i)
      {
	if (c1[i] == 1) fwd_score += log(p1[i]);
	else if (c1[i]) fwd_score += log(p1[i])-log(c1[i]);
	else            fwd_score += log(plup_fwd(snt1[i].id(),0,alpha));
      }
    bwd_score = 0;
    for (size_t i = s2; i < e2; ++i)
      {
	if (c2[i] == 1) bwd_score += log(p2[i]);
	else if (c2[i]) bwd_score += log(p2[i])-log(c2[i]);
	else            bwd_score += log(plup_bwd(0,snt2[i].id(),alpha));
      }
  }
}
#endif