Welcome to mirror list, hosted at ThFree Co, Russian Federation.

ForestRescore.h « mert - github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
blob: 900275b747d1cb39e13dd719e426a2006fe84344 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
/***********************************************************************
Moses - factored phrase-based language decoder
Copyright (C) 2014- University of Edinburgh

This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.

This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
Lesser General Public License for more details.

You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
***********************************************************************/
#ifndef MERT_FOREST_RESCORE_H
#define MERT_FOREST_RESCORE_H

#include <valarray>
#include <vector>

#include <boost/unordered_set.hpp>

#include "BleuScorer.h"
#include "Hypergraph.h"

namespace MosesTuning {

std::ostream& operator<<(std::ostream& out, const WordVec& wordVec);

struct NgramHash : public std::unary_function<const WordVec&, std::size_t> {
  std::size_t operator()(const WordVec& ngram) const {
    return util::MurmurHashNative(&(ngram[0]), ngram.size() * sizeof(WordVec::value_type));
  }
};

struct NgramEquals : public std::binary_function<const WordVec&, const WordVec&, bool> {
  bool operator()(const WordVec& first, const WordVec& second) const {
    if (first.size() != second.size()) return false;
    return memcmp(&(first[0]), &(second[0]), first.size() * sizeof(WordVec::value_type)) == 0;
  }
};

typedef boost::unordered_map<WordVec, size_t, NgramHash, NgramEquals> NgramCounter;


class ReferenceSet {


public:
  
  void AddLine(size_t sentenceId, const StringPiece& line, Vocab& vocab);

  void Load(const std::vector<std::string>& files, Vocab& vocab);

  size_t NgramMatches(size_t sentenceId, const WordVec&, bool clip) const;

  size_t Length(size_t sentenceId) const {return lengths_[sentenceId];}

private:
  //ngrams to (clipped,unclipped) counts
  typedef boost::unordered_map<WordVec, std::pair<std::size_t,std::size_t>, NgramHash,NgramEquals> NgramMap;
  std::vector<NgramMap> ngramCounts_;
  std::vector<size_t> lengths_;

};

struct VertexState {
  VertexState();

  std::vector<FeatureStatsType> bleuStats;
  WordVec leftContext;
  WordVec rightContext;
  size_t targetLength;
};

/**
  * Used to score an rule (ie edge) when we are applying it.
**/
class HgBleuScorer {
  public:
    HgBleuScorer(const ReferenceSet& references, const Graph& graph, size_t sentenceId, const std::vector<FeatureStatsType>& backgroundBleu):
    references_(references), sentenceId_(sentenceId), graph_(graph), backgroundBleu_(backgroundBleu),
      backgroundRefLength_(backgroundBleu[kBleuNgramOrder*2]) {
      vertexStates_.resize(graph.VertexSize());
      totalSourceLength_ = graph.GetVertex(graph.VertexSize()-1).SourceCovered();
    }

    FeatureStatsType Score(const Edge& edge, const Vertex& head, std::vector<FeatureStatsType>& bleuStats) ;

    void UpdateState(const Edge& winnerEdge, size_t vertexId, const std::vector<FeatureStatsType>& bleuStats);


  private:
    const ReferenceSet& references_;
    std::vector<VertexState> vertexStates_;
    size_t sentenceId_;
    size_t totalSourceLength_;
    const Graph& graph_;
    std::vector<FeatureStatsType> backgroundBleu_;
    FeatureStatsType backgroundRefLength_;

    void UpdateMatches(const NgramCounter& counter, std::vector<FeatureStatsType>& bleuStats) const;
    size_t GetTargetLength(const Edge& edge) const;
};

struct HgHypothesis {
  SparseVector featureVector;
  WordVec text;
  std::vector<FeatureStatsType> bleuStats;
};

void Viterbi(const Graph& graph, const SparseVector& weights, float bleuWeight, const ReferenceSet& references, size_t sentenceId, const std::vector<FeatureStatsType>& backgroundBleu, HgHypothesis* bestHypo);

};

#endif