Merge 08811deb17337356cd8dae9c59c0160590679a35 from joshua

author: Barry Haddow <barry.haddow@gmail.com> 2014-07-21 14:04:43 +0400
committer: Barry Haddow <barry.haddow@gmail.com> 2014-07-21 14:04:43 +0400
commit: efee2695c31e1086af783c1b092fc842fb7bb1a4 (patch)
tree: e8324ea35cc92f0737c93f6b8fa8e23898e4d78a /mert/ForestRescore.h
parent: c83c5a3ee6f3ef7480e7a782d2023af9e99c1711 (diff)
1 files changed, 120 insertions, 0 deletions
diff --git a/mert/ForestRescore.h b/mert/ForestRescore.h
new file mode 100644
index 000000000..900275b74
--- /dev/null
+++ b/mert/ForestRescore.h
@@ -0,0 +1,120 @@
+/***********************************************************************
+Moses - factored phrase-based language decoder
+Copyright (C) 2014- University of Edinburgh
+
+This library is free software; you can redistribute it and/or
+modify it under the terms of the GNU Lesser General Public
+License as published by the Free Software Foundation; either
+version 2.1 of the License, or (at your option) any later version.
+
+This library is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+Lesser General Public License for more details.
+
+You should have received a copy of the GNU Lesser General Public
+License along with this library; if not, write to the Free Software
+Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+***********************************************************************/
+#ifndef MERT_FOREST_RESCORE_H
+#define MERT_FOREST_RESCORE_H
+
+#include <valarray>
+#include <vector>
+
+#include <boost/unordered_set.hpp>
+
+#include "BleuScorer.h"
+#include "Hypergraph.h"
+
+namespace MosesTuning {
+
+std::ostream& operator<<(std::ostream& out, const WordVec& wordVec);
+
+struct NgramHash : public std::unary_function<const WordVec&, std::size_t> {
+  std::size_t operator()(const WordVec& ngram) const {
+    return util::MurmurHashNative(&(ngram[0]), ngram.size() * sizeof(WordVec::value_type));
+  }
+};
+
+struct NgramEquals : public std::binary_function<const WordVec&, const WordVec&, bool> {
+  bool operator()(const WordVec& first, const WordVec& second) const {
+    if (first.size() != second.size()) return false;
+    return memcmp(&(first[0]), &(second[0]), first.size() * sizeof(WordVec::value_type)) == 0;
+  }
+};
+
+typedef boost::unordered_map<WordVec, size_t, NgramHash, NgramEquals> NgramCounter;
+
+
+class ReferenceSet {
+
+
+public:
+  
+  void AddLine(size_t sentenceId, const StringPiece& line, Vocab& vocab);
+
+  void Load(const std::vector<std::string>& files, Vocab& vocab);
+
+  size_t NgramMatches(size_t sentenceId, const WordVec&, bool clip) const;
+
+  size_t Length(size_t sentenceId) const {return lengths_[sentenceId];}
+
+private:
+  //ngrams to (clipped,unclipped) counts
+  typedef boost::unordered_map<WordVec, std::pair<std::size_t,std::size_t>, NgramHash,NgramEquals> NgramMap;
+  std::vector<NgramMap> ngramCounts_;
+  std::vector<size_t> lengths_;
+
+};
+
+struct VertexState {
+  VertexState();
+
+  std::vector<FeatureStatsType> bleuStats;
+  WordVec leftContext;
+  WordVec rightContext;
+  size_t targetLength;
+};
+
+/**
+  * Used to score an rule (ie edge) when we are applying it.
+**/
+class HgBleuScorer {
+  public:
+    HgBleuScorer(const ReferenceSet& references, const Graph& graph, size_t sentenceId, const std::vector<FeatureStatsType>& backgroundBleu):
+    references_(references), sentenceId_(sentenceId), graph_(graph), backgroundBleu_(backgroundBleu),
+      backgroundRefLength_(backgroundBleu[kBleuNgramOrder*2]) {
+      vertexStates_.resize(graph.VertexSize());
+      totalSourceLength_ = graph.GetVertex(graph.VertexSize()-1).SourceCovered();
+    }
+
+    FeatureStatsType Score(const Edge& edge, const Vertex& head, std::vector<FeatureStatsType>& bleuStats) ;
+
+    void UpdateState(const Edge& winnerEdge, size_t vertexId, const std::vector<FeatureStatsType>& bleuStats);
+
+
+  private:
+    const ReferenceSet& references_;
+    std::vector<VertexState> vertexStates_;
+    size_t sentenceId_;
+    size_t totalSourceLength_;
+    const Graph& graph_;
+    std::vector<FeatureStatsType> backgroundBleu_;
+    FeatureStatsType backgroundRefLength_;
+
+    void UpdateMatches(const NgramCounter& counter, std::vector<FeatureStatsType>& bleuStats) const;
+    size_t GetTargetLength(const Edge& edge) const;
+};
+
+struct HgHypothesis {
+  SparseVector featureVector;
+  WordVec text;
+  std::vector<FeatureStatsType> bleuStats;
+};
+
+void Viterbi(const Graph& graph, const SparseVector& weights, float bleuWeight, const ReferenceSet& references, size_t sentenceId, const std::vector<FeatureStatsType>& backgroundBleu, HgHypothesis* bestHypo);
+
+};
+
+#endif
author	Barry Haddow <barry.haddow@gmail.com>	2014-07-21 14:04:43 +0400
committer	Barry Haddow <barry.haddow@gmail.com>	2014-07-21 14:04:43 +0400
commit	efee2695c31e1086af783c1b092fc842fb7bb1a4 (patch)
tree	e8324ea35cc92f0737c93f6b8fa8e23898e4d78a /mert/ForestRescore.h
parent	c83c5a3ee6f3ef7480e7a782d2023af9e99c1711 (diff)