diff options
-rw-r--r-- | mert/BleuScorer.cpp | 4 | ||||
-rw-r--r-- | mert/BleuScorer.h | 2 | ||||
-rw-r--r-- | mert/FeatureStats.cpp | 30 | ||||
-rw-r--r-- | mert/FeatureStats.h | 7 | ||||
-rw-r--r-- | mert/ForestRescore.cpp | 422 | ||||
-rw-r--r-- | mert/ForestRescore.h | 120 | ||||
-rw-r--r-- | mert/ForestRescoreTest.cpp | 246 | ||||
-rw-r--r-- | mert/HopeFearDecoder.cpp | 328 | ||||
-rw-r--r-- | mert/HopeFearDecoder.h | 151 | ||||
-rw-r--r-- | mert/Hypergraph.cpp | 286 | ||||
-rw-r--r-- | mert/Hypergraph.h | 251 | ||||
-rw-r--r-- | mert/HypergraphTest.cpp | 151 | ||||
-rw-r--r-- | mert/Jamfile | 7 | ||||
-rw-r--r-- | mert/MiraFeatureVector.cpp | 40 | ||||
-rw-r--r-- | mert/MiraFeatureVector.h | 8 | ||||
-rw-r--r-- | mert/MiraWeightVector.cpp | 17 | ||||
-rw-r--r-- | mert/MiraWeightVector.h | 8 | ||||
-rw-r--r-- | mert/kbmira.cpp | 187 |
18 files changed, 2154 insertions, 111 deletions
diff --git a/mert/BleuScorer.cpp b/mert/BleuScorer.cpp index 467855d9b..f6ada2aa8 100644 --- a/mert/BleuScorer.cpp +++ b/mert/BleuScorer.cpp @@ -266,12 +266,12 @@ float smoothedSentenceBleu float sentenceLevelBackgroundBleu(const std::vector<float>& sent, const std::vector<float>& bg) { // Sum sent and background - std::vector<float> stats; UTIL_THROW_IF(sent.size()!=bg.size(), util::Exception, "Error"); UTIL_THROW_IF(sent.size() != kBleuNgramOrder * 2 + 1, util::Exception, "Error"); + std::vector<float> stats(sent.size()); for(size_t i=0; i<sent.size(); i++) - stats.push_back(sent[i]+bg[i]); + stats[i] = sent[i]+bg[i]; // Calculate BLEU float logbleu = 0.0; diff --git a/mert/BleuScorer.h b/mert/BleuScorer.h index 8be567574..affa37fbf 100644 --- a/mert/BleuScorer.h +++ b/mert/BleuScorer.h @@ -13,7 +13,7 @@ namespace MosesTuning { -const int kBleuNgramOrder = 4; +const size_t kBleuNgramOrder = 4; class NgramCounts; class Reference; diff --git a/mert/FeatureStats.cpp b/mert/FeatureStats.cpp index 5a12be70a..a0c6a6ebc 100644 --- a/mert/FeatureStats.cpp +++ b/mert/FeatureStats.cpp @@ -14,6 +14,8 @@ #include <boost/functional/hash.hpp> +#include "util/murmur_hash.hh" + #include "Util.h" using namespace std; @@ -59,6 +61,11 @@ void SparseVector::set(const string& name, FeatureStatsType value) m_fvector[id] = value; } +void SparseVector::set(size_t id, FeatureStatsType value) { + assert(m_id_to_name.size() > id); + m_fvector[id] = value; +} + void SparseVector::write(ostream& out, const string& sep) const { for (fvector_t::const_iterator i = m_fvector.begin(); i != m_fvector.end(); ++i) { @@ -91,6 +98,16 @@ void SparseVector::load(const string& file) } } +SparseVector& SparseVector::operator+=(const SparseVector& rhs) +{ + + for (fvector_t::const_iterator i = rhs.m_fvector.begin(); + i != rhs.m_fvector.end(); ++i) { + m_fvector[i->first] = get(i->first) + (i->second); + } + return *this; +} + SparseVector& SparseVector::operator-=(const SparseVector& rhs) { @@ -162,12 +179,18 @@ bool operator==(SparseVector const& item1, SparseVector const& item2) return item1.m_fvector==item2.m_fvector; } + std::size_t hash_value(SparseVector const& item) { - boost::hash<SparseVector::fvector_t> hasher; - return hasher(item.m_fvector); + size_t seed = 0; + for (SparseVector::fvector_t::const_iterator i = item.m_fvector.begin(); i != item.m_fvector.end(); ++i) { + seed = util::MurmurHashNative(&(i->first), sizeof(i->first), seed); + seed = util::MurmurHashNative(&(i->second), sizeof(i->second), seed); + } + return seed; } + FeatureStats::FeatureStats() : m_available_size(kAvailableSize), m_entries(0), m_array(new FeatureStatsType[m_available_size]) {} @@ -181,8 +204,7 @@ FeatureStats::FeatureStats(const size_t size) FeatureStats::~FeatureStats() { - delete [] m_array; - m_array = NULL; + delete [] m_array; } void FeatureStats::Copy(const FeatureStats &stats) diff --git a/mert/FeatureStats.h b/mert/FeatureStats.h index a882e7358..2ccbac50c 100644 --- a/mert/FeatureStats.h +++ b/mert/FeatureStats.h @@ -14,6 +14,11 @@ #include <map> #include <string> #include <vector> + +#include <boost/unordered_map.hpp> + +#include <util/string_piece.hh> + #include "Types.h" namespace MosesTuning @@ -31,6 +36,7 @@ public: FeatureStatsType get(const std::string& name) const; FeatureStatsType get(std::size_t id) const; void set(const std::string& name, FeatureStatsType value); + void set(size_t id, FeatureStatsType value); void clear(); void load(const std::string& file); std::size_t size() const { @@ -40,6 +46,7 @@ public: void write(std::ostream& out, const std::string& sep = " ") const; SparseVector& operator-=(const SparseVector& rhs); + SparseVector& operator+=(const SparseVector& rhs); FeatureStatsType inner_product(const SparseVector& rhs) const; // Added by cherryc diff --git a/mert/ForestRescore.cpp b/mert/ForestRescore.cpp new file mode 100644 index 000000000..c88b58e4c --- /dev/null +++ b/mert/ForestRescore.cpp @@ -0,0 +1,422 @@ +/*********************************************************************** +Moses - factored phrase-based language decoder +Copyright (C) 2014- University of Edinburgh + +This library is free software; you can redistribute it and/or +modify it under the terms of the GNU Lesser General Public +License as published by the Free Software Foundation; either +version 2.1 of the License, or (at your option) any later version. + +This library is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +Lesser General Public License for more details. + +You should have received a copy of the GNU Lesser General Public +License along with this library; if not, write to the Free Software +Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +***********************************************************************/ + +#include <cmath> +#include <limits> +#include <list> + +#include <boost/unordered_set.hpp> + +#include "util/file_piece.hh" +#include "util/tokenize_piece.hh" + +#include "BleuScorer.h" +#include "ForestRescore.h" + +using namespace std; + +namespace MosesTuning { + +std::ostream& operator<<(std::ostream& out, const WordVec& wordVec) { + out << "["; + for (size_t i = 0; i < wordVec.size(); ++i) { + out << wordVec[i]->first; + if (i+1< wordVec.size()) out << " "; + } + out << "]"; + return out; +} + + +void ReferenceSet::Load(const vector<string>& files, Vocab& vocab) { + for (size_t i = 0; i < files.size(); ++i) { + util::FilePiece fh(files[i].c_str()); + size_t sentenceId = 0; + while(true) { + StringPiece line; + try { + line = fh.ReadLine(); + } catch (util::EndOfFileException &e) { + break; + } + AddLine(sentenceId, line, vocab); + ++sentenceId; + } + } + +} + +void ReferenceSet::AddLine(size_t sentenceId, const StringPiece& line, Vocab& vocab) { + //cerr << line << endl; + NgramCounter ngramCounts; + list<WordVec> openNgrams; + size_t length = 0; + //tokenize & count + for (util::TokenIter<util::SingleCharacter, true> j(line, util::SingleCharacter(' ')); j; ++j) { + const Vocab::Entry* nextTok = &(vocab.FindOrAdd(*j)); + ++length; + openNgrams.push_front(WordVec()); + for (list<WordVec>::iterator k = openNgrams.begin(); k != openNgrams.end(); ++k) { + k->push_back(nextTok); + ++ngramCounts[*k]; + } + if (openNgrams.size() >= kBleuNgramOrder) openNgrams.pop_back(); + } + + //merge into overall ngram map + for (NgramCounter::const_iterator ni = ngramCounts.begin(); + ni != ngramCounts.end(); ++ni) { + size_t count = ni->second; + //cerr << *ni << " " << count << endl; + if (ngramCounts_.size() <= sentenceId) ngramCounts_.resize(sentenceId+1); + NgramMap::iterator totalsIter = ngramCounts_[sentenceId].find(ni->first); + if (totalsIter == ngramCounts_[sentenceId].end()) { + ngramCounts_[sentenceId][ni->first] = pair<size_t,size_t>(count,count); + } else { + ngramCounts_[sentenceId][ni->first].first = max(count, ngramCounts_[sentenceId][ni->first].first); //clip + ngramCounts_[sentenceId][ni->first].second += count; //no clip + } + } + //length + if (lengths_.size() <= sentenceId) lengths_.resize(sentenceId+1); + //TODO - length strategy - this is MIN + if (!lengths_[sentenceId]) { + lengths_[sentenceId] = length; + } else { + lengths_[sentenceId] = min(length,lengths_[sentenceId]); + } + //cerr << endl; + +} + +size_t ReferenceSet::NgramMatches(size_t sentenceId, const WordVec& ngram, bool clip) const { + const NgramMap& ngramCounts = ngramCounts_.at(sentenceId); + NgramMap::const_iterator ngi = ngramCounts.find(ngram); + if (ngi == ngramCounts.end()) return 0; + return clip ? ngi->second.first : ngi->second.second; +} + +VertexState::VertexState(): bleuStats(kBleuNgramOrder), targetLength(0) {} + +void HgBleuScorer::UpdateMatches(const NgramCounter& counts, vector<FeatureStatsType>& bleuStats ) const { + for (NgramCounter::const_iterator ngi = counts.begin(); ngi != counts.end(); ++ngi) { + //cerr << "Checking: " << *ngi << " matches " << references_.NgramMatches(sentenceId_,*ngi,false) << endl; + size_t order = ngi->first.size(); + size_t count = ngi->second; + bleuStats[(order-1)*2 + 1] += count; + bleuStats[(order-1) * 2] += min(count, references_.NgramMatches(sentenceId_,ngi->first,false)); + } +} + +size_t HgBleuScorer::GetTargetLength(const Edge& edge) const { + size_t targetLength = 0; + for (size_t i = 0; i < edge.Words().size(); ++i) { + const Vocab::Entry* word = edge.Words()[i]; + if (word) ++targetLength; + } + for (size_t i = 0; i < edge.Children().size(); ++i) { + const VertexState& state = vertexStates_[edge.Children()[i]]; + targetLength += state.targetLength; + } + return targetLength; +} + +FeatureStatsType HgBleuScorer::Score(const Edge& edge, const Vertex& head, vector<FeatureStatsType>& bleuStats) { + NgramCounter ngramCounts; + size_t childId = 0; + size_t wordId = 0; + size_t contextId = 0; //position within left or right context + const VertexState* vertexState = NULL; + bool inLeftContext = false; + bool inRightContext = false; + list<WordVec> openNgrams; + const Vocab::Entry* currentWord = NULL; + while (wordId < edge.Words().size()) { + currentWord = edge.Words()[wordId]; + if (currentWord != NULL) { + ++wordId; + } else { + if (!inLeftContext && !inRightContext) { + //entering a vertex + assert(!vertexState); + vertexState = &(vertexStates_[edge.Children()[childId]]); + ++childId; + if (vertexState->leftContext.size()) { + inLeftContext = true; + contextId = 0; + currentWord = vertexState->leftContext[contextId]; + } else { + //empty context + vertexState = NULL; + ++wordId; + continue; + } + } else { + //already in a vertex + ++contextId; + if (inLeftContext && contextId < vertexState->leftContext.size()) { + //still in left context + currentWord = vertexState->leftContext[contextId]; + } else if (inLeftContext) { + //at end of left context + if (vertexState->leftContext.size() == kBleuNgramOrder-1) { + //full size context, jump to right state + openNgrams.clear(); + inLeftContext = false; + inRightContext = true; + contextId = 0; + currentWord = vertexState->rightContext[contextId]; + } else { + //short context, just ignore right context + inLeftContext = false; + vertexState = NULL; + ++wordId; + continue; + } + } else { + //in right context + if (contextId < vertexState->rightContext.size()) { + currentWord = vertexState->rightContext[contextId]; + } else { + //leaving vertex + inRightContext = false; + vertexState = NULL; + ++wordId; + continue; + } + } + } + } + assert(currentWord); + if (graph_.IsBoundary(currentWord)) continue; + openNgrams.push_front(WordVec()); + openNgrams.front().reserve(kBleuNgramOrder); + for (list<WordVec>::iterator k = openNgrams.begin(); k != openNgrams.end(); ++k) { + k->push_back(currentWord); + //Only insert ngrams that cross boundaries + if (!vertexState || (inLeftContext && k->size() > contextId+1)) ++ngramCounts[*k]; + } + if (openNgrams.size() >= kBleuNgramOrder) openNgrams.pop_back(); + } + + //Collect matches + //This edge + //cerr << "edge ngrams" << endl; + UpdateMatches(ngramCounts, bleuStats); + + //Child vertexes + for (size_t i = 0; i < edge.Children().size(); ++i) { + //cerr << "vertex ngrams " << edge.Children()[i] << endl; + for (size_t j = 0; j < bleuStats.size(); ++j) { + bleuStats[j] += vertexStates_[edge.Children()[i]].bleuStats[j]; + } + } + + + FeatureStatsType sourceLength = head.SourceCovered(); + size_t referenceLength = references_.Length(sentenceId_); + FeatureStatsType effectiveReferenceLength = + sourceLength / totalSourceLength_ * referenceLength; + + bleuStats[bleuStats.size()-1] = effectiveReferenceLength; + //backgroundBleu_[backgroundBleu_.size()-1] = + // backgroundRefLength_ * sourceLength / totalSourceLength_; + FeatureStatsType bleu = sentenceLevelBackgroundBleu(bleuStats, backgroundBleu_); + + return bleu; +} + +void HgBleuScorer::UpdateState(const Edge& winnerEdge, size_t vertexId, const vector<FeatureStatsType>& bleuStats) { + //TODO: Maybe more efficient to absorb into the Score() method + VertexState& vertexState = vertexStates_[vertexId]; + //cerr << "Updating state for " << vertexId << endl; + + //leftContext + int wi = 0; + const VertexState* childState = NULL; + int contexti = 0; //index within child context + int childi = 0; + while (vertexState.leftContext.size() < (kBleuNgramOrder-1)) { + if ((size_t)wi >= winnerEdge.Words().size()) break; + const Vocab::Entry* word = winnerEdge.Words()[wi]; + if (word != NULL) { + vertexState.leftContext.push_back(word); + ++wi; + } else { + if (childState == NULL) { + //start of child state + childState = &(vertexStates_[winnerEdge.Children()[childi++]]); + contexti = 0; + } + if ((size_t)contexti < childState->leftContext.size()) { + vertexState.leftContext.push_back(childState->leftContext[contexti++]); + } else { + //end of child context + childState = NULL; + ++wi; + } + } + } + + //rightContext + wi = winnerEdge.Words().size() - 1; + childState = NULL; + childi = winnerEdge.Children().size() - 1; + while (vertexState.rightContext.size() < (kBleuNgramOrder-1)) { + if (wi < 0) break; + const Vocab::Entry* word = winnerEdge.Words()[wi]; + if (word != NULL) { + vertexState.rightContext.push_back(word); + --wi; + } else { + if (childState == NULL) { + //start (ie rhs) of child state + childState = &(vertexStates_[winnerEdge.Children()[childi--]]); + contexti = childState->rightContext.size()-1; + } + if (contexti >= 0) { + vertexState.rightContext.push_back(childState->rightContext[contexti--]); + } else { + //end (ie lhs) of child context + childState = NULL; + --wi; + } + } + } + reverse(vertexState.rightContext.begin(), vertexState.rightContext.end()); + + //length + counts + vertexState.targetLength = GetTargetLength(winnerEdge); + vertexState.bleuStats = bleuStats; +} + + +typedef pair<const Edge*,FeatureStatsType> BackPointer; + + +/** + * Recurse through back pointers + **/ +static void GetBestHypothesis(size_t vertexId, const Graph& graph, const vector<BackPointer>& bps, + HgHypothesis* bestHypo) { + //cerr << "Expanding " << vertexId << endl; + //UTIL_THROW_IF(bps[vertexId].second == kMinScore+1, HypergraphException, "Landed at vertex " << vertexId << " which is a dead end"); + if (!bps[vertexId].first) return; + const Edge* prevEdge = bps[vertexId].first; + bestHypo->featureVector += *(prevEdge->Features().get()); + size_t childId = 0; + for (size_t i = 0; i < prevEdge->Words().size(); ++i) { + if (prevEdge->Words()[i] != NULL) { + bestHypo->text.push_back(prevEdge->Words()[i]); + } else { + size_t childVertexId = prevEdge->Children()[childId++]; + HgHypothesis childHypo; + GetBestHypothesis(childVertexId,graph,bps,&childHypo); + bestHypo->text.insert(bestHypo->text.end(), childHypo.text.begin(), childHypo.text.end()); + bestHypo->featureVector += childHypo.featureVector; + } + } +} + +void Viterbi(const Graph& graph, const SparseVector& weights, float bleuWeight, const ReferenceSet& references , size_t sentenceId, const std::vector<FeatureStatsType>& backgroundBleu, HgHypothesis* bestHypo) +{ + BackPointer init(NULL,kMinScore); + vector<BackPointer> backPointers(graph.VertexSize(),init); + HgBleuScorer bleuScorer(references, graph, sentenceId, backgroundBleu); + vector<FeatureStatsType> winnerStats(kBleuNgramOrder*2+1); + for (size_t vi = 0; vi < graph.VertexSize(); ++vi) { + //cerr << "vertex id " << vi << endl; + FeatureStatsType winnerScore = kMinScore; + const Vertex& vertex = graph.GetVertex(vi); + const vector<const Edge*>& incoming = vertex.GetIncoming(); + if (!incoming.size()) { + //UTIL_THROW(HypergraphException, "Vertex " << vi << " has no incoming edges"); + //If no incoming edges, vertex is a dead end + backPointers[vi].first = NULL; + backPointers[vi].second = kMinScore/2; + } else { + //cerr << "\nVertex: " << vi << endl; + for (size_t ei = 0; ei < incoming.size(); ++ei) { + //cerr << "edge id " << ei << endl; + FeatureStatsType incomingScore = incoming[ei]->GetScore(weights); + for (size_t i = 0; i < incoming[ei]->Children().size(); ++i) { + size_t childId = incoming[ei]->Children()[i]; + UTIL_THROW_IF(backPointers[childId].second == kMinScore, + HypergraphException, "Graph was not topologically sorted. curr=" << vi << " prev=" << childId); + incomingScore += backPointers[childId].second; + } + vector<FeatureStatsType> bleuStats(kBleuNgramOrder*2+1); + // cerr << "Score: " << incomingScore << " Bleu: "; + // if (incomingScore > nonbleuscore) {nonbleuscore = incomingScore; nonbleuid = ei;} + FeatureStatsType totalScore = incomingScore; + if (bleuWeight) { + FeatureStatsType bleuScore = bleuScorer.Score(*(incoming[ei]), vertex, bleuStats); + UTIL_THROW_IF(isnan(bleuScore), util::Exception, "Bleu score undefined, smoothing problem?"); + totalScore += bleuWeight * bleuScore; + // cerr << bleuScore << " Total: " << incomingScore << endl << endl; + //cerr << "is " << incomingScore << " bs " << bleuScore << endl; + } + if (totalScore >= winnerScore) { + //We only store the feature score (not the bleu score) with the vertex, + //since the bleu score is always cumulative, ie from counts for the whole span. + winnerScore = totalScore; + backPointers[vi].first = incoming[ei]; + backPointers[vi].second = incomingScore; + winnerStats = bleuStats; + } + } + //update with winner + //if (bleuWeight) { + //TODO: Not sure if we need this when computing max-model solution + bleuScorer.UpdateState(*(backPointers[vi].first), vi, winnerStats); + + } + } + + //expand back pointers + GetBestHypothesis(graph.VertexSize()-1, graph, backPointers, bestHypo); + + //bleu stats and fv + + //Need the actual (clipped) stats + //TODO: This repeats code in bleu scorer - factor out + bestHypo->bleuStats.resize(kBleuNgramOrder*2+1); + NgramCounter counts; + list<WordVec> openNgrams; + for (size_t i = 0; i < bestHypo->text.size(); ++i) { + const Vocab::Entry* entry = bestHypo->text[i]; + if (graph.IsBoundary(entry)) continue; + openNgrams.push_front(WordVec()); + for (list<WordVec>::iterator k = openNgrams.begin(); k != openNgrams.end(); ++k) { + k->push_back(entry); + ++counts[*k]; + } + if (openNgrams.size() >= kBleuNgramOrder) openNgrams.pop_back(); + } + for (NgramCounter::const_iterator ngi = counts.begin(); ngi != counts.end(); ++ngi) { + size_t order = ngi->first.size(); + size_t count = ngi->second; + bestHypo->bleuStats[(order-1)*2 + 1] += count; + bestHypo->bleuStats[(order-1) * 2] += min(count, references.NgramMatches(sentenceId,ngi->first,true)); + } + bestHypo->bleuStats[kBleuNgramOrder*2] = references.Length(sentenceId); +} + + +}; diff --git a/mert/ForestRescore.h b/mert/ForestRescore.h new file mode 100644 index 000000000..900275b74 --- /dev/null +++ b/mert/ForestRescore.h @@ -0,0 +1,120 @@ +/*********************************************************************** +Moses - factored phrase-based language decoder +Copyright (C) 2014- University of Edinburgh + +This library is free software; you can redistribute it and/or +modify it under the terms of the GNU Lesser General Public +License as published by the Free Software Foundation; either +version 2.1 of the License, or (at your option) any later version. + +This library is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +Lesser General Public License for more details. + +You should have received a copy of the GNU Lesser General Public +License along with this library; if not, write to the Free Software +Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +***********************************************************************/ +#ifndef MERT_FOREST_RESCORE_H +#define MERT_FOREST_RESCORE_H + +#include <valarray> +#include <vector> + +#include <boost/unordered_set.hpp> + +#include "BleuScorer.h" +#include "Hypergraph.h" + +namespace MosesTuning { + +std::ostream& operator<<(std::ostream& out, const WordVec& wordVec); + +struct NgramHash : public std::unary_function<const WordVec&, std::size_t> { + std::size_t operator()(const WordVec& ngram) const { + return util::MurmurHashNative(&(ngram[0]), ngram.size() * sizeof(WordVec::value_type)); + } +}; + +struct NgramEquals : public std::binary_function<const WordVec&, const WordVec&, bool> { + bool operator()(const WordVec& first, const WordVec& second) const { + if (first.size() != second.size()) return false; + return memcmp(&(first[0]), &(second[0]), first.size() * sizeof(WordVec::value_type)) == 0; + } +}; + +typedef boost::unordered_map<WordVec, size_t, NgramHash, NgramEquals> NgramCounter; + + +class ReferenceSet { + + +public: + + void AddLine(size_t sentenceId, const StringPiece& line, Vocab& vocab); + + void Load(const std::vector<std::string>& files, Vocab& vocab); + + size_t NgramMatches(size_t sentenceId, const WordVec&, bool clip) const; + + size_t Length(size_t sentenceId) const {return lengths_[sentenceId];} + +private: + //ngrams to (clipped,unclipped) counts + typedef boost::unordered_map<WordVec, std::pair<std::size_t,std::size_t>, NgramHash,NgramEquals> NgramMap; + std::vector<NgramMap> ngramCounts_; + std::vector<size_t> lengths_; + +}; + +struct VertexState { + VertexState(); + + std::vector<FeatureStatsType> bleuStats; + WordVec leftContext; + WordVec rightContext; + size_t targetLength; +}; + +/** + * Used to score an rule (ie edge) when we are applying it. +**/ +class HgBleuScorer { + public: + HgBleuScorer(const ReferenceSet& references, const Graph& graph, size_t sentenceId, const std::vector<FeatureStatsType>& backgroundBleu): + references_(references), sentenceId_(sentenceId), graph_(graph), backgroundBleu_(backgroundBleu), + backgroundRefLength_(backgroundBleu[kBleuNgramOrder*2]) { + vertexStates_.resize(graph.VertexSize()); + totalSourceLength_ = graph.GetVertex(graph.VertexSize()-1).SourceCovered(); + } + + FeatureStatsType Score(const Edge& edge, const Vertex& head, std::vector<FeatureStatsType>& bleuStats) ; + + void UpdateState(const Edge& winnerEdge, size_t vertexId, const std::vector<FeatureStatsType>& bleuStats); + + + private: + const ReferenceSet& references_; + std::vector<VertexState> vertexStates_; + size_t sentenceId_; + size_t totalSourceLength_; + const Graph& graph_; + std::vector<FeatureStatsType> backgroundBleu_; + FeatureStatsType backgroundRefLength_; + + void UpdateMatches(const NgramCounter& counter, std::vector<FeatureStatsType>& bleuStats) const; + size_t GetTargetLength(const Edge& edge) const; +}; + +struct HgHypothesis { + SparseVector featureVector; + WordVec text; + std::vector<FeatureStatsType> bleuStats; +}; + +void Viterbi(const Graph& graph, const SparseVector& weights, float bleuWeight, const ReferenceSet& references, size_t sentenceId, const std::vector<FeatureStatsType>& backgroundBleu, HgHypothesis* bestHypo); + +}; + +#endif diff --git a/mert/ForestRescoreTest.cpp b/mert/ForestRescoreTest.cpp new file mode 100644 index 000000000..86975d3a5 --- /dev/null +++ b/mert/ForestRescoreTest.cpp @@ -0,0 +1,246 @@ +#include <iostream> + +#include "ForestRescore.h" + +#define BOOST_TEST_MODULE MertForestRescore +#include <boost/test/unit_test.hpp> + + + +using namespace std; +using namespace MosesTuning; + +BOOST_AUTO_TEST_CASE(viterbi_simple_lattice) +{ + Vocab vocab; + WordVec words; + string wordStrings[] = + {"<s>", "</s>", "a", "b", "c", "d", "e", "f", "g"}; + for (size_t i = 0; i < 9; ++i) { + words.push_back(&(vocab.FindOrAdd((wordStrings[i])))); + } + + const string f1 = "foo"; + const string f2 = "bar"; + Graph graph(vocab); + graph.SetCounts(5,5); + + Edge* e0 = graph.NewEdge(); + e0->AddWord(words[0]); + e0->AddFeature(f1, 2.0); + + Vertex* v0 = graph.NewVertex(); + v0->AddEdge(e0); + + Edge* e1 = graph.NewEdge(); + e1->AddWord(NULL); + e1->AddChild(0); + e1->AddWord(words[2]); + e1->AddWord(words[3]); + e1->AddFeature(f1, 1.0); + e1->AddFeature(f2, 3.0); + + Vertex* v1 = graph.NewVertex(); + v1->AddEdge(e1); + + Edge* e2 = graph.NewEdge(); + e2->AddWord(NULL); + e2->AddChild(1); + e2->AddWord(words[4]); + e2->AddWord(words[5]); + e2->AddFeature(f2, 2.5); + + Vertex* v2 = graph.NewVertex(); + v2->AddEdge(e2); + + Edge* e3 = graph.NewEdge(); + e3->AddWord(NULL); + e3->AddChild(2); + e3->AddWord(words[6]); + e3->AddWord(words[7]); + e3->AddWord(words[8]); + e3->AddFeature(f1, -1); + + Vertex* v3 = graph.NewVertex(); + v3->AddEdge(e3); + + Edge* e4 = graph.NewEdge(); + e4->AddWord(NULL); + e4->AddChild(3); + e4->AddWord(words[1]); + e3->AddFeature(f2, 0.5); + + Vertex* v4 = graph.NewVertex(); + v4->AddEdge(e4); + + ReferenceSet references; + references.AddLine(0, "a b c k e f o", vocab); + HgHypothesis modelHypo; + vector<FeatureStatsType> bg(kBleuNgramOrder*2+1); + SparseVector weights; + weights.set(f1,2); + weights.set(f2,1); + Viterbi(graph, weights, 0, references, 0, bg, &modelHypo); + BOOST_CHECK_CLOSE(2.0,modelHypo.featureVector.get(f1), 0.0001); + BOOST_CHECK_CLOSE(6.0,modelHypo.featureVector.get(f2), 0.0001); + + BOOST_CHECK_EQUAL(words[0]->first, modelHypo.text[0]->first); + BOOST_CHECK_EQUAL(words[2]->first, modelHypo.text[1]->first); + BOOST_CHECK_EQUAL(words[3]->first, modelHypo.text[2]->first); + BOOST_CHECK_EQUAL(words[4]->first, modelHypo.text[3]->first); + BOOST_CHECK_EQUAL(words[5]->first, modelHypo.text[4]->first); + BOOST_CHECK_EQUAL(words[6]->first, modelHypo.text[5]->first); + BOOST_CHECK_EQUAL(words[7]->first, modelHypo.text[6]->first); + BOOST_CHECK_EQUAL(words[8]->first, modelHypo.text[7]->first); + BOOST_CHECK_EQUAL(words[1]->first, modelHypo.text[8]->first); +} + + + +BOOST_AUTO_TEST_CASE(viterbi_3branch_lattice) +{ + Vocab vocab; + WordVec words; + string wordStrings[] = + {"<s>", "</s>", "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k"}; + for (size_t i = 0; i < 13; ++i) { + words.push_back(&(vocab.FindOrAdd((wordStrings[i])))); + } + + const string f1 = "foo"; + const string f2 = "bar"; + Graph graph(vocab); + graph.SetCounts(5,8); + + Edge* e0 = graph.NewEdge(); + e0->AddWord(words[0]); + + Vertex* v0 = graph.NewVertex(); + v0->AddEdge(e0); + + Edge* e1 = graph.NewEdge(); + e1->AddWord(NULL); + e1->AddChild(0); + e1->AddWord(words[2]); + e1->AddWord(words[3]); + e1->AddFeature(f1,1); + e1->AddFeature(f2,1); + Edge* e5 = graph.NewEdge(); + e5->AddWord(NULL); + e5->AddChild(0); + e5->AddWord(words[9]); + e5->AddWord(words[10]); + e5->AddFeature(f1,2); + e5->AddFeature(f2,-2); + + Vertex* v1 = graph.NewVertex(); + v1->AddEdge(e1); + v1->AddEdge(e5); + v1->SetSourceCovered(1); + + Edge* e2 = graph.NewEdge(); + e2->AddWord(NULL); + e2->AddChild(1); + e2->AddWord(words[4]); + e2->AddWord(words[5]); + e2->AddFeature(f2,3); + + Vertex* v2 = graph.NewVertex(); + v2->AddEdge(e2); + v2->SetSourceCovered(3); + + Edge* e3 = graph.NewEdge(); + e3->AddWord(NULL); + e3->AddChild(2); + e3->AddWord(words[6]); + e3->AddWord(words[7]); + e3->AddWord(words[8]); + e3->AddFeature(f1,1); + Edge* e6 = graph.NewEdge(); + e6->AddWord(NULL); + e6->AddChild(2); + e6->AddWord(words[9]); + e6->AddWord(words[12]); + e6->AddFeature(f2,1); + Edge* e7 = graph.NewEdge(); + e7->AddWord(NULL); + e7->AddChild(1); + e7->AddWord(words[11]); + e7->AddWord(words[12]); + e7->AddFeature(f1,2); + e7->AddFeature(f2,3); + + Vertex* v3 = graph.NewVertex(); + v3->AddEdge(e3); + v3->AddEdge(e6); + v3->AddEdge(e7); + v3->SetSourceCovered(5); + + Edge* e4 = graph.NewEdge(); + e4->AddWord(NULL); + e4->AddChild(3); + e4->AddWord(words[1]); + + Vertex* v4 = graph.NewVertex(); + v4->AddEdge(e4); + v4->SetSourceCovered(6); + + /*Paths || foo || bar || s(2,1) + ab cd hk || 1 || 5 || 7 + hi cd hk || 2 || 2 || 6 + ab jk || 3 || 4 || 10 + hi jk || 4 || 1 || 9 + ab cd efg || 2 || 4 || 8 + hi cd efg || 3 || 1 || 7 + */ + + ReferenceSet references; + references.AddLine(0, "a b c d h k", vocab); + HgHypothesis modelHypo; + vector<FeatureStatsType> bg(kBleuNgramOrder*2+1, 0.1); + SparseVector weights; + weights.set(f1,2); + weights.set(f2,1); + Viterbi(graph, weights, 0, references, 0, bg, &modelHypo); + BOOST_CHECK_CLOSE(3.0,modelHypo.featureVector.get(f1), 0.0001); + BOOST_CHECK_CLOSE(4.0,modelHypo.featureVector.get(f2), 0.0001); + + BOOST_CHECK_EQUAL(6, modelHypo.text.size()); + + //expect ab jk + BOOST_CHECK_EQUAL(words[0]->first, modelHypo.text[0]->first); + BOOST_CHECK_EQUAL(words[2]->first, modelHypo.text[1]->first); + BOOST_CHECK_EQUAL(words[3]->first, modelHypo.text[2]->first); + BOOST_CHECK_EQUAL(words[11]->first, modelHypo.text[3]->first); + BOOST_CHECK_EQUAL(words[12]->first, modelHypo.text[4]->first); + BOOST_CHECK_EQUAL(words[1]->first, modelHypo.text[5]->first); + + + HgHypothesis hopeHypo; + Viterbi(graph, weights, 1, references, 0, bg, &hopeHypo); + //expect abcdhk + BOOST_CHECK_EQUAL(8, hopeHypo.text.size()); + + BOOST_CHECK_EQUAL(words[0]->first, hopeHypo.text[0]->first); + BOOST_CHECK_EQUAL(words[2]->first, hopeHypo.text[1]->first); + BOOST_CHECK_EQUAL(words[3]->first, hopeHypo.text[2]->first); + BOOST_CHECK_EQUAL(words[4]->first, hopeHypo.text[3]->first); + BOOST_CHECK_EQUAL(words[5]->first, hopeHypo.text[4]->first); + BOOST_CHECK_EQUAL(words[9]->first, hopeHypo.text[5]->first); + BOOST_CHECK_EQUAL(words[12]->first, hopeHypo.text[6]->first); + BOOST_CHECK_EQUAL(words[1]->first, hopeHypo.text[7]->first); + + BOOST_CHECK_EQUAL(kBleuNgramOrder*2+1, hopeHypo.bleuStats.size()); + BOOST_CHECK_EQUAL(6, hopeHypo.bleuStats[0]); + BOOST_CHECK_EQUAL(6, hopeHypo.bleuStats[1]); + BOOST_CHECK_EQUAL(5, hopeHypo.bleuStats[2]); + BOOST_CHECK_EQUAL(5, hopeHypo.bleuStats[3]); + BOOST_CHECK_EQUAL(4, hopeHypo.bleuStats[4]); + BOOST_CHECK_EQUAL(4, hopeHypo.bleuStats[5]); + BOOST_CHECK_EQUAL(3, hopeHypo.bleuStats[6]); + BOOST_CHECK_EQUAL(3, hopeHypo.bleuStats[7]); + BOOST_CHECK_EQUAL(6, hopeHypo.bleuStats[8]); +} + + + diff --git a/mert/HopeFearDecoder.cpp b/mert/HopeFearDecoder.cpp new file mode 100644 index 000000000..5f5c59966 --- /dev/null +++ b/mert/HopeFearDecoder.cpp @@ -0,0 +1,328 @@ +/*********************************************************************** +Moses - factored phrase-based language decoder +Copyright (C) 2014- University of Edinburgh + +This library is free software; you can redistribute it and/or +modify it under the terms of the GNU Lesser General Public +License as published by the Free Software Foundation; either +version 2.1 of the License, or (at your option) any later version. + +This library is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +Lesser General Public License for more details. + +You should have received a copy of the GNU Lesser General Public +License along with this library; if not, write to the Free Software +Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +***********************************************************************/ + +#include <cmath> +#include <iterator> + +#include <boost/filesystem.hpp> +#include <boost/lexical_cast.hpp> + +#include "util/exception.hh" +#include "util/file_piece.hh" + +#include "BleuScorer.h" +#include "HopeFearDecoder.h" + +using namespace std; +namespace fs = boost::filesystem; + +namespace MosesTuning { + +static const ValType BLEU_RATIO = 5; + +ValType HopeFearDecoder::Evaluate(const AvgWeightVector& wv) { + vector<ValType> stats(kBleuNgramOrder*2+1,0); + for(reset(); !finished(); next()) { + vector<ValType> sent; + MaxModel(wv,&sent); + for(size_t i=0; i<sent.size(); i++) { + stats[i]+=sent[i]; + } + } + return unsmoothedBleu(stats); +} + +NbestHopeFearDecoder::NbestHopeFearDecoder( + const vector<string>& featureFiles, + const vector<string>& scoreFiles, + bool streaming, + bool no_shuffle, + bool safe_hope + ) : safe_hope_(safe_hope) { + if (streaming) { + train_.reset(new StreamingHypPackEnumerator(featureFiles, scoreFiles)); + } else { + train_.reset(new RandomAccessHypPackEnumerator(featureFiles, scoreFiles, no_shuffle)); + } +} + + +void NbestHopeFearDecoder::next() { + train_->next(); +} + +bool NbestHopeFearDecoder::finished() { + return train_->finished(); +} + +void NbestHopeFearDecoder::reset() { + train_->reset(); +} + +void NbestHopeFearDecoder::HopeFear( + const std::vector<ValType>& backgroundBleu, + const MiraWeightVector& wv, + HopeFearData* hopeFear + ) { + + + // Hope / fear decode + ValType hope_scale = 1.0; + size_t hope_index=0, fear_index=0, model_index=0; + ValType hope_score=0, fear_score=0, model_score=0; + for(size_t safe_loop=0; safe_loop<2; safe_loop++) { + ValType hope_bleu, hope_model; + for(size_t i=0; i< train_->cur_size(); i++) { + const MiraFeatureVector& vec=train_->featuresAt(i); + ValType score = wv.score(vec); + ValType bleu = sentenceLevelBackgroundBleu(train_->scoresAt(i),backgroundBleu); + // Hope + if(i==0 || (hope_scale*score + bleu) > hope_score) { + hope_score = hope_scale*score + bleu; + hope_index = i; + hope_bleu = bleu; + hope_model = score; + } + // Fear + if(i==0 || (score - bleu) > fear_score) { + fear_score = score - bleu; + fear_index = i; + } + // Model + if(i==0 || score > model_score) { + model_score = score; + model_index = i; + } + } + // Outer loop rescales the contribution of model score to 'hope' in antagonistic cases + // where model score is having far more influence than BLEU + hope_bleu *= BLEU_RATIO; // We only care about cases where model has MUCH more influence than BLEU + if(safe_hope_ && safe_loop==0 && abs(hope_model)>1e-8 && abs(hope_bleu)/abs(hope_model)<hope_scale) + hope_scale = abs(hope_bleu) / abs(hope_model); + else break; + } + hopeFear->modelFeatures = train_->featuresAt(model_index); + hopeFear->hopeFeatures = train_->featuresAt(hope_index); + hopeFear->fearFeatures = train_->featuresAt(fear_index); + + hopeFear->hopeStats = train_->scoresAt(hope_index); + hopeFear->hopeBleu = sentenceLevelBackgroundBleu(hopeFear->hopeStats, backgroundBleu); + const vector<float>& fear_stats = train_->scoresAt(fear_index); + hopeFear->fearBleu = sentenceLevelBackgroundBleu(fear_stats, backgroundBleu); + + hopeFear->modelStats = train_->scoresAt(model_index); + hopeFear->hopeFearEqual = (hope_index == fear_index); +} + +void NbestHopeFearDecoder::MaxModel(const AvgWeightVector& wv, std::vector<ValType>* stats) { + // Find max model + size_t max_index=0; + ValType max_score=0; + for(size_t i=0; i<train_->cur_size(); i++) { + MiraFeatureVector vec(train_->featuresAt(i)); + ValType score = wv.score(vec); + if(i==0 || score > max_score) { + max_index = i; + max_score = score; + } + } + *stats = train_->scoresAt(max_index); +} + + + +HypergraphHopeFearDecoder::HypergraphHopeFearDecoder + ( + const string& hypergraphDir, + const vector<string>& referenceFiles, + size_t num_dense, + bool streaming, + bool no_shuffle, + bool safe_hope, + size_t hg_pruning, + const MiraWeightVector& wv + ) : + num_dense_(num_dense) { + + UTIL_THROW_IF(streaming, util::Exception, "Streaming not currently supported for hypergraphs"); + UTIL_THROW_IF(!fs::exists(hypergraphDir), HypergraphException, "Directory '" << hypergraphDir << "' does not exist"); + UTIL_THROW_IF(!referenceFiles.size(), util::Exception, "No reference files supplied"); + references_.Load(referenceFiles, vocab_); + + SparseVector weights; + wv.ToSparse(&weights); + + static const string kWeights = "weights"; + fs::directory_iterator dend; + size_t fileCount = 0; + cerr << "Reading hypergraphs" << endl; + for (fs::directory_iterator di(hypergraphDir); di != dend; ++di) { + if (di->path().filename() == kWeights) continue; + Graph graph(vocab_); + size_t id = boost::lexical_cast<size_t>(di->path().stem().string()); + util::FilePiece file(di->path().string().c_str()); + ReadGraph(file,graph); + + //cerr << "ref length " << references_.Length(id) << endl; + size_t edgeCount = hg_pruning * references_.Length(id); + boost::shared_ptr<Graph> prunedGraph; + prunedGraph.reset(new Graph(vocab_)); + graph.Prune(prunedGraph.get(), weights, edgeCount); + graphs_[id] = prunedGraph; + //cerr << "Pruning to v=" << graphs_[id]->VertexSize() << " e=" << graphs_[id]->EdgeSize() << endl; + ++fileCount; + if (fileCount % 10 == 0) cerr << "."; + if (fileCount % 400 == 0) cerr << " [count=" << fileCount << "]\n"; + } + cerr << endl << "Done" << endl; + + +} + +void HypergraphHopeFearDecoder::reset() { + graphIter_ = graphs_.begin(); +} + +void HypergraphHopeFearDecoder::next() { + ++graphIter_; +} + +bool HypergraphHopeFearDecoder::finished() { + return graphIter_ == graphs_.end(); +} + +void HypergraphHopeFearDecoder::HopeFear( + const vector<ValType>& backgroundBleu, + const MiraWeightVector& wv, + HopeFearData* hopeFear + ) { + size_t sentenceId = graphIter_->first; + SparseVector weights; + wv.ToSparse(&weights); + const Graph& graph = *(graphIter_->second); + + ValType hope_scale = 1.0; + HgHypothesis hopeHypo, fearHypo, modelHypo; + for(size_t safe_loop=0; safe_loop<2; safe_loop++) { + + //hope decode + Viterbi(graph, weights, 1, references_, sentenceId, backgroundBleu, &hopeHypo); + + //fear decode + Viterbi(graph, weights, -1, references_, sentenceId, backgroundBleu, &fearHypo); + + //Model decode + Viterbi(graph, weights, 0, references_, sentenceId, backgroundBleu, &modelHypo); + + + // Outer loop rescales the contribution of model score to 'hope' in antagonistic cases + // where model score is having far more influence than BLEU + // hope_bleu *= BLEU_RATIO; // We only care about cases where model has MUCH more influence than BLEU + // if(safe_hope_ && safe_loop==0 && abs(hope_model)>1e-8 && abs(hope_bleu)/abs(hope_model)<hope_scale) + // hope_scale = abs(hope_bleu) / abs(hope_model); + // else break; + //TODO: Don't currently get model and bleu so commented this out for now. + break; + } + //modelFeatures, hopeFeatures and fearFeatures + hopeFear->modelFeatures = MiraFeatureVector(modelHypo.featureVector, num_dense_); + hopeFear->hopeFeatures = MiraFeatureVector(hopeHypo.featureVector, num_dense_); + hopeFear->fearFeatures = MiraFeatureVector(fearHypo.featureVector, num_dense_); + + //Need to know which are to be mapped to dense features! + + //Only C++11 + //hopeFear->modelStats.assign(std::begin(modelHypo.bleuStats), std::end(modelHypo.bleuStats)); + vector<ValType> fearStats(kBleuNgramOrder*2+1); + hopeFear->hopeStats.reserve(kBleuNgramOrder*2+1); + hopeFear->modelStats.reserve(kBleuNgramOrder*2+1); + for (size_t i = 0; i < fearStats.size(); ++i) { + hopeFear->modelStats.push_back(modelHypo.bleuStats[i]); + hopeFear->hopeStats.push_back(hopeHypo.bleuStats[i]); + + fearStats[i] = fearHypo.bleuStats[i]; + } + /* + cerr << "hope" << endl;; + for (size_t i = 0; i < hopeHypo.text.size(); ++i) { + cerr << hopeHypo.text[i]->first << " "; + } + cerr << endl; + for (size_t i = 0; i < fearStats.size(); ++i) { + cerr << hopeHypo.bleuStats[i] << " "; + } + cerr << endl; + cerr << "fear"; + for (size_t i = 0; i < fearHypo.text.size(); ++i) { + cerr << fearHypo.text[i]->first << " "; + } + cerr << endl; + for (size_t i = 0; i < fearStats.size(); ++i) { + cerr << fearHypo.bleuStats[i] << " "; + } + cerr << endl; + cerr << "model"; + for (size_t i = 0; i < modelHypo.text.size(); ++i) { + cerr << modelHypo.text[i]->first << " "; + } + cerr << endl; + for (size_t i = 0; i < fearStats.size(); ++i) { + cerr << modelHypo.bleuStats[i] << " "; + } + cerr << endl; + */ + hopeFear->hopeBleu = sentenceLevelBackgroundBleu(hopeFear->hopeStats, backgroundBleu); + hopeFear->fearBleu = sentenceLevelBackgroundBleu(fearStats, backgroundBleu); + + //If fv and bleu stats are equal, then assume equal + hopeFear->hopeFearEqual = true; //(hopeFear->hopeBleu - hopeFear->fearBleu) >= 1e-8; + if (hopeFear->hopeFearEqual) { + for (size_t i = 0; i < fearStats.size(); ++i) { + if (fearStats[i] != hopeFear->hopeStats[i]) { + hopeFear->hopeFearEqual = false; + break; + } + } + } + hopeFear->hopeFearEqual = hopeFear->hopeFearEqual && (hopeFear->fearFeatures == hopeFear->hopeFeatures); +} + +void HypergraphHopeFearDecoder::MaxModel(const AvgWeightVector& wv, vector<ValType>* stats) { + assert(!finished()); + HgHypothesis bestHypo; + size_t sentenceId = graphIter_->first; + SparseVector weights; + wv.ToSparse(&weights); + vector<ValType> bg(kBleuNgramOrder*2+1); + Viterbi(*(graphIter_->second), weights, 0, references_, sentenceId, bg, &bestHypo); + stats->resize(bestHypo.bleuStats.size()); + /* + for (size_t i = 0; i < bestHypo.text.size(); ++i) { + cerr << bestHypo.text[i]->first << " "; + } + cerr << endl; + */ + for (size_t i = 0; i < bestHypo.bleuStats.size(); ++i) { + (*stats)[i] = bestHypo.bleuStats[i]; + } +} + + + +}; diff --git a/mert/HopeFearDecoder.h b/mert/HopeFearDecoder.h new file mode 100644 index 000000000..e8323fc76 --- /dev/null +++ b/mert/HopeFearDecoder.h @@ -0,0 +1,151 @@ +/*********************************************************************** +Moses - factored phrase-based language decoder +Copyright (C) 2014- University of Edinburgh + +This library is free software; you can redistribute it and/or +modify it under the terms of the GNU Lesser General Public +License as published by the Free Software Foundation; either +version 2.1 of the License, or (at your option) any later version. + +This library is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +Lesser General Public License for more details. + +You should have received a copy of the GNU Lesser General Public +License along with this library; if not, write to the Free Software +Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +***********************************************************************/ +#ifndef MERT_HOPEFEARDECODER_H +#define MERT_HOPEFEARDECODER_H + +#include <vector> + +#include <boost/scoped_ptr.hpp> +#include <boost/shared_ptr.hpp> + +#include "ForestRescore.h" +#include "Hypergraph.h" +#include "HypPackEnumerator.h" +#include "MiraFeatureVector.h" +#include "MiraWeightVector.h" + +// +// Used by batch mira to get the hope, fear and model hypothesis. This wraps +// the n-best list and lattice/hypergraph implementations +// + +namespace MosesTuning { + +/** To be filled in by the decoder */ +struct HopeFearData { + MiraFeatureVector modelFeatures; + MiraFeatureVector hopeFeatures; + MiraFeatureVector fearFeatures; + + std::vector<float> modelStats; + std::vector<float> hopeStats; + + ValType hopeBleu; + ValType fearBleu; + + bool hopeFearEqual; +}; + +//Abstract base class +class HopeFearDecoder { +public: + //iterator methods + virtual void reset() = 0; + virtual void next() = 0; + virtual bool finished() = 0; + + /** + * Calculate hope, fear and model hypotheses + **/ + virtual void HopeFear( + const std::vector<ValType>& backgroundBleu, + const MiraWeightVector& wv, + HopeFearData* hopeFear + ) = 0; + + /** Max score decoding */ + virtual void MaxModel(const AvgWeightVector& wv, std::vector<ValType>* stats) + = 0; + + /** Calculate bleu on training set */ + ValType Evaluate(const AvgWeightVector& wv); + +}; + + +/** Gets hope-fear from nbest lists */ +class NbestHopeFearDecoder : public virtual HopeFearDecoder { +public: + NbestHopeFearDecoder(const std::vector<std::string>& featureFiles, + const std::vector<std::string>& scoreFiles, + bool streaming, + bool no_shuffle, + bool safe_hope + ); + + virtual void reset(); + virtual void next(); + virtual bool finished(); + + virtual void HopeFear( + const std::vector<ValType>& backgroundBleu, + const MiraWeightVector& wv, + HopeFearData* hopeFear + ); + + virtual void MaxModel(const AvgWeightVector& wv, std::vector<ValType>* stats); + +private: + boost::scoped_ptr<HypPackEnumerator> train_; + bool safe_hope_; + +}; + + + +/** Gets hope-fear from hypergraphs */ +class HypergraphHopeFearDecoder : public virtual HopeFearDecoder { +public: + HypergraphHopeFearDecoder( + const std::string& hypergraphDir, + const std::vector<std::string>& referenceFiles, + size_t num_dense, + bool streaming, + bool no_shuffle, + bool safe_hope, + size_t hg_pruning, + const MiraWeightVector& wv + ); + + virtual void reset(); + virtual void next(); + virtual bool finished(); + + virtual void HopeFear( + const std::vector<ValType>& backgroundBleu, + const MiraWeightVector& wv, + HopeFearData* hopeFear + ); + + virtual void MaxModel(const AvgWeightVector& wv, std::vector<ValType>* stats); + +private: + size_t num_dense_; + //maps sentence Id to graph ptr + typedef std::map<size_t, boost::shared_ptr<Graph> > GraphColl; + GraphColl graphs_; + GraphColl::const_iterator graphIter_; + ReferenceSet references_; + Vocab vocab_; +}; + +}; + +#endif + diff --git a/mert/Hypergraph.cpp b/mert/Hypergraph.cpp new file mode 100644 index 000000000..d80a5c12c --- /dev/null +++ b/mert/Hypergraph.cpp @@ -0,0 +1,286 @@ +/*********************************************************************** +Moses - factored phrase-based language decoder +Copyright (C) 2014- University of Edinburgh + +This library is free software; you can redistribute it and/or +modify it under the terms of the GNU Lesser General Public +License as published by the Free Software Foundation; either +version 2.1 of the License, or (at your option) any later version. + +This library is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +Lesser General Public License for more details. + +You should have received a copy of the GNU Lesser General Public +License along with this library; if not, write to the Free Software +Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +***********************************************************************/ +#include <iostream> +#include <set> + +#include <boost/lexical_cast.hpp> + +#include "util/double-conversion/double-conversion.h" +#include "util/string_piece.hh" +#include "util/tokenize_piece.hh" + +#include "Hypergraph.h" + +using namespace std; +static const string kBOS = "<s>"; +static const string kEOS = "</s>"; + +namespace MosesTuning { + +StringPiece NextLine(util::FilePiece& from) { + StringPiece line; + while ((line = from.ReadLine()).starts_with("#")); + return line; +} + +Vocab::Vocab() : eos_( FindOrAdd(kEOS)), bos_(FindOrAdd(kBOS)){ +} + +const Vocab::Entry &Vocab::FindOrAdd(const StringPiece &str) { +#if BOOST_VERSION >= 104200 + Map::const_iterator i= map_.find(str, Hash(), Equals()); +#else + std::string copied_str(str.data(), str.size()); + Map::const_iterator i = map_.find(copied_str.c_str()); +#endif + if (i != map_.end()) return *i; + char *copied = static_cast<char*>(piece_backing_.Allocate(str.size() + 1)); + memcpy(copied, str.data(), str.size()); + copied[str.size()] = 0; + return *map_.insert(Entry(copied, map_.size())).first; +} + +double_conversion::StringToDoubleConverter converter(double_conversion::StringToDoubleConverter::NO_FLAGS, NAN, NAN, "inf", "nan"); + + +/** + * Reads an incoming edge. Returns edge and source words covered. +**/ +static pair<Edge*,size_t> ReadEdge(util::FilePiece &from, Graph &graph) { + Edge* edge = graph.NewEdge(); + StringPiece line = NextLine(from); + util::TokenIter<util::MultiCharacter> pipes(line, util::MultiCharacter(" ||| ")); + //Target + for (util::TokenIter<util::SingleCharacter, true> i(*pipes, util::SingleCharacter(' ')); i; ++i) { + StringPiece got = *i; + if ('[' == *got.data() && ']' == got.data()[got.size() - 1]) { + // non-terminal + char *end_ptr; + unsigned long int child = std::strtoul(got.data() + 1, &end_ptr, 10); + UTIL_THROW_IF(end_ptr != got.data() + got.size() - 1, HypergraphException, "Bad non-terminal" << got); + UTIL_THROW_IF(child >= graph.VertexSize(), HypergraphException, "Reference to vertex " << child << " but we only have " << graph.VertexSize() << " vertices. Is the file in bottom-up format?"); + edge->AddWord(NULL); + edge->AddChild(child); + } else { + const Vocab::Entry &found = graph.MutableVocab().FindOrAdd(got); + edge->AddWord(&found); + } + } + + //Features + ++pipes; + for (util::TokenIter<util::SingleCharacter, true> i(*pipes, util::SingleCharacter(' ')); i; ++i) { + StringPiece fv = *i; + if (!fv.size()) break; + size_t equals = fv.find_last_of("="); + UTIL_THROW_IF(equals == fv.npos, HypergraphException, "Failed to parse feature '" << fv << "'"); + StringPiece name = fv.substr(0,equals); + StringPiece value = fv.substr(equals+1); + int processed; + float score = converter.StringToFloat(value.data(), value.length(), &processed); + UTIL_THROW_IF(isnan(score), HypergraphException, "Failed to parse weight '" << value << "'"); + edge->AddFeature(name,score); + } + //Covered words + ++pipes; + size_t sourceCovered = boost::lexical_cast<size_t>(*pipes); + return pair<Edge*,size_t>(edge,sourceCovered); +} + +void Graph::Prune(Graph* pNewGraph, const SparseVector& weights, size_t minEdgeCount) const { + + Graph& newGraph = *pNewGraph; + //TODO: Optimise case where no pruning required + + //For debug + + /* + map<const Edge*, string> edgeIds; + for (size_t i = 0; i < edges_.Size(); ++i) { + stringstream str; + for (size_t j = 0; j < edges_[i].Words().size(); ++j) { + if (edges_[i].Words()[j]) str << edges_[i].Words()[j]->first; + } + edgeIds[&(edges_[i])] = str.str(); + }*/ + + //end For debug + + map<const Edge*, FeatureStatsType> edgeBackwardScores; + map<const Edge*, size_t> edgeHeads; + vector<FeatureStatsType> vertexBackwardScores(vertices_.Size(), kMinScore); + vector<vector<const Edge*> > outgoing(vertices_.Size()); + + //Compute backward scores + for (size_t vi = 0; vi < vertices_.Size(); ++vi) { + //cerr << "Vertex " << vi << endl; + const Vertex& vertex = vertices_[vi]; + const vector<const Edge*>& incoming = vertex.GetIncoming(); + if (!incoming.size()) { + vertexBackwardScores[vi] = 0; + } else { + for (size_t ei = 0; ei < incoming.size(); ++ei) { + //cerr << "Edge " << edgeIds[incoming[ei]] << endl; + edgeHeads[incoming[ei]]= vi; + FeatureStatsType incomingScore = incoming[ei]->GetScore(weights); + for (size_t i = 0; i < incoming[ei]->Children().size(); ++i) { + // cerr << "\tChild " << incoming[ei]->Children()[i] << endl; + size_t childId = incoming[ei]->Children()[i]; + UTIL_THROW_IF(vertexBackwardScores[childId] == kMinScore, + HypergraphException, "Graph was not topologically sorted. curr=" << vi << " prev=" << childId); + outgoing[childId].push_back(incoming[ei]); + incomingScore += vertexBackwardScores[childId]; + } + edgeBackwardScores[incoming[ei]]= incomingScore; + // cerr << "Backward score: " << incomingScore << endl; + if (incomingScore > vertexBackwardScores[vi]) vertexBackwardScores[vi] = incomingScore; + } + } + } + + //Compute forward scores + vector<FeatureStatsType> vertexForwardScores(vertices_.Size(), kMinScore); + map<const Edge*, FeatureStatsType> edgeForwardScores; + for (size_t i = 1; i <= vertices_.Size(); ++i) { + size_t vi = vertices_.Size() - i; + //cerr << "Vertex " << vi << endl; + if (!outgoing[vi].size()) { + vertexForwardScores[vi] = 0; + } else { + for (size_t ei = 0; ei < outgoing[vi].size(); ++ei) { + //cerr << "Edge " << edgeIds[outgoing[vi][ei]] << endl; + FeatureStatsType outgoingScore = 0; + //sum scores of siblings + for (size_t i = 0; i < outgoing[vi][ei]->Children().size(); ++i) { + size_t siblingId = outgoing[vi][ei]->Children()[i]; + if (siblingId != vi) { + // cerr << "\tSibling " << siblingId << endl; + outgoingScore += vertexBackwardScores[siblingId]; + } + } + //add score of head + outgoingScore += vertexForwardScores[edgeHeads[outgoing[vi][ei]]]; + //cerr << "Forward score " << outgoingScore << endl; + edgeForwardScores[outgoing[vi][ei]] = outgoingScore; + outgoingScore += outgoing[vi][ei]->GetScore(weights); + if (outgoingScore > vertexForwardScores[vi]) vertexForwardScores[vi] = outgoingScore; + } + } + } + + + + multimap<FeatureStatsType, const Edge*> edgeScores; + for (size_t i = 0; i < edges_.Size(); ++i) { + const Edge* edge = &(edges_[i]); + if (edgeForwardScores.find(edge) == edgeForwardScores.end()) { + //This edge has no children, so didn't get a forward score. Its forward score + //is that of its head + edgeForwardScores[edge] = vertexForwardScores[edgeHeads[edge]]; + } + FeatureStatsType score = edgeForwardScores[edge] + edgeBackwardScores[edge]; + edgeScores.insert(pair<FeatureStatsType, const Edge*>(score,edge)); + // cerr << edgeIds[edge] << " " << score << endl; + } + + + + multimap<FeatureStatsType, const Edge*>::const_reverse_iterator ei = edgeScores.rbegin(); + size_t edgeCount = 1; + while(edgeCount < minEdgeCount && ei != edgeScores.rend()) { + ++ei; + ++edgeCount; + } + multimap<FeatureStatsType, const Edge*>::const_iterator lowest = edgeScores.begin(); + if (ei != edgeScores.rend()) lowest = edgeScores.lower_bound(ei->first); + + //cerr << "Retained edges" << endl; + set<size_t> retainedVertices; + set<const Edge*> retainedEdges; + for (; lowest != edgeScores.end(); ++lowest) { + //cerr << lowest->first << " " << edgeIds[lowest->second] << endl; + retainedEdges.insert(lowest->second); + retainedVertices.insert(edgeHeads[lowest->second]); + for (size_t i = 0; i < lowest->second->Children().size(); ++i) { + retainedVertices.insert(lowest->second->Children()[i]); + } + } + newGraph.SetCounts(retainedVertices.size(), retainedEdges.size()); + + //cerr << "Retained vertices" << endl; + map<size_t,size_t> oldIdToNew; + size_t vi = 0; + for (set<size_t>::const_iterator i = retainedVertices.begin(); i != retainedVertices.end(); ++i, ++vi) { + //cerr << *i << endl; + oldIdToNew[*i] = vi; + Vertex* vertex = newGraph.NewVertex(); + vertex->SetSourceCovered(vertices_[*i].SourceCovered()); + } + + for (set<const Edge*>::const_iterator i = retainedEdges.begin(); i != retainedEdges.end(); ++i) { + Edge* newEdge = newGraph.NewEdge(); + const Edge* oldEdge = *i; + for (size_t j = 0; j < oldEdge->Words().size(); ++j) { + newEdge->AddWord(oldEdge->Words()[j]); + } + for (size_t j = 0; j < oldEdge->Children().size(); ++j) { + newEdge->AddChild(oldIdToNew[oldEdge->Children()[j]]); + } + newEdge->SetFeatures(oldEdge->Features()); + Vertex& newHead = newGraph.vertices_[oldIdToNew[edgeHeads[oldEdge]]]; + newHead.AddEdge(newEdge); + } + + + + +} + +/** + * Read from "Kenneth's hypergraph" aka cdec target_graph format (with comments) +**/ +void ReadGraph(util::FilePiece &from, Graph &graph) { + + //First line should contain field names + StringPiece line = from.ReadLine(); + UTIL_THROW_IF(line.compare("# target ||| features ||| source-covered") != 0, HypergraphException, "Incorrect format spec on first line: '" << line << "'"); + line = NextLine(from); + + //Then expect numbers of vertices + util::TokenIter<util::SingleCharacter, false> i(line, util::SingleCharacter(' ')); + unsigned long int vertices = boost::lexical_cast<unsigned long int>(*i); + ++i; + unsigned long int edges = boost::lexical_cast<unsigned long int>(*i); + graph.SetCounts(vertices, edges); + //cerr << "vertices: " << vertices << "; edges: " << edges << endl; + for (size_t i = 0; i < vertices; ++i) { + line = NextLine(from); + unsigned long int edge_count = boost::lexical_cast<unsigned long int>(line); + Vertex* vertex = graph.NewVertex(); + for (unsigned long int e = 0; e < edge_count; ++e) { + pair<Edge*,size_t> edge = ReadEdge(from, graph); + vertex->AddEdge(edge.first); + //Note: the file format attaches this to the edge, but it's really a property + //of the vertex. + if (!e) {vertex->SetSourceCovered(edge.second);} + } + } +} + +}; diff --git a/mert/Hypergraph.h b/mert/Hypergraph.h new file mode 100644 index 000000000..b6ee6c3f8 --- /dev/null +++ b/mert/Hypergraph.h @@ -0,0 +1,251 @@ +/*********************************************************************** +Moses - factored phrase-based language decoder +Copyright (C) 2014- University of Edinburgh + +This library is free software; you can redistribute it and/or +modify it under the terms of the GNU Lesser General Public +License as published by the Free Software Foundation; either +version 2.1 of the License, or (at your option) any later version. + +This library is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +Lesser General Public License for more details. + +You should have received a copy of the GNU Lesser General Public +License along with this library; if not, write to the Free Software +Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +***********************************************************************/ + +#ifndef MERT_HYPERGRAPH_H +#define MERT_HYPERGRAPH_H + +#include <string> + +#include <boost/noncopyable.hpp> +#include <boost/scoped_array.hpp> +#include <boost/shared_ptr.hpp> +#include <boost/functional/hash/hash.hpp> +#include <boost/unordered_map.hpp> + + +#include "util/exception.hh" +#include "util/file_piece.hh" +#include "util/murmur_hash.hh" +#include "util/pool.hh" +#include "util/string_piece.hh" + +#include "FeatureStats.h" + +namespace MosesTuning { + +typedef unsigned int WordIndex; +const WordIndex kMaxWordIndex = UINT_MAX; +const FeatureStatsType kMinScore = -std::numeric_limits<FeatureStatsType>::max(); + +template <class T> class FixedAllocator : boost::noncopyable { + public: + FixedAllocator() : current_(NULL), end_(NULL) {} + + void Init(std::size_t count) { + assert(!current_); + array_.reset(new T[count]); + current_ = array_.get(); + end_ = current_ + count; + } + + T &operator[](std::size_t idx) { + return array_.get()[idx]; + } + const T &operator[](std::size_t idx) const { + return array_.get()[idx]; + } + + T *New() { + T *ret = current_++; + UTIL_THROW_IF(ret >= end_, util::Exception, "Allocating past end"); + return ret; + } + + std::size_t Capacity() const { + return end_ - array_.get(); + } + + std::size_t Size() const { + return current_ - array_.get(); + } + + private: + boost::scoped_array<T> array_; + T *current_, *end_; +}; + + +class Vocab { + public: + Vocab(); + + typedef std::pair<const char *const, WordIndex> Entry; + + const Entry &FindOrAdd(const StringPiece &str); + + const Entry& Bos() const {return bos_;} + + const Entry& Eos() const {return eos_;} + + private: + util::Pool piece_backing_; + + struct Hash : public std::unary_function<const char *, std::size_t> { + std::size_t operator()(StringPiece str) const { + return util::MurmurHashNative(str.data(), str.size()); + } + }; + + struct Equals : public std::binary_function<const char *, const char *, bool> { + bool operator()(StringPiece first, StringPiece second) const { + return first == second; + } + }; + + typedef boost::unordered_map<const char *, WordIndex, Hash, Equals> Map; + Map map_; + Entry eos_; + Entry bos_; + +}; + +typedef std::vector<const Vocab::Entry*> WordVec; + +class Vertex; + +//Use shared pointer to save copying when we prune +typedef boost::shared_ptr<SparseVector> FeaturePtr; + +/** + * An edge has 1 head vertex, 0..n child (tail) vertices, a list of words and a feature vector. +**/ +class Edge { + public: + Edge() {features_.reset(new SparseVector());} + + void AddWord(const Vocab::Entry *word) { + words_.push_back(word); + } + + void AddChild(size_t child) { + children_.push_back(child); + } + + void AddFeature(const StringPiece& name, FeatureStatsType value) { + //TODO StringPiece interface + features_->set(name.as_string(),value); + } + + + const WordVec &Words() const { + return words_; + } + + const FeaturePtr& Features() const { + return features_; + } + + void SetFeatures(const FeaturePtr& features) { + features_ = features; + } + + const std::vector<size_t>& Children() const { + return children_; + } + + FeatureStatsType GetScore(const SparseVector& weights) const { + return inner_product(*(features_.get()), weights); + } + + private: + // NULL for non-terminals. + std::vector<const Vocab::Entry*> words_; + std::vector<size_t> children_; + boost::shared_ptr<SparseVector> features_; +}; + +/* + * A vertex has 0..n incoming edges + **/ +class Vertex { + public: + Vertex() : sourceCovered_(0) {} + + void AddEdge(const Edge* edge) {incoming_.push_back(edge);} + + void SetSourceCovered(size_t sourceCovered) {sourceCovered_ = sourceCovered;} + + const std::vector<const Edge*>& GetIncoming() const {return incoming_;} + + size_t SourceCovered() const {return sourceCovered_;} + + private: + std::vector<const Edge*> incoming_; + size_t sourceCovered_; +}; + + +class Graph : boost::noncopyable { + public: + Graph(Vocab& vocab) : vocab_(vocab) {} + + void SetCounts(std::size_t vertices, std::size_t edges) { + vertices_.Init(vertices); + edges_.Init(edges); + } + + Vocab &MutableVocab() { return vocab_; } + + Edge *NewEdge() { + return edges_.New(); + } + + Vertex *NewVertex() { + return vertices_.New(); + } + + const Vertex &GetVertex(std::size_t index) const { + return vertices_[index]; + } + + Edge &GetEdge(std::size_t index) { + return edges_[index]; + } + + /* Created a pruned copy of this graph with minEdgeCount edges. Uses + the scores in the max-product semiring to rank edges, as suggested by + Colin Cherry */ + void Prune(Graph* newGraph, const SparseVector& weights, size_t minEdgeCount) const; + + std::size_t VertexSize() const { return vertices_.Size(); } + std::size_t EdgeSize() const { return edges_.Size(); } + + bool IsBoundary(const Vocab::Entry* word) const { + return word->second == vocab_.Bos().second || word->second == vocab_.Eos().second; + } + + private: + FixedAllocator<Edge> edges_; + FixedAllocator<Vertex> vertices_; + Vocab& vocab_; +}; + +class HypergraphException : public util::Exception { + public: + HypergraphException() {} + ~HypergraphException() throw() {} +}; + + +void ReadGraph(util::FilePiece &from, Graph &graph); + + +}; + +#endif diff --git a/mert/HypergraphTest.cpp b/mert/HypergraphTest.cpp new file mode 100644 index 000000000..345a445f0 --- /dev/null +++ b/mert/HypergraphTest.cpp @@ -0,0 +1,151 @@ +#include <iostream> + +#define BOOST_TEST_MODULE MertForestRescore +#include <boost/test/unit_test.hpp> + +#include "Hypergraph.h" + +using namespace std; +using namespace MosesTuning; + +BOOST_AUTO_TEST_CASE(prune) +{ + Vocab vocab; + WordVec words; + string wordStrings[] = + {"<s>", "</s>", "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k"}; + for (size_t i = 0; i < 13; ++i) { + words.push_back(&(vocab.FindOrAdd((wordStrings[i])))); + } + + const string f1 = "foo"; + const string f2 = "bar"; + Graph graph(vocab); + graph.SetCounts(5,8); + + Edge* e0 = graph.NewEdge(); + e0->AddWord(words[0]); + + Vertex* v0 = graph.NewVertex(); + v0->AddEdge(e0); + + Edge* e1 = graph.NewEdge(); + e1->AddWord(NULL); + e1->AddChild(0); + e1->AddWord(words[2]); + e1->AddWord(words[3]); + e1->AddFeature(f1,1); + e1->AddFeature(f2,1); + Edge* e5 = graph.NewEdge(); + e5->AddWord(NULL); + e5->AddChild(0); + e5->AddWord(words[9]); + e5->AddWord(words[10]); + e5->AddFeature(f1,2); + e5->AddFeature(f2,-2); + + Vertex* v1 = graph.NewVertex(); + v1->AddEdge(e1); + v1->AddEdge(e5); + v1->SetSourceCovered(1); + + Edge* e2 = graph.NewEdge(); + e2->AddWord(NULL); + e2->AddChild(1); + e2->AddWord(words[4]); + e2->AddWord(words[5]); + e2->AddFeature(f2,3); + + Vertex* v2 = graph.NewVertex(); + v2->AddEdge(e2); + v2->SetSourceCovered(3); + + Edge* e3 = graph.NewEdge(); + e3->AddWord(NULL); + e3->AddChild(2); + e3->AddWord(words[6]); + e3->AddWord(words[7]); + e3->AddWord(words[8]); + e3->AddFeature(f1,1); + Edge* e6 = graph.NewEdge(); + e6->AddWord(NULL); + e6->AddChild(2); + e6->AddWord(words[9]); + e6->AddWord(words[12]); + e6->AddFeature(f2,1); + Edge* e7 = graph.NewEdge(); + e7->AddWord(NULL); + e7->AddChild(1); + e7->AddWord(words[11]); + e7->AddWord(words[12]); + e7->AddFeature(f1,2); + e7->AddFeature(f2,3); + + Vertex* v3 = graph.NewVertex(); + v3->AddEdge(e3); + v3->AddEdge(e6); + v3->AddEdge(e7); + v3->SetSourceCovered(5); + + Edge* e4 = graph.NewEdge(); + e4->AddWord(NULL); + e4->AddChild(3); + e4->AddWord(words[1]); + + Vertex* v4 = graph.NewVertex(); + v4->AddEdge(e4); + v4->SetSourceCovered(6); + + SparseVector weights; + weights.set(f1,2); + weights.set(f2,1); + + Graph pruned(vocab); + graph.Prune(&pruned, weights, 5); + + BOOST_CHECK_EQUAL(5, pruned.EdgeSize()); + BOOST_CHECK_EQUAL(4, pruned.VertexSize()); + + //edges retained should be best path (<s> ab jk </s>) and hi + BOOST_CHECK_EQUAL(1, pruned.GetVertex(0).GetIncoming().size()); + BOOST_CHECK_EQUAL(2, pruned.GetVertex(1).GetIncoming().size()); + BOOST_CHECK_EQUAL(1, pruned.GetVertex(2).GetIncoming().size()); + BOOST_CHECK_EQUAL(1, pruned.GetVertex(3).GetIncoming().size()); + + const Edge* edge; + + edge = pruned.GetVertex(0).GetIncoming()[0]; + BOOST_CHECK_EQUAL(1, edge->Words().size()); + BOOST_CHECK_EQUAL(words[0], edge->Words()[0]); + + edge = pruned.GetVertex(1).GetIncoming()[0]; + BOOST_CHECK_EQUAL(3, edge->Words().size()); + BOOST_CHECK_EQUAL((Vocab::Entry*)NULL, edge->Words()[0]); + BOOST_CHECK_EQUAL(words[2]->first, edge->Words()[1]->first); + BOOST_CHECK_EQUAL(words[3]->first, edge->Words()[2]->first); + + edge = pruned.GetVertex(1).GetIncoming()[1]; + BOOST_CHECK_EQUAL(3, edge->Words().size()); + BOOST_CHECK_EQUAL((Vocab::Entry*)NULL, edge->Words()[0]); + BOOST_CHECK_EQUAL(words[9]->first, edge->Words()[1]->first); + BOOST_CHECK_EQUAL(words[10]->first, edge->Words()[2]->first); + + edge = pruned.GetVertex(2).GetIncoming()[0]; + BOOST_CHECK_EQUAL(3, edge->Words().size()); + BOOST_CHECK_EQUAL((Vocab::Entry*)NULL, edge->Words()[0]); + BOOST_CHECK_EQUAL(words[11]->first, edge->Words()[1]->first); + BOOST_CHECK_EQUAL(words[12]->first, edge->Words()[2]->first); + + edge = pruned.GetVertex(3).GetIncoming()[0]; + BOOST_CHECK_EQUAL(2, edge->Words().size()); + BOOST_CHECK_EQUAL((Vocab::Entry*)NULL, edge->Words()[0]); + BOOST_CHECK_EQUAL(words[1]->first, edge->Words()[1]->first); + + + + + +// BOOST_CHECK_EQUAL(words[0], pruned.GetVertex(0).GetIncoming()[0].Words()[0]); + + +} diff --git a/mert/Jamfile b/mert/Jamfile index 34c640b06..d848c258f 100644 --- a/mert/Jamfile +++ b/mert/Jamfile @@ -15,6 +15,9 @@ FeatureStats.cpp FeatureArray.cpp FeatureData.cpp FeatureDataIterator.cpp +ForestRescore.cpp +HopeFearDecoder.cpp +Hypergraph.cpp MiraFeatureVector.cpp MiraWeightVector.cpp HypPackEnumerator.cpp @@ -62,13 +65,15 @@ exe sentence-bleu : sentence-bleu.cpp mert_lib ; exe pro : pro.cpp mert_lib ..//boost_program_options ; -exe kbmira : kbmira.cpp mert_lib ..//boost_program_options ; +exe kbmira : kbmira.cpp mert_lib ..//boost_program_options ..//boost_filesystem ; alias programs : mert extractor evaluator pro kbmira sentence-bleu ; unit-test bleu_scorer_test : BleuScorerTest.cpp mert_lib ..//boost_unit_test_framework ; unit-test feature_data_test : FeatureDataTest.cpp mert_lib ..//boost_unit_test_framework ; unit-test data_test : DataTest.cpp mert_lib ..//boost_unit_test_framework ; +unit-test forest_rescore_test : ForestRescoreTest.cpp mert_lib ..//boost_unit_test_framework ; +unit-test hypergraph_test : HypergraphTest.cpp mert_lib ..//boost_unit_test_framework ; unit-test ngram_test : NgramTest.cpp mert_lib ..//boost_unit_test_framework ; unit-test optimizer_factory_test : OptimizerFactoryTest.cpp mert_lib ..//boost_unit_test_framework ; unit-test point_test : PointTest.cpp mert_lib ..//boost_unit_test_framework ; diff --git a/mert/MiraFeatureVector.cpp b/mert/MiraFeatureVector.cpp index dea9b9b83..347ad488e 100644 --- a/mert/MiraFeatureVector.cpp +++ b/mert/MiraFeatureVector.cpp @@ -9,18 +9,17 @@ namespace MosesTuning { -MiraFeatureVector::MiraFeatureVector(const FeatureDataItem& vec) - : m_dense(vec.dense) -{ - vector<size_t> sparseFeats = vec.sparse.feats(); +void MiraFeatureVector::InitSparse(const SparseVector& sparse, size_t ignoreLimit) { + vector<size_t> sparseFeats = sparse.feats(); bool bFirst = true; size_t lastFeat = 0; m_sparseFeats.reserve(sparseFeats.size()); m_sparseVals.reserve(sparseFeats.size()); for(size_t i=0; i<sparseFeats.size(); i++) { + if (sparseFeats[i] < ignoreLimit) continue; size_t feat = m_dense.size() + sparseFeats[i]; m_sparseFeats.push_back(feat); - m_sparseVals.push_back(vec.sparse.get(sparseFeats[i])); + m_sparseVals.push_back(sparse.get(sparseFeats[i])); // Check ordered property if(bFirst) { @@ -35,6 +34,21 @@ MiraFeatureVector::MiraFeatureVector(const FeatureDataItem& vec) } } +MiraFeatureVector::MiraFeatureVector(const FeatureDataItem& vec) + : m_dense(vec.dense) +{ + InitSparse(vec.sparse); +} + +MiraFeatureVector::MiraFeatureVector(const SparseVector& sparse, size_t num_dense) { + m_dense.resize(num_dense); + //Assume that features with id [0,num_dense) are the dense features + for (size_t id = 0; id < num_dense; ++id) { + m_dense[id] = sparse.get(id); + } + InitSparse(sparse,num_dense); +} + MiraFeatureVector::MiraFeatureVector(const MiraFeatureVector& other) : m_dense(other.m_dense), m_sparseFeats(other.m_sparseFeats), @@ -148,6 +162,22 @@ MiraFeatureVector operator-(const MiraFeatureVector& a, const MiraFeatureVector& return MiraFeatureVector(dense,sparseFeats,sparseVals); } +bool operator==(const MiraFeatureVector& a,const MiraFeatureVector& b) { + ValType eps = 1e-8; + //dense features + if (a.m_dense.size() != b.m_dense.size()) return false; + for (size_t i = 0; i < a.m_dense.size(); ++i) { + if (fabs(a.m_dense[i]-b.m_dense[i]) < eps) return false; + } + if (a.m_sparseFeats.size() != b.m_sparseFeats.size()) return false; + for (size_t i = 0; i < a.m_sparseFeats.size(); ++i) { + if (a.m_sparseFeats[i] != b.m_sparseFeats[i]) return false; + if (fabs(a.m_sparseVals[i] != b.m_sparseVals[i])) return false; + } + return true; + +} + ostream& operator<<(ostream& o, const MiraFeatureVector& e) { for(size_t i=0; i<e.size(); i++) { diff --git a/mert/MiraFeatureVector.h b/mert/MiraFeatureVector.h index cb2b1c87d..48aa496b5 100644 --- a/mert/MiraFeatureVector.h +++ b/mert/MiraFeatureVector.h @@ -26,7 +26,10 @@ typedef FeatureStatsType ValType; class MiraFeatureVector { public: + MiraFeatureVector() {} MiraFeatureVector(const FeatureDataItem& vec); + //Assumes that features in sparse with id < num_dense are dense features + MiraFeatureVector(const SparseVector& sparse, size_t num_dense); MiraFeatureVector(const MiraFeatureVector& other); MiraFeatureVector(const std::vector<ValType>& dense, const std::vector<std::size_t>& sparseFeats, @@ -42,7 +45,12 @@ public: friend std::ostream& operator<<(std::ostream& o, const MiraFeatureVector& e); + friend bool operator==(const MiraFeatureVector& a,const MiraFeatureVector& b); + private: + //Ignore any sparse features with id < ignoreLimit + void InitSparse(const SparseVector& sparse, size_t ignoreLimit = 0); + std::vector<ValType> m_dense; std::vector<std::size_t> m_sparseFeats; std::vector<ValType> m_sparseVals; diff --git a/mert/MiraWeightVector.cpp b/mert/MiraWeightVector.cpp index e23804cbf..3b7b1780c 100644 --- a/mert/MiraWeightVector.cpp +++ b/mert/MiraWeightVector.cpp @@ -93,6 +93,14 @@ void MiraWeightVector::update(size_t index, ValType delta) m_lastUpdated[index] = m_numUpdates; } +void MiraWeightVector::ToSparse(SparseVector* sparse) const { + for (size_t i = 0; i < m_weights.size(); ++i) { + if(abs(m_weights[i])>1e-8) { + sparse->set(i,m_weights[i]); + } + } +} + /** * Make sure everyone's total is up-to-date */ @@ -163,6 +171,15 @@ size_t AvgWeightVector::size() const return m_wv.m_weights.size(); } +void AvgWeightVector::ToSparse(SparseVector* sparse) const { + for (size_t i = 0; i < size(); ++i) { + ValType w = weight(i); + if(abs(w)>1e-8) { + sparse->set(i,w); + } + } +} + // --Emacs trickery-- // Local Variables: // mode:c++ diff --git a/mert/MiraWeightVector.h b/mert/MiraWeightVector.h index eb27e8a6d..bbc28704b 100644 --- a/mert/MiraWeightVector.h +++ b/mert/MiraWeightVector.h @@ -63,6 +63,11 @@ public: */ AvgWeightVector avg(); + /** + * Convert to sparse vector, interpreting all features as sparse. + **/ + void ToSparse(SparseVector* sparse) const; + friend class AvgWeightVector; friend std::ostream& operator<<(std::ostream& o, const MiraWeightVector& e); @@ -99,12 +104,12 @@ public: ValType score(const MiraFeatureVector& fv) const; ValType weight(std::size_t index) const; std::size_t size() const; + void ToSparse(SparseVector* sparse) const; private: const MiraWeightVector& m_wv; }; -#endif // MERT_WEIGHT_VECTOR_H // --Emacs trickery-- // Local Variables: @@ -113,3 +118,4 @@ private: // End: } +#endif // MERT_WEIGHT_VECTOR_H diff --git a/mert/kbmira.cpp b/mert/kbmira.cpp index a2665ac13..da552fa36 100644 --- a/mert/kbmira.cpp +++ b/mert/kbmira.cpp @@ -39,8 +39,10 @@ de recherches du Canada #include <boost/program_options.hpp> #include <boost/scoped_ptr.hpp> +#include "util/exception.hh" + #include "BleuScorer.h" -#include "HypPackEnumerator.h" +#include "HopeFearDecoder.h" #include "MiraFeatureVector.h" #include "MiraWeightVector.h" @@ -49,38 +51,16 @@ using namespace MosesTuning; namespace po = boost::program_options; -ValType evaluate(HypPackEnumerator* train, const AvgWeightVector& wv) -{ - vector<ValType> stats(kBleuNgramOrder*2+1,0); - for(train->reset(); !train->finished(); train->next()) { - // Find max model - size_t max_index=0; - ValType max_score=0; - for(size_t i=0; i<train->cur_size(); i++) { - MiraFeatureVector vec(train->featuresAt(i)); - ValType score = wv.score(vec); - if(i==0 || score > max_score) { - max_index = i; - max_score = score; - } - } - // Update stats - const vector<float>& sent = train->scoresAt(max_index); - for(size_t i=0; i<sent.size(); i++) { - stats[i]+=sent[i]; - } - } - return unsmoothedBleu(stats); -} - int main(int argc, char** argv) { - const ValType BLEU_RATIO = 5; bool help; string denseInitFile; string sparseInitFile; + string type = "nbest"; vector<string> scoreFiles; vector<string> featureFiles; + vector<string> referenceFiles; //for hg mira + string hgDir; int seed; string outputFile; float c = 0.01; // Step-size cap C @@ -91,25 +71,30 @@ int main(int argc, char** argv) bool model_bg = false; // Use model for background corpus bool verbose = false; // Verbose updates bool safe_hope = false; // Model score cannot have more than BLEU_RATIO times more influence than BLEU + size_t hgPruning = 50; //prune hypergraphs to have this many edges per reference word // Command-line processing follows pro.cpp po::options_description desc("Allowed options"); desc.add_options() ("help,h", po::value(&help)->zero_tokens()->default_value(false), "Print this help message and exit") + ("type,t", po::value<string>(&type), "Either nbest or hypergraph") ("scfile,S", po::value<vector<string> >(&scoreFiles), "Scorer data files") ("ffile,F", po::value<vector<string> > (&featureFiles), "Feature data files") + ("hgdir,H", po::value<string> (&hgDir), "Directory containing hypergraphs") + ("reference,R", po::value<vector<string> > (&referenceFiles), "Reference files, only required for hypergraph mira") ("random-seed,r", po::value<int>(&seed), "Seed for random number generation") ("output-file,o", po::value<string>(&outputFile), "Output file") ("cparam,C", po::value<float>(&c), "MIRA C-parameter, lower for more regularization (default 0.01)") ("decay,D", po::value<float>(&decay), "BLEU background corpus decay rate (default 0.999)") ("iters,J", po::value<int>(&n_iters), "Number of MIRA iterations to run (default 60)") - ("dense-init,d", po::value<string>(&denseInitFile), "Weight file for dense features") + ("dense-init,d", po::value<string>(&denseInitFile), "Weight file for dense features. This should have 'name= value' on each line, or (legacy) should be the Moses mert 'init.opt' format.") ("sparse-init,s", po::value<string>(&sparseInitFile), "Weight file for sparse features") ("streaming", po::value(&streaming)->zero_tokens()->default_value(false), "Stream n-best lists to save memory, implies --no-shuffle") ("no-shuffle", po::value(&no_shuffle)->zero_tokens()->default_value(false), "Don't shuffle hypotheses before each epoch") ("model-bg", po::value(&model_bg)->zero_tokens()->default_value(false), "Use model instead of hope for BLEU background") ("verbose", po::value(&verbose)->zero_tokens()->default_value(false), "Verbose updates") ("safe-hope", po::value(&safe_hope)->zero_tokens()->default_value(false), "Mode score's influence on hope decoding is limited") + ("hg-prune", po::value<size_t>(&hgPruning), "Prune hypergraphs to have this many edges per reference word") ; po::options_description cmdline_options; @@ -145,12 +130,56 @@ int main(int argc, char** argv) cerr << "could not open dense initfile: " << denseInitFile << endl; exit(3); } + if (verbose) cerr << "Reading dense features:" << endl; parameter_t val; getline(opt,buffer); - istringstream strstrm(buffer); - while(strstrm >> val) { - initParams.push_back(val); + if (buffer.find_first_of("=") == buffer.npos) { + UTIL_THROW_IF(type == "hypergraph", util::Exception, "For hypergraph version, require dense features in 'name= value' format"); + cerr << "WARN: dense features in deprecated Moses mert format. Prefer 'name= value' format." << endl; + istringstream strstrm(buffer); + while(strstrm >> val) { + initParams.push_back(val); + if(verbose) cerr << val << endl; + } + } else { + vector<string> names; + string last_name = ""; + size_t feature_ctr = 0; + do { + size_t equals = buffer.find_last_of("="); + UTIL_THROW_IF(equals == buffer.npos, util::Exception, "Incorrect format in dense feature file: '" + << buffer << "'"); + string name = buffer.substr(0,equals); + names.push_back(name); + initParams.push_back(boost::lexical_cast<ValType>(buffer.substr(equals+2))); + + //Names for features with several values need to have their id added + if (name != last_name) feature_ctr = 0; + last_name = name; + if (feature_ctr) { + stringstream namestr; + namestr << names.back() << feature_ctr; + names[names.size()-1] = namestr.str(); + if (feature_ctr == 1) { + stringstream namestr; + namestr << names[names.size()-2] << (feature_ctr-1); + names[names.size()-2] = namestr.str(); + } + } + ++feature_ctr; + + } while(getline(opt,buffer)); + + + //Make sure that SparseVector encodes dense feature names as 0..n-1. + for (size_t i = 0; i < names.size(); ++i) { + size_t id = SparseVector::encode(names[i]); + assert(id == i); + if (verbose) cerr << names[i] << " " << initParams[i] << endl; + } + } + opt.close(); } size_t initDenseSize = initParams.size(); @@ -188,82 +217,45 @@ int main(int argc, char** argv) } bg.push_back(kBleuNgramOrder); + boost::scoped_ptr<HopeFearDecoder> decoder; + if (type == "nbest") { + decoder.reset(new NbestHopeFearDecoder(featureFiles, scoreFiles, streaming, no_shuffle, safe_hope)); + } else if (type == "hypergraph") { + decoder.reset(new HypergraphHopeFearDecoder(hgDir, referenceFiles, initDenseSize, streaming, no_shuffle, safe_hope, hgPruning, wv)); + } else { + UTIL_THROW(util::Exception, "Unknown batch mira type: '" << type << "'"); + } + // Training loop - boost::scoped_ptr<HypPackEnumerator> train; - if(streaming) - train.reset(new StreamingHypPackEnumerator(featureFiles, scoreFiles)); - else - train.reset(new RandomAccessHypPackEnumerator(featureFiles, scoreFiles, no_shuffle)); - cerr << "Initial BLEU = " << evaluate(train.get(), wv.avg()) << endl; + cerr << "Initial BLEU = " << decoder->Evaluate(wv.avg()) << endl; ValType bestBleu = 0; for(int j=0; j<n_iters; j++) { // MIRA train for one epoch - int iNumHyps = 0; int iNumExamples = 0; int iNumUpdates = 0; ValType totalLoss = 0.0; - for(train->reset(); !train->finished(); train->next()) { - // Hope / fear decode - ValType hope_scale = 1.0; - size_t hope_index=0, fear_index=0, model_index=0; - ValType hope_score=0, fear_score=0, model_score=0; - int iNumHypsBackup = iNumHyps; - for(size_t safe_loop=0; safe_loop<2; safe_loop++) { - iNumHyps = iNumHypsBackup; - ValType hope_bleu, hope_model; - for(size_t i=0; i< train->cur_size(); i++) { - const MiraFeatureVector& vec=train->featuresAt(i); - ValType score = wv.score(vec); - ValType bleu = sentenceLevelBackgroundBleu(train->scoresAt(i),bg); - // Hope - if(i==0 || (hope_scale*score + bleu) > hope_score) { - hope_score = hope_scale*score + bleu; - hope_index = i; - hope_bleu = bleu; - hope_model = score; - } - // Fear - if(i==0 || (score - bleu) > fear_score) { - fear_score = score - bleu; - fear_index = i; - } - // Model - if(i==0 || score > model_score) { - model_score = score; - model_index = i; - } - iNumHyps++; - } - // Outer loop rescales the contribution of model score to 'hope' in antagonistic cases - // where model score is having far more influence than BLEU - hope_bleu *= BLEU_RATIO; // We only care about cases where model has MUCH more influence than BLEU - if(safe_hope && safe_loop==0 && abs(hope_model)>1e-8 && abs(hope_bleu)/abs(hope_model)<hope_scale) - hope_scale = abs(hope_bleu) / abs(hope_model); - else break; - } + size_t sentenceIndex = 0; + for(decoder->reset();!decoder->finished(); decoder->next()) { + HopeFearData hfd; + decoder->HopeFear(bg,wv,&hfd); + // Update weights - if(hope_index!=fear_index) { + if (!hfd.hopeFearEqual && hfd.hopeBleu > hfd.fearBleu) { // Vector difference - const MiraFeatureVector& hope=train->featuresAt(hope_index); - const MiraFeatureVector& fear=train->featuresAt(fear_index); - MiraFeatureVector diff = hope - fear; + MiraFeatureVector diff = hfd.hopeFeatures - hfd.fearFeatures; // Bleu difference - const vector<float>& hope_stats = train->scoresAt(hope_index); - ValType hopeBleu = sentenceLevelBackgroundBleu(hope_stats, bg); - const vector<float>& fear_stats = train->scoresAt(fear_index); - ValType fearBleu = sentenceLevelBackgroundBleu(fear_stats, bg); - assert(hopeBleu + 1e-8 >= fearBleu); - ValType delta = hopeBleu - fearBleu; + //assert(hfd.hopeBleu + 1e-8 >= hfd.fearBleu); + ValType delta = hfd.hopeBleu - hfd.fearBleu; // Loss and update ValType diff_score = wv.score(diff); ValType loss = delta - diff_score; if(verbose) { - cerr << "Updating sent " << train->cur_id() << endl; + cerr << "Updating sent " << sentenceIndex << endl; cerr << "Wght: " << wv << endl; - cerr << "Hope: " << hope << " BLEU:" << hopeBleu << " Score:" << wv.score(hope) << endl; - cerr << "Fear: " << fear << " BLEU:" << fearBleu << " Score:" << wv.score(fear) << endl; + cerr << "Hope: " << hfd.hopeFeatures << " BLEU:" << hfd.hopeBleu << " Score:" << wv.score(hfd.hopeFeatures) << endl; + cerr << "Fear: " << hfd.fearFeatures << " BLEU:" << hfd.fearBleu << " Score:" << wv.score(hfd.fearFeatures) << endl; cerr << "Diff: " << diff << " BLEU:" << delta << " Score:" << diff_score << endl; - cerr << "Loss: " << loss << " Scale: " << hope_scale << endl; + cerr << "Loss: " << loss << " Scale: " << 1 << endl; cerr << endl; } if(loss > 0) { @@ -273,16 +265,16 @@ int main(int argc, char** argv) iNumUpdates++; } // Update BLEU statistics - const vector<float>& model_stats = train->scoresAt(model_index); for(size_t k=0; k<bg.size(); k++) { bg[k]*=decay; if(model_bg) - bg[k]+=model_stats[k]; + bg[k]+=hfd.modelStats[k]; else - bg[k]+=hope_stats[k]; + bg[k]+=hfd.hopeStats[k]; } } iNumExamples++; + ++sentenceIndex; } // Training Epoch summary cerr << iNumUpdates << "/" << iNumExamples << " updates" @@ -291,15 +283,16 @@ int main(int argc, char** argv) // Evaluate current average weights AvgWeightVector avg = wv.avg(); - ValType bleu = evaluate(train.get(), avg); + ValType bleu = decoder->Evaluate(avg); cerr << ", BLEU = " << bleu << endl; if(bleu > bestBleu) { + /* size_t num_dense = train->num_dense(); if(initDenseSize>0 && initDenseSize!=num_dense) { cerr << "Error: Initial dense feature count and dense feature count from n-best do not match: " << initDenseSize << "!=" << num_dense << endl; exit(1); - } + }*/ // Write to a file ostream* out; ofstream outFile; @@ -314,11 +307,11 @@ int main(int argc, char** argv) out = &cout; } for(size_t i=0; i<avg.size(); i++) { - if(i<num_dense) + if(i<initDenseSize) *out << "F" << i << " " << avg.weight(i) << endl; else { if(abs(avg.weight(i))>1e-8) - *out << SparseVector::decode(i-num_dense) << " " << avg.weight(i) << endl; + *out << SparseVector::decode(i-initDenseSize) << " " << avg.weight(i) << endl; } } outFile.close(); |