Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorBarry Haddow <barry.haddow@gmail.com>2014-07-21 14:04:43 +0400
committerBarry Haddow <barry.haddow@gmail.com>2014-07-21 14:04:43 +0400
commitefee2695c31e1086af783c1b092fc842fb7bb1a4 (patch)
treee8324ea35cc92f0737c93f6b8fa8e23898e4d78a /mert/Hypergraph.cpp
parentc83c5a3ee6f3ef7480e7a782d2023af9e99c1711 (diff)
Merge 08811deb17337356cd8dae9c59c0160590679a35 from joshua
Diffstat (limited to 'mert/Hypergraph.cpp')
-rw-r--r--mert/Hypergraph.cpp286
1 files changed, 286 insertions, 0 deletions
diff --git a/mert/Hypergraph.cpp b/mert/Hypergraph.cpp
new file mode 100644
index 000000000..d80a5c12c
--- /dev/null
+++ b/mert/Hypergraph.cpp
@@ -0,0 +1,286 @@
+/***********************************************************************
+Moses - factored phrase-based language decoder
+Copyright (C) 2014- University of Edinburgh
+
+This library is free software; you can redistribute it and/or
+modify it under the terms of the GNU Lesser General Public
+License as published by the Free Software Foundation; either
+version 2.1 of the License, or (at your option) any later version.
+
+This library is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+Lesser General Public License for more details.
+
+You should have received a copy of the GNU Lesser General Public
+License along with this library; if not, write to the Free Software
+Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+***********************************************************************/
+#include <iostream>
+#include <set>
+
+#include <boost/lexical_cast.hpp>
+
+#include "util/double-conversion/double-conversion.h"
+#include "util/string_piece.hh"
+#include "util/tokenize_piece.hh"
+
+#include "Hypergraph.h"
+
+using namespace std;
+static const string kBOS = "<s>";
+static const string kEOS = "</s>";
+
+namespace MosesTuning {
+
+StringPiece NextLine(util::FilePiece& from) {
+ StringPiece line;
+ while ((line = from.ReadLine()).starts_with("#"));
+ return line;
+}
+
+Vocab::Vocab() : eos_( FindOrAdd(kEOS)), bos_(FindOrAdd(kBOS)){
+}
+
+const Vocab::Entry &Vocab::FindOrAdd(const StringPiece &str) {
+#if BOOST_VERSION >= 104200
+ Map::const_iterator i= map_.find(str, Hash(), Equals());
+#else
+ std::string copied_str(str.data(), str.size());
+ Map::const_iterator i = map_.find(copied_str.c_str());
+#endif
+ if (i != map_.end()) return *i;
+ char *copied = static_cast<char*>(piece_backing_.Allocate(str.size() + 1));
+ memcpy(copied, str.data(), str.size());
+ copied[str.size()] = 0;
+ return *map_.insert(Entry(copied, map_.size())).first;
+}
+
+double_conversion::StringToDoubleConverter converter(double_conversion::StringToDoubleConverter::NO_FLAGS, NAN, NAN, "inf", "nan");
+
+
+/**
+ * Reads an incoming edge. Returns edge and source words covered.
+**/
+static pair<Edge*,size_t> ReadEdge(util::FilePiece &from, Graph &graph) {
+ Edge* edge = graph.NewEdge();
+ StringPiece line = NextLine(from);
+ util::TokenIter<util::MultiCharacter> pipes(line, util::MultiCharacter(" ||| "));
+ //Target
+ for (util::TokenIter<util::SingleCharacter, true> i(*pipes, util::SingleCharacter(' ')); i; ++i) {
+ StringPiece got = *i;
+ if ('[' == *got.data() && ']' == got.data()[got.size() - 1]) {
+ // non-terminal
+ char *end_ptr;
+ unsigned long int child = std::strtoul(got.data() + 1, &end_ptr, 10);
+ UTIL_THROW_IF(end_ptr != got.data() + got.size() - 1, HypergraphException, "Bad non-terminal" << got);
+ UTIL_THROW_IF(child >= graph.VertexSize(), HypergraphException, "Reference to vertex " << child << " but we only have " << graph.VertexSize() << " vertices. Is the file in bottom-up format?");
+ edge->AddWord(NULL);
+ edge->AddChild(child);
+ } else {
+ const Vocab::Entry &found = graph.MutableVocab().FindOrAdd(got);
+ edge->AddWord(&found);
+ }
+ }
+
+ //Features
+ ++pipes;
+ for (util::TokenIter<util::SingleCharacter, true> i(*pipes, util::SingleCharacter(' ')); i; ++i) {
+ StringPiece fv = *i;
+ if (!fv.size()) break;
+ size_t equals = fv.find_last_of("=");
+ UTIL_THROW_IF(equals == fv.npos, HypergraphException, "Failed to parse feature '" << fv << "'");
+ StringPiece name = fv.substr(0,equals);
+ StringPiece value = fv.substr(equals+1);
+ int processed;
+ float score = converter.StringToFloat(value.data(), value.length(), &processed);
+ UTIL_THROW_IF(isnan(score), HypergraphException, "Failed to parse weight '" << value << "'");
+ edge->AddFeature(name,score);
+ }
+ //Covered words
+ ++pipes;
+ size_t sourceCovered = boost::lexical_cast<size_t>(*pipes);
+ return pair<Edge*,size_t>(edge,sourceCovered);
+}
+
+void Graph::Prune(Graph* pNewGraph, const SparseVector& weights, size_t minEdgeCount) const {
+
+ Graph& newGraph = *pNewGraph;
+ //TODO: Optimise case where no pruning required
+
+ //For debug
+
+ /*
+ map<const Edge*, string> edgeIds;
+ for (size_t i = 0; i < edges_.Size(); ++i) {
+ stringstream str;
+ for (size_t j = 0; j < edges_[i].Words().size(); ++j) {
+ if (edges_[i].Words()[j]) str << edges_[i].Words()[j]->first;
+ }
+ edgeIds[&(edges_[i])] = str.str();
+ }*/
+
+ //end For debug
+
+ map<const Edge*, FeatureStatsType> edgeBackwardScores;
+ map<const Edge*, size_t> edgeHeads;
+ vector<FeatureStatsType> vertexBackwardScores(vertices_.Size(), kMinScore);
+ vector<vector<const Edge*> > outgoing(vertices_.Size());
+
+ //Compute backward scores
+ for (size_t vi = 0; vi < vertices_.Size(); ++vi) {
+ //cerr << "Vertex " << vi << endl;
+ const Vertex& vertex = vertices_[vi];
+ const vector<const Edge*>& incoming = vertex.GetIncoming();
+ if (!incoming.size()) {
+ vertexBackwardScores[vi] = 0;
+ } else {
+ for (size_t ei = 0; ei < incoming.size(); ++ei) {
+ //cerr << "Edge " << edgeIds[incoming[ei]] << endl;
+ edgeHeads[incoming[ei]]= vi;
+ FeatureStatsType incomingScore = incoming[ei]->GetScore(weights);
+ for (size_t i = 0; i < incoming[ei]->Children().size(); ++i) {
+ // cerr << "\tChild " << incoming[ei]->Children()[i] << endl;
+ size_t childId = incoming[ei]->Children()[i];
+ UTIL_THROW_IF(vertexBackwardScores[childId] == kMinScore,
+ HypergraphException, "Graph was not topologically sorted. curr=" << vi << " prev=" << childId);
+ outgoing[childId].push_back(incoming[ei]);
+ incomingScore += vertexBackwardScores[childId];
+ }
+ edgeBackwardScores[incoming[ei]]= incomingScore;
+ // cerr << "Backward score: " << incomingScore << endl;
+ if (incomingScore > vertexBackwardScores[vi]) vertexBackwardScores[vi] = incomingScore;
+ }
+ }
+ }
+
+ //Compute forward scores
+ vector<FeatureStatsType> vertexForwardScores(vertices_.Size(), kMinScore);
+ map<const Edge*, FeatureStatsType> edgeForwardScores;
+ for (size_t i = 1; i <= vertices_.Size(); ++i) {
+ size_t vi = vertices_.Size() - i;
+ //cerr << "Vertex " << vi << endl;
+ if (!outgoing[vi].size()) {
+ vertexForwardScores[vi] = 0;
+ } else {
+ for (size_t ei = 0; ei < outgoing[vi].size(); ++ei) {
+ //cerr << "Edge " << edgeIds[outgoing[vi][ei]] << endl;
+ FeatureStatsType outgoingScore = 0;
+ //sum scores of siblings
+ for (size_t i = 0; i < outgoing[vi][ei]->Children().size(); ++i) {
+ size_t siblingId = outgoing[vi][ei]->Children()[i];
+ if (siblingId != vi) {
+ // cerr << "\tSibling " << siblingId << endl;
+ outgoingScore += vertexBackwardScores[siblingId];
+ }
+ }
+ //add score of head
+ outgoingScore += vertexForwardScores[edgeHeads[outgoing[vi][ei]]];
+ //cerr << "Forward score " << outgoingScore << endl;
+ edgeForwardScores[outgoing[vi][ei]] = outgoingScore;
+ outgoingScore += outgoing[vi][ei]->GetScore(weights);
+ if (outgoingScore > vertexForwardScores[vi]) vertexForwardScores[vi] = outgoingScore;
+ }
+ }
+ }
+
+
+
+ multimap<FeatureStatsType, const Edge*> edgeScores;
+ for (size_t i = 0; i < edges_.Size(); ++i) {
+ const Edge* edge = &(edges_[i]);
+ if (edgeForwardScores.find(edge) == edgeForwardScores.end()) {
+ //This edge has no children, so didn't get a forward score. Its forward score
+ //is that of its head
+ edgeForwardScores[edge] = vertexForwardScores[edgeHeads[edge]];
+ }
+ FeatureStatsType score = edgeForwardScores[edge] + edgeBackwardScores[edge];
+ edgeScores.insert(pair<FeatureStatsType, const Edge*>(score,edge));
+ // cerr << edgeIds[edge] << " " << score << endl;
+ }
+
+
+
+ multimap<FeatureStatsType, const Edge*>::const_reverse_iterator ei = edgeScores.rbegin();
+ size_t edgeCount = 1;
+ while(edgeCount < minEdgeCount && ei != edgeScores.rend()) {
+ ++ei;
+ ++edgeCount;
+ }
+ multimap<FeatureStatsType, const Edge*>::const_iterator lowest = edgeScores.begin();
+ if (ei != edgeScores.rend()) lowest = edgeScores.lower_bound(ei->first);
+
+ //cerr << "Retained edges" << endl;
+ set<size_t> retainedVertices;
+ set<const Edge*> retainedEdges;
+ for (; lowest != edgeScores.end(); ++lowest) {
+ //cerr << lowest->first << " " << edgeIds[lowest->second] << endl;
+ retainedEdges.insert(lowest->second);
+ retainedVertices.insert(edgeHeads[lowest->second]);
+ for (size_t i = 0; i < lowest->second->Children().size(); ++i) {
+ retainedVertices.insert(lowest->second->Children()[i]);
+ }
+ }
+ newGraph.SetCounts(retainedVertices.size(), retainedEdges.size());
+
+ //cerr << "Retained vertices" << endl;
+ map<size_t,size_t> oldIdToNew;
+ size_t vi = 0;
+ for (set<size_t>::const_iterator i = retainedVertices.begin(); i != retainedVertices.end(); ++i, ++vi) {
+ //cerr << *i << endl;
+ oldIdToNew[*i] = vi;
+ Vertex* vertex = newGraph.NewVertex();
+ vertex->SetSourceCovered(vertices_[*i].SourceCovered());
+ }
+
+ for (set<const Edge*>::const_iterator i = retainedEdges.begin(); i != retainedEdges.end(); ++i) {
+ Edge* newEdge = newGraph.NewEdge();
+ const Edge* oldEdge = *i;
+ for (size_t j = 0; j < oldEdge->Words().size(); ++j) {
+ newEdge->AddWord(oldEdge->Words()[j]);
+ }
+ for (size_t j = 0; j < oldEdge->Children().size(); ++j) {
+ newEdge->AddChild(oldIdToNew[oldEdge->Children()[j]]);
+ }
+ newEdge->SetFeatures(oldEdge->Features());
+ Vertex& newHead = newGraph.vertices_[oldIdToNew[edgeHeads[oldEdge]]];
+ newHead.AddEdge(newEdge);
+ }
+
+
+
+
+}
+
+/**
+ * Read from "Kenneth's hypergraph" aka cdec target_graph format (with comments)
+**/
+void ReadGraph(util::FilePiece &from, Graph &graph) {
+
+ //First line should contain field names
+ StringPiece line = from.ReadLine();
+ UTIL_THROW_IF(line.compare("# target ||| features ||| source-covered") != 0, HypergraphException, "Incorrect format spec on first line: '" << line << "'");
+ line = NextLine(from);
+
+ //Then expect numbers of vertices
+ util::TokenIter<util::SingleCharacter, false> i(line, util::SingleCharacter(' '));
+ unsigned long int vertices = boost::lexical_cast<unsigned long int>(*i);
+ ++i;
+ unsigned long int edges = boost::lexical_cast<unsigned long int>(*i);
+ graph.SetCounts(vertices, edges);
+ //cerr << "vertices: " << vertices << "; edges: " << edges << endl;
+ for (size_t i = 0; i < vertices; ++i) {
+ line = NextLine(from);
+ unsigned long int edge_count = boost::lexical_cast<unsigned long int>(line);
+ Vertex* vertex = graph.NewVertex();
+ for (unsigned long int e = 0; e < edge_count; ++e) {
+ pair<Edge*,size_t> edge = ReadEdge(from, graph);
+ vertex->AddEdge(edge.first);
+ //Note: the file format attaches this to the edge, but it's really a property
+ //of the vertex.
+ if (!e) {vertex->SetSourceCovered(edge.second);}
+ }
+ }
+}
+
+};