diff options
author | Barry Haddow <barry.haddow@gmail.com> | 2014-07-21 14:04:43 +0400 |
---|---|---|
committer | Barry Haddow <barry.haddow@gmail.com> | 2014-07-21 14:04:43 +0400 |
commit | efee2695c31e1086af783c1b092fc842fb7bb1a4 (patch) | |
tree | e8324ea35cc92f0737c93f6b8fa8e23898e4d78a /mert/Hypergraph.cpp | |
parent | c83c5a3ee6f3ef7480e7a782d2023af9e99c1711 (diff) |
Merge 08811deb17337356cd8dae9c59c0160590679a35 from joshua
Diffstat (limited to 'mert/Hypergraph.cpp')
-rw-r--r-- | mert/Hypergraph.cpp | 286 |
1 files changed, 286 insertions, 0 deletions
diff --git a/mert/Hypergraph.cpp b/mert/Hypergraph.cpp new file mode 100644 index 000000000..d80a5c12c --- /dev/null +++ b/mert/Hypergraph.cpp @@ -0,0 +1,286 @@ +/*********************************************************************** +Moses - factored phrase-based language decoder +Copyright (C) 2014- University of Edinburgh + +This library is free software; you can redistribute it and/or +modify it under the terms of the GNU Lesser General Public +License as published by the Free Software Foundation; either +version 2.1 of the License, or (at your option) any later version. + +This library is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +Lesser General Public License for more details. + +You should have received a copy of the GNU Lesser General Public +License along with this library; if not, write to the Free Software +Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +***********************************************************************/ +#include <iostream> +#include <set> + +#include <boost/lexical_cast.hpp> + +#include "util/double-conversion/double-conversion.h" +#include "util/string_piece.hh" +#include "util/tokenize_piece.hh" + +#include "Hypergraph.h" + +using namespace std; +static const string kBOS = "<s>"; +static const string kEOS = "</s>"; + +namespace MosesTuning { + +StringPiece NextLine(util::FilePiece& from) { + StringPiece line; + while ((line = from.ReadLine()).starts_with("#")); + return line; +} + +Vocab::Vocab() : eos_( FindOrAdd(kEOS)), bos_(FindOrAdd(kBOS)){ +} + +const Vocab::Entry &Vocab::FindOrAdd(const StringPiece &str) { +#if BOOST_VERSION >= 104200 + Map::const_iterator i= map_.find(str, Hash(), Equals()); +#else + std::string copied_str(str.data(), str.size()); + Map::const_iterator i = map_.find(copied_str.c_str()); +#endif + if (i != map_.end()) return *i; + char *copied = static_cast<char*>(piece_backing_.Allocate(str.size() + 1)); + memcpy(copied, str.data(), str.size()); + copied[str.size()] = 0; + return *map_.insert(Entry(copied, map_.size())).first; +} + +double_conversion::StringToDoubleConverter converter(double_conversion::StringToDoubleConverter::NO_FLAGS, NAN, NAN, "inf", "nan"); + + +/** + * Reads an incoming edge. Returns edge and source words covered. +**/ +static pair<Edge*,size_t> ReadEdge(util::FilePiece &from, Graph &graph) { + Edge* edge = graph.NewEdge(); + StringPiece line = NextLine(from); + util::TokenIter<util::MultiCharacter> pipes(line, util::MultiCharacter(" ||| ")); + //Target + for (util::TokenIter<util::SingleCharacter, true> i(*pipes, util::SingleCharacter(' ')); i; ++i) { + StringPiece got = *i; + if ('[' == *got.data() && ']' == got.data()[got.size() - 1]) { + // non-terminal + char *end_ptr; + unsigned long int child = std::strtoul(got.data() + 1, &end_ptr, 10); + UTIL_THROW_IF(end_ptr != got.data() + got.size() - 1, HypergraphException, "Bad non-terminal" << got); + UTIL_THROW_IF(child >= graph.VertexSize(), HypergraphException, "Reference to vertex " << child << " but we only have " << graph.VertexSize() << " vertices. Is the file in bottom-up format?"); + edge->AddWord(NULL); + edge->AddChild(child); + } else { + const Vocab::Entry &found = graph.MutableVocab().FindOrAdd(got); + edge->AddWord(&found); + } + } + + //Features + ++pipes; + for (util::TokenIter<util::SingleCharacter, true> i(*pipes, util::SingleCharacter(' ')); i; ++i) { + StringPiece fv = *i; + if (!fv.size()) break; + size_t equals = fv.find_last_of("="); + UTIL_THROW_IF(equals == fv.npos, HypergraphException, "Failed to parse feature '" << fv << "'"); + StringPiece name = fv.substr(0,equals); + StringPiece value = fv.substr(equals+1); + int processed; + float score = converter.StringToFloat(value.data(), value.length(), &processed); + UTIL_THROW_IF(isnan(score), HypergraphException, "Failed to parse weight '" << value << "'"); + edge->AddFeature(name,score); + } + //Covered words + ++pipes; + size_t sourceCovered = boost::lexical_cast<size_t>(*pipes); + return pair<Edge*,size_t>(edge,sourceCovered); +} + +void Graph::Prune(Graph* pNewGraph, const SparseVector& weights, size_t minEdgeCount) const { + + Graph& newGraph = *pNewGraph; + //TODO: Optimise case where no pruning required + + //For debug + + /* + map<const Edge*, string> edgeIds; + for (size_t i = 0; i < edges_.Size(); ++i) { + stringstream str; + for (size_t j = 0; j < edges_[i].Words().size(); ++j) { + if (edges_[i].Words()[j]) str << edges_[i].Words()[j]->first; + } + edgeIds[&(edges_[i])] = str.str(); + }*/ + + //end For debug + + map<const Edge*, FeatureStatsType> edgeBackwardScores; + map<const Edge*, size_t> edgeHeads; + vector<FeatureStatsType> vertexBackwardScores(vertices_.Size(), kMinScore); + vector<vector<const Edge*> > outgoing(vertices_.Size()); + + //Compute backward scores + for (size_t vi = 0; vi < vertices_.Size(); ++vi) { + //cerr << "Vertex " << vi << endl; + const Vertex& vertex = vertices_[vi]; + const vector<const Edge*>& incoming = vertex.GetIncoming(); + if (!incoming.size()) { + vertexBackwardScores[vi] = 0; + } else { + for (size_t ei = 0; ei < incoming.size(); ++ei) { + //cerr << "Edge " << edgeIds[incoming[ei]] << endl; + edgeHeads[incoming[ei]]= vi; + FeatureStatsType incomingScore = incoming[ei]->GetScore(weights); + for (size_t i = 0; i < incoming[ei]->Children().size(); ++i) { + // cerr << "\tChild " << incoming[ei]->Children()[i] << endl; + size_t childId = incoming[ei]->Children()[i]; + UTIL_THROW_IF(vertexBackwardScores[childId] == kMinScore, + HypergraphException, "Graph was not topologically sorted. curr=" << vi << " prev=" << childId); + outgoing[childId].push_back(incoming[ei]); + incomingScore += vertexBackwardScores[childId]; + } + edgeBackwardScores[incoming[ei]]= incomingScore; + // cerr << "Backward score: " << incomingScore << endl; + if (incomingScore > vertexBackwardScores[vi]) vertexBackwardScores[vi] = incomingScore; + } + } + } + + //Compute forward scores + vector<FeatureStatsType> vertexForwardScores(vertices_.Size(), kMinScore); + map<const Edge*, FeatureStatsType> edgeForwardScores; + for (size_t i = 1; i <= vertices_.Size(); ++i) { + size_t vi = vertices_.Size() - i; + //cerr << "Vertex " << vi << endl; + if (!outgoing[vi].size()) { + vertexForwardScores[vi] = 0; + } else { + for (size_t ei = 0; ei < outgoing[vi].size(); ++ei) { + //cerr << "Edge " << edgeIds[outgoing[vi][ei]] << endl; + FeatureStatsType outgoingScore = 0; + //sum scores of siblings + for (size_t i = 0; i < outgoing[vi][ei]->Children().size(); ++i) { + size_t siblingId = outgoing[vi][ei]->Children()[i]; + if (siblingId != vi) { + // cerr << "\tSibling " << siblingId << endl; + outgoingScore += vertexBackwardScores[siblingId]; + } + } + //add score of head + outgoingScore += vertexForwardScores[edgeHeads[outgoing[vi][ei]]]; + //cerr << "Forward score " << outgoingScore << endl; + edgeForwardScores[outgoing[vi][ei]] = outgoingScore; + outgoingScore += outgoing[vi][ei]->GetScore(weights); + if (outgoingScore > vertexForwardScores[vi]) vertexForwardScores[vi] = outgoingScore; + } + } + } + + + + multimap<FeatureStatsType, const Edge*> edgeScores; + for (size_t i = 0; i < edges_.Size(); ++i) { + const Edge* edge = &(edges_[i]); + if (edgeForwardScores.find(edge) == edgeForwardScores.end()) { + //This edge has no children, so didn't get a forward score. Its forward score + //is that of its head + edgeForwardScores[edge] = vertexForwardScores[edgeHeads[edge]]; + } + FeatureStatsType score = edgeForwardScores[edge] + edgeBackwardScores[edge]; + edgeScores.insert(pair<FeatureStatsType, const Edge*>(score,edge)); + // cerr << edgeIds[edge] << " " << score << endl; + } + + + + multimap<FeatureStatsType, const Edge*>::const_reverse_iterator ei = edgeScores.rbegin(); + size_t edgeCount = 1; + while(edgeCount < minEdgeCount && ei != edgeScores.rend()) { + ++ei; + ++edgeCount; + } + multimap<FeatureStatsType, const Edge*>::const_iterator lowest = edgeScores.begin(); + if (ei != edgeScores.rend()) lowest = edgeScores.lower_bound(ei->first); + + //cerr << "Retained edges" << endl; + set<size_t> retainedVertices; + set<const Edge*> retainedEdges; + for (; lowest != edgeScores.end(); ++lowest) { + //cerr << lowest->first << " " << edgeIds[lowest->second] << endl; + retainedEdges.insert(lowest->second); + retainedVertices.insert(edgeHeads[lowest->second]); + for (size_t i = 0; i < lowest->second->Children().size(); ++i) { + retainedVertices.insert(lowest->second->Children()[i]); + } + } + newGraph.SetCounts(retainedVertices.size(), retainedEdges.size()); + + //cerr << "Retained vertices" << endl; + map<size_t,size_t> oldIdToNew; + size_t vi = 0; + for (set<size_t>::const_iterator i = retainedVertices.begin(); i != retainedVertices.end(); ++i, ++vi) { + //cerr << *i << endl; + oldIdToNew[*i] = vi; + Vertex* vertex = newGraph.NewVertex(); + vertex->SetSourceCovered(vertices_[*i].SourceCovered()); + } + + for (set<const Edge*>::const_iterator i = retainedEdges.begin(); i != retainedEdges.end(); ++i) { + Edge* newEdge = newGraph.NewEdge(); + const Edge* oldEdge = *i; + for (size_t j = 0; j < oldEdge->Words().size(); ++j) { + newEdge->AddWord(oldEdge->Words()[j]); + } + for (size_t j = 0; j < oldEdge->Children().size(); ++j) { + newEdge->AddChild(oldIdToNew[oldEdge->Children()[j]]); + } + newEdge->SetFeatures(oldEdge->Features()); + Vertex& newHead = newGraph.vertices_[oldIdToNew[edgeHeads[oldEdge]]]; + newHead.AddEdge(newEdge); + } + + + + +} + +/** + * Read from "Kenneth's hypergraph" aka cdec target_graph format (with comments) +**/ +void ReadGraph(util::FilePiece &from, Graph &graph) { + + //First line should contain field names + StringPiece line = from.ReadLine(); + UTIL_THROW_IF(line.compare("# target ||| features ||| source-covered") != 0, HypergraphException, "Incorrect format spec on first line: '" << line << "'"); + line = NextLine(from); + + //Then expect numbers of vertices + util::TokenIter<util::SingleCharacter, false> i(line, util::SingleCharacter(' ')); + unsigned long int vertices = boost::lexical_cast<unsigned long int>(*i); + ++i; + unsigned long int edges = boost::lexical_cast<unsigned long int>(*i); + graph.SetCounts(vertices, edges); + //cerr << "vertices: " << vertices << "; edges: " << edges << endl; + for (size_t i = 0; i < vertices; ++i) { + line = NextLine(from); + unsigned long int edge_count = boost::lexical_cast<unsigned long int>(line); + Vertex* vertex = graph.NewVertex(); + for (unsigned long int e = 0; e < edge_count; ++e) { + pair<Edge*,size_t> edge = ReadEdge(from, graph); + vertex->AddEdge(edge.first); + //Note: the file format attaches this to the edge, but it's really a property + //of the vertex. + if (!e) {vertex->SetSourceCovered(edge.second);} + } + } +} + +}; |