diff options
author | Phil Williams <philip.williams@mac.com> | 2015-06-02 17:23:41 +0300 |
---|---|---|
committer | Phil Williams <philip.williams@mac.com> | 2015-06-02 17:23:41 +0300 |
commit | 2f04d4a56ebab78a97b9fa9ecf4b50ef845a1bdb (patch) | |
tree | a5e60d8efce2aece1c850ccb40adb5021387d1c4 /phrase-extract/syntax-common | |
parent | 5ece895ab4d7fafe32d76cb2dd7bd7995cd06c7c (diff) |
Ongoing moses/phrase-extract refactoring
Diffstat (limited to 'phrase-extract/syntax-common')
-rw-r--r-- | phrase-extract/syntax-common/xml_tree_parser.cc | 69 | ||||
-rw-r--r-- | phrase-extract/syntax-common/xml_tree_parser.h | 28 |
2 files changed, 58 insertions, 39 deletions
diff --git a/phrase-extract/syntax-common/xml_tree_parser.cc b/phrase-extract/syntax-common/xml_tree_parser.cc index 2f8a904fa..bf3c6d87e 100644 --- a/phrase-extract/syntax-common/xml_tree_parser.cc +++ b/phrase-extract/syntax-common/xml_tree_parser.cc @@ -1,17 +1,27 @@ #include "xml_tree_parser.h" +#include <cassert> +#include <vector> + +#include "util/tokenize.hh" + +#include "SyntaxTree.h" #include "tables-core.h" #include "XmlException.h" #include "XmlTree.h" -#include "util/tokenize.hh" - -#include <cassert> -#include <vector> namespace MosesTraining { namespace Syntax { -StringTree *XmlTreeParser::Parse(const std::string &line) { +XmlTreeParser::XmlTreeParser(std::set<std::string> &labelSet, + std::map<std::string, int> &topLabelSet) + : label_set_(labelSet) + , top_label_set_(topLabelSet) +{ +} + +std::auto_ptr<SyntaxTree> XmlTreeParser::Parse(const std::string &line) +{ line_ = line; node_collection_.Clear(); try { @@ -22,38 +32,37 @@ StringTree *XmlTreeParser::Parse(const std::string &line) { } catch (const XmlException &e) { throw Exception(e.getMsg()); } - node_collection_.ConnectNodes(); - SyntaxNode *root = node_collection_.GetTop(); - assert(root); + std::auto_ptr<SyntaxTree> root = node_collection_.ExtractTree(); words_ = util::tokenize(line_); - return ConvertTree(*root, words_); + AttachWords(words_, *root); + return root; } -// Converts a SyntaxNode tree to a StringTree. -StringTree *XmlTreeParser::ConvertTree(const SyntaxNode &tree, - const std::vector<std::string> &words) { - StringTree *root = new StringTree(tree.GetLabel()); - const std::vector<SyntaxNode*> &children = tree.GetChildren(); - if (children.empty()) { - if (tree.GetStart() != tree.GetEnd()) { +void XmlTreeParser::AttachWords(const std::vector<std::string> &words, + SyntaxTree &root) +{ + std::vector<SyntaxTree*> leaves; + leaves.reserve(words.size()); + for (SyntaxTree::LeafIterator p(root); p != SyntaxTree::LeafIterator(); ++p) { + leaves.push_back(&*p); + } + + std::vector<std::string>::const_iterator q = words.begin(); + for (std::vector<SyntaxTree*>::iterator p = leaves.begin(); p != leaves.end(); + ++p) { + SyntaxTree *leaf = *p; + const int start = leaf->value().GetStart(); + const int end = leaf->value().GetEnd(); + if (start != end) { std::ostringstream msg; - msg << "leaf node covers multiple words (" << tree.GetStart() - << "-" << tree.GetEnd() << "): this is currently unsupported"; + msg << "leaf node covers multiple words (" << start << "-" << end + << "): this is currently unsupported"; throw Exception(msg.str()); } - StringTree *leaf = new StringTree(words[tree.GetStart()]); - leaf->parent() = root; - root->children().push_back(leaf); - } else { - for (std::vector<SyntaxNode*>::const_iterator p = children.begin(); - p != children.end(); ++p) { - assert(*p); - StringTree *child = ConvertTree(**p, words); - child->parent() = root; - root->children().push_back(child); - } + SyntaxTree *newLeaf = new SyntaxTree(SyntaxNode(start, end, *q++)); + leaf->children().push_back(newLeaf); + newLeaf->parent() = leaf; } - return root; } } // namespace Syntax diff --git a/phrase-extract/syntax-common/xml_tree_parser.h b/phrase-extract/syntax-common/xml_tree_parser.h index c84ea25ec..e0b75c830 100644 --- a/phrase-extract/syntax-common/xml_tree_parser.h +++ b/phrase-extract/syntax-common/xml_tree_parser.h @@ -1,34 +1,44 @@ #pragma once #include <map> +#include <memory> #include <set> #include <string> #include <vector> #include "SyntaxNode.h" #include "SyntaxNodeCollection.h" +#include "SyntaxTree.h" #include "exception.h" -#include "string_tree.h" namespace MosesTraining { namespace Syntax { -// Parses a string in Moses' XML parse tree format and returns a StringTree +// Parses a string in Moses' XML parse tree format and returns a SyntaxTree // object. This is a wrapper around the ProcessAndStripXMLTags function. class XmlTreeParser { public: - StringTree *Parse(const std::string &); + XmlTreeParser(std::set<std::string> &, std::map<std::string, int> &); - private: - static StringTree *ConvertTree(const MosesTraining::SyntaxNode &, - const std::vector<std::string> &); + std::auto_ptr<SyntaxTree> Parse(const std::string &); + + const std::vector<std::string>& GetWords() { + return words_; + } + + const SyntaxNodeCollection &GetNodeCollection() const { + return node_collection_; + } - std::set<std::string> label_set_; - std::map<std::string, int> top_label_set_; + private: + std::set<std::string> &label_set_; + std::map<std::string, int> &top_label_set_; std::string line_; - MosesTraining::SyntaxNodeCollection node_collection_; + SyntaxNodeCollection node_collection_; std::vector<std::string> words_; + + void AttachWords(const std::vector<std::string> &, SyntaxTree &); }; } // namespace Syntax |