diff options
author | Phil Williams <philip.williams@mac.com> | 2015-06-09 18:50:27 +0300 |
---|---|---|
committer | Phil Williams <philip.williams@mac.com> | 2015-06-09 18:50:27 +0300 |
commit | fa51da28c5f21881b716026b69b07b0fd2e3a015 (patch) | |
tree | 9bcb6e6b3e38473a3d511948656942616ec3376c /phrase-extract | |
parent | b76194a16b3e2c070522751ff40762c3f8870bce (diff) |
moses/phrase-extract refactoring
Final commit in this round of refactoring (which started with commit
2f735998...). The main changes are:
- a general storage mechanism for attribute/value pairs in XML-style
tree / lattice input. E.g. the "pcfg-score" and "semantic-role"
attributes in:
<tree label="PRP" pcfg-score="1.0" semantic-role="AGENT"> I </tree>
- consolidation of the various near-duplicate Tree / XmlTreeParser classes
that have accumulated over the years (my fault)
- miscellaneous de-crufting
Diffstat (limited to 'phrase-extract')
-rw-r--r-- | phrase-extract/SyntaxNode.h | 3 | ||||
-rw-r--r-- | phrase-extract/SyntaxNodeCollection.h | 8 | ||||
-rw-r--r-- | phrase-extract/XmlTree.cpp | 27 | ||||
-rw-r--r-- | phrase-extract/syntax-common/xml_tree_parser.h | 2 |
4 files changed, 21 insertions, 19 deletions
diff --git a/phrase-extract/SyntaxNode.h b/phrase-extract/SyntaxNode.h index 49e2eb695..25a75b784 100644 --- a/phrase-extract/SyntaxNode.h +++ b/phrase-extract/SyntaxNode.h @@ -25,6 +25,9 @@ namespace MosesTraining { +/*! A node in a syntactic structure (tree, lattice, etc.). SyntaxNodes have a + * label and a span plus an arbitrary set of name/value attributes. + */ struct SyntaxNode { typedef std::map<std::string, std::string> AttributeMap; diff --git a/phrase-extract/SyntaxNodeCollection.h b/phrase-extract/SyntaxNodeCollection.h index 405a77c5f..da0e1eca3 100644 --- a/phrase-extract/SyntaxNodeCollection.h +++ b/phrase-extract/SyntaxNodeCollection.h @@ -55,11 +55,13 @@ public: return m_nodes; }; - size_t GetNumWords() const { - return m_numWords; - } + //! Get the number of words (defined as 1 + the max end pos of any node). + std::size_t GetNumWords() const { return m_numWords; } + + //! Clear the container (this deletes the SyntaxNodes). void Clear(); + //! Extract a SyntaxTree (assuming the collection's nodes constitute a tree). std::auto_ptr<SyntaxTree> ExtractTree(); private: diff --git a/phrase-extract/XmlTree.cpp b/phrase-extract/XmlTree.cpp index d8b77b6e6..d88c78c0b 100644 --- a/phrase-extract/XmlTree.cpp +++ b/phrase-extract/XmlTree.cpp @@ -80,7 +80,6 @@ string ParseXmlTagAttribute(const string& tag,const string& attributeName) return tag.substr(contentsStart,contentsEnd-contentsStart); } -// TODO Special handling of "label" attribute // s should be a sequence of name=attribute pairs separated by whitespace. // e.g. "label=\"S\" pcfg=\"-1.452\" foo=\"blah\\\"blah\"" void ParseXmlTagAttributes(const std::string &s, @@ -107,8 +106,9 @@ void ParseXmlTagAttributes(const std::string &s, throw XmlException("invalid tag content"); } } - // TODO unescape \" - attributes[name] = s.substr(begin+1, pos-begin-1); + if (name != "label" && name != "span") { + attributes[name] = s.substr(begin+1, pos-begin-1); + } begin = pos+1; } } @@ -245,20 +245,17 @@ vector<string> TokenizeXml(const string& str) } /** - * Process a sentence with xml annotation - * Xml tags may specifiy additional/replacing translation options - * and reordering constraints + * Process a sentence with XML-style annotation of syntactic nodes. * - * \param line in: sentence, out: sentence without the xml - * \param res vector with translation options specified by xml - * \param reorderingConstraint reordering constraint zones specified by xml - * \param walls reordering constraint walls specified by xml + * \param line[in,out] in: sentence, out: sentence without the XML + * \param nodeCollection[out] the collection of SyntaxNode objects for this + * sentence + * \param labelCollection[out] label values are inserted into this set + * \param topLabelCollection[out] top labels (key) and their counts (value) + * are inserted into this map + * \param unescapeSpecialChars flag indicating whether XML special characters + * should be unescaped */ -/*TODO: we'd only have to return a vector of XML options if we dropped linking. 2-d vector - is so we can link things up afterwards. We can't create TranslationOptions as we - parse because we don't have the completed source parsed until after this function - removes all the markup from it (CreateFromString in Sentence::Read). -*/ bool ProcessAndStripXMLTags(string &line, SyntaxNodeCollection &nodeCollection, set< string > &labelCollection, map< string, int > &topLabelCollection, diff --git a/phrase-extract/syntax-common/xml_tree_parser.h b/phrase-extract/syntax-common/xml_tree_parser.h index 48ea056b8..04ad74e24 100644 --- a/phrase-extract/syntax-common/xml_tree_parser.h +++ b/phrase-extract/syntax-common/xml_tree_parser.h @@ -16,7 +16,7 @@ namespace Syntax { * converts them to SyntaxTree objects. * * This is a thin wrapper around the ProcessAndStripXMLTags function. After - * calling Parse(), the output of the ProcessAndStripXMLTags function (the + * calling Parse(), the output from the ProcessAndStripXMLTags call (the * sentence, node collection, label set, and top label set) are available via * accessors. */ |