Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPhil Williams <philip.williams@mac.com>2015-06-09 18:50:27 +0300
committerPhil Williams <philip.williams@mac.com>2015-06-09 18:50:27 +0300
commitfa51da28c5f21881b716026b69b07b0fd2e3a015 (patch)
tree9bcb6e6b3e38473a3d511948656942616ec3376c /phrase-extract
parentb76194a16b3e2c070522751ff40762c3f8870bce (diff)
moses/phrase-extract refactoring
Final commit in this round of refactoring (which started with commit 2f735998...). The main changes are: - a general storage mechanism for attribute/value pairs in XML-style tree / lattice input. E.g. the "pcfg-score" and "semantic-role" attributes in: <tree label="PRP" pcfg-score="1.0" semantic-role="AGENT"> I </tree> - consolidation of the various near-duplicate Tree / XmlTreeParser classes that have accumulated over the years (my fault) - miscellaneous de-crufting
Diffstat (limited to 'phrase-extract')
-rw-r--r--phrase-extract/SyntaxNode.h3
-rw-r--r--phrase-extract/SyntaxNodeCollection.h8
-rw-r--r--phrase-extract/XmlTree.cpp27
-rw-r--r--phrase-extract/syntax-common/xml_tree_parser.h2
4 files changed, 21 insertions, 19 deletions
diff --git a/phrase-extract/SyntaxNode.h b/phrase-extract/SyntaxNode.h
index 49e2eb695..25a75b784 100644
--- a/phrase-extract/SyntaxNode.h
+++ b/phrase-extract/SyntaxNode.h
@@ -25,6 +25,9 @@
namespace MosesTraining
{
+/*! A node in a syntactic structure (tree, lattice, etc.). SyntaxNodes have a
+ * label and a span plus an arbitrary set of name/value attributes.
+ */
struct SyntaxNode {
typedef std::map<std::string, std::string> AttributeMap;
diff --git a/phrase-extract/SyntaxNodeCollection.h b/phrase-extract/SyntaxNodeCollection.h
index 405a77c5f..da0e1eca3 100644
--- a/phrase-extract/SyntaxNodeCollection.h
+++ b/phrase-extract/SyntaxNodeCollection.h
@@ -55,11 +55,13 @@ public:
return m_nodes;
};
- size_t GetNumWords() const {
- return m_numWords;
- }
+ //! Get the number of words (defined as 1 + the max end pos of any node).
+ std::size_t GetNumWords() const { return m_numWords; }
+
+ //! Clear the container (this deletes the SyntaxNodes).
void Clear();
+ //! Extract a SyntaxTree (assuming the collection's nodes constitute a tree).
std::auto_ptr<SyntaxTree> ExtractTree();
private:
diff --git a/phrase-extract/XmlTree.cpp b/phrase-extract/XmlTree.cpp
index d8b77b6e6..d88c78c0b 100644
--- a/phrase-extract/XmlTree.cpp
+++ b/phrase-extract/XmlTree.cpp
@@ -80,7 +80,6 @@ string ParseXmlTagAttribute(const string& tag,const string& attributeName)
return tag.substr(contentsStart,contentsEnd-contentsStart);
}
-// TODO Special handling of "label" attribute
// s should be a sequence of name=attribute pairs separated by whitespace.
// e.g. "label=\"S\" pcfg=\"-1.452\" foo=\"blah\\\"blah\""
void ParseXmlTagAttributes(const std::string &s,
@@ -107,8 +106,9 @@ void ParseXmlTagAttributes(const std::string &s,
throw XmlException("invalid tag content");
}
}
- // TODO unescape \"
- attributes[name] = s.substr(begin+1, pos-begin-1);
+ if (name != "label" && name != "span") {
+ attributes[name] = s.substr(begin+1, pos-begin-1);
+ }
begin = pos+1;
}
}
@@ -245,20 +245,17 @@ vector<string> TokenizeXml(const string& str)
}
/**
- * Process a sentence with xml annotation
- * Xml tags may specifiy additional/replacing translation options
- * and reordering constraints
+ * Process a sentence with XML-style annotation of syntactic nodes.
*
- * \param line in: sentence, out: sentence without the xml
- * \param res vector with translation options specified by xml
- * \param reorderingConstraint reordering constraint zones specified by xml
- * \param walls reordering constraint walls specified by xml
+ * \param line[in,out] in: sentence, out: sentence without the XML
+ * \param nodeCollection[out] the collection of SyntaxNode objects for this
+ * sentence
+ * \param labelCollection[out] label values are inserted into this set
+ * \param topLabelCollection[out] top labels (key) and their counts (value)
+ * are inserted into this map
+ * \param unescapeSpecialChars flag indicating whether XML special characters
+ * should be unescaped
*/
-/*TODO: we'd only have to return a vector of XML options if we dropped linking. 2-d vector
- is so we can link things up afterwards. We can't create TranslationOptions as we
- parse because we don't have the completed source parsed until after this function
- removes all the markup from it (CreateFromString in Sentence::Read).
-*/
bool ProcessAndStripXMLTags(string &line, SyntaxNodeCollection &nodeCollection,
set< string > &labelCollection,
map< string, int > &topLabelCollection,
diff --git a/phrase-extract/syntax-common/xml_tree_parser.h b/phrase-extract/syntax-common/xml_tree_parser.h
index 48ea056b8..04ad74e24 100644
--- a/phrase-extract/syntax-common/xml_tree_parser.h
+++ b/phrase-extract/syntax-common/xml_tree_parser.h
@@ -16,7 +16,7 @@ namespace Syntax {
* converts them to SyntaxTree objects.
*
* This is a thin wrapper around the ProcessAndStripXMLTags function. After
- * calling Parse(), the output of the ProcessAndStripXMLTags function (the
+ * calling Parse(), the output from the ProcessAndStripXMLTags call (the
* sentence, node collection, label set, and top label set) are available via
* accessors.
*/