1 files changed, 344 insertions, 0 deletions
diff --git a/scripts/training/filter-pt.2/XmlTree.cpp b/scripts/training/filter-pt.2/XmlTree.cpp
new file mode 100644
index 000000000..4f300d57c
--- /dev/null
+++ b/scripts/training/filter-pt.2/XmlTree.cpp
@@ -0,0 +1,344 @@
+// $Id: XmlOption.cpp 1960 2008-12-15 12:52:38Z phkoehn $
+// vim:tabstop=2
+
+/***********************************************************************
+  Moses - factored phrase-based language decoder
+  Copyright (C) 2006 University of Edinburgh
+
+  This library is free software; you can redistribute it and/or
+  modify it under the terms of the GNU Lesser General Public
+  License as published by the Free Software Foundation; either
+  version 2.1 of the License, or (at your option) any later version.
+
+  This library is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  Lesser General Public License for more details.
+
+  You should have received a copy of the GNU Lesser General Public
+  License along with this library; if not, write to the Free Software
+  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ ***********************************************************************/
+
+#include <vector>
+#include <string>
+#include <set>
+#include <iostream>
+#include <stdlib.h>
+#include "XmlTree.h"
+
+using namespace std;
+
+
+inline std::vector<std::string> Tokenize(const std::string& str,
+                                                                const std::string& delimiters = " \t")
+{
+	std::vector<std::string> tokens;
+	// Skip delimiters at beginning.
+	std::string::size_type lastPos = str.find_first_not_of(delimiters, 0);
+	// Find first "non-delimiter".
+	std::string::size_type pos     = str.find_first_of(delimiters, lastPos);
+	
+	while (std::string::npos != pos || std::string::npos != lastPos)
+	{
+		// Found a token, add it to the vector.
+		tokens.push_back(str.substr(lastPos, pos - lastPos));
+		// Skip delimiters.  Note the "not_of"
+		lastPos = str.find_first_not_of(delimiters, pos);
+		// Find next "non-delimiter"
+		pos = str.find_first_of(delimiters, lastPos);
+	}
+	
+	return tokens;
+}
+
+const std::string Trim(const std::string& str, const std::string dropChars = " \t\n\r")
+{
+	std::string res = str;
+	res.erase(str.find_last_not_of(dropChars)+1);
+	return res.erase(0, res.find_first_not_of(dropChars));
+}
+
+string ParseXmlTagAttribute(const string& tag,const string& attributeName){
+	/*TODO deal with unescaping \"*/
+	string tagOpen = attributeName + "=\"";
+	size_t contentsStart = tag.find(tagOpen);
+	if (contentsStart == string::npos) return "";
+	contentsStart += tagOpen.size();
+	size_t contentsEnd = tag.find_first_of('"',contentsStart+1);
+	if (contentsEnd == string::npos) {
+		cerr << "Malformed XML attribute: "<< tag;
+		return "";
+	}
+	size_t possibleEnd;
+	while (tag.at(contentsEnd-1) == '\\' && (possibleEnd = tag.find_first_of('"',contentsEnd+1)) != string::npos) {
+		contentsEnd = possibleEnd;
+	}
+	return tag.substr(contentsStart,contentsEnd-contentsStart);
+}
+
+/**
+ * Remove "<" and ">" from XML tag
+ *
+ * \param str xml token to be stripped
+ */
+string TrimXml(const string& str) 
+{
+  // too short to be xml token -> do nothing
+	if (str.size() < 2) return str;
+	
+  // strip first and last character
+	if (str[0] == '<' && str[str.size() - 1] == '>') 
+	{
+		return str.substr(1, str.size() - 2);
+	} 
+  // not an xml token -> do nothing
+  else { return str; }
+}
+
+/**
+ * Check if the token is an XML tag, i.e. starts with "<"
+ *
+ * \param tag token to be checked
+ */
+bool isXmlTag(const string& tag)
+{
+	return tag[0] == '<';
+}
+
+/**
+ * Split up the input character string into tokens made up of 
+ * either XML tags or text.
+ * example: this <b> is a </b> test .
+ *       => (this ), (<b>), ( is a ), (</b>), ( test .)
+ *
+ * \param str input string
+ */
+inline vector<string> TokenizeXml(const string& str)
+{
+	string lbrack = "<";
+	string rbrack = ">";
+	vector<string> tokens; // vector of tokens to be returned
+	string::size_type cpos = 0; // current position in string
+	string::size_type lpos = 0; // left start of xml tag
+	string::size_type rpos = 0; // right end of xml tag
+	
+  // walk thorugh the string (loop vver cpos)
+	while (cpos != str.size()) 
+	{
+    // find the next opening "<" of an xml tag
+  	lpos = str.find_first_of(lbrack, cpos);
+		if (lpos != string::npos) 
+		{
+			// find the end of the xml tag
+			rpos = str.find_first_of(rbrack, lpos);
+			// sanity check: there has to be closing ">"
+			if (rpos == string::npos) 
+			{
+			  cerr << "ERROR: malformed XML: " << str << endl;
+				return tokens;
+			}
+		} 
+		else // no more tags found
+		{
+			// add the rest as token
+			tokens.push_back(str.substr(cpos));
+			break;
+		}
+		
+		// add stuff before xml tag as token, if there is any
+		if (lpos - cpos > 0)
+			tokens.push_back(str.substr(cpos, lpos - cpos));
+		
+		// add xml tag as token
+		tokens.push_back(str.substr(lpos, rpos-lpos+1));
+		cpos = rpos + 1;
+	}
+	return tokens;
+}
+
+/**
+ * Process a sentence with xml annotation
+ * Xml tags may specifiy additional/replacing translation options
+ * and reordering constraints
+ *
+ * \param line in: sentence, out: sentence without the xml
+ * \param res vector with translation options specified by xml
+ * \param reorderingConstraint reordering constraint zones specified by xml
+ * \param walls reordering constraint walls specified by xml
+ */
+/*TODO: we'd only have to return a vector of XML options if we dropped linking. 2-d vector
+	is so we can link things up afterwards. We can't create TranslationOptions as we
+	parse because we don't have the completed source parsed until after this function
+	removes all the markup from it (CreateFromString in Sentence::Read).
+*/
+bool ProcessAndStripXMLTags(string &line, SyntaxTree &tree, set< string > &labelCollection, map< string, int > &topLabelCollection ) {
+	//parse XML markup in translation line
+	
+	// no xml tag? we're done.
+	if (line.find_first_of('<') == string::npos) { return true; }
+	
+	// break up input into a vector of xml tags and text
+  // example: (this), (<b>), (is a), (</b>), (test .)
+	vector<string> xmlTokens = TokenizeXml(line);
+	
+	// we need to store opened tags, until they are closed
+	// tags are stored as tripled (tagname, startpos, contents)
+	typedef pair< string, pair< size_t, string > > OpenedTag;
+	vector< OpenedTag > tagStack; // stack that contains active opened tags
+	
+	string cleanLine; // return string (text without xml)
+	size_t wordPos = 0; // position in sentence (in terms of number of words)
+	bool isLinked = false;
+	
+  // loop through the tokens
+	for (size_t xmlTokenPos = 0 ; xmlTokenPos < xmlTokens.size() ; xmlTokenPos++)
+	{
+    // not a xml tag, but regular text (may contain many words)
+		if(!isXmlTag(xmlTokens[xmlTokenPos]))
+		{
+			// add a space at boundary, if necessary
+			if (cleanLine.size()>0 &&
+			    cleanLine[cleanLine.size() - 1] != ' ' &&
+			    xmlTokens[xmlTokenPos][0] != ' ')
+			{
+				cleanLine += " ";
+			}
+			cleanLine += xmlTokens[xmlTokenPos]; // add to output
+			wordPos = Tokenize(cleanLine).size(); // count all the words
+		}
+		
+		// process xml tag
+		else
+		{
+			// *** get essential information about tag ***
+			
+      // strip extra boundary spaces and "<" and ">"
+			string tag =  Trim(TrimXml(xmlTokens[xmlTokenPos]));
+			// cerr << "XML TAG IS: " << tag << std::endl;
+			
+			if (tag.size() == 0)
+			{
+				cerr << "ERROR: empty tag name: " << line << endl;
+				return false;
+			}
+			
+      // check if unary (e.g., "<wall/>")
+			bool isUnary = ( tag[tag.size() - 1] == '/' );
+			
+			// check if opening tag (e.g. "<a>", not "</a>")g
+			bool isClosed = ( tag[0] == '/' );
+			bool isOpen = !isClosed;
+			
+			if (isClosed && isUnary)
+			{
+				cerr << "ERROR: can't have both closed and unary tag <" << tag << ">: " << line << endl;
+				return false;
+			}
+			
+			if (isClosed)
+				tag = tag.substr(1); // remove "/" at the beginning
+			if (isUnary)
+				tag = tag.substr(0,tag.size()-1); // remove "/" at the end
+			
+      // find the tag name and contents
+			string::size_type endOfName = tag.find_first_of(' ');
+			string tagName = tag;
+			string tagContent = "";
+			if (endOfName != string::npos) {
+				tagName = tag.substr(0,endOfName);
+				tagContent = tag.substr(endOfName+1);
+			}
+			
+			// *** process new tag ***
+
+			if (isOpen || isUnary)
+			{
+				// put the tag on the tag stack
+				OpenedTag openedTag = make_pair( tagName, make_pair( wordPos, tagContent ) );
+				tagStack.push_back( openedTag );
+				// cerr << "XML TAG " << tagName << " (" << tagContent << ") added to stack, now size " << tagStack.size() << endl;
+			}
+
+			// *** process completed tag ***
+
+			if (isClosed || isUnary)
+			{
+				// pop last opened tag from stack;
+				if (tagStack.size() == 0)
+				{
+				    cerr << "ERROR: tag " << tagName << " closed, but not opened" << ":" << line << endl;
+					return false;
+				}
+				OpenedTag openedTag = tagStack.back();
+				tagStack.pop_back();
+				
+				// tag names have to match
+				if (openedTag.first != tagName)
+				{
+				    cerr << "ERROR: tag " << openedTag.first << " closed by tag " << tagName << ": " << line << endl;
+					return false;
+				}
+				 
+				// assemble remaining information about tag
+				size_t startPos = openedTag.second.first;
+				string tagContent = openedTag.second.second;
+				size_t endPos = wordPos;
+
+				// span attribute overwrites position
+				string span = ParseXmlTagAttribute(tagContent,"span");
+				if (! span.empty()) 
+				{
+					vector<string> ij = Tokenize(span, "-");
+					if (ij.size() != 1 && ij.size() != 2) {
+					    cerr << "ERROR: span attribute must be of the form \"i-j\" or \"i\": " << line << endl;
+						return false;
+					}
+					startPos = atoi(ij[0].c_str());
+					if (ij.size() == 1) endPos = startPos + 1;
+					else endPos = atoi(ij[1].c_str()) + 1;
+				}
+
+				// cerr << "XML TAG " << tagName << " (" << tagContent << ") spanning " << startPos << " to " << (endPos-1) << " complete, commence processing" << endl;
+
+				if (startPos >= endPos)
+				{
+				    cerr << "ERROR: tag " << tagName << " must span at least one word (" << startPos << "-" << endPos << "): " << line << endl;
+					return false;
+				}
+
+				string label = ParseXmlTagAttribute(tagContent,"label");
+				labelCollection.insert( label );
+
+				// report what we have processed so far
+				if (0) {
+				  cerr << "XML TAG NAME IS: '" << tagName << "'" << endl;
+				  cerr << "XML TAG LABEL IS: '" << label << "'" << endl;
+				  cerr << "XML SPAN IS: " << startPos << "-" << (endPos-1) << endl;
+				}
+				tree.AddNode( startPos, endPos-1, label );
+			}
+		}
+	}
+	// we are done. check if there are tags that are still open
+	if (tagStack.size() > 0)
+	{
+		cerr << "ERROR: some opened tags were never closed: " << line << endl;
+		return false;
+	}
+
+	// collect top labels
+	const vector< SyntaxNode* >& topNodes = tree.GetNodes( 0, wordPos-1 );
+	for( vector< SyntaxNode* >::const_iterator node = topNodes.begin(); node != topNodes.end(); node++ )
+	{
+		SyntaxNode *n = *node;
+		const string &label = n->GetLabel();
+		if (topLabelCollection.find( label ) == topLabelCollection.end())
+			topLabelCollection[ label ] = 0;
+		topLabelCollection[ label ]++;
+	}
+
+	// return de-xml'ed sentence in line
+	line = cleanLine;
+	return true;
+}