add support for unary XML options. refer to regression-testing/tests/xml-markup/to-translate.txt for examples.

git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@1508 1f5c12ca-751b-0410-a591-d2e778427230
author: redpony <redpony@1f5c12ca-751b-0410-a591-d2e778427230> 2007-11-10 07:40:18 +0300
committer: redpony <redpony@1f5c12ca-751b-0410-a591-d2e778427230> 2007-11-10 07:40:18 +0300
commit: 180d9bac5d9f9884f06ad64a63b466760396bb74 (patch)
tree: f7989a3e7163785000380d252184d91fa0b877a6 /moses
parent: 3b008f67a9bc00bb27ce1e9692baa59298a6429e (diff)
4 files changed, 186 insertions, 186 deletions
diff --git a/moses/src/Makefile.am b/moses/src/Makefile.am
index 626cbe74f..efbac44ff 100644
--- a/moses/src/Makefile.am
+++ b/moses/src/Makefile.am
@@ -59,7 +59,8 @@ libmoses_a_SOURCES = \
 	Word.cpp \
 	WordsBitmap.cpp \
 	WordLattice.cpp \
-	WordsRange.cpp
+	WordsRange.cpp \
+	XmlOption.cpp
 
 
 if SRI_LM
diff --git a/moses/src/Sentence.cpp b/moses/src/Sentence.cpp
index 49e202278..a8a720ebc 100755
--- a/moses/src/Sentence.cpp
+++ b/moses/src/Sentence.cpp
@@ -41,93 +41,9 @@ int Sentence::Read(std::istream& in,const std::vector<FactorType>& factorOrder)
 	
 	//parse XML markup in translation line
 	const StaticData &staticData = StaticData::Instance();
-	if (staticData.GetXmlInputType() != XmlPassThrough && (line.find_first_of('<') != std::string::npos))
-	{
-		std::vector<string> xmlTokens = Tokenize(line,"<>");
-		std::string tagName = "";
-		std::string tagContents = "";
-		std::vector<std::string> altTexts;
-		std::vector<std::string> altProbs;
-		size_t offset=0;
-		size_t tagStart=0;
-		if (xmlTokens.size()>1 && line.at(0) == '<') offset=1;
-		for (size_t xmlTokenPos = 0 ; xmlTokenPos < xmlTokens.size() ; xmlTokenPos++)
-		{
-			if(((xmlTokenPos+offset) % 2) == 0)
-			{
-				//phrase, not tag
-				Phrase::CreateFromString(factorOrder,xmlTokens[xmlTokenPos],factorDelimiter);
-			}
-			else
-			{
-				//TODO: support UNARY tags
-
-				//tag data
-				std::string tag =  Trim(xmlTokens[xmlTokenPos]);
-				VERBOSE(3,"XML TAG IS: " << tag << endl);
-				std::string::size_type endOfName = xmlTokens[xmlTokenPos].find_first_of(' ');
-				std::string nextTagName = tag;
-				if (endOfName != std::string::npos) {
-					nextTagName = xmlTokens[xmlTokenPos].substr(0,endOfName);
-					tagContents = xmlTokens[xmlTokenPos].substr(endOfName+1);
-				}
-				if ((xmlTokenPos-1+offset) % 4 == 0)
-				{
-					//this is an open tag
-					tagName = nextTagName;
-					altTexts = TokenizeMultiCharSeparator(Sentence::ParseXmlTagAttribute(tagContents,"english"), "||");
-					altProbs = TokenizeMultiCharSeparator(Sentence::ParseXmlTagAttribute(tagContents,"prob"), "||");
-					tagStart =  Phrase::GetSize();
-					VERBOSE(3,"XML TAG NAME IS: '" << tagName << "'" << endl);
-					VERBOSE(3,"XML TAG ENGLISH IS: '" << altTexts[0] << "'" << endl);
-					VERBOSE(3,"XML TAG PROB IS: '" << altProbs[0] << "'" << endl);
-					VERBOSE(3,"XML TAG STARTS AT WORD: " << Phrase::GetSize() << endl);					
-					if (altTexts.size() != altProbs.size()) {
-					  TRACE_ERR("ERROR: Unequal number of probabilities and translation alternatives: " << line << endl);
-						return 0;
-					}
-				}
-				else if ((nextTagName.size() == 0) || (nextTagName.at(0) != '/') || (nextTagName.substr(1) != tagName)) 
-				{
-					//mismatched tag, abort!
-					TRACE_ERR("ERROR: tried to parse malformed XML with xml-input enabled: " << line << endl);
-					return 0;
-				}
-				else 
-				{
-					VERBOSE(3,"XML END TAG IS: " << nextTagName.substr(1) << endl);
-					VERBOSE(3,"XML TAG ENDS AT WORD: " << Phrase::GetSize() << endl);
-					//store translation options into members
-					size_t tagEnd = Phrase::GetSize()-1; //size is inclusive
-					
-					//TODO: deal with multiple XML options here
-					
-					if (staticData.GetXmlInputType() != XmlIgnore) {
-						for (size_t i=0; i<altTexts.size(); ++i) {
-							//only store options if we aren't ignoring them
-							//set default probability
-							float probValue = 1;
-							if (altProbs[i] != "") probValue = Scan<float>(altProbs[i]);
-							//Convert from prob to log-prob
-							float scoreValue = FloorScore(TransformScore(probValue));
-							XmlOption option(tagStart,tagEnd,altTexts[i],scoreValue);
-							m_xmlOptionsList.push_back(option);
-						}
-					}
-					tagName= "";
-					tagContents = "";
-					altTexts.clear();
-					altProbs.clear();
-				}
-			
-			}
-		
-		}
-	}
-	else 
-	{
-		Phrase::CreateFromString(factorOrder, line, factorDelimiter);
-	}
+	if (staticData.GetXmlInputType() != XmlPassThrough)
+		m_xmlOptionsList = ProcessAndStripXMLTags(line);
+	Phrase::CreateFromString(factorOrder, line, factorDelimiter);
 	
 	//only fill the vector if we are parsing XML
 	if (staticData.GetXmlInputType() != XmlPassThrough ) {
diff --git a/moses/src/XmlOption.cpp b/moses/src/XmlOption.cpp
index e3166d88d..4e7ff8159 100644
--- a/moses/src/XmlOption.cpp
+++ b/moses/src/XmlOption.cpp
@@ -21,106 +21,187 @@
  ***********************************************************************/
 
 #include "XmlOption.h"
+#include <vector>
+#include <string>
+#include <iostream>
+#include "Util.h"
+#include "StaticData.h"
 
+namespace {
 
-std::vector<XmlOption> parseXMLOptions(const std::string& line) {
-  //parse XML markup in translation line
-  std::vector<XmlOption> res;
-  if (line.find_first_of('<') == std::string::npos) { return res; }
-  std::vector<string> xmlTokens = Tokenize(line,"<>");
-  std::string tagName = "";
-  std::string tagContents = "";
-  std::vector<std::string> altTexts;
-  std::vector<std::string> altProbs;
-  size_t offset=0;
-  size_t tagStart=0;
-  if (xmlTokens.size()>1 && line.at(0) == '<') offset=1;
-  for (size_t xmlTokenPos = 0 ; xmlTokenPos < xmlTokens.size() ; xmlTokenPos++)
-  {
-    if(((xmlTokenPos+offset) % 2) == 0)
-    {
-      //phrase, not tag
-      Phrase::CreateFromString(factorOrder,xmlTokens[xmlTokenPos],factorDelimiter);
-    }
-    else
-    {
-      //TODO: support UNARY tags
-
-      //tag data
-      std::string tag =  Trim(xmlTokens[xmlTokenPos]);
-      VERBOSE(3,"XML TAG IS: " << tag << endl);
-      std::string::size_type endOfName = xmlTokens[xmlTokenPos].find_first_of(' ');
-      std::string nextTagName = tag;
-      if (endOfName != std::string::npos) {
-        nextTagName = xmlTokens[xmlTokenPos].substr(0,endOfName);
-        tagContents = xmlTokens[xmlTokenPos].substr(endOfName+1);
-      }
-      if ((xmlTokenPos-1+offset) % 4 == 0)
-      {
-        //this is an open tag
-        tagName = nextTagName;
-        altTexts = Tokenize(ParseXmlTagAttribute(tagContents,"english"), "|");
-        altProbs = Tokenize(ParseXmlTagAttribute(tagContents,"prob"), "|");
-        tagStart =  Phrase::GetSize();
-        VERBOSE(3,"XML TAG NAME IS: '" << tagName << "'" << endl);
-        VERBOSE(3,"XML TAG ENGLISH IS: '" << altTexts[0] << "'" << endl);
-        VERBOSE(3,"XML TAG PROB IS: '" << altProbs[0] << "'" << endl);
-        VERBOSE(3,"XML TAG STARTS AT WORD: " << Phrase::GetSize() << endl);					
-        if (altTexts.size() != altProbs.size()) {
-          TRACE_ERR("ERROR: Unequal number of probabilities and translation alternatives: " << line << endl);
-          return 0;
-        }
-      }
-      else if ((nextTagName.size() == 0) || (nextTagName.at(0) != '/') || (nextTagName.substr(1) != tagName)) 
-      {
-        //mismatched tag, abort!
-        TRACE_ERR("ERROR: tried to parse malformed XML with xml-input enabled: " << line << endl);
-        return 0;
-      }
-      else 
-      {
-        VERBOSE(3,"XML END TAG IS: " << nextTagName.substr(1) << endl);
-        VERBOSE(3,"XML TAG ENDS AT WORD: " << Phrase::GetSize() << endl);
-        //store translation options into members
-        size_t tagEnd = Phrase::GetSize()-1; //size is inclusive
-
-        //TODO: deal with multiple XML options here
-
-        if (staticData.GetXmlInputType() != XmlIgnore) {
-          for (size_t i=0; i<altTexts.size(); ++i) {
-            //only store options if we aren't ignoring them
-            //set default probability
-            float probValue = 1;
-            if (altProbs[i] != "") probValue = Scan<float>(altProbs[i]);
-            //Convert from prob to log-prob
-            float scoreValue = FloorScore(TransformScore(probValue));
-            XmlOption option(tagStart,tagEnd,altTexts[i],scoreValue);
-            m_xmlOptionsList.push_back(option);
-          }
-        }
-        tagName= "";
-        tagContents = "";
-        altTexts.clear();
-        altProbs.clear();
-      }
-    }
-  }
+std::string ParseXmlTagAttribute(const std::string& tag,const std::string& attributeName){
+	/*TODO deal with unescaping \"*/
+	string tagOpen = attributeName + "=\"";
+	size_t contentsStart = tag.find(tagOpen);
+	if (contentsStart == std::string::npos) return "";
+	contentsStart += tagOpen.size();
+	size_t contentsEnd = tag.find_first_of('"',contentsStart+1);
+	if (contentsEnd == std::string::npos) {
+		TRACE_ERR("Malformed XML attribute: "<< tag);
+		return "";	
+	}
+	size_t possibleEnd;
+	while (tag.at(contentsEnd-1) == '\\' && (possibleEnd = tag.find_first_of('"',contentsEnd+1)) != std::string::npos) {
+		contentsEnd = possibleEnd;
+	}
+	return tag.substr(contentsStart,contentsEnd-contentsStart);
 }
 
-std::string ParseXmlTagAttribute(const std::string& tag,const std::string& attributeName){
-  /*TODO deal with unescaping \"*/
-  string tagOpen = attributeName + "=\"";
-  size_t contentsStart = tag.find(tagOpen);
-  if (contentsStart == std::string::npos) return "";
-  contentsStart += tagOpen.size();
-  size_t contentsEnd = tag.find_first_of('"',contentsStart+1);
-  if (contentsEnd == std::string::npos) {
-    TRACE_ERR("Malformed XML attribute: "<< tag);
-    return "";	
-  }
-  size_t possibleEnd;
-  while (tag.at(contentsEnd-1) == '\\' && (possibleEnd = tag.find_first_of('"',contentsEnd+1)) != std::string::npos) {
-    contentsEnd = possibleEnd;
-  }
-  return tag.substr(contentsStart,contentsEnd-contentsStart);
+std::string TrimXml(const std::string& str) {
+	if (str.size() < 2) return str;
+	if (str[0] == '<' && str[str.size() - 1] == '>') {
+		return str.substr(1, str.size() - 2);
+	} else { return str; }
+}
+
+bool isXmlTag(const std::string& tag)
+{
+	return tag[0] == '<';
+}
+
+inline std::vector<std::string> TokenizeXml(const std::string& str)
+{
+	std::string lbrack = "<";
+	std::string rbrack = ">";
+	std::vector<std::string> tokens;
+	// Find first "non-delimiter".
+	std::string::size_type cpos = 0;
+	std::string::size_type lpos = 0;
+	std::string::size_type rpos = 0;
+
+	while (cpos != str.size()) {
+  	lpos = str.find_first_of(lbrack, cpos);
+		if (lpos != std::string::npos) {
+			rpos = str.find_first_of(rbrack, lpos);
+			if (rpos == std::string::npos) {
+				TRACE_ERR("ERROR: malformed XML: " << str << endl);
+				return tokens;
+			}
+		} else {
+			tokens.push_back(str.substr(cpos));
+			break;
+		}
+		if (lpos - cpos > 0)
+			tokens.push_back(str.substr(cpos, lpos - cpos));
+		tokens.push_back(str.substr(lpos, rpos-lpos+1));
+		cpos = rpos + 1;
+	}
+	return tokens;
 }
+
+}
+
+std::vector<XmlOption> ProcessAndStripXMLTags(std::string& line) {
+	//parse XML markup in translation line
+	std::vector<XmlOption> res;
+	std::string rstr;
+	if (line.find_first_of('<') == std::string::npos) { return res; }
+	std::vector<std::string> xmlTokens = TokenizeXml(line);
+	std::string tagName = "";
+	std::string tagContents = "";
+	std::vector<std::string> altTexts;
+	std::vector<std::string> altProbs;
+	size_t tagStart=0;
+	size_t tagEnd=0;
+	size_t curWord=0;
+	int numUnary = 0;
+	bool doClose = false;
+	for (size_t xmlTokenPos = 0 ; xmlTokenPos < xmlTokens.size() ; xmlTokenPos++)
+	{
+		if(!isXmlTag(xmlTokens[xmlTokenPos]))
+		{
+			//phrase, not tag
+			rstr += xmlTokens[xmlTokenPos];
+			curWord = Tokenize(rstr).size();
+		}
+		else
+		{
+			//tag data
+			std::string tag =  Trim(TrimXml(xmlTokens[xmlTokenPos]));
+			VERBOSE(3,"XML TAG IS: " << tag << std::endl);
+			std::string::size_type endOfName = tag.find_first_of(' ');
+			std::string nextTagName = tag;
+			bool isUnary = tag[tag.size() - 1] == '/';
+			bool isOpen = tag[1] != '/';
+			if (endOfName != std::string::npos) {
+				nextTagName = tag.substr(0,endOfName);
+				tagContents = tag.substr(endOfName+1);
+			}
+			if (isOpen)
+			{
+				//this is an open tag
+				tagName = nextTagName;
+				altTexts = TokenizeMultiCharSeparator(ParseXmlTagAttribute(tagContents,"english"), "||");
+				altProbs = TokenizeMultiCharSeparator(ParseXmlTagAttribute(tagContents,"prob"), "||");
+				std::string span = ParseXmlTagAttribute(tagContents,"span");
+				tagStart = curWord;
+				if (isUnary) {
+					numUnary++;
+					if (span.empty()) {
+						TRACE_ERR("ERROR: unary tags must have a span attribute: " << line << endl);
+						return res;
+					}
+					std::vector<std::string> ij = Tokenize(span, ",");
+					if (ij.size() != 2) {
+						TRACE_ERR("ERROR: span tag must be of the form \"i,j\": " << line << endl);
+						return res;
+					}
+					tagStart = atoi(ij[0].c_str());
+					tagEnd = atoi(ij[1].c_str());
+					if (tagEnd < tagStart) {
+						TRACE_ERR("ERROR: span tag " << span << " invalid" << endl);
+						return res;
+					}
+					doClose = true;
+					VERBOSE(3,"XML TAG IS UNARY" << endl);
+				}
+				VERBOSE(3,"XML TAG NAME IS: '" << tagName << "'" << endl);
+				VERBOSE(3,"XML TAG ENGLISH IS: '" << altTexts[0] << "'" << endl);
+				VERBOSE(3,"XML TAG PROB IS: '" << altProbs[0] << "'" << endl);
+				VERBOSE(3,"XML TAG STARTS AT WORD: " << tagStart << endl);					
+				if (altTexts.size() != altProbs.size()) {
+					TRACE_ERR("ERROR: Unequal number of probabilities and translation alternatives: " << line << endl);
+					return res;
+				}
+			}
+			else if ((nextTagName.size() == 0) || (nextTagName.at(0) != '/') || (nextTagName.substr(1) != tagName)) 
+			{
+				//mismatched tag, abort!
+				TRACE_ERR("ERROR: tried to parse malformed XML with xml-input enabled: " << line << endl);
+				return res;
+			}
+			else {
+				doClose = true;
+				tagEnd = curWord-1; //size is inclusive
+			}
+			if (doClose) {
+				VERBOSE(3,"XML END TAG IS: " << nextTagName.substr(1) << endl);
+				VERBOSE(3,"XML TAG ENDS AT WORD: " << tagEnd << endl);
+				//store translation options into members
+
+				//TODO: deal with multiple XML options here
+
+				if (StaticData::Instance().GetXmlInputType() != XmlIgnore) {
+					for (size_t i=0; i<altTexts.size(); ++i) {
+						//only store options if we aren't ignoring them
+						//set default probability
+						float probValue = 1;
+						if (altProbs[i] != "") probValue = Scan<float>(altProbs[i]);
+						//Convert from prob to log-prob
+						float scoreValue = FloorScore(TransformScore(probValue));
+						XmlOption option(tagStart,tagEnd,altTexts[i],scoreValue);
+						res.push_back(option);
+					}
+				}
+				tagName= "";
+				tagContents = "";
+				altTexts.clear();
+				altProbs.clear();
+				doClose = false;
+			}
+		}
+	}
+	line = rstr;
+	return res;
+}
+
diff --git a/moses/src/XmlOption.h b/moses/src/XmlOption.h
index 07ef06250..e211a6398 100644
--- a/moses/src/XmlOption.h
+++ b/moses/src/XmlOption.h
@@ -18,3 +18,5 @@ struct XmlOption {
 
 };
 
+std::vector<XmlOption> ProcessAndStripXMLTags(std::string& line);
+
author	redpony <redpony@1f5c12ca-751b-0410-a591-d2e778427230>	2007-11-10 07:40:18 +0300
committer	redpony <redpony@1f5c12ca-751b-0410-a591-d2e778427230>	2007-11-10 07:40:18 +0300
commit	180d9bac5d9f9884f06ad64a63b466760396bb74 (patch)
tree	f7989a3e7163785000380d252184d91fa0b877a6 /moses
parent	3b008f67a9bc00bb27ce1e9692baa59298a6429e (diff)