Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
path: root/moses
diff options
context:
space:
mode:
authorredpony <redpony@1f5c12ca-751b-0410-a591-d2e778427230>2007-11-10 07:40:18 +0300
committerredpony <redpony@1f5c12ca-751b-0410-a591-d2e778427230>2007-11-10 07:40:18 +0300
commit180d9bac5d9f9884f06ad64a63b466760396bb74 (patch)
treef7989a3e7163785000380d252184d91fa0b877a6 /moses
parent3b008f67a9bc00bb27ce1e9692baa59298a6429e (diff)
add support for unary XML options. refer to regression-testing/tests/xml-markup/to-translate.txt for examples.
git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@1508 1f5c12ca-751b-0410-a591-d2e778427230
Diffstat (limited to 'moses')
-rw-r--r--moses/src/Makefile.am3
-rwxr-xr-xmoses/src/Sentence.cpp90
-rw-r--r--moses/src/XmlOption.cpp277
-rw-r--r--moses/src/XmlOption.h2
4 files changed, 186 insertions, 186 deletions
diff --git a/moses/src/Makefile.am b/moses/src/Makefile.am
index 626cbe74f..efbac44ff 100644
--- a/moses/src/Makefile.am
+++ b/moses/src/Makefile.am
@@ -59,7 +59,8 @@ libmoses_a_SOURCES = \
Word.cpp \
WordsBitmap.cpp \
WordLattice.cpp \
- WordsRange.cpp
+ WordsRange.cpp \
+ XmlOption.cpp
if SRI_LM
diff --git a/moses/src/Sentence.cpp b/moses/src/Sentence.cpp
index 49e202278..a8a720ebc 100755
--- a/moses/src/Sentence.cpp
+++ b/moses/src/Sentence.cpp
@@ -41,93 +41,9 @@ int Sentence::Read(std::istream& in,const std::vector<FactorType>& factorOrder)
//parse XML markup in translation line
const StaticData &staticData = StaticData::Instance();
- if (staticData.GetXmlInputType() != XmlPassThrough && (line.find_first_of('<') != std::string::npos))
- {
- std::vector<string> xmlTokens = Tokenize(line,"<>");
- std::string tagName = "";
- std::string tagContents = "";
- std::vector<std::string> altTexts;
- std::vector<std::string> altProbs;
- size_t offset=0;
- size_t tagStart=0;
- if (xmlTokens.size()>1 && line.at(0) == '<') offset=1;
- for (size_t xmlTokenPos = 0 ; xmlTokenPos < xmlTokens.size() ; xmlTokenPos++)
- {
- if(((xmlTokenPos+offset) % 2) == 0)
- {
- //phrase, not tag
- Phrase::CreateFromString(factorOrder,xmlTokens[xmlTokenPos],factorDelimiter);
- }
- else
- {
- //TODO: support UNARY tags
-
- //tag data
- std::string tag = Trim(xmlTokens[xmlTokenPos]);
- VERBOSE(3,"XML TAG IS: " << tag << endl);
- std::string::size_type endOfName = xmlTokens[xmlTokenPos].find_first_of(' ');
- std::string nextTagName = tag;
- if (endOfName != std::string::npos) {
- nextTagName = xmlTokens[xmlTokenPos].substr(0,endOfName);
- tagContents = xmlTokens[xmlTokenPos].substr(endOfName+1);
- }
- if ((xmlTokenPos-1+offset) % 4 == 0)
- {
- //this is an open tag
- tagName = nextTagName;
- altTexts = TokenizeMultiCharSeparator(Sentence::ParseXmlTagAttribute(tagContents,"english"), "||");
- altProbs = TokenizeMultiCharSeparator(Sentence::ParseXmlTagAttribute(tagContents,"prob"), "||");
- tagStart = Phrase::GetSize();
- VERBOSE(3,"XML TAG NAME IS: '" << tagName << "'" << endl);
- VERBOSE(3,"XML TAG ENGLISH IS: '" << altTexts[0] << "'" << endl);
- VERBOSE(3,"XML TAG PROB IS: '" << altProbs[0] << "'" << endl);
- VERBOSE(3,"XML TAG STARTS AT WORD: " << Phrase::GetSize() << endl);
- if (altTexts.size() != altProbs.size()) {
- TRACE_ERR("ERROR: Unequal number of probabilities and translation alternatives: " << line << endl);
- return 0;
- }
- }
- else if ((nextTagName.size() == 0) || (nextTagName.at(0) != '/') || (nextTagName.substr(1) != tagName))
- {
- //mismatched tag, abort!
- TRACE_ERR("ERROR: tried to parse malformed XML with xml-input enabled: " << line << endl);
- return 0;
- }
- else
- {
- VERBOSE(3,"XML END TAG IS: " << nextTagName.substr(1) << endl);
- VERBOSE(3,"XML TAG ENDS AT WORD: " << Phrase::GetSize() << endl);
- //store translation options into members
- size_t tagEnd = Phrase::GetSize()-1; //size is inclusive
-
- //TODO: deal with multiple XML options here
-
- if (staticData.GetXmlInputType() != XmlIgnore) {
- for (size_t i=0; i<altTexts.size(); ++i) {
- //only store options if we aren't ignoring them
- //set default probability
- float probValue = 1;
- if (altProbs[i] != "") probValue = Scan<float>(altProbs[i]);
- //Convert from prob to log-prob
- float scoreValue = FloorScore(TransformScore(probValue));
- XmlOption option(tagStart,tagEnd,altTexts[i],scoreValue);
- m_xmlOptionsList.push_back(option);
- }
- }
- tagName= "";
- tagContents = "";
- altTexts.clear();
- altProbs.clear();
- }
-
- }
-
- }
- }
- else
- {
- Phrase::CreateFromString(factorOrder, line, factorDelimiter);
- }
+ if (staticData.GetXmlInputType() != XmlPassThrough)
+ m_xmlOptionsList = ProcessAndStripXMLTags(line);
+ Phrase::CreateFromString(factorOrder, line, factorDelimiter);
//only fill the vector if we are parsing XML
if (staticData.GetXmlInputType() != XmlPassThrough ) {
diff --git a/moses/src/XmlOption.cpp b/moses/src/XmlOption.cpp
index e3166d88d..4e7ff8159 100644
--- a/moses/src/XmlOption.cpp
+++ b/moses/src/XmlOption.cpp
@@ -21,106 +21,187 @@
***********************************************************************/
#include "XmlOption.h"
+#include <vector>
+#include <string>
+#include <iostream>
+#include "Util.h"
+#include "StaticData.h"
+namespace {
-std::vector<XmlOption> parseXMLOptions(const std::string& line) {
- //parse XML markup in translation line
- std::vector<XmlOption> res;
- if (line.find_first_of('<') == std::string::npos) { return res; }
- std::vector<string> xmlTokens = Tokenize(line,"<>");
- std::string tagName = "";
- std::string tagContents = "";
- std::vector<std::string> altTexts;
- std::vector<std::string> altProbs;
- size_t offset=0;
- size_t tagStart=0;
- if (xmlTokens.size()>1 && line.at(0) == '<') offset=1;
- for (size_t xmlTokenPos = 0 ; xmlTokenPos < xmlTokens.size() ; xmlTokenPos++)
- {
- if(((xmlTokenPos+offset) % 2) == 0)
- {
- //phrase, not tag
- Phrase::CreateFromString(factorOrder,xmlTokens[xmlTokenPos],factorDelimiter);
- }
- else
- {
- //TODO: support UNARY tags
-
- //tag data
- std::string tag = Trim(xmlTokens[xmlTokenPos]);
- VERBOSE(3,"XML TAG IS: " << tag << endl);
- std::string::size_type endOfName = xmlTokens[xmlTokenPos].find_first_of(' ');
- std::string nextTagName = tag;
- if (endOfName != std::string::npos) {
- nextTagName = xmlTokens[xmlTokenPos].substr(0,endOfName);
- tagContents = xmlTokens[xmlTokenPos].substr(endOfName+1);
- }
- if ((xmlTokenPos-1+offset) % 4 == 0)
- {
- //this is an open tag
- tagName = nextTagName;
- altTexts = Tokenize(ParseXmlTagAttribute(tagContents,"english"), "|");
- altProbs = Tokenize(ParseXmlTagAttribute(tagContents,"prob"), "|");
- tagStart = Phrase::GetSize();
- VERBOSE(3,"XML TAG NAME IS: '" << tagName << "'" << endl);
- VERBOSE(3,"XML TAG ENGLISH IS: '" << altTexts[0] << "'" << endl);
- VERBOSE(3,"XML TAG PROB IS: '" << altProbs[0] << "'" << endl);
- VERBOSE(3,"XML TAG STARTS AT WORD: " << Phrase::GetSize() << endl);
- if (altTexts.size() != altProbs.size()) {
- TRACE_ERR("ERROR: Unequal number of probabilities and translation alternatives: " << line << endl);
- return 0;
- }
- }
- else if ((nextTagName.size() == 0) || (nextTagName.at(0) != '/') || (nextTagName.substr(1) != tagName))
- {
- //mismatched tag, abort!
- TRACE_ERR("ERROR: tried to parse malformed XML with xml-input enabled: " << line << endl);
- return 0;
- }
- else
- {
- VERBOSE(3,"XML END TAG IS: " << nextTagName.substr(1) << endl);
- VERBOSE(3,"XML TAG ENDS AT WORD: " << Phrase::GetSize() << endl);
- //store translation options into members
- size_t tagEnd = Phrase::GetSize()-1; //size is inclusive
-
- //TODO: deal with multiple XML options here
-
- if (staticData.GetXmlInputType() != XmlIgnore) {
- for (size_t i=0; i<altTexts.size(); ++i) {
- //only store options if we aren't ignoring them
- //set default probability
- float probValue = 1;
- if (altProbs[i] != "") probValue = Scan<float>(altProbs[i]);
- //Convert from prob to log-prob
- float scoreValue = FloorScore(TransformScore(probValue));
- XmlOption option(tagStart,tagEnd,altTexts[i],scoreValue);
- m_xmlOptionsList.push_back(option);
- }
- }
- tagName= "";
- tagContents = "";
- altTexts.clear();
- altProbs.clear();
- }
- }
- }
+std::string ParseXmlTagAttribute(const std::string& tag,const std::string& attributeName){
+ /*TODO deal with unescaping \"*/
+ string tagOpen = attributeName + "=\"";
+ size_t contentsStart = tag.find(tagOpen);
+ if (contentsStart == std::string::npos) return "";
+ contentsStart += tagOpen.size();
+ size_t contentsEnd = tag.find_first_of('"',contentsStart+1);
+ if (contentsEnd == std::string::npos) {
+ TRACE_ERR("Malformed XML attribute: "<< tag);
+ return "";
+ }
+ size_t possibleEnd;
+ while (tag.at(contentsEnd-1) == '\\' && (possibleEnd = tag.find_first_of('"',contentsEnd+1)) != std::string::npos) {
+ contentsEnd = possibleEnd;
+ }
+ return tag.substr(contentsStart,contentsEnd-contentsStart);
}
-std::string ParseXmlTagAttribute(const std::string& tag,const std::string& attributeName){
- /*TODO deal with unescaping \"*/
- string tagOpen = attributeName + "=\"";
- size_t contentsStart = tag.find(tagOpen);
- if (contentsStart == std::string::npos) return "";
- contentsStart += tagOpen.size();
- size_t contentsEnd = tag.find_first_of('"',contentsStart+1);
- if (contentsEnd == std::string::npos) {
- TRACE_ERR("Malformed XML attribute: "<< tag);
- return "";
- }
- size_t possibleEnd;
- while (tag.at(contentsEnd-1) == '\\' && (possibleEnd = tag.find_first_of('"',contentsEnd+1)) != std::string::npos) {
- contentsEnd = possibleEnd;
- }
- return tag.substr(contentsStart,contentsEnd-contentsStart);
+std::string TrimXml(const std::string& str) {
+ if (str.size() < 2) return str;
+ if (str[0] == '<' && str[str.size() - 1] == '>') {
+ return str.substr(1, str.size() - 2);
+ } else { return str; }
+}
+
+bool isXmlTag(const std::string& tag)
+{
+ return tag[0] == '<';
+}
+
+inline std::vector<std::string> TokenizeXml(const std::string& str)
+{
+ std::string lbrack = "<";
+ std::string rbrack = ">";
+ std::vector<std::string> tokens;
+ // Find first "non-delimiter".
+ std::string::size_type cpos = 0;
+ std::string::size_type lpos = 0;
+ std::string::size_type rpos = 0;
+
+ while (cpos != str.size()) {
+ lpos = str.find_first_of(lbrack, cpos);
+ if (lpos != std::string::npos) {
+ rpos = str.find_first_of(rbrack, lpos);
+ if (rpos == std::string::npos) {
+ TRACE_ERR("ERROR: malformed XML: " << str << endl);
+ return tokens;
+ }
+ } else {
+ tokens.push_back(str.substr(cpos));
+ break;
+ }
+ if (lpos - cpos > 0)
+ tokens.push_back(str.substr(cpos, lpos - cpos));
+ tokens.push_back(str.substr(lpos, rpos-lpos+1));
+ cpos = rpos + 1;
+ }
+ return tokens;
}
+
+}
+
+std::vector<XmlOption> ProcessAndStripXMLTags(std::string& line) {
+ //parse XML markup in translation line
+ std::vector<XmlOption> res;
+ std::string rstr;
+ if (line.find_first_of('<') == std::string::npos) { return res; }
+ std::vector<std::string> xmlTokens = TokenizeXml(line);
+ std::string tagName = "";
+ std::string tagContents = "";
+ std::vector<std::string> altTexts;
+ std::vector<std::string> altProbs;
+ size_t tagStart=0;
+ size_t tagEnd=0;
+ size_t curWord=0;
+ int numUnary = 0;
+ bool doClose = false;
+ for (size_t xmlTokenPos = 0 ; xmlTokenPos < xmlTokens.size() ; xmlTokenPos++)
+ {
+ if(!isXmlTag(xmlTokens[xmlTokenPos]))
+ {
+ //phrase, not tag
+ rstr += xmlTokens[xmlTokenPos];
+ curWord = Tokenize(rstr).size();
+ }
+ else
+ {
+ //tag data
+ std::string tag = Trim(TrimXml(xmlTokens[xmlTokenPos]));
+ VERBOSE(3,"XML TAG IS: " << tag << std::endl);
+ std::string::size_type endOfName = tag.find_first_of(' ');
+ std::string nextTagName = tag;
+ bool isUnary = tag[tag.size() - 1] == '/';
+ bool isOpen = tag[1] != '/';
+ if (endOfName != std::string::npos) {
+ nextTagName = tag.substr(0,endOfName);
+ tagContents = tag.substr(endOfName+1);
+ }
+ if (isOpen)
+ {
+ //this is an open tag
+ tagName = nextTagName;
+ altTexts = TokenizeMultiCharSeparator(ParseXmlTagAttribute(tagContents,"english"), "||");
+ altProbs = TokenizeMultiCharSeparator(ParseXmlTagAttribute(tagContents,"prob"), "||");
+ std::string span = ParseXmlTagAttribute(tagContents,"span");
+ tagStart = curWord;
+ if (isUnary) {
+ numUnary++;
+ if (span.empty()) {
+ TRACE_ERR("ERROR: unary tags must have a span attribute: " << line << endl);
+ return res;
+ }
+ std::vector<std::string> ij = Tokenize(span, ",");
+ if (ij.size() != 2) {
+ TRACE_ERR("ERROR: span tag must be of the form \"i,j\": " << line << endl);
+ return res;
+ }
+ tagStart = atoi(ij[0].c_str());
+ tagEnd = atoi(ij[1].c_str());
+ if (tagEnd < tagStart) {
+ TRACE_ERR("ERROR: span tag " << span << " invalid" << endl);
+ return res;
+ }
+ doClose = true;
+ VERBOSE(3,"XML TAG IS UNARY" << endl);
+ }
+ VERBOSE(3,"XML TAG NAME IS: '" << tagName << "'" << endl);
+ VERBOSE(3,"XML TAG ENGLISH IS: '" << altTexts[0] << "'" << endl);
+ VERBOSE(3,"XML TAG PROB IS: '" << altProbs[0] << "'" << endl);
+ VERBOSE(3,"XML TAG STARTS AT WORD: " << tagStart << endl);
+ if (altTexts.size() != altProbs.size()) {
+ TRACE_ERR("ERROR: Unequal number of probabilities and translation alternatives: " << line << endl);
+ return res;
+ }
+ }
+ else if ((nextTagName.size() == 0) || (nextTagName.at(0) != '/') || (nextTagName.substr(1) != tagName))
+ {
+ //mismatched tag, abort!
+ TRACE_ERR("ERROR: tried to parse malformed XML with xml-input enabled: " << line << endl);
+ return res;
+ }
+ else {
+ doClose = true;
+ tagEnd = curWord-1; //size is inclusive
+ }
+ if (doClose) {
+ VERBOSE(3,"XML END TAG IS: " << nextTagName.substr(1) << endl);
+ VERBOSE(3,"XML TAG ENDS AT WORD: " << tagEnd << endl);
+ //store translation options into members
+
+ //TODO: deal with multiple XML options here
+
+ if (StaticData::Instance().GetXmlInputType() != XmlIgnore) {
+ for (size_t i=0; i<altTexts.size(); ++i) {
+ //only store options if we aren't ignoring them
+ //set default probability
+ float probValue = 1;
+ if (altProbs[i] != "") probValue = Scan<float>(altProbs[i]);
+ //Convert from prob to log-prob
+ float scoreValue = FloorScore(TransformScore(probValue));
+ XmlOption option(tagStart,tagEnd,altTexts[i],scoreValue);
+ res.push_back(option);
+ }
+ }
+ tagName= "";
+ tagContents = "";
+ altTexts.clear();
+ altProbs.clear();
+ doClose = false;
+ }
+ }
+ }
+ line = rstr;
+ return res;
+}
+
diff --git a/moses/src/XmlOption.h b/moses/src/XmlOption.h
index 07ef06250..e211a6398 100644
--- a/moses/src/XmlOption.h
+++ b/moses/src/XmlOption.h
@@ -18,3 +18,5 @@ struct XmlOption {
};
+std::vector<XmlOption> ProcessAndStripXMLTags(std::string& line);
+