From afbbfaacd483981e314de6e53bb8c512761faf42 Mon Sep 17 00:00:00 2001 From: Arianna Bisazza Date: Wed, 16 Nov 2011 13:38:22 +0100 Subject: xml-like markup symbols are now configurable --- moses/src/Parameter.cpp | 1 + moses/src/Sentence.cpp | 2 +- moses/src/StaticData.cpp | 15 ++++++++++++++ moses/src/StaticData.h | 5 +++++ moses/src/XmlOption.cpp | 54 ++++++++++++++++++++++++++++-------------------- 5 files changed, 54 insertions(+), 23 deletions(-) (limited to 'moses') diff --git a/moses/src/Parameter.cpp b/moses/src/Parameter.cpp index 345ad4651..c850b17cb 100644 --- a/moses/src/Parameter.cpp +++ b/moses/src/Parameter.cpp @@ -102,6 +102,7 @@ Parameter::Parameter() AddParam("distortion-file", "source factors (0 if table independent of source), target factors, location of the factorized/lexicalized reordering tables"); AddParam("distortion", "configurations for each factorized/lexicalized reordering model."); AddParam("xml-input", "xi", "allows markup of input with desired translations and probabilities. values can be 'pass-through' (default), 'inclusive', 'exclusive', 'ignore'"); + AddParam("xml-brackets", "xb", "specify XML tags opening and closing bracket strings. Default is \"< >\". Square brackets (\"[ ]\") should be avoided because of configuration file format. Valid only with text input mode" ); AddParam("minimum-bayes-risk", "mbr", "use miminum Bayes risk to determine best translation"); AddParam("lminimum-bayes-risk", "lmbr", "use lattice miminum Bayes risk to determine best translation"); AddParam("consensus-decoding", "con", "use consensus decoding (De Nero et. al. 2009)"); diff --git a/moses/src/Sentence.cpp b/moses/src/Sentence.cpp index 7e1d611a3..1b53aedf7 100644 --- a/moses/src/Sentence.cpp +++ b/moses/src/Sentence.cpp @@ -99,7 +99,7 @@ int Sentence::Read(std::istream& in,const std::vector& factorOrder) std::vector xmlOptionsList(0); std::vector< size_t > xmlWalls; if (staticData.GetXmlInputType() != XmlPassThrough) { - if (!ProcessAndStripXMLTags(line, xmlOptionsList, m_reorderingConstraint, xmlWalls )) { + if (!ProcessAndStripXMLTags(line, xmlOptionsList, m_reorderingConstraint, xmlWalls, staticData.GetXmlBrackets().first, staticData.GetXmlBrackets().second)) { const string msg("Unable to parse XML in line: " + line); TRACE_ERR(msg << endl); throw runtime_error(msg); diff --git a/moses/src/StaticData.cpp b/moses/src/StaticData.cpp index 5c9d279fa..a0de0061d 100644 --- a/moses/src/StaticData.cpp +++ b/moses/src/StaticData.cpp @@ -88,6 +88,9 @@ StaticData::StaticData() m_maxFactorIdx[0] = 0; // source side m_maxFactorIdx[1] = 0; // target side + m_xmlBrackets.first="<"; + m_xmlBrackets.second=">"; + // memory pools Phrase::InitializeMemPool(); } @@ -478,6 +481,18 @@ bool StaticData::LoadData(Parameter *parameter) return false; } + // specify XML tags opening and closing brackets for XML option + if (m_parameter->GetParam("xml-brackets").size() > 0) { + std::vector brackets = Tokenize(m_parameter->GetParam("xml-brackets")[0]); + if(brackets.size()!=2) { + cerr << "invalid xml-brackets value, must specify exactly 2 blank-delimited strings for XML tags opening and closing brackets" << endl; + exit(1); + } + m_xmlBrackets.first= brackets[0]; + m_xmlBrackets.second=brackets[1]; + cerr << "XML tags opening and closing brackets for XML input are: " << m_xmlBrackets.first << " and " << m_xmlBrackets.second << endl; + } + #ifdef HAVE_SYNLM if (m_parameter->GetParam("slmodel-file").size() > 0) { if (!LoadSyntacticLanguageModel()) return false; diff --git a/moses/src/StaticData.h b/moses/src/StaticData.h index b1d4a90a6..949426b98 100644 --- a/moses/src/StaticData.h +++ b/moses/src/StaticData.h @@ -158,6 +158,7 @@ protected: size_t m_maxNumFactors; //! max number of factors on both source and target sides XmlInputType m_xmlInputType; //! method for handling sentence XML input + std::pair m_xmlBrackets; //! strings to use as XML tags' opening and closing brackets. Default are "<" and ">" bool m_mbr; //! use MBR decoder bool m_useLatticeMBR; //! use MBR decoder @@ -560,6 +561,10 @@ public: return m_xmlInputType; } + std::pair GetXmlBrackets() const { + return m_xmlBrackets; + } + bool GetUseTransOptCache() const { return m_useTransOptCache; } diff --git a/moses/src/XmlOption.cpp b/moses/src/XmlOption.cpp index b6afb6b4b..ad56dd353 100644 --- a/moses/src/XmlOption.cpp +++ b/moses/src/XmlOption.cpp @@ -56,15 +56,17 @@ string ParseXmlTagAttribute(const string& tag,const string& attributeName) * Remove "<" and ">" from XML tag * * \param str xml token to be stripped + * \param lbrackStr xml tag's left bracket string, typically "<" + * \param rbrackStr xml tag's right bracket string, typically ">" */ -string TrimXml(const string& str) +string TrimXml(const string& str, const std::string& lbrackStr, const std::string& rbrackStr) { // too short to be xml token -> do nothing - if (str.size() < 2) return str; + if (str.size() < lbrackStr.length()+rbrackStr.length() ) return str; // strip first and last character - if (str[0] == '<' && str[str.size() - 1] == '>') { - return str.substr(1, str.size() - 2); + if (str.substr(0,lbrackStr.length()) == lbrackStr && str.substr(str.size()-rbrackStr.length()) == rbrackStr) { + return str.substr(lbrackStr.length(), str.size()-lbrackStr.length()-rbrackStr.length()); } // not an xml token -> do nothing else { @@ -76,13 +78,15 @@ string TrimXml(const string& str) * Check if the token is an XML tag, i.e. starts with "<" * * \param tag token to be checked + * \param lbrackStr xml tag's left bracket string, typically "<" + * \param rbrackStr xml tag's right bracket string, typically ">" */ -bool isXmlTag(const string& tag) +bool isXmlTag(const string& tag, const std::string& lbrackStr, const std::string& rbrackStr) { - return (tag[0] == '<' && - (tag[1] == '/' || - (tag[1] >= 'a' && tag[1] <= 'z') || - (tag[1] >= 'A' && tag[1] <= 'Z'))); + return (tag.substr(0,lbrackStr.length()) == lbrackStr && + (tag[lbrackStr.length()] == '/' || + (tag[lbrackStr.length()] >= 'a' && tag[lbrackStr.length()] <= 'z') || + (tag[lbrackStr.length()] >= 'A' && tag[lbrackStr.length()] <= 'Z'))); } /** @@ -92,11 +96,13 @@ bool isXmlTag(const string& tag) * => (this ), (), ( is a ), (), ( test .) * * \param str input string + * \param lbrackStr xml tag's left bracket string, typically "<" + * \param rbrackStr xml tag's right bracket string, typically ">" */ -vector TokenizeXml(const string& str) +vector TokenizeXml(const string& str, const std::string& lbrackStr, const std::string& rbrackStr) { - string lbrack = "<"; - string rbrack = ">"; + string lbrack = lbrackStr; // = "<"; + string rbrack = rbrackStr; // = ">"; vector tokens; // vector of tokens to be returned string::size_type cpos = 0; // current position in string string::size_type lpos = 0; // left start of xml tag @@ -105,10 +111,10 @@ vector TokenizeXml(const string& str) // walk thorugh the string (loop vver cpos) while (cpos != str.size()) { // find the next opening "<" of an xml tag - lpos = str.find_first_of(lbrack, cpos); + lpos = str.find(lbrack, cpos); // lpos = str.find_first_of(lbrack, cpos); if (lpos != string::npos) { // find the end of the xml tag - rpos = str.find_first_of(rbrack, lpos); + rpos = str.find(rbrack, lpos+lbrackStr.length()-1); // rpos = str.find_first_of(rbrack, lpos); // sanity check: there has to be closing ">" if (rpos == string::npos) { TRACE_ERR("ERROR: malformed XML: " << str << endl); @@ -125,8 +131,8 @@ vector TokenizeXml(const string& str) tokens.push_back(str.substr(cpos, lpos - cpos)); // add xml tag as token - tokens.push_back(str.substr(lpos, rpos-lpos+1)); - cpos = rpos + 1; + tokens.push_back(str.substr(lpos, rpos-lpos+rbrackStr.length())); + cpos = rpos + rbrackStr.length(); } return tokens; } @@ -140,19 +146,23 @@ vector TokenizeXml(const string& str) * \param res vector with translation options specified by xml * \param reorderingConstraint reordering constraint zones specified by xml * \param walls reordering constraint walls specified by xml + * \param lbrackStr xml tag's left bracket string, typically "<" + * \param rbrackStr xml tag's right bracket string, typically ">" */ -bool ProcessAndStripXMLTags(string &line, vector &res, ReorderingConstraint &reorderingConstraint, vector< size_t > &walls ) +bool ProcessAndStripXMLTags(string &line, vector &res, ReorderingConstraint &reorderingConstraint, vector< size_t > &walls, + const std::string& lbrackStr, const std::string& rbrackStr) { //parse XML markup in translation line // no xml tag? we're done. - if (line.find_first_of('<') == string::npos) { +//if (line.find_first_of('<') == string::npos) { + if (line.find(lbrackStr) == string::npos) { return true; } // break up input into a vector of xml tags and text // example: (this), (), (is a), (), (test .) - vector xmlTokens = TokenizeXml(line); + vector xmlTokens = TokenizeXml(line, lbrackStr, rbrackStr); // we need to store opened tags, until they are closed // tags are stored as tripled (tagname, startpos, contents) @@ -168,7 +178,7 @@ bool ProcessAndStripXMLTags(string &line, vector &res, ReorderingCon // loop through the tokens for (size_t xmlTokenPos = 0 ; xmlTokenPos < xmlTokens.size() ; xmlTokenPos++) { // not a xml tag, but regular text (may contain many words) - if(!isXmlTag(xmlTokens[xmlTokenPos])) { + if(!isXmlTag(xmlTokens[xmlTokenPos], lbrackStr, rbrackStr)) { // add a space at boundary, if necessary if (cleanLine.size()>0 && cleanLine[cleanLine.size() - 1] != ' ' && @@ -184,7 +194,7 @@ bool ProcessAndStripXMLTags(string &line, vector &res, ReorderingCon // *** get essential information about tag *** // strip extra boundary spaces and "<" and ">" - string tag = Trim(TrimXml(xmlTokens[xmlTokenPos])); + string tag = Trim(TrimXml(xmlTokens[xmlTokenPos], lbrackStr, rbrackStr)); VERBOSE(3,"XML TAG IS: " << tag << std::endl); if (tag.size() == 0) { @@ -200,7 +210,7 @@ bool ProcessAndStripXMLTags(string &line, vector &res, ReorderingCon bool isOpen = !isClosed; if (isClosed && isUnary) { - TRACE_ERR("ERROR: can't have both closed and unary tag <" << tag << ">: " << line << endl); + TRACE_ERR("ERROR: can't have both closed and unary tag " << lbrackStr << tag << rbrackStr << ": " << line << endl); return false; } -- cgit v1.2.3