// $Id$ // vim:tabstop=2 /*********************************************************************** Moses - factored phrase-based language decoder Copyright (C) 2006 University of Edinburgh This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ***********************************************************************/ #include "XmlOption.h" #include #include #include #include "Util.h" #include "StaticData.h" #include "TranslationOption.h" namespace { std::string ParseXmlTagAttribute(const std::string& tag,const std::string& attributeName){ /*TODO deal with unescaping \"*/ string tagOpen = attributeName + "=\""; size_t contentsStart = tag.find(tagOpen); if (contentsStart == std::string::npos) return ""; contentsStart += tagOpen.size(); size_t contentsEnd = tag.find_first_of('"',contentsStart+1); if (contentsEnd == std::string::npos) { TRACE_ERR("Malformed XML attribute: "<< tag); return ""; } size_t possibleEnd; while (tag.at(contentsEnd-1) == '\\' && (possibleEnd = tag.find_first_of('"',contentsEnd+1)) != std::string::npos) { contentsEnd = possibleEnd; } return tag.substr(contentsStart,contentsEnd-contentsStart); } std::string TrimXml(const std::string& str) { if (str.size() < 2) return str; if (str[0] == '<' && str[str.size() - 1] == '>') { return str.substr(1, str.size() - 2); } else { return str; } } bool isXmlTag(const std::string& tag) { return tag[0] == '<'; } inline std::vector TokenizeXml(const std::string& str) { std::string lbrack = "<"; std::string rbrack = ">"; std::vector tokens; // Find first "non-delimiter". std::string::size_type cpos = 0; std::string::size_type lpos = 0; std::string::size_type rpos = 0; while (cpos != str.size()) { lpos = str.find_first_of(lbrack, cpos); if (lpos != std::string::npos) { rpos = str.find_first_of(rbrack, lpos); if (rpos == std::string::npos) { TRACE_ERR("ERROR: malformed XML: " << str << endl); return tokens; } } else { tokens.push_back(str.substr(cpos)); break; } if (lpos - cpos > 0) tokens.push_back(str.substr(cpos, lpos - cpos)); tokens.push_back(str.substr(lpos, rpos-lpos+1)); cpos = rpos + 1; } return tokens; } } std::vector ProcessAndStripXMLTags(std::string& line, const InputType &source) { //parse XML markup in translation line std::vector res; std::string rstr; std::string linkedStr; if (line.find_first_of('<') == std::string::npos) { return res; } std::vector xmlTokens = TokenizeXml(line); std::string tagName = ""; std::string tagContents = ""; std::vector altTexts; std::vector altProbs; size_t tagStart=0; size_t tagEnd=0; size_t curWord=0; int numUnary = 0; bool doClose = false; bool isLinked = false; for (size_t xmlTokenPos = 0 ; xmlTokenPos < xmlTokens.size() ; xmlTokenPos++) { if(!isXmlTag(xmlTokens[xmlTokenPos])) { //phrase, not tag rstr += xmlTokens[xmlTokenPos]; curWord = Tokenize(rstr).size(); } else { //tag data std::string tag = Trim(TrimXml(xmlTokens[xmlTokenPos])); VERBOSE(3,"XML TAG IS: " << tag << std::endl); std::string::size_type endOfName = tag.find_first_of(' '); std::string nextTagName = tag; bool isUnary = tag[tag.size() - 1] == '/'; bool isOpen = tag[0] != '/'; if (endOfName != std::string::npos) { nextTagName = tag.substr(0,endOfName); tagContents = tag.substr(endOfName+1); } if (nextTagName == "linked") { isLinked = true; linkedStr = ""; } else if (nextTagName == "/linked") { isLinked = false; // recurse to process linked tags std::vector tOptions = ProcessAndStripXMLTags(linkedStr, source); // link them together std::vector::const_iterator iterTransOpts1; std::vector::const_iterator iterTransOpts2; for (iterTransOpts1 = tOptions.begin(); iterTransOpts1 != tOptions.end(); iterTransOpts1++) { for (iterTransOpts2 = tOptions.begin(); iterTransOpts2 != tOptions.end(); iterTransOpts2++) { if (iterTransOpts1 != iterTransOpts2) { (**iterTransOpts1).AddLinkedTransOpt(*iterTransOpts2); } } res.push_back(*iterTransOpts1); } } else if (isLinked) { linkedStr += xmlTokens[xmlTokenPos]; } else if (isOpen) { //this is an open tag tagName = nextTagName; altTexts = TokenizeMultiCharSeparator(ParseXmlTagAttribute(tagContents,"english"), "||"); altProbs = TokenizeMultiCharSeparator(ParseXmlTagAttribute(tagContents,"prob"), "||"); std::string span = ParseXmlTagAttribute(tagContents,"span"); tagStart = curWord; if (isUnary) { numUnary++; if (span.empty()) { TRACE_ERR("ERROR: unary tags must have a span attribute: " << line << endl); return res; } std::vector ij = Tokenize(span, ","); if (ij.size() != 2) { TRACE_ERR("ERROR: span tag must be of the form \"i,j\": " << line << endl); return res; } tagStart = atoi(ij[0].c_str()); tagEnd = atoi(ij[1].c_str()); if (tagEnd < tagStart) { TRACE_ERR("ERROR: span tag " << span << " invalid" << endl); return res; } doClose = true; VERBOSE(3,"XML TAG IS UNARY" << endl); } VERBOSE(3,"XML TAG NAME IS: '" << tagName << "'" << endl); VERBOSE(3,"XML TAG ENGLISH IS: '" << altTexts[0] << "'" << endl); VERBOSE(3,"XML TAG PROB IS: '" << altProbs[0] << "'" << endl); VERBOSE(3,"XML TAG STARTS AT WORD: " << tagStart << endl); if (altTexts.size() != altProbs.size()) { TRACE_ERR("ERROR: Unequal number of probabilities and translation alternatives: " << line << endl); return res; } } else if ((nextTagName.size() == 0) || (nextTagName.at(0) != '/') || (nextTagName.substr(1) != tagName)) { //mismatched tag, abort! TRACE_ERR("ERROR: tried to parse malformed XML with xml-input enabled: " << line << endl); return res; } else { doClose = true; tagEnd = curWord-1; //size is inclusive } if (doClose) { VERBOSE(3,"XML END TAG IS: " << nextTagName.substr(1) << endl); VERBOSE(3,"XML TAG ENDS AT WORD: " << tagEnd << endl); //store translation options into members //TODO: deal with multiple XML options here if (StaticData::Instance().GetXmlInputType() != XmlIgnore) { const std::vector &outputFactorOrder = StaticData::Instance().GetOutputFactorOrder(); for (size_t i=0; i(altProbs[i]); //Convert from prob to log-prob float scoreValue = FloorScore(TransformScore(probValue)); TargetPhrase targetPhrase(Output); targetPhrase.CreateFromString(outputFactorOrder,altTexts[i],StaticData::Instance().GetFactorDelimiter()); targetPhrase.SetScore(scoreValue); WordsRange range(tagStart,tagEnd); TranslationOption *option = new TranslationOption(range,targetPhrase,source); assert(option); res.push_back(option); } } tagName= ""; tagContents = ""; altTexts.clear(); altProbs.clear(); doClose = false; } } } line = rstr; return res; }