diff options
Diffstat (limited to 'scripts/training/filter-pt.2/XmlTree.cpp')
-rw-r--r-- | scripts/training/filter-pt.2/XmlTree.cpp | 344 |
1 files changed, 344 insertions, 0 deletions
diff --git a/scripts/training/filter-pt.2/XmlTree.cpp b/scripts/training/filter-pt.2/XmlTree.cpp new file mode 100644 index 000000000..4f300d57c --- /dev/null +++ b/scripts/training/filter-pt.2/XmlTree.cpp @@ -0,0 +1,344 @@ +// $Id: XmlOption.cpp 1960 2008-12-15 12:52:38Z phkoehn $ +// vim:tabstop=2 + +/*********************************************************************** + Moses - factored phrase-based language decoder + Copyright (C) 2006 University of Edinburgh + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + ***********************************************************************/ + +#include <vector> +#include <string> +#include <set> +#include <iostream> +#include <stdlib.h> +#include "XmlTree.h" + +using namespace std; + + +inline std::vector<std::string> Tokenize(const std::string& str, + const std::string& delimiters = " \t") +{ + std::vector<std::string> tokens; + // Skip delimiters at beginning. + std::string::size_type lastPos = str.find_first_not_of(delimiters, 0); + // Find first "non-delimiter". + std::string::size_type pos = str.find_first_of(delimiters, lastPos); + + while (std::string::npos != pos || std::string::npos != lastPos) + { + // Found a token, add it to the vector. + tokens.push_back(str.substr(lastPos, pos - lastPos)); + // Skip delimiters. Note the "not_of" + lastPos = str.find_first_not_of(delimiters, pos); + // Find next "non-delimiter" + pos = str.find_first_of(delimiters, lastPos); + } + + return tokens; +} + +const std::string Trim(const std::string& str, const std::string dropChars = " \t\n\r") +{ + std::string res = str; + res.erase(str.find_last_not_of(dropChars)+1); + return res.erase(0, res.find_first_not_of(dropChars)); +} + +string ParseXmlTagAttribute(const string& tag,const string& attributeName){ + /*TODO deal with unescaping \"*/ + string tagOpen = attributeName + "=\""; + size_t contentsStart = tag.find(tagOpen); + if (contentsStart == string::npos) return ""; + contentsStart += tagOpen.size(); + size_t contentsEnd = tag.find_first_of('"',contentsStart+1); + if (contentsEnd == string::npos) { + cerr << "Malformed XML attribute: "<< tag; + return ""; + } + size_t possibleEnd; + while (tag.at(contentsEnd-1) == '\\' && (possibleEnd = tag.find_first_of('"',contentsEnd+1)) != string::npos) { + contentsEnd = possibleEnd; + } + return tag.substr(contentsStart,contentsEnd-contentsStart); +} + +/** + * Remove "<" and ">" from XML tag + * + * \param str xml token to be stripped + */ +string TrimXml(const string& str) +{ + // too short to be xml token -> do nothing + if (str.size() < 2) return str; + + // strip first and last character + if (str[0] == '<' && str[str.size() - 1] == '>') + { + return str.substr(1, str.size() - 2); + } + // not an xml token -> do nothing + else { return str; } +} + +/** + * Check if the token is an XML tag, i.e. starts with "<" + * + * \param tag token to be checked + */ +bool isXmlTag(const string& tag) +{ + return tag[0] == '<'; +} + +/** + * Split up the input character string into tokens made up of + * either XML tags or text. + * example: this <b> is a </b> test . + * => (this ), (<b>), ( is a ), (</b>), ( test .) + * + * \param str input string + */ +inline vector<string> TokenizeXml(const string& str) +{ + string lbrack = "<"; + string rbrack = ">"; + vector<string> tokens; // vector of tokens to be returned + string::size_type cpos = 0; // current position in string + string::size_type lpos = 0; // left start of xml tag + string::size_type rpos = 0; // right end of xml tag + + // walk thorugh the string (loop vver cpos) + while (cpos != str.size()) + { + // find the next opening "<" of an xml tag + lpos = str.find_first_of(lbrack, cpos); + if (lpos != string::npos) + { + // find the end of the xml tag + rpos = str.find_first_of(rbrack, lpos); + // sanity check: there has to be closing ">" + if (rpos == string::npos) + { + cerr << "ERROR: malformed XML: " << str << endl; + return tokens; + } + } + else // no more tags found + { + // add the rest as token + tokens.push_back(str.substr(cpos)); + break; + } + + // add stuff before xml tag as token, if there is any + if (lpos - cpos > 0) + tokens.push_back(str.substr(cpos, lpos - cpos)); + + // add xml tag as token + tokens.push_back(str.substr(lpos, rpos-lpos+1)); + cpos = rpos + 1; + } + return tokens; +} + +/** + * Process a sentence with xml annotation + * Xml tags may specifiy additional/replacing translation options + * and reordering constraints + * + * \param line in: sentence, out: sentence without the xml + * \param res vector with translation options specified by xml + * \param reorderingConstraint reordering constraint zones specified by xml + * \param walls reordering constraint walls specified by xml + */ +/*TODO: we'd only have to return a vector of XML options if we dropped linking. 2-d vector + is so we can link things up afterwards. We can't create TranslationOptions as we + parse because we don't have the completed source parsed until after this function + removes all the markup from it (CreateFromString in Sentence::Read). +*/ +bool ProcessAndStripXMLTags(string &line, SyntaxTree &tree, set< string > &labelCollection, map< string, int > &topLabelCollection ) { + //parse XML markup in translation line + + // no xml tag? we're done. + if (line.find_first_of('<') == string::npos) { return true; } + + // break up input into a vector of xml tags and text + // example: (this), (<b>), (is a), (</b>), (test .) + vector<string> xmlTokens = TokenizeXml(line); + + // we need to store opened tags, until they are closed + // tags are stored as tripled (tagname, startpos, contents) + typedef pair< string, pair< size_t, string > > OpenedTag; + vector< OpenedTag > tagStack; // stack that contains active opened tags + + string cleanLine; // return string (text without xml) + size_t wordPos = 0; // position in sentence (in terms of number of words) + bool isLinked = false; + + // loop through the tokens + for (size_t xmlTokenPos = 0 ; xmlTokenPos < xmlTokens.size() ; xmlTokenPos++) + { + // not a xml tag, but regular text (may contain many words) + if(!isXmlTag(xmlTokens[xmlTokenPos])) + { + // add a space at boundary, if necessary + if (cleanLine.size()>0 && + cleanLine[cleanLine.size() - 1] != ' ' && + xmlTokens[xmlTokenPos][0] != ' ') + { + cleanLine += " "; + } + cleanLine += xmlTokens[xmlTokenPos]; // add to output + wordPos = Tokenize(cleanLine).size(); // count all the words + } + + // process xml tag + else + { + // *** get essential information about tag *** + + // strip extra boundary spaces and "<" and ">" + string tag = Trim(TrimXml(xmlTokens[xmlTokenPos])); + // cerr << "XML TAG IS: " << tag << std::endl; + + if (tag.size() == 0) + { + cerr << "ERROR: empty tag name: " << line << endl; + return false; + } + + // check if unary (e.g., "<wall/>") + bool isUnary = ( tag[tag.size() - 1] == '/' ); + + // check if opening tag (e.g. "<a>", not "</a>")g + bool isClosed = ( tag[0] == '/' ); + bool isOpen = !isClosed; + + if (isClosed && isUnary) + { + cerr << "ERROR: can't have both closed and unary tag <" << tag << ">: " << line << endl; + return false; + } + + if (isClosed) + tag = tag.substr(1); // remove "/" at the beginning + if (isUnary) + tag = tag.substr(0,tag.size()-1); // remove "/" at the end + + // find the tag name and contents + string::size_type endOfName = tag.find_first_of(' '); + string tagName = tag; + string tagContent = ""; + if (endOfName != string::npos) { + tagName = tag.substr(0,endOfName); + tagContent = tag.substr(endOfName+1); + } + + // *** process new tag *** + + if (isOpen || isUnary) + { + // put the tag on the tag stack + OpenedTag openedTag = make_pair( tagName, make_pair( wordPos, tagContent ) ); + tagStack.push_back( openedTag ); + // cerr << "XML TAG " << tagName << " (" << tagContent << ") added to stack, now size " << tagStack.size() << endl; + } + + // *** process completed tag *** + + if (isClosed || isUnary) + { + // pop last opened tag from stack; + if (tagStack.size() == 0) + { + cerr << "ERROR: tag " << tagName << " closed, but not opened" << ":" << line << endl; + return false; + } + OpenedTag openedTag = tagStack.back(); + tagStack.pop_back(); + + // tag names have to match + if (openedTag.first != tagName) + { + cerr << "ERROR: tag " << openedTag.first << " closed by tag " << tagName << ": " << line << endl; + return false; + } + + // assemble remaining information about tag + size_t startPos = openedTag.second.first; + string tagContent = openedTag.second.second; + size_t endPos = wordPos; + + // span attribute overwrites position + string span = ParseXmlTagAttribute(tagContent,"span"); + if (! span.empty()) + { + vector<string> ij = Tokenize(span, "-"); + if (ij.size() != 1 && ij.size() != 2) { + cerr << "ERROR: span attribute must be of the form \"i-j\" or \"i\": " << line << endl; + return false; + } + startPos = atoi(ij[0].c_str()); + if (ij.size() == 1) endPos = startPos + 1; + else endPos = atoi(ij[1].c_str()) + 1; + } + + // cerr << "XML TAG " << tagName << " (" << tagContent << ") spanning " << startPos << " to " << (endPos-1) << " complete, commence processing" << endl; + + if (startPos >= endPos) + { + cerr << "ERROR: tag " << tagName << " must span at least one word (" << startPos << "-" << endPos << "): " << line << endl; + return false; + } + + string label = ParseXmlTagAttribute(tagContent,"label"); + labelCollection.insert( label ); + + // report what we have processed so far + if (0) { + cerr << "XML TAG NAME IS: '" << tagName << "'" << endl; + cerr << "XML TAG LABEL IS: '" << label << "'" << endl; + cerr << "XML SPAN IS: " << startPos << "-" << (endPos-1) << endl; + } + tree.AddNode( startPos, endPos-1, label ); + } + } + } + // we are done. check if there are tags that are still open + if (tagStack.size() > 0) + { + cerr << "ERROR: some opened tags were never closed: " << line << endl; + return false; + } + + // collect top labels + const vector< SyntaxNode* >& topNodes = tree.GetNodes( 0, wordPos-1 ); + for( vector< SyntaxNode* >::const_iterator node = topNodes.begin(); node != topNodes.end(); node++ ) + { + SyntaxNode *n = *node; + const string &label = n->GetLabel(); + if (topLabelCollection.find( label ) == topLabelCollection.end()) + topLabelCollection[ label ] = 0; + topLabelCollection[ label ]++; + } + + // return de-xml'ed sentence in line + line = cleanLine; + return true; +} |