// $Id$ #include "TreeInput.h" #include "StaticData.h" #include "Util.h" #include "XmlOption.h" #include "FactorCollection.h" #include "moses/TranslationModel/PhraseDictionary.h" using namespace std; namespace Moses { /** * Process a sentence with xml annotation * Xml tags may specifiy additional/replacing translation options * and reordering constraints * * \param line in: sentence, out: sentence without the xml * \param res vector with translation options specified by xml * \param reorderingConstraint reordering constraint zones specified by xml * \param walls reordering constraint walls specified by xml */ bool TreeInput:: ProcessAndStripXMLTags(AllOptions const& opts, string &line, std::vector &sourceLabels, std::vector &xmlOptions) { //parse XML markup in translation line vector const& oFactors = opts.output.factor_order; // no xml tag? we're done. if (line.find_first_of('<') == string::npos) { return true; } // hack. What pt should XML trans opt be assigned to? PhraseDictionary *firstPt = NULL; if (PhraseDictionary::GetColl().size() == 0) { firstPt = PhraseDictionary::GetColl()[0]; } // break up input into a vector of xml tags and text // example: (this), (), (is a), (), (test .) vector xmlTokens = TokenizeXml(line); // we need to store opened tags, until they are closed // tags are stored as tripled (tagname, startpos, contents) typedef pair< string, pair< size_t, string > > OpenedTag; vector< OpenedTag > tagStack; // stack that contains active opened tags string cleanLine; // return string (text without xml) size_t wordPos = 0; // position in sentence (in terms of number of words) // loop through the tokens for (size_t xmlTokenPos = 0 ; xmlTokenPos < xmlTokens.size() ; xmlTokenPos++) { // not a xml tag, but regular text (may contain many words) if(!isXmlTag(xmlTokens[xmlTokenPos])) { // add a space at boundary, if necessary if (cleanLine.size()>0 && cleanLine[cleanLine.size() - 1] != ' ' && xmlTokens[xmlTokenPos][0] != ' ') { cleanLine += " "; } cleanLine += xmlTokens[xmlTokenPos]; // add to output wordPos = Tokenize(cleanLine).size(); // count all the words } // process xml tag else { // *** get essential information about tag *** // strip extra boundary spaces and "<" and ">" string tag = Trim(TrimXml(xmlTokens[xmlTokenPos])); VERBOSE(3,"XML TAG IS: " << tag << std::endl); if (tag.size() == 0) { TRACE_ERR("ERROR: empty tag name: " << line << endl); return false; } // check if unary (e.g., "") bool isUnary = ( tag[tag.size() - 1] == '/' ); // check if opening tag (e.g. "", not "")g bool isClosed = ( tag[0] == '/' ); bool isOpen = !isClosed; if (isClosed && isUnary) { TRACE_ERR("ERROR: can't have both closed and unary tag <" << tag << ">: " << line << endl); return false; } if (isClosed) tag = tag.substr(1); // remove "/" at the beginning if (isUnary) tag = tag.substr(0,tag.size()-1); // remove "/" at the end // find the tag name and contents string::size_type endOfName = tag.find_first_of(' '); string tagName = tag; string tagContent = ""; if (endOfName != string::npos) { tagName = tag.substr(0,endOfName); tagContent = tag.substr(endOfName+1); } // *** process new tag *** if (isOpen || isUnary) { // put the tag on the tag stack OpenedTag openedTag = make_pair( tagName, make_pair( wordPos, tagContent ) ); tagStack.push_back( openedTag ); VERBOSE(3,"XML TAG " << tagName << " (" << tagContent << ") added to stack, now size " << tagStack.size() << endl); } // *** process completed tag *** if (isClosed || isUnary) { // pop last opened tag from stack; if (tagStack.size() == 0) { TRACE_ERR("ERROR: tag " << tagName << " closed, but not opened" << ":" << line << endl); return false; } OpenedTag openedTag = tagStack.back(); tagStack.pop_back(); // tag names have to match if (openedTag.first != tagName) { TRACE_ERR("ERROR: tag " << openedTag.first << " closed by tag " << tagName << ": " << line << endl ); return false; } // assemble remaining information about tag size_t startPos = openedTag.second.first; string tagContent = openedTag.second.second; size_t endPos = wordPos; // span attribute overwrites position string span = ParseXmlTagAttribute(tagContent,"span"); if (! span.empty()) { vector ij = Tokenize(span, "-"); if (ij.size() != 1 && ij.size() != 2) { TRACE_ERR("ERROR: span attribute must be of the form \"i-j\" or \"i\": " << line << endl); return false; } startPos = atoi(ij[0].c_str()); if (ij.size() == 1) endPos = startPos + 1; else endPos = atoi(ij[1].c_str()) + 1; } VERBOSE(3,"XML TAG " << tagName << " (" << tagContent << ") spanning " << startPos << " to " << (endPos-1) << " complete, commence processing" << endl); if (startPos == endPos) { TRACE_ERR("WARNING: tag " << tagName << " span is empty. Ignoring: " << line << endl); continue; } else if (startPos > endPos) { TRACE_ERR("ERROR: tag " << tagName << " startPos > endPos: " << line << endl); return false; } // may be either a input span label ("label"), or a specified output translation "translation" string label = ParseXmlTagAttribute(tagContent,"label"); string translation = ParseXmlTagAttribute(tagContent,"translation"); // specified label if (translation.length() == 0 && label.length() > 0) { Range range(startPos,endPos-1); // really? XMLParseOutput item(label, range); sourceLabels.push_back(item); } // specified translations -> vector of phrases, separated by "||" if (translation.length() > 0 && opts.input.xml_policy != XmlIgnore) { vector altTexts = TokenizeMultiCharSeparator(translation, "||"); vector altLabel = TokenizeMultiCharSeparator(label, "||"); vector altProbs = TokenizeMultiCharSeparator(ParseXmlTagAttribute(tagContent,"prob"), "||"); //TRACE_ERR("number of translations: " << altTexts.size() << endl); for (size_t i=0; i i && altLabel[i].size() > 0) { targetLHSstr = altLabel[i]; } else { const UnknownLHSList &lhsList = StaticData::Instance().GetUnknownLHS(); UnknownLHSList::const_iterator iterLHS = lhsList.begin(); targetLHSstr = iterLHS->first; } Word *targetLHS = new Word(true); targetLHS->CreateFromString(Output, oFactors, targetLHSstr, true); UTIL_THROW_IF2(targetLHS->GetFactor(0) == NULL, "Null factor left-hand-side"); targetPhrase.SetTargetLHS(targetLHS); // not tested Phrase sourcePhrase = this->GetSubString(Range(startPos,endPos-1)); // get probability float probValue = 1; if (altProbs.size() > i && altProbs[i].size() > 0) { probValue = Scan(altProbs[i]); } // convert from prob to log-prob float scoreValue = FloorScore(TransformScore(probValue)); targetPhrase.SetXMLScore(scoreValue); targetPhrase.EvaluateInIsolation(sourcePhrase); // set span and create XmlOption Range range(startPos+1,endPos); XmlOption *option = new XmlOption(range,targetPhrase); assert(option); xmlOptions.push_back(option); VERBOSE(2,"xml translation = [" << range << "] " << targetLHSstr << " -> " << altTexts[i] << " prob: " << probValue << endl); } altTexts.clear(); altProbs.clear(); } } } } // we are done. check if there are tags that are still open if (tagStack.size() > 0) { TRACE_ERR("ERROR: some opened tags were never closed: " << line << endl); return false; } // return de-xml'ed sentence in line line = cleanLine; return true; } //! populate this InputType with data from in stream int TreeInput:: Read(std::istream& in) { string line; if (getline(in, line, '\n').eof()) return 0; m_labelledSpans.clear(); ProcessAndStripXMLTags(*m_options, line, m_labelledSpans, m_xmlOptions); // do words 1st - hack stringstream strme; strme << line << endl; Sentence::Read(strme); // size input chart size_t sourceSize = GetSize(); m_sourceChart.resize(sourceSize); for (size_t pos = 0; pos < sourceSize; ++pos) { m_sourceChart[pos].resize(sourceSize - pos); } // do source labels vector::const_iterator iterLabel; for (iterLabel = m_labelledSpans.begin(); iterLabel != m_labelledSpans.end(); ++iterLabel) { const XMLParseOutput &labelItem = *iterLabel; const Range &range = labelItem.m_range; const string &label = labelItem.m_label; AddChartLabel(range.GetStartPos() + 1, range.GetEndPos() + 1, label); } // default label bool only4empty = m_options->syntax.default_non_term_only_for_empty_range; for (size_t startPos = 0; startPos < sourceSize; ++startPos) { for (size_t endPos = startPos; endPos < sourceSize; ++endPos) { NonTerminalSet &list = GetLabelSet(startPos, endPos); if (list.size() == 0 || ! only4empty ) { AddChartLabel(startPos, endPos, m_options->syntax.input_default_non_terminal); } } } return 1; } //! Output debugging info to stream out void TreeInput::Print(std::ostream &out) const { out << *this << "\n"; } //! create trans options specific to this InputType TranslationOptionCollection* TreeInput::CreateTranslationOptionCollection() const { return NULL; } void TreeInput:: AddChartLabel(size_t startPos, size_t endPos, const Word &label) { UTIL_THROW_IF2(!label.IsNonTerminal(), "Label must be a non-terminal"); SourceLabelOverlap overlapType = m_options->syntax.source_label_overlap; NonTerminalSet &list = GetLabelSet(startPos, endPos); switch (overlapType) { case SourceLabelOverlapAdd: list.insert(label); break; case SourceLabelOverlapReplace: if (list.size() > 0) // replace existing label list.clear(); list.insert(label); break; case SourceLabelOverlapDiscard: if (list.size() == 0) list.insert(label); break; } } void TreeInput:: AddChartLabel(size_t startPos, size_t endPos, const string &label) { const std::vector& fOrder = m_options->input.factor_order; Word word(true); const Factor *factor = FactorCollection::Instance().AddFactor(Input, fOrder[0], label, true); // TODO - no factors word.SetFactor(0, factor); AddChartLabel(startPos, endPos, word); } std::ostream& operator<<(std::ostream &out, const TreeInput &input) { out<< static_cast(input) << " ||| "; size_t size = input.GetSize(); for (size_t startPos = 0; startPos < size; ++startPos) { for (size_t endPos = startPos; endPos < size; ++endPos) { const NonTerminalSet &labelSet = input.GetLabelSet(startPos, endPos); NonTerminalSet::const_iterator iter; for (iter = labelSet.begin(); iter != labelSet.end(); ++iter) { const Word &word = *iter; UTIL_THROW_IF2(!word.IsNonTerminal(), "Word must be a non-terminal"); out << "[" << startPos <<"," << endPos << "]=" << word << "(" << word.IsNonTerminal() << ") "; } } } return out; } } // namespace