Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorHieu Hoang <hieuhoang@gmail.com>2012-11-12 23:56:18 +0400
committerHieu Hoang <hieuhoang@gmail.com>2012-11-12 23:56:18 +0400
commit5e3ef23cef6101d2c098eb3445f562e8f595655b (patch)
treeb8c332b6fa82bae84ea4910967a10ba1b08a7107 /moses/TreeInput.cpp
parent8c785cff2b1be3cccd76ea9026f71b649762dfc3 (diff)
move moses/src/* to moses/
Diffstat (limited to 'moses/TreeInput.cpp')
-rw-r--r--moses/TreeInput.cpp376
1 files changed, 376 insertions, 0 deletions
diff --git a/moses/TreeInput.cpp b/moses/TreeInput.cpp
new file mode 100644
index 000000000..7c3557d17
--- /dev/null
+++ b/moses/TreeInput.cpp
@@ -0,0 +1,376 @@
+// $Id$
+
+#include "TreeInput.h"
+#include "StaticData.h"
+#include "Util.h"
+#include "XmlOption.h"
+
+using namespace std;
+
+namespace Moses
+{
+
+/**
+ * Process a sentence with xml annotation
+ * Xml tags may specifiy additional/replacing translation options
+ * and reordering constraints
+ *
+ * \param line in: sentence, out: sentence without the xml
+ * \param res vector with translation options specified by xml
+ * \param reorderingConstraint reordering constraint zones specified by xml
+ * \param walls reordering constraint walls specified by xml
+ */
+bool TreeInput::ProcessAndStripXMLTags(string &line, std::vector<XMLParseOutput> &sourceLabels, std::vector<XmlOption*> &xmlOptions)
+{
+ //parse XML markup in translation line
+
+ // no xml tag? we're done.
+ if (line.find_first_of('<') == string::npos) {
+ return true;
+ }
+
+ // break up input into a vector of xml tags and text
+ // example: (this), (<b>), (is a), (</b>), (test .)
+ vector<string> xmlTokens = TokenizeXml(line);
+
+ // we need to store opened tags, until they are closed
+ // tags are stored as tripled (tagname, startpos, contents)
+ typedef pair< string, pair< size_t, string > > OpenedTag;
+ vector< OpenedTag > tagStack; // stack that contains active opened tags
+
+ string cleanLine; // return string (text without xml)
+ size_t wordPos = 0; // position in sentence (in terms of number of words)
+
+ // keep this handy for later
+ const vector<FactorType> &outputFactorOrder = StaticData::Instance().GetOutputFactorOrder();
+ const string &factorDelimiter = StaticData::Instance().GetFactorDelimiter();
+
+ // loop through the tokens
+ for (size_t xmlTokenPos = 0 ; xmlTokenPos < xmlTokens.size() ; xmlTokenPos++) {
+ // not a xml tag, but regular text (may contain many words)
+ if(!isXmlTag(xmlTokens[xmlTokenPos])) {
+ // add a space at boundary, if necessary
+ if (cleanLine.size()>0 &&
+ cleanLine[cleanLine.size() - 1] != ' ' &&
+ xmlTokens[xmlTokenPos][0] != ' ') {
+ cleanLine += " ";
+ }
+ cleanLine += xmlTokens[xmlTokenPos]; // add to output
+ wordPos = Tokenize(cleanLine).size(); // count all the words
+ }
+
+ // process xml tag
+ else {
+ // *** get essential information about tag ***
+
+ // strip extra boundary spaces and "<" and ">"
+ string tag = Trim(TrimXml(xmlTokens[xmlTokenPos]));
+ VERBOSE(3,"XML TAG IS: " << tag << std::endl);
+
+ if (tag.size() == 0) {
+ TRACE_ERR("ERROR: empty tag name: " << line << endl);
+ return false;
+ }
+
+ // check if unary (e.g., "<wall/>")
+ bool isUnary = ( tag[tag.size() - 1] == '/' );
+
+ // check if opening tag (e.g. "<a>", not "</a>")g
+ bool isClosed = ( tag[0] == '/' );
+ bool isOpen = !isClosed;
+
+ if (isClosed && isUnary) {
+ TRACE_ERR("ERROR: can't have both closed and unary tag <" << tag << ">: " << line << endl);
+ return false;
+ }
+
+ if (isClosed)
+ tag = tag.substr(1); // remove "/" at the beginning
+ if (isUnary)
+ tag = tag.substr(0,tag.size()-1); // remove "/" at the end
+
+ // find the tag name and contents
+ string::size_type endOfName = tag.find_first_of(' ');
+ string tagName = tag;
+ string tagContent = "";
+ if (endOfName != string::npos) {
+ tagName = tag.substr(0,endOfName);
+ tagContent = tag.substr(endOfName+1);
+ }
+
+ // *** process new tag ***
+
+ if (isOpen || isUnary) {
+ // put the tag on the tag stack
+ OpenedTag openedTag = make_pair( tagName, make_pair( wordPos, tagContent ) );
+ tagStack.push_back( openedTag );
+ VERBOSE(3,"XML TAG " << tagName << " (" << tagContent << ") added to stack, now size " << tagStack.size() << endl);
+ }
+
+ // *** process completed tag ***
+
+ if (isClosed || isUnary) {
+ // pop last opened tag from stack;
+ if (tagStack.size() == 0) {
+ TRACE_ERR("ERROR: tag " << tagName << " closed, but not opened" << ":" << line << endl);
+ return false;
+ }
+ OpenedTag openedTag = tagStack.back();
+ tagStack.pop_back();
+
+ // tag names have to match
+ if (openedTag.first != tagName) {
+ TRACE_ERR("ERROR: tag " << openedTag.first << " closed by tag " << tagName << ": " << line << endl );
+ return false;
+ }
+
+ // assemble remaining information about tag
+ size_t startPos = openedTag.second.first;
+ string tagContent = openedTag.second.second;
+ size_t endPos = wordPos;
+
+ // span attribute overwrites position
+ string span = ParseXmlTagAttribute(tagContent,"span");
+ if (! span.empty()) {
+ vector<string> ij = Tokenize(span, "-");
+ if (ij.size() != 1 && ij.size() != 2) {
+ TRACE_ERR("ERROR: span attribute must be of the form \"i-j\" or \"i\": " << line << endl);
+ return false;
+ }
+ startPos = atoi(ij[0].c_str());
+ if (ij.size() == 1) endPos = startPos + 1;
+ else endPos = atoi(ij[1].c_str()) + 1;
+ }
+
+ VERBOSE(3,"XML TAG " << tagName << " (" << tagContent << ") spanning " << startPos << " to " << (endPos-1) << " complete, commence processing" << endl);
+
+ if (startPos >= endPos) {
+ TRACE_ERR("ERROR: tag " << tagName << " must span at least one word: " << line << endl);
+ return false;
+ }
+
+ // may be either a input span label ("label"), or a specified output translation "translation"
+ string label = ParseXmlTagAttribute(tagContent,"label");
+ string translation = ParseXmlTagAttribute(tagContent,"translation");
+
+ // specified label
+ if (translation.length() == 0 && label.length() > 0) {
+ WordsRange range(startPos,endPos-1); // really?
+ XMLParseOutput item(label, range);
+ sourceLabels.push_back(item);
+ }
+
+ // specified translations -> vector of phrases, separated by "||"
+ if (translation.length() > 0 && StaticData::Instance().GetXmlInputType() != XmlIgnore) {
+ vector<string> altTexts = TokenizeMultiCharSeparator(translation, "||");
+ vector<string> altLabel = TokenizeMultiCharSeparator(label, "||");
+ vector<string> altProbs = TokenizeMultiCharSeparator(ParseXmlTagAttribute(tagContent,"prob"), "||");
+ //TRACE_ERR("number of translations: " << altTexts.size() << endl);
+ for (size_t i=0; i<altTexts.size(); ++i) {
+ // set target phrase
+ TargetPhrase targetPhrase;
+ targetPhrase.CreateFromString(outputFactorOrder,altTexts[i],factorDelimiter);
+
+ // set constituent label
+ string targetLHSstr;
+ if (altLabel.size() > i && altLabel[i].size() > 0) {
+ targetLHSstr = altLabel[i];
+ }
+ else {
+ const UnknownLHSList &lhsList = StaticData::Instance().GetUnknownLHS();
+ UnknownLHSList::const_iterator iterLHS = lhsList.begin();
+ targetLHSstr = iterLHS->first;
+ }
+ Word targetLHS(true);
+ targetLHS.CreateFromString(Output, outputFactorOrder, targetLHSstr, true);
+ CHECK(targetLHS.GetFactor(0) != NULL);
+ targetPhrase.SetTargetLHS(targetLHS);
+
+ // get probability
+ float probValue = 1;
+ if (altProbs.size() > i && altProbs[i].size() > 0) {
+ probValue = Scan<float>(altProbs[i]);
+ }
+ // convert from prob to log-prob
+ float scoreValue = FloorScore(TransformScore(probValue));
+ targetPhrase.SetScore(scoreValue);
+
+ // set span and create XmlOption
+ WordsRange range(startPos+1,endPos);
+ XmlOption *option = new XmlOption(range,targetPhrase);
+ CHECK(option);
+ xmlOptions.push_back(option);
+
+ VERBOSE(2,"xml translation = [" << range << "] " << targetLHSstr << " -> " << altTexts[i] << " prob: " << probValue << endl);
+ }
+ altTexts.clear();
+ altProbs.clear();
+ }
+ }
+ }
+ }
+ // we are done. check if there are tags that are still open
+ if (tagStack.size() > 0) {
+ TRACE_ERR("ERROR: some opened tags were never closed: " << line << endl);
+ return false;
+ }
+
+ // return de-xml'ed sentence in line
+ line = cleanLine;
+ return true;
+}
+
+//! populate this InputType with data from in stream
+int TreeInput::Read(std::istream& in,const std::vector<FactorType>& factorOrder)
+{
+ const StaticData &staticData = StaticData::Instance();
+
+ string line;
+ if (getline(in, line, '\n').eof())
+ return 0;
+ // remove extra spaces
+ //line = Trim(line);
+
+ std::vector<XMLParseOutput> sourceLabels;
+ std::vector<XmlOption*> xmlOptionsList;
+ ProcessAndStripXMLTags(line, sourceLabels, xmlOptionsList);
+
+ // do words 1st - hack
+ stringstream strme;
+ strme << line << endl;
+
+ Sentence::Read(strme, factorOrder);
+
+ // size input chart
+ size_t sourceSize = GetSize();
+ m_sourceChart.resize(sourceSize);
+
+ for (size_t pos = 0; pos < sourceSize; ++pos) {
+ m_sourceChart[pos].resize(sourceSize - pos);
+ }
+
+ // do source labels
+ vector<XMLParseOutput>::const_iterator iterLabel;
+ for (iterLabel = sourceLabels.begin(); iterLabel != sourceLabels.end(); ++iterLabel) {
+ const XMLParseOutput &labelItem = *iterLabel;
+ const WordsRange &range = labelItem.m_range;
+ const string &label = labelItem.m_label;
+ AddChartLabel(range.GetStartPos() + 1, range.GetEndPos() + 1, label, factorOrder);
+ }
+
+ // default label
+ for (size_t startPos = 0; startPos < sourceSize; ++startPos) {
+ for (size_t endPos = startPos; endPos < sourceSize; ++endPos) {
+ AddChartLabel(startPos, endPos, staticData.GetInputDefaultNonTerminal(), factorOrder);
+ }
+ }
+
+ // XML Options
+
+ //only fill the vector if we are parsing XML
+ if (staticData.GetXmlInputType() != XmlPassThrough ) {
+ //TODO: needed to handle exclusive
+ //for (size_t i=0; i<GetSize(); i++) {
+ // m_xmlCoverageMap.push_back(false);
+ //}
+
+ //iterXMLOpts will be empty for XmlIgnore
+ //look at each column
+ for(std::vector<XmlOption*>::const_iterator iterXmlOpts = xmlOptionsList.begin();
+ iterXmlOpts != xmlOptionsList.end(); iterXmlOpts++) {
+
+ const XmlOption *xmlOption = *iterXmlOpts;
+ TargetPhrase *targetPhrase = new TargetPhrase(xmlOption->targetPhrase);
+ *targetPhrase = xmlOption->targetPhrase; // copy everything
+ WordsRange *range = new WordsRange(xmlOption->range);
+ const StackVec emptyStackVec; // hmmm... maybe dangerous, but it is never consulted
+
+ TargetPhraseCollection *tpc = new TargetPhraseCollection;
+ tpc->Add(targetPhrase);
+
+ ChartTranslationOptions *transOpt = new ChartTranslationOptions(*tpc, emptyStackVec, *range, 0.0f);
+ m_xmlChartOptionsList.push_back(transOpt);
+
+ //TODO: needed to handle exclusive
+ //for(size_t j=transOpt->GetSourceWordsRange().GetStartPos(); j<=transOpt->GetSourceWordsRange().GetEndPos(); j++) {
+ // m_xmlCoverageMap[j]=true;
+ //}
+
+ delete xmlOption;
+ }
+
+ }
+
+ return 1;
+}
+
+//! Output debugging info to stream out
+void TreeInput::Print(std::ostream &out) const
+{
+ out << *this << "\n";
+}
+
+//! create trans options specific to this InputType
+TranslationOptionCollection* TreeInput::CreateTranslationOptionCollection() const
+{
+
+ return NULL;
+}
+
+void TreeInput::AddChartLabel(size_t startPos, size_t endPos, const Word &label
+ , const std::vector<FactorType>& /* factorOrder */)
+{
+ CHECK(label.IsNonTerminal());
+
+ SourceLabelOverlap overlapType = StaticData::Instance().GetSourceLabelOverlap();
+ NonTerminalSet &list = GetLabelSet(startPos, endPos);
+ switch (overlapType) {
+ case SourceLabelOverlapAdd:
+ list.insert(label);
+ break;
+ case SourceLabelOverlapReplace:
+ if (list.size() > 0) // replace existing label
+ list.clear();
+ list.insert(label);
+ break;
+ case SourceLabelOverlapDiscard:
+ if (list.size() == 0)
+ list.insert(label);
+ break;
+ }
+}
+
+void TreeInput::AddChartLabel(size_t startPos, size_t endPos, const string &label
+ , const std::vector<FactorType>& factorOrder)
+{
+ Word word(true);
+ const Factor *factor = FactorCollection::Instance().AddFactor(Input, factorOrder[0], label); // TODO - no factors
+ word.SetFactor(0, factor);
+
+ AddChartLabel(startPos, endPos, word, factorOrder);
+}
+
+std::ostream& operator<<(std::ostream &out, const TreeInput &input)
+{
+ out<< static_cast<Phrase const&>(input) << " ||| ";
+
+ size_t size = input.GetSize();
+ for (size_t startPos = 0; startPos < size; ++startPos) {
+ for (size_t endPos = startPos; endPos < size; ++endPos) {
+ const NonTerminalSet &labelSet = input.GetLabelSet(startPos, endPos);
+ NonTerminalSet::const_iterator iter;
+ for (iter = labelSet.begin(); iter != labelSet.end(); ++iter) {
+ const Word &word = *iter;
+ out << "[" << startPos <<"," << endPos << "]="
+ << word << "(" << word.IsNonTerminal() << ") ";
+ CHECK(word.IsNonTerminal());
+ }
+ }
+ }
+
+ return out;
+}
+
+
+} // namespace
+