roll forwards james smith's changes.

seg faults seen might have been caused by unknown compiler problem, or FC5/FC6 lib incompatibility on DICE machines. what a joke! git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@1554 1f5c12ca-751b-0410-a591-d2e778427230
author: hieuhoang1972 <hieuhoang1972@1f5c12ca-751b-0410-a591-d2e778427230> 2008-02-09 14:37:41 +0300
committer: hieuhoang1972 <hieuhoang1972@1f5c12ca-751b-0410-a591-d2e778427230> 2008-02-09 14:37:41 +0300
commit: 2f091ce8f7eed08095b59b5746b8a42c21200458 (patch)
tree: 6c0da57b2f163c0e2088c3b7ba8dc01e5883b64c /moses
parent: fd60fe93b9e1bb2f210fbea861d78d7a84b37624 (diff)
6 files changed, 85 insertions, 29 deletions
diff --git a/moses/src/Manager.cpp b/moses/src/Manager.cpp
index 8b9ef579e..f077f5173 100755
--- a/moses/src/Manager.cpp
+++ b/moses/src/Manager.cpp
@@ -147,10 +147,10 @@ void Manager::ProcessOneHypothesis(const Hypothesis &hypothesis)
 
 		for (size_t startPos = hypoFirstGapPos ; startPos < sourceSize ; ++startPos)
 		{
-      size_t maxSize = sourceSize - startPos;
-      size_t maxSizePhrase = StaticData::Instance().GetMaxPhraseLength();
-      maxSize = (maxSize < maxSizePhrase) ? maxSize : maxSizePhrase;
-
+			size_t maxSize = sourceSize - startPos;
+			size_t maxSizePhrase = StaticData::Instance().GetMaxPhraseLength();
+			maxSize = (maxSize < maxSizePhrase) ? maxSize : maxSizePhrase;
+			
 			for (size_t endPos = startPos ; endPos < startPos + maxSize ; ++endPos)
 			{
 				if (!hypoBitmap.Overlap(WordsRange(startPos, endPos)))
@@ -296,6 +296,20 @@ void Manager::ExpandHypothesis(const Hypothesis &hypothesis, const TranslationOp
 	if (debug2) { std::cerr << "::EXT: " << transOpt << "\n"; }
 #endif
 	Hypothesis *newHypo = hypothesis.CreateNext(transOpt);
+	// expand hypothesis further if transOpt was linked
+	for (std::vector<TranslationOption*>::const_iterator iterLinked = transOpt.GetLinkedTransOpts().begin();
+	       iterLinked != transOpt.GetLinkedTransOpts().end(); iterLinked++) {
+		const WordsBitmap hypoBitmap = newHypo->GetWordsBitmap();
+		if (hypoBitmap.Overlap((**iterLinked).GetSourceWordsRange())) {
+			// don't want to add a hypothesis that has some but not all of a linked TO set, so return
+			return;
+		}
+		else
+		{
+			newHypo->CalcScore(m_transOptColl->GetFutureScore());
+			newHypo = newHypo->CreateNext(**iterLinked);
+		}
+	}
 	newHypo->CalcScore(m_transOptColl->GetFutureScore());
 	
 	// logging for the curious
diff --git a/moses/src/Sentence.cpp b/moses/src/Sentence.cpp
index a8a720ebc..5a2f7b2be 100755
--- a/moses/src/Sentence.cpp
+++ b/moses/src/Sentence.cpp
@@ -42,7 +42,7 @@ int Sentence::Read(std::istream& in,const std::vector<FactorType>& factorOrder)
 	//parse XML markup in translation line
 	const StaticData &staticData = StaticData::Instance();
 	if (staticData.GetXmlInputType() != XmlPassThrough)
-		m_xmlOptionsList = ProcessAndStripXMLTags(line);
+		m_xmlOptionsList = ProcessAndStripXMLTags(line, *this);
 	Phrase::CreateFromString(factorOrder, line, factorDelimiter);
 	
 	//only fill the vector if we are parsing XML
@@ -50,9 +50,10 @@ int Sentence::Read(std::istream& in,const std::vector<FactorType>& factorOrder)
 		for (size_t i=0; i<GetSize();i++) {
 			m_xmlCoverageMap.push_back(false);
 		}
-		for (size_t i=0; i< m_xmlOptionsList.size();i++) {
+		for (std::vector<TranslationOption*>::const_iterator iterXMLOpts = m_xmlOptionsList.begin();
+		        iterXMLOpts != m_xmlOptionsList.end(); iterXMLOpts++) {
 			//m_xmlOptionsList will be empty for XmlIgnore
-			for(size_t j=m_xmlOptionsList[i].startPos;j<=m_xmlOptionsList[i].endPos;j++) {
+			for(size_t j=(**iterXMLOpts).GetSourceWordsRange().GetStartPos();j<=(**iterXMLOpts).GetSourceWordsRange().GetEndPos();j++) {
 				m_xmlCoverageMap[j]=true;
 				
 			}
@@ -88,24 +89,12 @@ bool Sentence::XmlOverlap(size_t startPos, size_t endPos) const {
 
 void Sentence::GetXmlTranslationOptions(std::vector <TranslationOption*> &list, size_t startPos, size_t endPos) const {
 	//iterate over XmlOptions list, find exact source/target matches
-	//we don't worry about creating the objects ahead of time because this should only be called once for each unique start/end when a given sentence is processed
 	const std::vector<FactorType> &outputFactorOrder = StaticData::Instance().GetOutputFactorOrder();
 	
-	for(size_t i=0;i<m_xmlOptionsList.size();i++) {
-		if (startPos == m_xmlOptionsList[i].startPos && endPos == m_xmlOptionsList[i].endPos) {
-			//create TranslationOptions
-			
-			for (size_t j=0;j<m_xmlOptionsList[i].targetPhrases.size();j++) {
-				TargetPhrase targetPhrase(Output);
-				targetPhrase.CreateFromString(outputFactorOrder,m_xmlOptionsList[i].targetPhrases[j],StaticData::Instance().GetFactorDelimiter());
-				targetPhrase.SetScore(m_xmlOptionsList[i].targetScores[j]);
-				WordsRange range(m_xmlOptionsList[i].startPos,m_xmlOptionsList[i].endPos);
-				
-				TranslationOption *option = new TranslationOption(range,targetPhrase,*this);
-				assert(option);
-				list.push_back(option);
-
-			}
+	for (std::vector<TranslationOption*>::const_iterator iterXMLOpts = m_xmlOptionsList.begin();
+	        iterXMLOpts != m_xmlOptionsList.end(); iterXMLOpts++) {
+		if (startPos == (**iterXMLOpts).GetSourceWordsRange().GetStartPos() && endPos == (**iterXMLOpts).GetSourceWordsRange().GetEndPos()) {
+ 			list.push_back(*iterXMLOpts);
 		}
 	}
 }
diff --git a/moses/src/Sentence.h b/moses/src/Sentence.h
index 8a05787ee..0f0b4aa7a 100755
--- a/moses/src/Sentence.h
+++ b/moses/src/Sentence.h
@@ -48,7 +48,7 @@ class Sentence : public Phrase, public InputType
 	 * and returns the value of that tag if present, empty string otherwise
 	 */
 	static std::string ParseXmlTagAttribute(const std::string& tag,const std::string& attributeName);
-	std::vector <XmlOption> m_xmlOptionsList;
+	std::vector <TranslationOption*> m_xmlOptionsList;
 	std::vector <bool> m_xmlCoverageMap;
 
  public:
@@ -90,3 +90,4 @@ class Sentence : public Phrase, public InputType
 	TranslationOptionCollection* CreateTranslationOptionCollection() const;
 };
 
+
diff --git a/moses/src/TranslationOption.h b/moses/src/TranslationOption.h
index 441cd403c..440d4461d 100755
--- a/moses/src/TranslationOption.h
+++ b/moses/src/TranslationOption.h
@@ -60,6 +60,7 @@ protected:
 	Phrase				      *m_sourcePhrase; /*< input phrase translated by this */
 	const WordsRange		m_sourceWordsRange; /*< word position in the input that are covered by this translation option */
 	float               m_futureScore; /*< estimate of total cost when using this translation option, includes language model probabilities */
+	std::vector<TranslationOption*> m_linkedTransOpts; /* list of linked TOs which must be included with this in any hypothesis */
 	
 	//! in TranslationOption, m_scoreBreakdown is not complete.  It cannot,
 	//! for example, know the full n-gram score since the length of the
@@ -112,6 +113,18 @@ public:
 	{
 	  return m_sourcePhrase;
 	}
+	
+	/** returns linked TOs */
+	inline const std::vector<TranslationOption*> &GetLinkedTransOpts() const
+	{
+		return m_linkedTransOpts;
+	}
+	
+	/** add link to another TO */
+	inline void AddLinkedTransOpt(TranslationOption* to)
+	{
+		m_linkedTransOpts.push_back(to);
+	}
 
 	/** whether source span overlaps with those of a hypothesis */
 	bool Overlap(const Hypothesis &hypothesis) const;
@@ -167,3 +180,4 @@ public:
 };
 
 
+
diff --git a/moses/src/XmlOption.cpp b/moses/src/XmlOption.cpp
index e71dfc97f..6c18aac12 100644
--- a/moses/src/XmlOption.cpp
+++ b/moses/src/XmlOption.cpp
@@ -26,6 +26,7 @@
 #include <iostream>
 #include "Util.h"
 #include "StaticData.h"
+#include "TranslationOption.h"
 
 namespace {
 
@@ -91,10 +92,11 @@ inline std::vector<std::string> TokenizeXml(const std::string& str)
 
 }
 
-std::vector<XmlOption> ProcessAndStripXMLTags(std::string& line) {
+std::vector<TranslationOption*> ProcessAndStripXMLTags(std::string& line, const InputType &source) {
 	//parse XML markup in translation line
-	std::vector<XmlOption> res;
+	std::vector<TranslationOption*> res;
 	std::string rstr;
+	std::string linkedStr;
 	if (line.find_first_of('<') == std::string::npos) { return res; }
 	std::vector<std::string> xmlTokens = TokenizeXml(line);
 	std::string tagName = "";
@@ -106,6 +108,7 @@ std::vector<XmlOption> ProcessAndStripXMLTags(std::string& line) {
 	size_t curWord=0;
 	int numUnary = 0;
 	bool doClose = false;
+	bool isLinked = false;
 	for (size_t xmlTokenPos = 0 ; xmlTokenPos < xmlTokens.size() ; xmlTokenPos++)
 	{
 		if(!isXmlTag(xmlTokens[xmlTokenPos]))
@@ -127,7 +130,30 @@ std::vector<XmlOption> ProcessAndStripXMLTags(std::string& line) {
 				nextTagName = tag.substr(0,endOfName);
 				tagContents = tag.substr(endOfName+1);
 			}
-			if (isOpen)
+			if (nextTagName == "linked") {
+				isLinked = true;
+				linkedStr = "";
+			}
+			else if (nextTagName == "/linked") {
+				isLinked = false;
+				// recurse to process linked tags
+				std::vector<TranslationOption*> tOptions = ProcessAndStripXMLTags(linkedStr, source);
+				// link them together
+				std::vector<TranslationOption*>::const_iterator iterTransOpts1;
+				std::vector<TranslationOption*>::const_iterator iterTransOpts2;
+				for (iterTransOpts1 = tOptions.begin(); iterTransOpts1 != tOptions.end(); iterTransOpts1++) {
+					for (iterTransOpts2 = tOptions.begin(); iterTransOpts2 != tOptions.end(); iterTransOpts2++) {
+						if (iterTransOpts1 != iterTransOpts2) {
+							(**iterTransOpts1).AddLinkedTransOpt(*iterTransOpts2);
+						}
+					}
+					res.push_back(*iterTransOpts1);
+				}
+			}
+			else if (isLinked) {
+				linkedStr += xmlTokens[xmlTokenPos];
+			}
+			else if (isOpen)
 			{
 				//this is an open tag
 				tagName = nextTagName;
@@ -182,6 +208,7 @@ std::vector<XmlOption> ProcessAndStripXMLTags(std::string& line) {
 				//TODO: deal with multiple XML options here
 
 				if (StaticData::Instance().GetXmlInputType() != XmlIgnore) {
+					const std::vector<FactorType> &outputFactorOrder = StaticData::Instance().GetOutputFactorOrder();
 					for (size_t i=0; i<altTexts.size(); ++i) {
 						//only store options if we aren't ignoring them
 						//set default probability
@@ -189,7 +216,15 @@ std::vector<XmlOption> ProcessAndStripXMLTags(std::string& line) {
 						if (altProbs[i] != "") probValue = Scan<float>(altProbs[i]);
 						//Convert from prob to log-prob
 						float scoreValue = FloorScore(TransformScore(probValue));
-						XmlOption option(tagStart,tagEnd,altTexts[i],scoreValue);
+						
+						TargetPhrase targetPhrase(Output);
+						targetPhrase.CreateFromString(outputFactorOrder,altTexts[i],StaticData::Instance().GetFactorDelimiter());
+						targetPhrase.SetScore(scoreValue);
+						WordsRange range(tagStart,tagEnd);
+						
+						TranslationOption *option = new TranslationOption(range,targetPhrase,source);
+						assert(option);
+						
 						res.push_back(option);
 					}
 				}
@@ -205,3 +240,4 @@ std::vector<XmlOption> ProcessAndStripXMLTags(std::string& line) {
 	return res;
 }
 
+
diff --git a/moses/src/XmlOption.h b/moses/src/XmlOption.h
index e211a6398..dc9efe650 100644
--- a/moses/src/XmlOption.h
+++ b/moses/src/XmlOption.h
@@ -2,6 +2,7 @@
 
 #include <vector>
 #include <string>
+#include "InputType.h"
 
 /** This struct is used for storing XML force translation data for a given range in the sentence
  */
@@ -18,5 +19,6 @@ struct XmlOption {
 
 };
 
-std::vector<XmlOption> ProcessAndStripXMLTags(std::string& line);
+std::vector<TranslationOption*> ProcessAndStripXMLTags(std::string& line, const InputType &source);
+
author	hieuhoang1972 <hieuhoang1972@1f5c12ca-751b-0410-a591-d2e778427230>	2008-02-09 14:37:41 +0300
committer	hieuhoang1972 <hieuhoang1972@1f5c12ca-751b-0410-a591-d2e778427230>	2008-02-09 14:37:41 +0300
commit	2f091ce8f7eed08095b59b5746b8a42c21200458 (patch)
tree	6c0da57b2f163c0e2088c3b7ba8dc01e5883b64c /moses
parent	fd60fe93b9e1bb2f210fbea861d78d7a84b37624 (diff)