- fixing already fixed bugs that have been re-introduced by someone

- moving some stuff from PhraseDictionaryTree to GenerateTuples - removing some obsolete code git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@359 1f5c12ca-751b-0410-a591-d2e778427230
author: zens <zens@1f5c12ca-751b-0410-a591-d2e778427230> 2006-07-28 22:14:20 +0400
committer: zens <zens@1f5c12ca-751b-0410-a591-d2e778427230> 2006-07-28 22:14:20 +0400
commit: 0e6517b428a5537861e8dcfa0795eed96481cfff (patch)
tree: 7abbfb59afbfffa86e51d4d8aab9c929fcaa0419 /misc
parent: 652b6236d8e6212669237fb0f3972483d6233627 (diff)
4 files changed, 325 insertions, 10 deletions
diff --git a/misc/GenerateTuples.cpp b/misc/GenerateTuples.cpp
new file mode 100644
index 000000000..97b0bdd96
--- /dev/null
+++ b/misc/GenerateTuples.cpp
@@ -0,0 +1,294 @@
+
+////////////////////////////////////////////////////////////
+//
+// generate set of target candidates for confusion net
+//
+////////////////////////////////////////////////////////////
+
+
+
+#include <numeric>
+#include "Word.h"
+#include "Phrase.h"
+#include "ConfusionNet.h"
+#include "WordsRange.h"
+#include "PhraseDictionaryTree.h"
+#if 0
+// Generates all tuples from  n indexes with ranges 0 to card[j]-1, respectively..
+// Input: number of indexes and  ranges: ranges[0] ... ranges[num_idx-1] 
+// Output: number of tuples and monodimensional array of tuples.
+// Reference: mixed-radix generation algorithm (D. E. Knuth, TAOCP v. 4.2)
+
+size_t GenerateTuples(unsigned num_idx,unsigned* ranges,unsigned *&tuples)
+{
+  unsigned* single_tuple= new unsigned[num_idx+1];
+  unsigned num_tuples=1;
+
+  for (unsigned k=0;k<num_idx;++k)
+    {
+      num_tuples *= ranges[k];
+      single_tuple[k]=0;
+    }
+
+  tuples=new unsigned[num_idx * num_tuples];
+
+  // we need this additional element for the last iteration
+  single_tuple[num_idx]=0; 
+  unsigned j=0;
+  for (unsigned n=0;n<num_tuples;++n){
+    memcpy((void *)((tuples + n * num_idx)),(void *)single_tuple,num_idx * sizeof(unsigned));
+    j=0;
+    while (single_tuple[j]==ranges[j]-1){single_tuple[j]=0; ++j;}
+    ++single_tuple[j];
+  }
+	delete [] single_tuple;
+  return num_tuples;
+}
+
+
+typedef PhraseDictionaryTree::PrefixPtr PPtr;
+typedef std::vector<PPtr> vPPtr;
+typedef std::vector<std::vector<Factor const*> > mPhrase;
+
+std::ostream& operator<<(std::ostream& out,const mPhrase& p) {
+	for(size_t i=0;i<p.size();++i) {
+		out<<i<<" - ";
+		for(size_t j=0;j<p[i].size();++j)
+			out<<p[i][j]->ToString()<<" ";
+		out<<"|";
+	}
+
+	return out;
+}
+
+struct State {
+	vPPtr ptrs;
+	WordsRange range;
+	float score;
+
+	State() : range(0,0),score(0.0) {}
+	State(size_t b,size_t e,const vPPtr& v,float sc=0.0) : ptrs(v),range(b,e),score(sc) {}
+	
+	size_t begin() const {return range.GetStartPos();}
+	size_t end() const {return range.GetEndPos();}
+	float GetScore() const {return score;}
+
+};
+
+std::ostream& operator<<(std::ostream& out,const State& s) {
+	out<<"["<<s.ptrs.size()<<" ("<<s.begin()<<","<<s.end()<<") "<<s.GetScore()<<"]";
+
+	return out;
+}
+
+typedef std::map<mPhrase,float> E2Costs;
+
+
+struct GCData {
+	const std::vector<PhraseDictionaryTree const*>& pdicts;
+	const std::vector<std::vector<float> >& weights;
+	std::vector<FactorType> inF,outF;
+	size_t distinctOutputFactors;
+	vPPtr root;
+	size_t totalTuples,distinctTuples;
+
+
+	GCData(const std::vector<PhraseDictionaryTree const*>& a,
+				 const std::vector<std::vector<float> >& b) 
+		: pdicts(a),weights(b),totalTuples(0),distinctTuples(0) {
+
+		assert(pdicts.size()==weights.size());
+		std::set<FactorType> distinctOutFset;
+		inF.resize(pdicts.size());
+		outF.resize(pdicts.size());
+		root.resize(pdicts.size());
+		for(size_t i=0;i<pdicts.size();++i) 
+			{
+				root[i]=pdicts[i]->GetRoot();
+				inF[i]=pdicts[i]->GetInputFactorType();
+				outF[i]=pdicts[i]->GetOutputFactorType();
+				distinctOutFset.insert(pdicts[i]->GetOutputFactorType());
+			}
+		distinctOutputFactors=distinctOutFset.size();
+	}
+
+	FactorType OutFT(size_t i) const {return outF[i];}
+	FactorType InFT(size_t i) const {return inF[i];}
+	size_t DistinctOutFactors() const {return distinctOutputFactors;}
+
+	const vPPtr& GetRoot() const {return root;}
+
+};
+
+typedef std::vector<Factor const*> vFactor;
+typedef std::vector<std::pair<float,vFactor> > TgtCandList;
+
+typedef std::vector<TgtCandList> OutputFactor2TgtCandList;
+typedef std::vector<OutputFactor2TgtCandList*> Len2Cands;
+
+void GeneratePerFactorTgtList(size_t factorType,PPtr pptr,GCData& data,Len2Cands& len2cands)
+{
+	std::vector<FactorTgtCand> cands;
+	data.pdicts[factorType]->GetTargetCandidates(pptr,cands);
+
+	for(std::vector<FactorTgtCand>::const_iterator cand=cands.begin();cand!=cands.end();++cand) {
+		assert(data.weights[factorType].size()==cand->second.size());
+		float costs=std::inner_product(data.weights[factorType].begin(),
+																	 data.weights[factorType].end(),
+																	 cand->second.begin(),
+																	 0.0);
+
+		size_t len=cand->first.size();
+		if(len>=len2cands.size()) len2cands.resize(len+1,0);
+		if(!len2cands[len]) len2cands[len]=new OutputFactor2TgtCandList(data.DistinctOutFactors());
+		OutputFactor2TgtCandList &outf2tcandlist=*len2cands[len];
+
+		outf2tcandlist[data.OutFT(factorType)].push_back(std::make_pair(costs,cand->first));
+	}
+}
+
+void GenerateTupleTgtCands(OutputFactor2TgtCandList& tCand,E2Costs& e2costs,GCData& data) 
+{
+	// check if candidates are non-empty
+	bool gotCands=1;
+	for(size_t j=0;gotCands && j<tCand.size();++j)
+		gotCands &= !tCand[j].empty();
+				
+	if(gotCands) {
+		// enumerate tuples
+		assert(data.DistinctOutFactors()==tCand.size());
+		std::vector<unsigned> radix(data.DistinctOutFactors());
+		for(size_t i=0;i<tCand.size();++i) radix[i]=tCand[i].size();
+
+		unsigned *tuples=0;
+		size_t numTuples=GenerateTuples(radix.size(),&radix[0],tuples);
+
+		data.totalTuples+=numTuples;
+
+		for(size_t i=0;i<numTuples;++i)
+			{
+				mPhrase e(radix.size());float costs=0.0;
+				for(size_t j=0;j<radix.size();++j)
+					{
+						assert(tuples[radix.size()*i+j]<tCand[j].size());
+						std::pair<float,vFactor> const& mycand=tCand[j][tuples[radix.size()*i+j]];
+						e[j]=mycand.second;
+						costs+=mycand.first;
+					}
+#ifdef DEBUG
+				bool mismatch=0;
+				for(size_t j=1;!mismatch && j<e.size();++j)
+					if(e[j].size()!=e[j-1].size()) mismatch=1;
+				assert(mismatch==0);
+#endif
+				std::pair<E2Costs::iterator,bool> p=e2costs.insert(std::make_pair(e,costs));
+				if(p.second) ++data.distinctTuples;
+				else {
+					// entry known, take min of costs, alternative: sum probs
+					if(costs<p.first->second) p.first->second=costs;
+				}
+			}
+		delete [] tuples;
+	}
+}
+
+void GenerateCandidates_(E2Costs& e2costs,const vPPtr& nextP,GCData& data) 
+{
+	Len2Cands len2cands;
+	// generate candidates for each element of nextP:
+	for(size_t factorType=0;factorType<nextP.size();++factorType) 
+		if(nextP[factorType]) 
+			GeneratePerFactorTgtList(factorType,nextP[factorType],data,len2cands);
+
+	// for each length: enumerate tuples, compute score, and insert in e2costs
+	for(size_t len=0;len<len2cands.size();++len) if(len2cands[len]) 
+		GenerateTupleTgtCands(*len2cands[len],e2costs,data);
+}
+
+void GenerateCandidates(const ConfusionNet& src,
+												const std::vector<PhraseDictionaryTree const*>& pdicts,
+												const std::vector<std::vector<float> >& weights,
+												int verbose) {
+	GCData data(pdicts,weights);
+
+	std::vector<State> stack;
+	for(size_t i=0;i<src.GetSize();++i) stack.push_back(State(i,i,data.GetRoot()));
+
+	std::map<WordsRange,E2Costs> cov2E;
+
+	//	std::cerr<<"start while loop. initial stack size: "<<stack.size()<<"\n";
+
+	while(!stack.empty()) 
+	{
+		State curr(stack.back());
+		stack.pop_back();
+		
+		//std::cerr<<"processing state "<<curr<<" stack size: "<<stack.size()<<"\n";
+
+		assert(curr.end()<src.GetSize());
+		const ConfusionNet::Column &currCol=src[curr.end()];
+		for(size_t colidx=0;colidx<currCol.size();++colidx) 
+		{
+			const Word& w=currCol[colidx].first;
+			vPPtr nextP(curr.ptrs);
+			for(size_t j=0;j<nextP.size();++j)
+				nextP[j]=pdicts[j]->Extend(nextP[j],
+																	 w.GetFactor(data.InFT(j))->GetString());
+	
+			bool valid=1;
+			for(size_t j=0;j<nextP.size();++j) if(!nextP[j]) {valid=0;break;}
+
+			if(valid) 
+			{
+				if(curr.end()+1<src.GetSize())
+					stack.push_back(State(curr.begin(),curr.end()+1,nextP,
+																curr.GetScore()+currCol[colidx].second));
+
+				E2Costs &e2costs=cov2E[WordsRange(curr.begin(),curr.end()+1)];
+				GenerateCandidates_(e2costs,nextP,data);
+			}
+		}
+			
+		// check if there are translations of one-word phrases ...
+		//if(curr.begin()==curr.end() && tCand.empty()) {}		
+
+	} // end while(!stack.empty()) 
+
+	if(verbose) {
+		// print statistics for debugging purposes
+		std::cerr<<"tuple stats:  total: "<<data.totalTuples
+						 <<" distinct: "<<data.distinctTuples<<" ("
+						 <<(data.distinctTuples/(0.01*data.totalTuples))
+						 <<"%)\n";
+		std::cerr<<"per coverage set:\n";
+		for(std::map<WordsRange,E2Costs>::const_iterator i=cov2E.begin();
+				i!=cov2E.end();++i) {
+			std::cerr<<i->first<<" -- distinct cands: "
+							 <<i->second.size()<<"\n";
+		}
+		std::cerr<<"\n\n";
+	}
+
+	if(verbose>10) {
+		std::cerr<<"full list:\n";
+		for(std::map<WordsRange,E2Costs>::const_iterator i=cov2E.begin();
+				i!=cov2E.end();++i) {
+			std::cerr<<i->first<<" -- distinct cands: "
+							 <<i->second.size()<<"\n";
+			for(E2Costs::const_iterator j=i->second.begin();j!=i->second.end();++j)
+				std::cerr<<j->first<<" -- "<<j->second<<"\n";
+		}
+	}
+}
+
+#else
+
+void GenerateCandidates(const ConfusionNet&,
+												const std::vector<PhraseDictionaryTree const*>&,
+												const std::vector<std::vector<float> >&,
+												int) 
+{
+	std::cerr<<"ERROR: GenerateCandidates is currently broken\n";
+}
+
+#endif
diff --git a/misc/GenerateTuples.h b/misc/GenerateTuples.h
new file mode 100644
index 000000000..362c1534f
--- /dev/null
+++ b/misc/GenerateTuples.h
@@ -0,0 +1,12 @@
+// $Id$
+#ifndef GENERATETUPLES_H_
+#define GENERATETUPLES_H_
+#include "PhraseDictionaryTree.h"
+
+class ConfusionNet;
+
+void GenerateCandidates(const ConfusionNet& src,
+												const std::vector<PhraseDictionaryTree const*>& pdicts,
+												const std::vector<std::vector<float> >& weights,
+												int verbose=0) ;
+#endif
diff --git a/misc/Makefile b/misc/Makefile
index a21823f27..ab038611e 100644
--- a/misc/Makefile
+++ b/misc/Makefile
@@ -1,22 +1,25 @@
 
 
 BOOSTDIR=/home/ws06/cdyer/boost-stage
+SRIDIR=/home/ws06/cdyer/srilm/lib/i686
 CXX=g++
 CXXFLAGS=-W -Wall -O0 -g -ggdb -ffor-scope -D_FILE_OFFSET_BITS=64 -D_LARGE_FILES
 LDFLAGS=-static
 INCLUDES=-I../moses/src -I$(BOOSTDIR) -I$(BOOSTDIR)/include
-LIBS=-L$(BOOSTDIR)/lib -L$(BOOSTDIR)/stage/lib -lboost_iostreams-gcc-mt -lboost_filesystem-gcc-mt -lboost_thread-gcc-mt -lz
-
+BOOSTLIBS=-L$(BOOSTDIR)/lib -L$(BOOSTDIR)/stage/lib -lboost_iostreams-gcc-mt -lboost_filesystem-gcc-mt -lboost_thread-gcc-mt -lz
+SRILIBS=-L$(SRIDIR) -loolm -ldstruct -lmisc
 
 default: processPhraseTable
 
-processPhraseTable.o: processPhraseTable.cpp
+%.o: %.cpp
 	$(CXX) $(CXXFLAGS) $(INCLUDES) $< -c -o $@
 
+
+
 MOSESLIB =../moses/src/libmoses.a
 
-processPhraseTable: processPhraseTable.o $(MOSESLIB)
-	$(CXX) $(LDFLAGS) $^ -o $@ $(LIBS)
+processPhraseTable: processPhraseTable.o GenerateTuples.o $(MOSESLIB)
+	$(CXX) $(LDFLAGS) $^ -o $@ $(SRILIBS) $(BOOSTLIBS)
 
 
 
diff --git a/misc/processPhraseTable.cpp b/misc/processPhraseTable.cpp
index d7becb8a6..9a92daeac 100644
--- a/misc/processPhraseTable.cpp
+++ b/misc/processPhraseTable.cpp
@@ -12,6 +12,9 @@
 #include "FactorCollection.h"
 #include "Phrase.h"
 #include "InputFileStream.h"
+#include "Timer.h"
+
+Timer timer;
 
 template<typename T>
 std::ostream& operator<<(std::ostream& out,const std::vector<T>& x)
@@ -86,20 +89,20 @@ int main(int argc,char **argv) {
 			std::cerr<<"processing ptree for\n";
 
 			if(ftts.size()==1 && ftts[0].first=="-") {
-				PhraseDictionaryTree pdt(noScoreComponent,&factorCollection);
+				PhraseDictionaryTree pdt(noScoreComponent);
 				pdt.Create(std::cin,fto);}
 			else 
 				{
-
+#if 0
 					std::vector<PhraseDictionaryTree const*> pdicts;
 					std::vector<FactorType> factorOrder;
 					for(size_t i=0;i<ftts.size();++i) {
+
 						PhraseDictionaryTree *pdtptr=new PhraseDictionaryTree(noScoreComponent,
 																																	&factorCollection,
 																																	getFactorType(atoi(ftts[i].second.first)),
 																																	getFactorType(atoi(ftts[i].second.second))
 																																	);
-						
 						factorOrder.push_back(pdtptr->GetInputFactorType());
 						PhraseDictionaryTree &pdt=*pdtptr;
 						pdicts.push_back(pdtptr);
@@ -180,8 +183,11 @@ int main(int argc,char **argv) {
 						}
 
 					}
-
-				}		
+#else
+					std::cerr<<"ERROR: these functions are currently broken...\n";
+					exit(1);
+#endif
+				}
 	}
 	
 }
author	zens <zens@1f5c12ca-751b-0410-a591-d2e778427230>	2006-07-28 22:14:20 +0400
committer	zens <zens@1f5c12ca-751b-0410-a591-d2e778427230>	2006-07-28 22:14:20 +0400
commit	0e6517b428a5537861e8dcfa0795eed96481cfff (patch)
tree	7abbfb59afbfffa86e51d4d8aab9c929fcaa0419 /misc
parent	652b6236d8e6212669237fb0f3972483d6233627 (diff)