Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
path: root/misc
diff options
context:
space:
mode:
authorzens <zens@1f5c12ca-751b-0410-a591-d2e778427230>2006-07-28 22:14:20 +0400
committerzens <zens@1f5c12ca-751b-0410-a591-d2e778427230>2006-07-28 22:14:20 +0400
commit0e6517b428a5537861e8dcfa0795eed96481cfff (patch)
tree7abbfb59afbfffa86e51d4d8aab9c929fcaa0419 /misc
parent652b6236d8e6212669237fb0f3972483d6233627 (diff)
- fixing already fixed bugs that have been re-introduced by someone
- moving some stuff from PhraseDictionaryTree to GenerateTuples - removing some obsolete code git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@359 1f5c12ca-751b-0410-a591-d2e778427230
Diffstat (limited to 'misc')
-rw-r--r--misc/GenerateTuples.cpp294
-rw-r--r--misc/GenerateTuples.h12
-rw-r--r--misc/Makefile13
-rw-r--r--misc/processPhraseTable.cpp16
4 files changed, 325 insertions, 10 deletions
diff --git a/misc/GenerateTuples.cpp b/misc/GenerateTuples.cpp
new file mode 100644
index 000000000..97b0bdd96
--- /dev/null
+++ b/misc/GenerateTuples.cpp
@@ -0,0 +1,294 @@
+
+////////////////////////////////////////////////////////////
+//
+// generate set of target candidates for confusion net
+//
+////////////////////////////////////////////////////////////
+
+
+
+#include <numeric>
+#include "Word.h"
+#include "Phrase.h"
+#include "ConfusionNet.h"
+#include "WordsRange.h"
+#include "PhraseDictionaryTree.h"
+#if 0
+// Generates all tuples from n indexes with ranges 0 to card[j]-1, respectively..
+// Input: number of indexes and ranges: ranges[0] ... ranges[num_idx-1]
+// Output: number of tuples and monodimensional array of tuples.
+// Reference: mixed-radix generation algorithm (D. E. Knuth, TAOCP v. 4.2)
+
+size_t GenerateTuples(unsigned num_idx,unsigned* ranges,unsigned *&tuples)
+{
+ unsigned* single_tuple= new unsigned[num_idx+1];
+ unsigned num_tuples=1;
+
+ for (unsigned k=0;k<num_idx;++k)
+ {
+ num_tuples *= ranges[k];
+ single_tuple[k]=0;
+ }
+
+ tuples=new unsigned[num_idx * num_tuples];
+
+ // we need this additional element for the last iteration
+ single_tuple[num_idx]=0;
+ unsigned j=0;
+ for (unsigned n=0;n<num_tuples;++n){
+ memcpy((void *)((tuples + n * num_idx)),(void *)single_tuple,num_idx * sizeof(unsigned));
+ j=0;
+ while (single_tuple[j]==ranges[j]-1){single_tuple[j]=0; ++j;}
+ ++single_tuple[j];
+ }
+ delete [] single_tuple;
+ return num_tuples;
+}
+
+
+typedef PhraseDictionaryTree::PrefixPtr PPtr;
+typedef std::vector<PPtr> vPPtr;
+typedef std::vector<std::vector<Factor const*> > mPhrase;
+
+std::ostream& operator<<(std::ostream& out,const mPhrase& p) {
+ for(size_t i=0;i<p.size();++i) {
+ out<<i<<" - ";
+ for(size_t j=0;j<p[i].size();++j)
+ out<<p[i][j]->ToString()<<" ";
+ out<<"|";
+ }
+
+ return out;
+}
+
+struct State {
+ vPPtr ptrs;
+ WordsRange range;
+ float score;
+
+ State() : range(0,0),score(0.0) {}
+ State(size_t b,size_t e,const vPPtr& v,float sc=0.0) : ptrs(v),range(b,e),score(sc) {}
+
+ size_t begin() const {return range.GetStartPos();}
+ size_t end() const {return range.GetEndPos();}
+ float GetScore() const {return score;}
+
+};
+
+std::ostream& operator<<(std::ostream& out,const State& s) {
+ out<<"["<<s.ptrs.size()<<" ("<<s.begin()<<","<<s.end()<<") "<<s.GetScore()<<"]";
+
+ return out;
+}
+
+typedef std::map<mPhrase,float> E2Costs;
+
+
+struct GCData {
+ const std::vector<PhraseDictionaryTree const*>& pdicts;
+ const std::vector<std::vector<float> >& weights;
+ std::vector<FactorType> inF,outF;
+ size_t distinctOutputFactors;
+ vPPtr root;
+ size_t totalTuples,distinctTuples;
+
+
+ GCData(const std::vector<PhraseDictionaryTree const*>& a,
+ const std::vector<std::vector<float> >& b)
+ : pdicts(a),weights(b),totalTuples(0),distinctTuples(0) {
+
+ assert(pdicts.size()==weights.size());
+ std::set<FactorType> distinctOutFset;
+ inF.resize(pdicts.size());
+ outF.resize(pdicts.size());
+ root.resize(pdicts.size());
+ for(size_t i=0;i<pdicts.size();++i)
+ {
+ root[i]=pdicts[i]->GetRoot();
+ inF[i]=pdicts[i]->GetInputFactorType();
+ outF[i]=pdicts[i]->GetOutputFactorType();
+ distinctOutFset.insert(pdicts[i]->GetOutputFactorType());
+ }
+ distinctOutputFactors=distinctOutFset.size();
+ }
+
+ FactorType OutFT(size_t i) const {return outF[i];}
+ FactorType InFT(size_t i) const {return inF[i];}
+ size_t DistinctOutFactors() const {return distinctOutputFactors;}
+
+ const vPPtr& GetRoot() const {return root;}
+
+};
+
+typedef std::vector<Factor const*> vFactor;
+typedef std::vector<std::pair<float,vFactor> > TgtCandList;
+
+typedef std::vector<TgtCandList> OutputFactor2TgtCandList;
+typedef std::vector<OutputFactor2TgtCandList*> Len2Cands;
+
+void GeneratePerFactorTgtList(size_t factorType,PPtr pptr,GCData& data,Len2Cands& len2cands)
+{
+ std::vector<FactorTgtCand> cands;
+ data.pdicts[factorType]->GetTargetCandidates(pptr,cands);
+
+ for(std::vector<FactorTgtCand>::const_iterator cand=cands.begin();cand!=cands.end();++cand) {
+ assert(data.weights[factorType].size()==cand->second.size());
+ float costs=std::inner_product(data.weights[factorType].begin(),
+ data.weights[factorType].end(),
+ cand->second.begin(),
+ 0.0);
+
+ size_t len=cand->first.size();
+ if(len>=len2cands.size()) len2cands.resize(len+1,0);
+ if(!len2cands[len]) len2cands[len]=new OutputFactor2TgtCandList(data.DistinctOutFactors());
+ OutputFactor2TgtCandList &outf2tcandlist=*len2cands[len];
+
+ outf2tcandlist[data.OutFT(factorType)].push_back(std::make_pair(costs,cand->first));
+ }
+}
+
+void GenerateTupleTgtCands(OutputFactor2TgtCandList& tCand,E2Costs& e2costs,GCData& data)
+{
+ // check if candidates are non-empty
+ bool gotCands=1;
+ for(size_t j=0;gotCands && j<tCand.size();++j)
+ gotCands &= !tCand[j].empty();
+
+ if(gotCands) {
+ // enumerate tuples
+ assert(data.DistinctOutFactors()==tCand.size());
+ std::vector<unsigned> radix(data.DistinctOutFactors());
+ for(size_t i=0;i<tCand.size();++i) radix[i]=tCand[i].size();
+
+ unsigned *tuples=0;
+ size_t numTuples=GenerateTuples(radix.size(),&radix[0],tuples);
+
+ data.totalTuples+=numTuples;
+
+ for(size_t i=0;i<numTuples;++i)
+ {
+ mPhrase e(radix.size());float costs=0.0;
+ for(size_t j=0;j<radix.size();++j)
+ {
+ assert(tuples[radix.size()*i+j]<tCand[j].size());
+ std::pair<float,vFactor> const& mycand=tCand[j][tuples[radix.size()*i+j]];
+ e[j]=mycand.second;
+ costs+=mycand.first;
+ }
+#ifdef DEBUG
+ bool mismatch=0;
+ for(size_t j=1;!mismatch && j<e.size();++j)
+ if(e[j].size()!=e[j-1].size()) mismatch=1;
+ assert(mismatch==0);
+#endif
+ std::pair<E2Costs::iterator,bool> p=e2costs.insert(std::make_pair(e,costs));
+ if(p.second) ++data.distinctTuples;
+ else {
+ // entry known, take min of costs, alternative: sum probs
+ if(costs<p.first->second) p.first->second=costs;
+ }
+ }
+ delete [] tuples;
+ }
+}
+
+void GenerateCandidates_(E2Costs& e2costs,const vPPtr& nextP,GCData& data)
+{
+ Len2Cands len2cands;
+ // generate candidates for each element of nextP:
+ for(size_t factorType=0;factorType<nextP.size();++factorType)
+ if(nextP[factorType])
+ GeneratePerFactorTgtList(factorType,nextP[factorType],data,len2cands);
+
+ // for each length: enumerate tuples, compute score, and insert in e2costs
+ for(size_t len=0;len<len2cands.size();++len) if(len2cands[len])
+ GenerateTupleTgtCands(*len2cands[len],e2costs,data);
+}
+
+void GenerateCandidates(const ConfusionNet& src,
+ const std::vector<PhraseDictionaryTree const*>& pdicts,
+ const std::vector<std::vector<float> >& weights,
+ int verbose) {
+ GCData data(pdicts,weights);
+
+ std::vector<State> stack;
+ for(size_t i=0;i<src.GetSize();++i) stack.push_back(State(i,i,data.GetRoot()));
+
+ std::map<WordsRange,E2Costs> cov2E;
+
+ // std::cerr<<"start while loop. initial stack size: "<<stack.size()<<"\n";
+
+ while(!stack.empty())
+ {
+ State curr(stack.back());
+ stack.pop_back();
+
+ //std::cerr<<"processing state "<<curr<<" stack size: "<<stack.size()<<"\n";
+
+ assert(curr.end()<src.GetSize());
+ const ConfusionNet::Column &currCol=src[curr.end()];
+ for(size_t colidx=0;colidx<currCol.size();++colidx)
+ {
+ const Word& w=currCol[colidx].first;
+ vPPtr nextP(curr.ptrs);
+ for(size_t j=0;j<nextP.size();++j)
+ nextP[j]=pdicts[j]->Extend(nextP[j],
+ w.GetFactor(data.InFT(j))->GetString());
+
+ bool valid=1;
+ for(size_t j=0;j<nextP.size();++j) if(!nextP[j]) {valid=0;break;}
+
+ if(valid)
+ {
+ if(curr.end()+1<src.GetSize())
+ stack.push_back(State(curr.begin(),curr.end()+1,nextP,
+ curr.GetScore()+currCol[colidx].second));
+
+ E2Costs &e2costs=cov2E[WordsRange(curr.begin(),curr.end()+1)];
+ GenerateCandidates_(e2costs,nextP,data);
+ }
+ }
+
+ // check if there are translations of one-word phrases ...
+ //if(curr.begin()==curr.end() && tCand.empty()) {}
+
+ } // end while(!stack.empty())
+
+ if(verbose) {
+ // print statistics for debugging purposes
+ std::cerr<<"tuple stats: total: "<<data.totalTuples
+ <<" distinct: "<<data.distinctTuples<<" ("
+ <<(data.distinctTuples/(0.01*data.totalTuples))
+ <<"%)\n";
+ std::cerr<<"per coverage set:\n";
+ for(std::map<WordsRange,E2Costs>::const_iterator i=cov2E.begin();
+ i!=cov2E.end();++i) {
+ std::cerr<<i->first<<" -- distinct cands: "
+ <<i->second.size()<<"\n";
+ }
+ std::cerr<<"\n\n";
+ }
+
+ if(verbose>10) {
+ std::cerr<<"full list:\n";
+ for(std::map<WordsRange,E2Costs>::const_iterator i=cov2E.begin();
+ i!=cov2E.end();++i) {
+ std::cerr<<i->first<<" -- distinct cands: "
+ <<i->second.size()<<"\n";
+ for(E2Costs::const_iterator j=i->second.begin();j!=i->second.end();++j)
+ std::cerr<<j->first<<" -- "<<j->second<<"\n";
+ }
+ }
+}
+
+#else
+
+void GenerateCandidates(const ConfusionNet&,
+ const std::vector<PhraseDictionaryTree const*>&,
+ const std::vector<std::vector<float> >&,
+ int)
+{
+ std::cerr<<"ERROR: GenerateCandidates is currently broken\n";
+}
+
+#endif
diff --git a/misc/GenerateTuples.h b/misc/GenerateTuples.h
new file mode 100644
index 000000000..362c1534f
--- /dev/null
+++ b/misc/GenerateTuples.h
@@ -0,0 +1,12 @@
+// $Id$
+#ifndef GENERATETUPLES_H_
+#define GENERATETUPLES_H_
+#include "PhraseDictionaryTree.h"
+
+class ConfusionNet;
+
+void GenerateCandidates(const ConfusionNet& src,
+ const std::vector<PhraseDictionaryTree const*>& pdicts,
+ const std::vector<std::vector<float> >& weights,
+ int verbose=0) ;
+#endif
diff --git a/misc/Makefile b/misc/Makefile
index a21823f27..ab038611e 100644
--- a/misc/Makefile
+++ b/misc/Makefile
@@ -1,22 +1,25 @@
BOOSTDIR=/home/ws06/cdyer/boost-stage
+SRIDIR=/home/ws06/cdyer/srilm/lib/i686
CXX=g++
CXXFLAGS=-W -Wall -O0 -g -ggdb -ffor-scope -D_FILE_OFFSET_BITS=64 -D_LARGE_FILES
LDFLAGS=-static
INCLUDES=-I../moses/src -I$(BOOSTDIR) -I$(BOOSTDIR)/include
-LIBS=-L$(BOOSTDIR)/lib -L$(BOOSTDIR)/stage/lib -lboost_iostreams-gcc-mt -lboost_filesystem-gcc-mt -lboost_thread-gcc-mt -lz
-
+BOOSTLIBS=-L$(BOOSTDIR)/lib -L$(BOOSTDIR)/stage/lib -lboost_iostreams-gcc-mt -lboost_filesystem-gcc-mt -lboost_thread-gcc-mt -lz
+SRILIBS=-L$(SRIDIR) -loolm -ldstruct -lmisc
default: processPhraseTable
-processPhraseTable.o: processPhraseTable.cpp
+%.o: %.cpp
$(CXX) $(CXXFLAGS) $(INCLUDES) $< -c -o $@
+
+
MOSESLIB =../moses/src/libmoses.a
-processPhraseTable: processPhraseTable.o $(MOSESLIB)
- $(CXX) $(LDFLAGS) $^ -o $@ $(LIBS)
+processPhraseTable: processPhraseTable.o GenerateTuples.o $(MOSESLIB)
+ $(CXX) $(LDFLAGS) $^ -o $@ $(SRILIBS) $(BOOSTLIBS)
diff --git a/misc/processPhraseTable.cpp b/misc/processPhraseTable.cpp
index d7becb8a6..9a92daeac 100644
--- a/misc/processPhraseTable.cpp
+++ b/misc/processPhraseTable.cpp
@@ -12,6 +12,9 @@
#include "FactorCollection.h"
#include "Phrase.h"
#include "InputFileStream.h"
+#include "Timer.h"
+
+Timer timer;
template<typename T>
std::ostream& operator<<(std::ostream& out,const std::vector<T>& x)
@@ -86,20 +89,20 @@ int main(int argc,char **argv) {
std::cerr<<"processing ptree for\n";
if(ftts.size()==1 && ftts[0].first=="-") {
- PhraseDictionaryTree pdt(noScoreComponent,&factorCollection);
+ PhraseDictionaryTree pdt(noScoreComponent);
pdt.Create(std::cin,fto);}
else
{
-
+#if 0
std::vector<PhraseDictionaryTree const*> pdicts;
std::vector<FactorType> factorOrder;
for(size_t i=0;i<ftts.size();++i) {
+
PhraseDictionaryTree *pdtptr=new PhraseDictionaryTree(noScoreComponent,
&factorCollection,
getFactorType(atoi(ftts[i].second.first)),
getFactorType(atoi(ftts[i].second.second))
);
-
factorOrder.push_back(pdtptr->GetInputFactorType());
PhraseDictionaryTree &pdt=*pdtptr;
pdicts.push_back(pdtptr);
@@ -180,8 +183,11 @@ int main(int argc,char **argv) {
}
}
-
- }
+#else
+ std::cerr<<"ERROR: these functions are currently broken...\n";
+ exit(1);
+#endif
+ }
}
}