Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
Diffstat (limited to 'misc/processPhraseTable.cpp')
-rw-r--r--misc/processPhraseTable.cpp200
1 files changed, 200 insertions, 0 deletions
diff --git a/misc/processPhraseTable.cpp b/misc/processPhraseTable.cpp
new file mode 100644
index 000000000..89d462b0e
--- /dev/null
+++ b/misc/processPhraseTable.cpp
@@ -0,0 +1,200 @@
+#include <iostream>
+//#include <fstream>
+#include <sstream>
+#include <vector>
+#include <string>
+#include <iterator>
+#include <functional>
+#include <sys/stat.h>
+#include "TypeDef.h"
+#include "PhraseDictionaryTree.h"
+#include "ConfusionNet.h"
+#include "FactorCollection.h"
+#include "Phrase.h"
+#include "InputFileStream.h"
+#include "Timer.h"
+
+using namespace std;
+using namespace Moses;
+
+Timer timer;
+
+template<typename T>
+std::ostream& operator<<(std::ostream& out,const std::vector<T>& x)
+{
+ out<<x.size()<<" ";
+ typename std::vector<T>::const_iterator iend=x.end();
+ for(typename std::vector<T>::const_iterator i=x.begin();i!=iend;++i)
+ out<<*i<<' ';
+ return out;
+}
+
+inline bool existsFile(const char* filename) {
+ struct stat mystat;
+ return (stat(filename,&mystat)==0);
+}
+inline bool existsFile(const std::string& filename) {
+ return existsFile(filename.c_str());
+}
+
+int main(int argc,char **argv) {
+ std::string fto;size_t noScoreComponent=5;int cn=0;
+ bool aligninfo=false;
+ std::vector<std::pair<std::string,std::pair<char*,char*> > > ftts;
+ int verb=0;
+ for(int i=1;i<argc;++i) {
+ std::string s(argv[i]);
+ if(s=="-ttable") {
+ std::pair<char*,char*> p;
+ p.first=argv[++i];
+ p.second=argv[++i];
+ ftts.push_back(std::make_pair(std::string(argv[++i]),p));
+ }
+ else if(s=="-nscores") noScoreComponent=atoi(argv[++i]);
+ else if(s=="-out") fto=std::string(argv[++i]);
+ else if(s=="-cn") cn=1;
+ else if(s=="-irst") cn=2;
+ else if(s=="-alignment-info") aligninfo=true;
+ else if(s=="-v") verb=atoi(argv[++i]);
+ else if(s=="-h")
+ {
+ std::cerr<<"usage "<<argv[0]<<" :\n\n"
+ "options:\n"
+ "\t-ttable int int string -- translation table file, use '-' for stdin\n"
+ "\t-out string -- output file name prefix for binary ttable\n"
+ "\t-nscores int -- number of scores in ttable\n"
+ "\t-alignment-info -- include alignment info in the binary ttable (suffix \".wa\")\n"
+ "\nfunctions:\n"
+ "\t - convert ascii ttable in binary format\n"
+ "\t - if ttable is not read from stdin:\n"
+ "\t treat each line as source phrase an print tgt candidates\n"
+ "\n";
+ return 1;
+ }
+ else
+ {
+ std::cerr<<"ERROR: unknown option '"<<s<<"'\n";
+ return 1;
+ }
+ }
+
+
+ if(ftts.size()) {
+
+ if(ftts.size()==1){
+ std::cerr<<"processing ptree for ";
+ PhraseDictionaryTree pdt(noScoreComponent);
+
+ pdt.PrintWordAlignment(aligninfo);
+
+ if (ftts[0].first=="-") {
+ std::cerr<< "stdin\n";
+ pdt.Create(std::cin,fto);
+ }
+ else{
+ std::cerr<< ftts[0].first << "\n";
+ InputFileStream in(ftts[0].first);
+ pdt.Create(in,fto);
+ }
+ }
+ else
+ {
+#if 0
+ std::vector<PhraseDictionaryTree const*> pdicts;
+ std::vector<FactorType> factorOrder;
+ for(size_t i=0;i<ftts.size();++i) {
+
+ PhraseDictionaryTree *pdtptr=new PhraseDictionaryTree(noScoreComponent,
+ &factorCollection,
+ getFactorType(atoi(ftts[i].second.first)),
+ getFactorType(atoi(ftts[i].second.second))
+ );
+ factorOrder.push_back(pdtptr->GetInputFactorType());
+ PhraseDictionaryTree &pdt=*pdtptr;
+ pdicts.push_back(pdtptr);
+
+ std::string facStr="."+std::string(ftts[i].second.first)+"-"+std::string(ftts[i].second.second);
+ std::string prefix=ftts[i].first+facStr;
+ if(!existsFile(prefix+".binphr.idx")) {
+ std::cerr<<"bin ttable does not exist -> create it\n";
+ InputFileStream in(prefix);
+ pdt.Create(in,prefix);
+ }
+ std::cerr<<"reading bin ttable\n";
+ pdt.Read(prefix);
+
+ }
+
+ std::cerr<<"processing stdin\n";
+ if(!cn) {
+ std::string line;
+ while(getline(std::cin,line)) {
+ std::istringstream is(line);
+#if 0
+ std::vector<std::string> f;
+ std::copy(std::istream_iterator<std::string>(is),
+ std::istream_iterator<std::string>(),
+ std::back_inserter(f));
+#endif
+ std::cerr<<"got source phrase '"<<line<<"'\n";
+
+ Phrase F(Input);
+ F.CreateFromString(factorOrder,line,factorCollection);
+
+ for(size_t k=0;k<pdicts.size();++k) {
+ PhraseDictionaryTree const& pdt=*pdicts[k];
+
+ std::vector<std::string> f(F.GetSize());
+ for(size_t i=0;i<F.GetSize();++i)
+ f[i]=F.GetFactor(i,pdt.GetInputFactorType())->ToString();
+
+ std::stringstream iostA,iostB;
+ std::cerr<<"full phrase processing "<<f<<"\n";
+ pdt.PrintTargetCandidates(f,iostA);
+
+ std::cerr<<"processing with prefix ptr\n";
+ PhraseDictionaryTree::PrefixPtr p(pdt.GetRoot());
+
+ for(size_t i=0;i<f.size() && p;++i) {
+ std::cerr<<"pre "<<i<<" "<<(p?"1":"0")<<"\n";
+ p=pdt.Extend(p,f[i]);
+ std::cerr<<"post "<<i<<" "<<(p?"1":"0")<<"\n";
+ }
+ if(p) {
+ std::cerr<<"retrieving candidates from prefix ptr\n";
+ pdt.PrintTargetCandidates(p,iostB);}
+ else {
+ std::cerr<<"final ptr is invalid\n";
+ iostB<<"there are 0 target candidates\n";
+ }
+ if(iostA.str() != iostB.str())
+ std::cerr<<"ERROR: translation candidates mismatch '"<<iostA.str()<<"' and for prefix pointer: '"<<iostB.str()<<"'\n";
+
+ std::cerr<<"translation candidates:\n"<<iostA.str()<<"\n";
+ pdt.FreeMemory();
+
+ }
+
+ }
+ }
+ else {
+ // process confusion net input
+ ConfusionNet net(&factorCollection);
+ std::vector<std::vector<float> > weights;
+ for(size_t i=0;i<pdicts.size();++i)
+ weights.push_back(std::vector<float>(noScoreComponent,1/(1.0*noScoreComponent)));
+
+ while(net.ReadF(std::cin,factorOrder,cn-1)) {
+ net.Print(std::cerr);
+ GenerateCandidates(net,pdicts,weights,verb);
+ }
+
+ }
+#else
+ std::cerr<<"ERROR: these functions are currently broken...\n";
+ exit(1);
+#endif
+ }
+ }
+
+}