Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorUlrich Germann <ulrich.germann@gmail.com>2017-01-14 20:57:46 +0300
committerUlrich Germann <ulrich.germann@gmail.com>2017-01-14 20:57:46 +0300
commitb237741acdc2097b4878042cf035075b23d4881e (patch)
tree656792f8360c083a2129d8b824878be4afe20138 /moses/TranslationModel
parent27760221c753d28fd08d018258db48719f8e6a56 (diff)
Initial check-in: new utility to check overlap of text with training data.
Diffstat (limited to 'moses/TranslationModel')
-rw-r--r--moses/TranslationModel/UG/check-coverage5.cc126
1 files changed, 126 insertions, 0 deletions
diff --git a/moses/TranslationModel/UG/check-coverage5.cc b/moses/TranslationModel/UG/check-coverage5.cc
new file mode 100644
index 000000000..549eb7b21
--- /dev/null
+++ b/moses/TranslationModel/UG/check-coverage5.cc
@@ -0,0 +1,126 @@
+// -*- mode: c++; indent-tabs-mode: nil; tab-width:2 -*-
+
+// read a text from stdin, report percentage of n-grams covered
+
+#include <boost/foreach.hpp>
+#include <boost/format.hpp>
+#include <boost/tokenizer.hpp>
+#include <boost/shared_ptr.hpp>
+#include <algorithm>
+#include <iostream>
+#include "mm/ug_bitext.h"
+#include "generic/file_io/ug_stream.h"
+#include <string>
+#include <sstream>
+#include "mm/ug_bitext_sampler.h"
+
+#include <boost/program_options.hpp>
+#include <boost/math/distributions/binomial.hpp>
+
+// #include "LSA.h"
+
+namespace po=boost::program_options;
+using namespace Moses;
+using namespace sapt;
+using namespace std;
+using namespace boost;
+
+typedef sapt::L2R_Token<sapt::SimpleWordId> Token;
+typedef mmTtrack<Token> ttrack_t;
+
+size_t ngram_size;
+string bname;
+vector<string> ifiles;
+
+void interpret_args(int ac, char* av[]);
+
+
+void
+dump(mmTSA<Token>::tree_iterator& m, TokenIndex& V)
+{
+ if (m.size()) cout << m.str(NULL) << endl;
+ if (m.size()) cout << m.str(&V) << endl;
+ if (m.down())
+ {
+ do { dump(m, V); } while (m.over());
+ m.up();
+ }
+}
+
+int
+main(int argc, char* argv[])
+{
+ interpret_args(argc,argv);
+ TokenIndex V;
+ V.open(bname+".tdx"); V.setDynamic(true); V.iniReverseIndex();
+ boost::shared_ptr<mmTtrack<Token> > T(new mmTtrack<Token>);
+ T->open(bname+".mct");
+ mmTSA<Token> I; I.open(bname+".sfa", T);
+
+ string line;
+ BOOST_FOREACH(string const& file, ifiles)
+ {
+ size_t total_ngrams=0;
+ float matched_ngrams=0;
+ ifstream in(file.c_str());
+ while(getline(in,line))
+ {
+ // cout << line << endl;
+ vector<id_type> snt;
+ V.fillIdSeq(line,snt);
+ if (snt.size() < ngram_size) continue;
+ total_ngrams += snt.size() - ngram_size + 1;
+ for (size_t i = 0; i + ngram_size <= snt.size(); ++i)
+ // for (size_t i = 0; i < snt.size(); ++i)
+ {
+ mmTSA<Token>::tree_iterator m(&I);
+ size_t stop = min(snt.size(), i+ngram_size);
+ size_t k = i;
+ while (k < stop && m.extend(snt[k])) ++k;
+ // cout << i << " " << k-i << " " << m.str(&V) << endl;
+ if (k - i == ngram_size)
+ ++matched_ngrams;
+ }
+ }
+ printf ("%5.1f%% matched %zu-grams (%.0f/%zu): %s\n",
+ (100 * matched_ngrams / total_ngrams), ngram_size,
+ matched_ngrams, total_ngrams, file.c_str());
+ }
+}
+
+void
+interpret_args(int ac, char* av[])
+{
+ po::variables_map vm;
+ po::options_description o("Options");
+ o.add_options()
+
+ ("help,h", "print this message")
+ ("ngram-size,n", po::value<size_t>(&ngram_size)->default_value(5),
+ "sample size")
+ ;
+
+ po::options_description h("Hidden Options");
+ h.add_options()
+ ("bname", po::value<string>(&bname), "base name of corpus")
+ ("ifiles", po::value<vector<string> >(&ifiles), "input files")
+ ;
+
+ h.add(o);
+ po::positional_options_description a;
+ a.add("bname",1);
+ a.add("ifiles",-1);
+
+ po::store(po::command_line_parser(ac,av)
+ .options(h)
+ .positional(a)
+ .run(),vm);
+ po::notify(vm);
+ if (vm.count("help"))
+ {
+ std::cout << "\nusage:\n\t" << av[0]
+ << " [options] <model file stem>" << std::endl;
+ std::cout << o << std::endl;
+ exit(0);
+ }
+}