Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorHieu Hoang <hieuhoang@gmail.com>2015-06-24 13:56:37 +0300
committerHieu Hoang <hieuhoang@gmail.com>2015-06-24 13:56:37 +0300
commit0d34023aad0dbf28c28bcc17876b4016b5b1b3ea (patch)
treeceb7961ea1afccfde7cadad61d4df80a40d634ac
parent58f0187e8bd3a49f894cf1815df510d151c46410 (diff)
prune generation table
-rw-r--r--misc/Jamfile4
-rw-r--r--misc/pruneGeneration.cpp55
-rw-r--r--misc/pruneGeneration.h45
3 files changed, 103 insertions, 1 deletions
diff --git a/misc/Jamfile b/misc/Jamfile
index bfea14d58..46a18e253 100644
--- a/misc/Jamfile
+++ b/misc/Jamfile
@@ -14,6 +14,8 @@ exe 1-1-Extraction : 1-1-Extraction.cpp ..//boost_filesystem ../moses//moses ;
exe prunePhraseTable : prunePhraseTable.cpp ..//boost_filesystem ../moses//moses ..//boost_program_options ;
+exe pruneGeneration : pruneGeneration.cpp ..//boost_filesystem ../moses//moses ..//boost_program_options ;
+
local with-cmph = [ option.get "with-cmph" ] ;
if $(with-cmph) {
exe processPhraseTableMin : processPhraseTableMin.cpp ..//boost_filesystem ../moses//moses ;
@@ -46,6 +48,6 @@ $(TOP)//boost_iostreams
$(TOP)//boost_program_options
;
-alias programs : 1-1-Extraction TMining generateSequences processLexicalTable queryLexicalTable programsMin programsProbing merge-sorted prunePhraseTable ;
+alias programs : 1-1-Extraction TMining generateSequences processLexicalTable queryLexicalTable programsMin programsProbing merge-sorted prunePhraseTable pruneGeneration ;
#processPhraseTable queryPhraseTable
diff --git a/misc/pruneGeneration.cpp b/misc/pruneGeneration.cpp
new file mode 100644
index 000000000..45873a4ac
--- /dev/null
+++ b/misc/pruneGeneration.cpp
@@ -0,0 +1,55 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <iostream>
+#include <cassert>
+#include "pruneGeneration.h"
+
+using namespace std;
+
+int main(int argc, char **argv)
+{
+ cerr << "Starting" << endl;
+ int limit = atoi(argv[1]);
+
+ vector<Rec> records;
+ string prevInWord;
+ string line;
+ while (getline(cin, line)) {
+ vector<string> toks;
+ Tokenize(toks, line);
+ assert(toks.size() == 4);
+
+ if (prevInWord != toks[0]) {
+ Output(limit, records);
+ records.clear();
+ }
+
+ // add new record
+ float prob = atof(toks[2].c_str());
+ records.push_back(Rec(prob, line));
+
+ prevInWord = toks[0];
+ }
+
+ // last
+ Output(limit, records);
+ records.clear();
+
+ cerr << "Finished" << endl;
+}
+
+void Output(int limit, vector<Rec> &records)
+{
+ Prune(limit, records);
+
+ for (size_t i = 0; i < limit && i < records.size(); ++i) {
+ const Rec &rec = records[i];
+ cout << rec.line << endl;
+ }
+}
+
+void Prune(int limit, std::vector<Rec> &records)
+{
+ std::sort(records.rbegin(), records.rend());
+
+}
diff --git a/misc/pruneGeneration.h b/misc/pruneGeneration.h
new file mode 100644
index 000000000..693c5f149
--- /dev/null
+++ b/misc/pruneGeneration.h
@@ -0,0 +1,45 @@
+#pragma once
+#include <vector>
+#include <string>
+
+class Rec
+{
+public:
+ float prob;
+ std::string line;
+
+ Rec(float aprob, const std::string &aline)
+ :prob(aprob)
+ ,line(aline)
+ {}
+
+ inline bool operator< (const Rec &compare) const {
+ return prob < compare.prob;
+ }
+};
+
+////////////////////////////////////////////////////////////
+
+void Output(int limit, std::vector<Rec> &records);
+void Prune(int limit, std::vector<Rec> &records);
+
+////////////////////////////////////////////////////////////
+inline void Tokenize(std::vector<std::string> &output
+ , const std::string& str
+ , const std::string& delimiters = " \t")
+{
+ // Skip delimiters at beginning.
+ std::string::size_type lastPos = str.find_first_not_of(delimiters, 0);
+ // Find first "non-delimiter".
+ std::string::size_type pos = str.find_first_of(delimiters, lastPos);
+
+ while (std::string::npos != pos || std::string::npos != lastPos) {
+ // Found a token, add it to the vector.
+ output.push_back(str.substr(lastPos, pos - lastPos));
+ // Skip delimiters. Note the "not_of"
+ lastPos = str.find_first_not_of(delimiters, pos);
+ // Find next "non-delimiter"
+ pos = str.find_first_of(delimiters, lastPos);
+ }
+}
+