From 0d34023aad0dbf28c28bcc17876b4016b5b1b3ea Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Wed, 24 Jun 2015 14:56:37 +0400 Subject: prune generation table --- misc/Jamfile | 4 +++- misc/pruneGeneration.cpp | 55 ++++++++++++++++++++++++++++++++++++++++++++++++ misc/pruneGeneration.h | 45 +++++++++++++++++++++++++++++++++++++++ 3 files changed, 103 insertions(+), 1 deletion(-) create mode 100644 misc/pruneGeneration.cpp create mode 100644 misc/pruneGeneration.h diff --git a/misc/Jamfile b/misc/Jamfile index bfea14d58..46a18e253 100644 --- a/misc/Jamfile +++ b/misc/Jamfile @@ -14,6 +14,8 @@ exe 1-1-Extraction : 1-1-Extraction.cpp ..//boost_filesystem ../moses//moses ; exe prunePhraseTable : prunePhraseTable.cpp ..//boost_filesystem ../moses//moses ..//boost_program_options ; +exe pruneGeneration : pruneGeneration.cpp ..//boost_filesystem ../moses//moses ..//boost_program_options ; + local with-cmph = [ option.get "with-cmph" ] ; if $(with-cmph) { exe processPhraseTableMin : processPhraseTableMin.cpp ..//boost_filesystem ../moses//moses ; @@ -46,6 +48,6 @@ $(TOP)//boost_iostreams $(TOP)//boost_program_options ; -alias programs : 1-1-Extraction TMining generateSequences processLexicalTable queryLexicalTable programsMin programsProbing merge-sorted prunePhraseTable ; +alias programs : 1-1-Extraction TMining generateSequences processLexicalTable queryLexicalTable programsMin programsProbing merge-sorted prunePhraseTable pruneGeneration ; #processPhraseTable queryPhraseTable diff --git a/misc/pruneGeneration.cpp b/misc/pruneGeneration.cpp new file mode 100644 index 000000000..45873a4ac --- /dev/null +++ b/misc/pruneGeneration.cpp @@ -0,0 +1,55 @@ +#include +#include +#include +#include +#include "pruneGeneration.h" + +using namespace std; + +int main(int argc, char **argv) +{ + cerr << "Starting" << endl; + int limit = atoi(argv[1]); + + vector records; + string prevInWord; + string line; + while (getline(cin, line)) { + vector toks; + Tokenize(toks, line); + assert(toks.size() == 4); + + if (prevInWord != toks[0]) { + Output(limit, records); + records.clear(); + } + + // add new record + float prob = atof(toks[2].c_str()); + records.push_back(Rec(prob, line)); + + prevInWord = toks[0]; + } + + // last + Output(limit, records); + records.clear(); + + cerr << "Finished" << endl; +} + +void Output(int limit, vector &records) +{ + Prune(limit, records); + + for (size_t i = 0; i < limit && i < records.size(); ++i) { + const Rec &rec = records[i]; + cout << rec.line << endl; + } +} + +void Prune(int limit, std::vector &records) +{ + std::sort(records.rbegin(), records.rend()); + +} diff --git a/misc/pruneGeneration.h b/misc/pruneGeneration.h new file mode 100644 index 000000000..693c5f149 --- /dev/null +++ b/misc/pruneGeneration.h @@ -0,0 +1,45 @@ +#pragma once +#include +#include + +class Rec +{ +public: + float prob; + std::string line; + + Rec(float aprob, const std::string &aline) + :prob(aprob) + ,line(aline) + {} + + inline bool operator< (const Rec &compare) const { + return prob < compare.prob; + } +}; + +//////////////////////////////////////////////////////////// + +void Output(int limit, std::vector &records); +void Prune(int limit, std::vector &records); + +//////////////////////////////////////////////////////////// +inline void Tokenize(std::vector &output + , const std::string& str + , const std::string& delimiters = " \t") +{ + // Skip delimiters at beginning. + std::string::size_type lastPos = str.find_first_not_of(delimiters, 0); + // Find first "non-delimiter". + std::string::size_type pos = str.find_first_of(delimiters, lastPos); + + while (std::string::npos != pos || std::string::npos != lastPos) { + // Found a token, add it to the vector. + output.push_back(str.substr(lastPos, pos - lastPos)); + // Skip delimiters. Note the "not_of" + lastPos = str.find_first_not_of(delimiters, pos); + // Find next "non-delimiter" + pos = str.find_first_of(delimiters, lastPos); + } +} + -- cgit v1.2.3