Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
path: root/misc
diff options
context:
space:
mode:
authorNicola Bertoldi <bertoldi@fbk.eu>2014-12-13 14:52:47 +0300
committerNicola Bertoldi <bertoldi@fbk.eu>2014-12-13 14:52:47 +0300
commite4eb201c52be74fee74399a6f35fcbe8eb85d834 (patch)
tree7792ef96d63262f6e28f1857741e1162c7dccbc4 /misc
parentcea2d9d8bb34a81660974cae20d66aefec4e0468 (diff)
parenta0b6b6a341e74b47bbef4652ad7fd928cf91e17c (diff)
merged master into dynamic-models and solved conflicts
Diffstat (limited to 'misc')
-rw-r--r--misc/CreateProbingPT.cpp20
-rw-r--r--misc/Jamfile43
-rw-r--r--misc/QueryProbingPT.cpp61
-rw-r--r--misc/merge-sorted.cc99
-rw-r--r--misc/prunePhraseTable.cpp227
-rw-r--r--misc/queryPhraseTableMin.cpp6
6 files changed, 442 insertions, 14 deletions
diff --git a/misc/CreateProbingPT.cpp b/misc/CreateProbingPT.cpp
new file mode 100644
index 000000000..3ea369a96
--- /dev/null
+++ b/misc/CreateProbingPT.cpp
@@ -0,0 +1,20 @@
+#include "util/usage.hh"
+#include "moses/TranslationModel/ProbingPT/storing.hh"
+
+
+
+int main(int argc, char* argv[]){
+
+ if (argc != 3) {
+ // Tell the user how to run the program
+ std::cerr << "Provided " << argc << " arguments, needed 3." << std::endl;
+ std::cerr << "Usage: " << argv[0] << " path_to_phrasetable output_dir" << std::endl;
+ return 1;
+ }
+
+ createProbingPT(argv[1], argv[2]);
+
+ util::PrintUsage(std::cout);
+ return 0;
+}
+
diff --git a/misc/Jamfile b/misc/Jamfile
index 76f91babb..8cc7aa9a8 100644
--- a/misc/Jamfile
+++ b/misc/Jamfile
@@ -1,22 +1,24 @@
-exe processPhraseTable : GenerateTuples.cpp processPhraseTable.cpp ../moses//moses ;
+exe processPhraseTable : GenerateTuples.cpp processPhraseTable.cpp ..//boost_filesystem ../moses//moses ;
-exe processLexicalTable : processLexicalTable.cpp ../moses//moses ;
+exe processLexicalTable : processLexicalTable.cpp ..//boost_filesystem ../moses//moses ;
-exe queryPhraseTable : queryPhraseTable.cpp ../moses//moses ;
+exe queryPhraseTable : queryPhraseTable.cpp ..//boost_filesystem ../moses//moses ;
-exe queryLexicalTable : queryLexicalTable.cpp ../moses//moses ;
+exe queryLexicalTable : queryLexicalTable.cpp ..//boost_filesystem ../moses//moses ;
-exe generateSequences : GenerateSequences.cpp ../moses//moses ;
+exe generateSequences : GenerateSequences.cpp ..//boost_filesystem ../moses//moses ;
-exe TMining : TransliterationMining.cpp ../moses//moses ;
+exe TMining : TransliterationMining.cpp ..//boost_filesystem ../moses//moses ;
-exe 1-1-Extraction : 1-1-Extraction.cpp ../moses//moses ;
+exe 1-1-Extraction : 1-1-Extraction.cpp ..//boost_filesystem ../moses//moses ;
+
+exe prunePhraseTable : prunePhraseTable.cpp ..//boost_filesystem ../moses//moses ..//boost_program_options ;
local with-cmph = [ option.get "with-cmph" ] ;
if $(with-cmph) {
- exe processPhraseTableMin : processPhraseTableMin.cpp ../moses//moses ;
- exe processLexicalTableMin : processLexicalTableMin.cpp ../moses//moses ;
- exe queryPhraseTableMin : queryPhraseTableMin.cpp ../moses//moses ;
+ exe processPhraseTableMin : processPhraseTableMin.cpp ..//boost_filesystem ../moses//moses ;
+ exe processLexicalTableMin : processLexicalTableMin.cpp ..//boost_filesystem ../moses//moses ;
+ exe queryPhraseTableMin : queryPhraseTableMin.cpp ..//boost_filesystem ../moses//moses ;
alias programsMin : processPhraseTableMin processLexicalTableMin queryPhraseTableMin ;
# alias programsMin : processPhraseTableMin processLexicalTableMin ;
@@ -25,4 +27,23 @@ else {
alias programsMin ;
}
-alias programs : 1-1-Extraction TMining generateSequences processPhraseTable processLexicalTable queryPhraseTable queryLexicalTable programsMin ;
+if [ option.get "with-probing-pt" : : "yes" ]
+{
+ exe CreateProbingPT : CreateProbingPT.cpp ..//boost_filesystem ../moses//moses ;
+ exe QueryProbingPT : QueryProbingPT.cpp ..//boost_filesystem ../moses//moses ;
+
+ alias programsProbing : CreateProbingPT QueryProbingPT ;
+}
+else {
+ alias programsProbing ;
+}
+
+exe merge-sorted :
+merge-sorted.cc
+../moses//moses
+../moses/TranslationModel/UG/generic//generic
+$(TOP)//boost_iostreams
+$(TOP)//boost_program_options
+;
+
+alias programs : 1-1-Extraction TMining generateSequences processPhraseTable processLexicalTable queryPhraseTable queryLexicalTable programsMin programsProbing merge-sorted prunePhraseTable ;
diff --git a/misc/QueryProbingPT.cpp b/misc/QueryProbingPT.cpp
new file mode 100644
index 000000000..8a3441a0d
--- /dev/null
+++ b/misc/QueryProbingPT.cpp
@@ -0,0 +1,61 @@
+#include "util/file_piece.hh"
+
+#include "util/file.hh"
+#include "util/scoped.hh"
+#include "util/string_piece.hh"
+#include "util/tokenize_piece.hh"
+#include "util/murmur_hash.hh"
+#include "util/probing_hash_table.hh"
+#include "util/usage.hh"
+
+#include "moses/TranslationModel/ProbingPT/quering.hh"
+
+#include <stdio.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+
+#include <fstream>
+#include <iostream>
+#include <sstream>
+#include <string>
+#include <sys/mman.h>
+#include <sys/stat.h> //For finding size of file
+#include <boost/functional/hash.hpp>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <unistd.h>
+#include <fcntl.h>
+
+int main(int argc, char* argv[]) {
+ if (argc != 2) {
+ // Tell the user how to run the program
+ std::cerr << "Usage: " << argv[0] << " path_to_directory" << std::endl;
+ return 1;
+ }
+
+ QueryEngine queries(argv[1]);
+
+ //Interactive search
+ std::cout << "Please enter a string to be searched, or exit to exit." << std::endl;
+ while (true){
+ std::string cinstr = "";
+ getline(std::cin, cinstr);
+ if (cinstr == "exit"){
+ break;
+ }else{
+ //Actual lookup
+ std::pair<bool, std::vector<target_text> > query_result;
+ query_result = queries.query(StringPiece(cinstr));
+
+ if (query_result.first) {
+ queries.printTargetInfo(query_result.second);
+ } else {
+ std::cout << "Key not found!" << std::endl;
+ }
+ }
+ }
+
+ util::PrintUsage(std::cout);
+
+ return 0;
+}
diff --git a/misc/merge-sorted.cc b/misc/merge-sorted.cc
new file mode 100644
index 000000000..ae693215b
--- /dev/null
+++ b/misc/merge-sorted.cc
@@ -0,0 +1,99 @@
+// This program takes gzipped sorted files and merges them in sorted order
+// to stdout. Written by Ulrich Germann
+#include <boost/iostreams/filtering_stream.hpp>
+#include <boost/shared_ptr.hpp>
+#include <algorithm>
+#include <string>
+#include <vector>
+#include "moses/TranslationModel/UG/generic/file_io/ug_stream.h"
+using namespace std;
+using namespace ugdiss;
+using namespace boost::iostreams;
+
+typedef boost::shared_ptr<filtering_istream> fptr;
+
+class Part
+{
+ string fname;
+ fptr f;
+ string my_lines[2];
+ size_t ctr;
+public:
+ string const& line() const
+ {
+ static string empty_line;
+ return f ? my_lines[ctr%2] : empty_line;
+ }
+
+ Part(string _fname) : ctr(0)
+ {
+ fname = _fname;
+ f.reset(open_input_stream(fname));
+ if (!getline(*f, my_lines[0])) f.reset();
+ }
+
+ bool next()
+ {
+ if (!f) return false;
+ if (!getline(*f, my_lines[++ctr%2]))
+ {
+ f.reset();
+ --ctr;
+ return false;
+ }
+ assert(my_lines[(ctr-1)%2] <= my_lines[ctr%2]);
+ return true;
+ }
+
+ bool operator <(Part const& other) const
+ { return line() < other.line(); }
+
+ bool operator <=(Part const& other) const
+ { return line() <= other.line(); }
+
+ bool operator >(Part const& other) const
+ { return line() > other.line(); }
+
+ bool operator >=(Part const& other) const
+ { return line() >= other.line(); }
+
+ bool go(ostream& out)
+ {
+ if (!f) return false;
+#if 0
+ if (ctr)
+ {
+ out << fname << "-" << ctr - 1 << "-";
+ out << my_lines[(ctr - 1)%2] << endl;
+ }
+ do
+ {
+ out << fname << " " << ctr << " ";
+ out << line() << "\n";
+ }
+ while (next() && my_lines[0] == my_lines[1]);
+#else
+ do { out << line() << "\n"; }
+ while (next() && my_lines[0] == my_lines[1]);
+ out.flush();
+#endif
+ return f != NULL;
+ }
+
+};
+
+
+int main(int argc, char* argv[])
+{
+ vector<Part> parts;
+ for (int i = 1; i < argc; ++i)
+ parts.push_back(Part(argv[i]));
+ make_heap(parts.begin(), parts.end(), greater<Part>());
+ while (parts.size())
+ {
+ pop_heap(parts.begin(), parts.end(), greater<Part>());
+ if (parts.back().go(cout))
+ push_heap(parts.begin(), parts.end(), greater<Part>());
+ else parts.pop_back();
+ }
+}
diff --git a/misc/prunePhraseTable.cpp b/misc/prunePhraseTable.cpp
new file mode 100644
index 000000000..dcf8d73da
--- /dev/null
+++ b/misc/prunePhraseTable.cpp
@@ -0,0 +1,227 @@
+// $Id$
+// vim:tabstop=2
+
+/***********************************************************************
+Moses - factored phrase-based language decoder
+Copyright (C) 2014- University of Edinburgh
+
+This library is free software; you can redistribute it and/or
+modify it under the terms of the GNU Lesser General Public
+License as published by the Free Software Foundation; either
+version 2.1 of the License, or (at your option) any later version.
+
+This library is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+Lesser General Public License for more details.
+
+You should have received a copy of the GNU Lesser General Public
+License along with this library; if not, write to the Free Software
+Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+***********************************************************************/
+
+
+/**
+ Prune the phrase table using the same translation pruning that Moses uses during decoding.
+**/
+
+#include <cstring>
+#include <iostream>
+#include <fstream>
+#include <map>
+#include <string>
+#include <vector>
+
+#include <boost/program_options.hpp>
+#include <boost/scoped_ptr.hpp>
+
+#include "moses/InputPath.h"
+#include "moses/Parameter.h"
+#include "moses/TranslationModel/PhraseDictionary.h"
+#include "moses/StaticData.h"
+
+#include "util/file_piece.hh"
+#include "util/string_piece.hh"
+#include "util/tokenize_piece.hh"
+#include "util/double-conversion/double-conversion.h"
+#include "util/exception.hh"
+
+
+using namespace Moses;
+using namespace std;
+
+namespace po = boost::program_options;
+typedef multimap<float,string> Lines;
+
+static void usage(const po::options_description& desc, char** argv) {
+ cerr << "Usage: " + string(argv[0]) + " [options] input-file output-file" << endl;
+ cerr << desc << endl;
+}
+
+//Find top n translations of source, and send them to output
+static void outputTopN(Lines lines, size_t maxPhrases, ostream& out) {
+ size_t count = 0;
+ for (Lines::const_reverse_iterator i = lines.rbegin(); i != lines.rend(); ++i) {
+ out << i->second << endl;
+ ++count;
+ if (count >= maxPhrases) break;
+ }
+}
+/*
+static void outputTopN(const Phrase& sourcePhrase, const multimap<float,const TargetPhrase*>& targetPhrases,
+ size_t maxPhrases, const PhraseDictionary* phraseTable,
+ const vector<FactorType> & input, const vector<FactorType> & output, ostream& out) {
+ size_t count = 0;
+ for (multimap<float,const TargetPhrase*>::const_reverse_iterator i
+ = targetPhrases.rbegin(); i != targetPhrases.rend() && count < maxPhrases; ++i, ++count) {
+ const TargetPhrase* targetPhrase = i->second;
+ out << sourcePhrase.GetStringRep(input);
+ out << " ||| ";
+ out << targetPhrase->GetStringRep(output);
+ out << " ||| ";
+ const ScoreComponentCollection scores = targetPhrase->GetScoreBreakdown();
+ vector<float> phraseScores = scores.GetScoresForProducer(phraseTable);
+ for (size_t j = 0; j < phraseScores.size(); ++j) {
+ out << exp(phraseScores[j]) << " ";
+ }
+ out << "||| ";
+ const AlignmentInfo& align = targetPhrase->GetAlignTerm();
+ for (AlignmentInfo::const_iterator j = align.begin(); j != align.end(); ++j) {
+ out << j->first << "-" << j->second << " ";
+ }
+ out << endl;
+ }
+}*/
+int main(int argc, char** argv)
+{
+ bool help;
+ string input_file;
+ string config_file;
+ size_t maxPhrases = 100;
+
+
+ po::options_description desc("Allowed options");
+ desc.add_options()
+ ("help,h", po::value(&help)->zero_tokens()->default_value(false), "Print this help message and exit")
+ ("input-file,i", po::value<string>(&input_file), "Input file")
+ ("config-file,f", po::value<string>(&config_file), "Config file")
+ ("max-phrases,n", po::value<size_t>(&maxPhrases), "Maximum target phrases per source phrase")
+ ;
+
+ po::options_description cmdline_options;
+ cmdline_options.add(desc);
+ po::variables_map vm;
+ po::parsed_options parsed = po::command_line_parser(argc,argv).
+ options(cmdline_options).run();
+ po::store(parsed, vm);
+ po::notify(vm);
+ if (help) {
+ usage(desc, argv);
+ exit(0);
+ }
+ if (input_file.empty()) {
+ cerr << "ERROR: Please specify an input file" << endl << endl;
+ usage(desc, argv);
+ exit(1);
+ }
+ if (config_file.empty()) {
+ cerr << "ERROR: Please specify a config file" << endl << endl;
+ usage(desc, argv);
+ exit(1);
+ }
+
+ vector<string> mosesargs;
+ mosesargs.push_back(argv[0]);
+ mosesargs.push_back("-f");
+ mosesargs.push_back(config_file);
+
+ boost::scoped_ptr<Parameter> params(new Parameter());
+ char** mosesargv = new char*[mosesargs.size()];
+ for (size_t i = 0; i < mosesargs.size(); ++i) {
+ mosesargv[i] = new char[mosesargs[i].length() + 1];
+ strcpy(mosesargv[i], mosesargs[i].c_str());
+ }
+
+ if (!params->LoadParam(mosesargs.size(), mosesargv)) {
+ params->Explain();
+ exit(1);
+ }
+
+ if (!StaticData::LoadDataStatic(params.get(),argv[0])) {
+ exit(1);
+ }
+
+ const StaticData &staticData = StaticData::Instance();
+
+ //Find the phrase table to manage the target phrases
+ PhraseDictionary* phraseTable = NULL;
+ const vector<FeatureFunction*>& ffs = FeatureFunction::GetFeatureFunctions();
+ for (size_t i = 0; i < ffs.size(); ++i) {
+ PhraseDictionary* maybePhraseTable = dynamic_cast< PhraseDictionary*>(ffs[i]);
+ if (maybePhraseTable) {
+ UTIL_THROW_IF(phraseTable,util::Exception,"Can only score translations with one phrase table");
+ phraseTable = maybePhraseTable;
+ }
+ }
+ UTIL_THROW_IF(!phraseTable,util::Exception,"Unable to find scoring phrase table");
+
+
+ //
+ //Load and prune the phrase table. This is taken (with mods) from moses/TranslationModel/RuleTable/LoaderStandard.cpp
+ //
+
+ std::ostream *progress = NULL;
+ IFVERBOSE(1) progress = &std::cerr;
+ util::FilePiece in(input_file.c_str(), progress);
+
+ // reused variables
+ vector<float> scoreVector;
+ StringPiece line;
+
+ double_conversion::StringToDoubleConverter converter(double_conversion::StringToDoubleConverter::NO_FLAGS, NAN, NAN, "inf", "nan");
+
+ string previous;
+ Lines lines;
+
+
+ while(true) {
+ try {
+ line = in.ReadLine();
+ } catch (const util::EndOfFileException &e) {
+ break;
+ }
+
+ util::TokenIter<util::MultiCharacter> pipes(line, "|||");
+ StringPiece sourcePhraseString(*pipes);
+ StringPiece targetPhraseString(*++pipes);
+ StringPiece scoreString(*++pipes);
+ scoreVector.clear();
+ for (util::TokenIter<util::AnyCharacter, true> s(scoreString, " \t"); s; ++s) {
+ int processed;
+ float score = converter.StringToFloat(s->data(), s->length(), &processed);
+ UTIL_THROW_IF2(isnan(score), "Bad score " << *s);
+ scoreVector.push_back(FloorScore(TransformScore(score)));
+ }
+
+ if (sourcePhraseString != previous) {
+ outputTopN(lines, maxPhrases, cout);
+ previous = sourcePhraseString.as_string();
+ lines.clear();
+ }
+
+ ScoreComponentCollection scores;
+ scores.Assign(phraseTable,scoreVector);
+ float score = scores.InnerProduct(staticData.GetAllWeights());
+ lines.insert(pair<float,string>(score,line.as_string()));
+
+ }
+ if (!lines.empty()) {
+ outputTopN(lines, maxPhrases, cout);
+ }
+
+
+
+
+
+ return 0;
+}
diff --git a/misc/queryPhraseTableMin.cpp b/misc/queryPhraseTableMin.cpp
index 723370252..ca4b4b690 100644
--- a/misc/queryPhraseTableMin.cpp
+++ b/misc/queryPhraseTableMin.cpp
@@ -51,9 +51,9 @@ int main(int argc, char **argv)
// const_cast<std::vector<std::string>&>(parameter->GetParam("factor-delimiter")).resize(1, "||dummy_string||");
// UG: I assume "||dummy_string||" means: I'm not using factored data;
// This is now expressed by setting the factor delimiter to the empty string
- const_cast<std::vector<std::string>&>(parameter->GetParam("factor-delimiter")).resize(1, "");
- const_cast<std::vector<std::string>&>(parameter->GetParam("input-factors")).resize(1, "0");
- const_cast<std::vector<std::string>&>(parameter->GetParam("verbose")).resize(1, "0");
+ const_cast<std::vector<std::string>&>(*parameter->GetParam("factor-delimiter")).resize(1, "");
+ const_cast<std::vector<std::string>&>(*parameter->GetParam("input-factors")).resize(1, "0");
+ const_cast<std::vector<std::string>&>(*parameter->GetParam("verbose")).resize(1, "0");
//const_cast<std::vector<std::string>&>(parameter->GetParam("weight-w")).resize(1, "0");
//const_cast<std::vector<std::string>&>(parameter->GetParam("weight-d")).resize(1, "0");