Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
Diffstat (limited to 'misc/CreateProbingPT.cpp')
-rw-r--r--misc/CreateProbingPT.cpp108
1 files changed, 96 insertions, 12 deletions
diff --git a/misc/CreateProbingPT.cpp b/misc/CreateProbingPT.cpp
index b23427f30..2cf6627ef 100644
--- a/misc/CreateProbingPT.cpp
+++ b/misc/CreateProbingPT.cpp
@@ -1,29 +1,113 @@
+#include <string>
+#include <boost/program_options.hpp>
#include "util/usage.hh"
#include "moses/TranslationModel/ProbingPT/storing.hh"
+#include "moses/InputFileStream.h"
+#include "moses/OutputFileStream.h"
+#include "moses/Util.h"
+using namespace std;
+std::string ReformatSCFGFile(const std::string &path);
int main(int argc, char* argv[])
{
+ string inPath, outPath;
+ int num_scores = 4;
+ int num_lex_scores = 0;
+ bool log_prob = false;
+ bool scfg = false;
+ int max_cache_size = 50000;
- const char * is_reordering = "false";
+ namespace po = boost::program_options;
+ po::options_description desc("Options");
+ desc.add_options()
+ ("help", "Print help messages")
+ ("input-pt", po::value<string>()->required(), "Text pt")
+ ("output-dir", po::value<string>()->required(), "Directory when binary files will be written")
+ ("num-scores", po::value<int>()->default_value(num_scores), "Number of pt scores")
+ ("num-lex-scores", po::value<int>()->default_value(num_lex_scores), "Number of lexicalized reordering scores")
+ ("log-prob", "log (and floor) probabilities before storing")
+ ("max-cache-size", po::value<int>()->default_value(max_cache_size), "Maximum number of high-count source lines to write to cache file. 0=no cache, negative=no limit")
+ ("scfg", "Rules are SCFG in Moses format (ie. with non-terms and LHS")
- if (!(argc == 5 || argc == 4)) {
- // Tell the user how to run the program
- std::cerr << "Provided " << argc << " arguments, needed 4 or 5." << std::endl;
- std::cerr << "Usage: " << argv[0] << " path_to_phrasetable output_dir num_scores is_reordering" << std::endl;
- std::cerr << "is_reordering should be either true or false, but it is currently a stub feature." << std::endl;
- //std::cerr << "Usage: " << argv[0] << " path_to_phrasetable number_of_uniq_lines output_bin_file output_hash_table output_vocab_id" << std::endl;
- return 1;
+ ;
+
+ po::variables_map vm;
+ try {
+ po::store(po::parse_command_line(argc, argv, desc),
+ vm); // can throw
+
+ /** --help option
+ */
+ if ( vm.count("help")) {
+ std::cout << desc << std::endl;
+ return EXIT_SUCCESS;
+ }
+
+ po::notify(vm); // throws on error, so do after help in case
+ // there are any problems
+ } catch(po::error& e) {
+ std::cerr << "ERROR: " << e.what() << std::endl << std::endl;
+ std::cerr << desc << std::endl;
+ return EXIT_FAILURE;
}
- if (argc == 5) {
- is_reordering = argv[4];
+ if (vm.count("input-pt")) inPath = vm["input-pt"].as<string>();
+ if (vm.count("output-dir")) outPath = vm["output-dir"].as<string>();
+ if (vm.count("num-scores")) num_scores = vm["num-scores"].as<int>();
+ if (vm.count("num-lex-scores")) num_lex_scores = vm["num-lex-scores"].as<int>();
+ if (vm.count("max-cache-size")) max_cache_size = vm["max-cache-size"].as<int>();
+ if (vm.count("log-prob")) log_prob = true;
+ if (vm.count("scfg")) scfg = true;
+
+
+ if (scfg) {
+ inPath = ReformatSCFGFile(inPath);
}
- createProbingPT(argv[1], argv[2], argv[3], is_reordering);
+ Moses::createProbingPT(inPath, outPath, num_scores, num_lex_scores, log_prob, max_cache_size, scfg);
- util::PrintUsage(std::cout);
+ //util::PrintUsage(std::cout);
return 0;
}
+std::string ReformatSCFGFile(const std::string &path)
+{
+ Moses::InputFileStream inFile(path);
+ string reformattedPath = path + ".reformat.gz";
+ Moses::OutputFileStream outFile(reformattedPath);
+
+ string line;
+ while (getline(inFile, line)) {
+ vector<string> toks = Moses::TokenizeMultiCharSeparator(line, "|||");
+ assert(toks.size() >= 3);
+
+ // source
+ vector<string> sourceToks = Moses::Tokenize(toks[0], " ");
+ for (size_t i = 0; i < sourceToks.size() - 1; ++i) {
+ outFile << sourceToks[i] << " ";
+ }
+
+ // other columns
+ for (size_t i = 1; i < toks.size(); ++i) {
+ outFile << "|||" << toks[i];
+ }
+ outFile << endl;
+ }
+
+ inFile.Close();
+ outFile.Close();
+
+ string sortedPath = path + ".reformat.sorted.gz";
+ string tmpPath = path + ".tmp ";
+ string cmd = "mkdir " + tmpPath
+ + " && gzip -dc " + reformattedPath + " | LC_ALL=C sort -T " + tmpPath + " | gzip -c > " + sortedPath;
+ system(cmd.c_str());
+
+ cmd = "rm -rf " + tmpPath + " " + reformattedPath;
+ system(cmd.c_str());
+
+ return sortedPath;
+}
+