Welcome to mirror list, hosted at ThFree Co, Russian Federation.

CreateProbingPT.cpp « misc - github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
blob: dff9166605543233db2e1089b935a84a438cbf7f (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
#include <string>
#include <boost/program_options.hpp>
#include "util/usage.hh"
#include "moses/TranslationModel/ProbingPT/storing.hh"
#include "moses/InputFileStream.h"
#include "moses/OutputFileStream.h"
#include "moses/Util.h"

using namespace std;

std::string ReformatSCFGFile(const std::string &path);

int main(int argc, char* argv[])
{
	string inPath, outPath; 
	int num_scores = 4;
	int num_lex_scores = 0;
	bool log_prob = false;
	bool scfg = false;
	int max_cache_size = 50000;

  namespace po = boost::program_options;
  po::options_description desc("Options");
  desc.add_options()
  ("help", "Print help messages")
  ("input-pt", po::value<string>()->required(), "Text pt")
  ("output-dir", po::value<string>()->required(), "Directory when binary files will be written")
  ("num-scores", po::value<int>()->default_value(num_scores), "Number of pt scores")
  ("num-lex-scores", po::value<int>()->default_value(num_lex_scores), "Number of lexicalized reordering scores")
  ("log-prob", "log (and floor) probabilities before storing")
  ("max-cache-size", po::value<int>()->default_value(max_cache_size), "Maximum number of high-count source lines to write to cache file. 0=no cache, negative=no limit")
  ("scfg", "Rules are SCFG in Moses format (ie. with non-terms and LHS")

	;

  po::variables_map vm;
  try {
    po::store(po::parse_command_line(argc, argv, desc),
              vm); // can throw

    /** --help option
     */
    if ( vm.count("help")) {
      std::cout << desc << std::endl;
      return EXIT_SUCCESS;
    }

    po::notify(vm); // throws on error, so do after help in case
    // there are any problems
  } catch(po::error& e) {
    std::cerr << "ERROR: " << e.what() << std::endl << std::endl;
    std::cerr << desc << std::endl;
    return EXIT_FAILURE;
  }

  if (vm.count("input-pt")) inPath = vm["input-pt"].as<string>();
  if (vm.count("output-dir")) outPath = vm["output-dir"].as<string>();
  if (vm.count("num-scores")) num_scores = vm["num-scores"].as<int>();
  if (vm.count("num-lex-scores")) num_lex_scores = vm["num-lex-scores"].as<int>();
  if (vm.count("max-cache-size")) max_cache_size = vm["max-cache-size"].as<int>();
  if (vm.count("log-prob")) log_prob = true;
  if (vm.count("scfg")) scfg = true;


  if (scfg) {
    inPath = ReformatSCFGFile(inPath);
  }

  Moses::createProbingPT(inPath, outPath, num_scores, num_lex_scores, log_prob, max_cache_size, scfg);

  //util::PrintUsage(std::cout);
  return 0;
}

std::string ReformatSCFGFile(const std::string &path)
{
  Moses::InputFileStream inFile(path);
  string reformattedPath = path + ".reformat.gz";
  Moses::OutputFileStream outFile(reformattedPath);

  string line;
  while (getline(inFile, line)) {
    vector<string> toks = Moses::TokenizeMultiCharSeparator(line, "|||");
    assert(toks.size() >= 3);

    // source
    vector<string> sourceToks = Moses::Tokenize(toks[0], " ");
    for (size_t i = 0; i < sourceToks.size() - 1; ++i) {
      outFile << sourceToks[i] << " ";
    }

    // other columns
    for (size_t i = 1; i < toks.size(); ++i) {
      outFile << "|||" << toks[i];
    }
    outFile << endl;
  }

  inFile.Close();
  outFile.Close();

  string sortedPath = path + ".reformat.sorted.gz";
  string tmpPath = path + ".tmp ";
  string cmd = "mkdir " + tmpPath
      + " && gzip -dc " + reformattedPath + " | LC_ALL=C sort -T " + tmpPath + " | gzip -c > " + sortedPath;
  system(cmd.c_str());

  cmd = "rm -rf " + tmpPath + " " + reformattedPath;
  system(cmd.c_str());

  return sortedPath;
}