Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMarcin Junczys-Dowmunt <junczys@amu.edu.pl>2014-12-22 02:32:28 +0300
committerMarcin Junczys-Dowmunt <junczys@amu.edu.pl>2014-12-22 02:32:28 +0300
commit3d3367bcc722414a3f73e1d2a14b2ca928589844 (patch)
tree8923d458230d6bb60c6cfbd3f1899056db5d338d
parent0ae4b5cbba0e16de77ab8bc870e972c8c62755ea (diff)
Added printer and statistics
-rw-r--r--contrib/bleu-champ/Corpus.hpp6
-rw-r--r--contrib/bleu-champ/Dynamic.hpp28
-rw-r--r--contrib/bleu-champ/Printer.hpp85
-rw-r--r--contrib/bleu-champ/bleu-champ.cpp189
4 files changed, 190 insertions, 118 deletions
diff --git a/contrib/bleu-champ/Corpus.hpp b/contrib/bleu-champ/Corpus.hpp
index 7cfe972ad..e76a43fc4 100644
--- a/contrib/bleu-champ/Corpus.hpp
+++ b/contrib/bleu-champ/Corpus.hpp
@@ -18,7 +18,7 @@ StringPiece operator+(const StringPiece& s1, const StringPiece& s2) {
typedef std::vector<StringPiece> NGramsByOrder;
typedef std::vector<NGramsByOrder> NGrams;
-const size_t MAX_NGRAM_ORDER = 4;
+const size_t MAX_NGRAM_ORDER = 2;
class Sentence {
public:
@@ -71,7 +71,6 @@ class Sentence {
}
private:
- //size_t m_id;
StringPiece m_sentence;
std::vector<StringPiece>* m_tokens;
size_t m_start;
@@ -135,7 +134,6 @@ class Corpus {
Sentence sentence(StringPiece(m_corpus.c_str() + start, length),
j, tokens, m_tokens);
m_sentences.push_back(sentence);
- //m_sentences.back().setId(m_sentences.size());
j += tokens;
}
@@ -161,7 +159,6 @@ class Corpus {
Sentence sentence(StringPiece(m_corpus.c_str() + start, length),
j, tokens, m_tokens);
m_sentences.push_back(sentence);
- //m_sentences.back().setId(m_sentences.size());
}
const Sentence& operator()(int i, int j) const {
@@ -184,7 +181,6 @@ class Corpus {
else {
Sentence* sentenceRange = new Sentence(m_sentences[i] + m_sentences[j]);
m_ranges[range] = sentenceRange;
- //sentenceRange->setId(m_sentences.size() + m_ranges.size());
return *sentenceRange;
}
}
diff --git a/contrib/bleu-champ/Dynamic.hpp b/contrib/bleu-champ/Dynamic.hpp
index 6f92013af..511a4f21b 100644
--- a/contrib/bleu-champ/Dynamic.hpp
+++ b/contrib/bleu-champ/Dynamic.hpp
@@ -3,9 +3,12 @@
#include <vector>
#include <iostream>
#include <algorithm>
+#include <limits>
/******************************************************************************/
+const float MIN = std::numeric_limits<float>::min();
+
struct Bead {
Bead() : m_bead{0 ,0} {}
@@ -20,6 +23,10 @@ struct Bead {
return const_cast<Bead&>(*this)[i];
}
+ bool operator<(const Bead& b) const {
+ return m_bead[0] < b[0] || (m_bead[0] == b[0] && m_bead[1] < b[1]);
+ }
+
size_t m_bead[2];
};
@@ -91,7 +98,7 @@ class Dynamic {
public:
Dynamic(CorpusType& corpus1, CorpusType& corpus2)
: m_corpus1(corpus1), m_corpus2(corpus2),
- m_seen(m_corpus1.size() + 1, std::vector<float>(m_corpus2.size() + 1, -100)),
+ m_seen(m_corpus1.size() + 1, std::vector<float>(m_corpus2.size() + 1, MIN)),
m_prev(m_corpus1.size() + 1, std::vector<Bead>(m_corpus2.size() + 1))
{}
@@ -100,19 +107,19 @@ class Dynamic {
}
float Align(int i, int j) {
- if(i <= 0 && j <= 0)
+ if(i < 0 || j < 0 || (i == 0 && j == 0))
return 0;
- if(m_seen[i][j] != -100)
+ if(m_seen[i][j] != MIN)
return m_seen[i][j];
Beads allowedBeads = m_config.Search()();
- float bestScore = -1;
+ float bestScore = MIN;
Bead bestBead = allowedBeads[0];
for(Bead& bead : allowedBeads) {
- float score = -10;
+ float score = MIN;
if(i >= bead[0] && j >= bead[1] && InCorridor(i - bead[0], j - bead[1])) {
score = Align(i - bead[0], j - bead[1])
+ m_config.Scorer()(m_corpus1(i - bead[0], i - 1),
@@ -132,7 +139,6 @@ class Dynamic {
bool InCorridor(size_t i, size_t j) {
if(!m_corridor.empty()) {
- //std::cout << "Corr: " << i << " " << j << std::endl;
return m_corridor[i][j];
}
return true;
@@ -151,8 +157,8 @@ class Dynamic {
return ladder;
}
- void BackTrack(size_t i, size_t j, Ladder& ladder) {
- if(i == 0 && j == 0)
+ void BackTrack(int i, int j, Ladder& ladder) {
+ if(i < 0 || j < 0 || (i == 0 && j == 0))
return;
Bead bead = m_prev[i][j];
@@ -163,10 +169,10 @@ class Dynamic {
rung.j = j - bead[1];
rung.bead = bead;
- if(m_seen[i - bead[0]][j - bead[1]] != -100)
+ if(m_seen[i - bead[0]][j - bead[1]] != MIN)
rung.score = m_seen[i][j] - m_seen[i - bead[0]][j - bead[1]];
else
- rung.score = 0;
+ rung.score = m_seen[i][j];
ladder.push_back(rung);
}
@@ -181,7 +187,7 @@ class Dynamic {
int distance = width/2;
m_corridor.resize(m + 1, std::vector<bool>(n + 1, false));
for(const Rung& r : ladder) {
- for(int j = std::max(0, (int)r.j - distance); j < std::min((int)r.j + distance, (int)n); j++)
+ for(int j = std::max(0, (int)r.j - distance); j <= std::min((int)r.j + distance, (int)n); j++)
m_corridor[r.i][j] = true;
}
}
diff --git a/contrib/bleu-champ/Printer.hpp b/contrib/bleu-champ/Printer.hpp
new file mode 100644
index 000000000..5fd007c5c
--- /dev/null
+++ b/contrib/bleu-champ/Printer.hpp
@@ -0,0 +1,85 @@
+#pragma once
+
+#include <vector>
+#include <map>
+#include <cstdio>
+
+#include "Dynamic.hpp"
+
+struct PrintParams {
+ bool printIds = false;
+ bool printBeads = false;
+ bool printScores = false;
+ bool printUnaligned = false;
+ bool print11 = false;
+ float printThreshold = 0;
+};
+
+struct TextFormat {
+ template <class Corpus>
+ static void Print(const Rung& r, const Corpus& source, const Corpus& target,
+ const PrintParams& params) {
+ if(r.i == source.size() && r.j == target.size())
+ return;
+
+ if(r.score < params.printThreshold)
+ return;
+ if(params.print11 && (r.bead[0] != 1 || r.bead[1] != 1))
+ return;
+ if(!params.printUnaligned && (r.bead[0] == 0 || r.bead[1] == 0))
+ return;
+
+ const Sentence& s1 = source(r.i, r.i + r.bead[0] - 1);
+ const Sentence& s2 = target(r.j, r.j + r.bead[1] - 1);
+
+ if(params.printIds) std::cout << r.i << " " << r.j << "\t";
+ if(params.printBeads) std::cout << r.bead << "\t";
+ if(params.printScores) std::cout << r.score << "\t";
+
+ std::cout << s1 << "\t" << s2 << std::endl;
+ }
+};
+
+struct LadderFormat {
+ template <class Corpus>
+ static void Print(const Rung& r, const Corpus& source, const Corpus& target,
+ const PrintParams& params) {
+ std::cout << r.i << "\t" << r.j << "\t" << r.score << std::endl;
+ }
+};
+
+template <class Format, class Corpus>
+void Print(const Ladder& ladder, const Corpus& source, const Corpus& target,
+ const PrintParams& params) {
+ for(const Rung& rung : ladder) {
+ Format::Print(rung, source, target, params);
+ }
+}
+
+void PrintStatistics(const Ladder& ladder) {
+ std::map<Bead, size_t> stats;
+
+ size_t nonZero = 0;
+ float scoreSum = 0;
+ for(size_t i = 0; i < ladder.size()-1; i++) {
+ const Rung& r = ladder[i];
+ stats[r.bead]++;
+ if(r.bead[0] > 0 && r.bead[1] > 0) {
+ scoreSum += r.score;
+ nonZero++;
+ }
+ }
+
+ std::cerr << "Bead statistics: " << std::endl;
+ for(auto& item : stats) {
+ float percent = ((float)item.second/(ladder.size()-1)) * 100;
+ fprintf(stderr, " %lu-%lu : %4lu (%5.2f\%)\n", item.first[0], item.first[1], item.second, percent);
+ }
+ std::cerr << std::endl;
+
+ std::cerr << "Quality: " << scoreSum/nonZero << std::endl;
+ std::cerr << "Quality including unaligned rungs: " << scoreSum/ladder.size()
+ << std::endl;
+ std::cerr << std::endl;
+}
+
diff --git a/contrib/bleu-champ/bleu-champ.cpp b/contrib/bleu-champ/bleu-champ.cpp
index bb54f4bca..531602d36 100644
--- a/contrib/bleu-champ/bleu-champ.cpp
+++ b/contrib/bleu-champ/bleu-champ.cpp
@@ -1,7 +1,6 @@
#include <iostream>
#include <vector>
-#include <map>
#include <boost/program_options.hpp>
#include <boost/timer/timer.hpp>
@@ -11,84 +10,48 @@
#include "Dynamic.hpp"
#include "Scorer.hpp"
#include "Corpus.hpp"
+#include "Printer.hpp"
namespace po = boost::program_options;
template <class Config, class Corpus>
-Ladder FirstPass(Corpus &source, Corpus &target) {
- std::cerr << " Pass 1: Tracing path with 1-1 beads:" << std::endl;
- std::cerr << " Computing best path" << std::endl;
+Ladder FirstPass(Corpus &source, Corpus &target, bool quiet) {
+ boost::timer::auto_cpu_timer t(std::cerr, 2, " Time: %t sec CPU, %w sec real\n");
+ if(!quiet) std::cerr << "Pass 1: Tracing path with 1-1 beads:" << std::endl;
+ if(!quiet) std::cerr << " Computing best path" << std::endl;
Dynamic<Config, Corpus> aligner(source, target);
aligner.Align();
- std::cerr << " Back-tracking" << std::endl;
+ if(!quiet) std::cerr << " Back-tracking" << std::endl;
Ladder path = aligner.BackTrack();
- std::cerr << " Done" << std::endl;
- std::cerr << std::endl;
+ t.stop();
+ if(!quiet) t.report();
+ if(!quiet) std::cerr << std::endl;
return path;
}
template <class Config, class Corpus>
-Ladder SecondPass(Corpus &source, Corpus &target, const Ladder& path, size_t corridorWidth) {
- std::cerr << " Pass 2: Tracing path with all beads:" << std::endl;
- std::cerr << " Setting corridor width to " << corridorWidth << std::endl;
+Ladder SecondPass(Corpus &source, Corpus &target, const Ladder& path, size_t corridorWidth, bool quiet) {
+ boost::timer::auto_cpu_timer t(std::cerr, 2, " Time: %t sec CPU, %w sec real\n");
+ if(!quiet) std::cerr << "Pass 2: Tracing path with all beads:" << std::endl;
+ if(!quiet) std::cerr << " Setting corridor width to " << corridorWidth << std::endl;
Dynamic<Config, Corpus> aligner(source, target);
aligner.SetCorridor(path, corridorWidth);
- std::cerr << " Computing best path within corridor" << std::endl;
+ if(!quiet) std::cerr << " Computing best path within corridor" << std::endl;
aligner.Align();
- std::cerr << " Back-tracking" << std::endl;
+ if(!quiet) std::cerr << " Back-tracking" << std::endl;
Ladder rungs = aligner.BackTrack();
- std::cerr << " Done" << std::endl;
- std::cerr << std::endl;
+ t.stop();
+ if(!quiet) t.report();
+ if(!quiet) std::cerr << std::endl;
return rungs;
}
-struct PrintParams {
- bool printIds = false;
- bool printBeads = false;
- bool printScores = false;
- bool printUnaligned = false;
- bool print11 = false;
- float printThreshold = 0;
-};
-
-struct TextFormat {
- static void Print(const Rung& r, const Corpus& source, const Corpus& target, const PrintParams& params) {
- if(r.score < params.printThreshold)
- return;
- if(params.print11 && (r.bead[0] != 1 || r.bead[1] != 1))
- return;
- if(!params.printUnaligned && (r.bead[0] == 0 || r.bead[1] == 0))
- return;
-
- const Sentence& s1 = source(r.i, r.i + r.bead[0] - 1);
- const Sentence& s2 = target(r.j, r.j + r.bead[1] - 1);
-
- if(params.printIds) std::cout << r.i << " " << r.j << "\t";
- if(params.printBeads) std::cout << r.bead << "\t";
- if(params.printScores) std::cout << r.score << "\t";
-
- std::cout << s1 << "\t" << s2 << std::endl;
- }
-};
-
-struct LadderFormat {
- static void Print(const Rung& r, const Corpus& source, const Corpus& target, const PrintParams& params) {
- std::cout << r.i << "\t" << r.j << "\t" << r.score << std::endl;
- }
-};
-
-template <class Format, class Corpus>
-void Print(const Ladder& ladder, const Corpus& source, const Corpus& target, const PrintParams& params) {
- for(const Rung& rung : ladder) {
- Format::Print(rung, source, target, params);
- }
-}
-
int main(int argc, char** argv)
{
- boost::timer::auto_cpu_timer t(std::cerr);
-
bool help;
+ bool skip1st;
+ bool skip2nd;
+ bool quiet;
std::string sourceFileName;
std::string targetFileName;
@@ -101,44 +64,59 @@ int main(int argc, char** argv)
PrintParams params;
- po::options_description desc("Allowed options");
- desc.add_options()
+ po::options_description general("General options");
+ general.add_options()
("source,s", po::value<std::string>(&sourceFileName)->required(),
"Source language file, used for alignment computation")
("target,t", po::value<std::string>(&targetFileName)->required(),
"Target language file, used for alignment computation")
-
- ("Source,S", po::value<std::string>(&sourceFileNameOrig),
- "Substitute source language file, if given will replace output of --source")
- ("Target,T", po::value<std::string>(&targetFileNameOrig),
- "Substitute target language file, if given will replace output of --target")
+ ("help,h", po::value(&help)->zero_tokens()->default_value(false),
+ "Print this help message and exit")
+ ("quiet,q", po::value(&quiet)->zero_tokens()->default_value(false),
+ "Do not print anything to stderr")
+ ;
+ po::options_description algo("Alignment algorithm options");
+ algo.add_options()
("width,w", po::value(&corridorWidth)->default_value(30),
"Width of search corridor around 1-1 path")
-
+ ("skip-1st", po::value(&skip1st)->zero_tokens()->default_value(false),
+ "Skip 1st pass. Can be very slow for larger files")
+ ("skip-2nd", po::value(&skip2nd)->zero_tokens()->default_value(false),
+ "Skip 2nd pass and output only 1-1 path")
+ ;
+
+ po::options_description output("Output options");
+ output.add_options()
+
("ladder,l", po::value(&ladderFormat)->zero_tokens()->default_value(false),
"Output in hunalign ladder format (not affected by other printing options)")
+ ("Source,S", po::value<std::string>(&sourceFileNameOrig),
+ "Substitute source language file, used only for output in text mode. "
+ "Has to be sentence aligned with --source arg")
+ ("Target,T", po::value<std::string>(&targetFileNameOrig),
+ "Substitute target language file, used only for output in text mode. "
+ "Has to be sentence aligned with --target arg")
+
+
("min-score,m", po::value(&params.printThreshold)->default_value(0),
"Print rungs with scores of at least arg")
("print-beads,b", po::value(&params.printBeads)->zero_tokens()->default_value(false),
- "Print column of beads")
+ "Print column with beads")
("print-ids,i", po::value(&params.printIds)->zero_tokens()->default_value(false),
- "Print column of sentence ids")
+ "Print column with sentence ids")
("print-scores,p", po::value(&params.printScores)->zero_tokens()->default_value(false),
- "Print column of scores")
+ "Print column with scores")
("print-1-1,1", po::value(&params.print11)->zero_tokens()->default_value(false),
"Print only 1-1 rungs")
("print-unaligned,u", po::value(&params.printUnaligned)->zero_tokens()->default_value(false),
"Print unaligned sentences")
-
- ("help,h", po::value(&help)->zero_tokens()->default_value(false),
- "Print this help message and exit")
;
- po::options_description cmdline_options;
- cmdline_options.add(desc);
+ po::options_description cmdline_options("Allowed options");
+ cmdline_options.add(general).add(algo).add(output);
po::variables_map vm;
try {
@@ -150,47 +128,54 @@ int main(int argc, char** argv)
std::cout << "Error: " << e.what() << std::endl << std::endl;
std::cout << "Usage: " + std::string(argv[0]) + " [options]" << std::endl;
- std::cout << desc << std::endl;
+ std::cout << cmdline_options << std::endl;
exit(0);
}
if (help) {
std::cout << "Usage: " + std::string(argv[0]) + " [options]" << std::endl;
- std::cout << desc << std::endl;
+ std::cout << cmdline_options << std::endl;
exit(0);
}
+
+ boost::timer::auto_cpu_timer t(std::cerr, 2, "Total time: %t sec CPU, %w sec real\n");
- std::cerr << std::endl;
+ if(!quiet) std::cerr << std::endl;
+ std::shared_ptr<Corpus> source(new Corpus(sourceFileName));
+ if(!quiet) std::cerr << "Loaded " << source->size() << " source sentences" << std::endl;
+ std::shared_ptr<Corpus> target(new Corpus(targetFileName));
+ if(!quiet) std::cerr << "Loaded " << target->size() << " target sentences" << std::endl;
+ if(!quiet) std::cerr << std::endl;
- Corpus source(sourceFileName);
- std::cerr << " Loaded " << source.size() << " source sentences" << std::endl;
- Corpus target(targetFileName);
- std::cerr << " Loaded " << target.size() << " target sentences" << std::endl;
- std::cerr << std::endl;
-
- Ladder rungs11 = FirstPass<Config<BLEU<2>, Fast>, Corpus>(source, target);
- Ladder rungsMN = SecondPass<Config<BLEU<2>, Full>, Corpus>(source, target, rungs11, corridorWidth);
+ Ladder rungsMN;
+
+ if(skip2nd) {
+ rungsMN = FirstPass<Config<BLEU<2>, Fast>, Corpus>(*source, *target, quiet);
+ }
+ else if(skip1st) {
+ rungsMN = FirstPass<Config<BLEU<2>, Full>, Corpus>(*source, *target, quiet);
+ }
+ else {
+ Ladder rungs11 = FirstPass<Config<BLEU<2>, Fast>, Corpus>(*source, *target, quiet);
+ rungsMN = SecondPass<Config<BLEU<2>, Full>, Corpus>(*source, *target, rungs11, corridorWidth, quiet);
+ }
+
+ t.stop();
+ if(!quiet) t.report();
+ if(!quiet) std::cerr << std::endl;
+ if(sourceFileNameOrig.size())
+ source.reset(new Corpus(sourceFileNameOrig));
+ if(targetFileNameOrig.size())
+ target.reset(new Corpus(targetFileNameOrig));
+
if(ladderFormat) {
- Print<LadderFormat>(rungsMN, source, target, params);
+ Print<LadderFormat>(rungsMN, *source, *target, params);
}
else {
- Print<TextFormat>(rungsMN, source, target, params);
+ Print<TextFormat>(rungsMN, *source, *target, params);
}
- //float scoreSum = 0;
- //size_t keptRungs = 0;
- //
- //std::map<std::pair<size_t,size_t>, size_t> stats;
- //
- // stats[std::make_pair(r.bead[0], r.bead[1])]++;
- //
- //std::cerr << " Bead statistics: " << std::endl;
- //for(auto& item : stats)
- // std::cerr << " " << item.first.first << "-" << item.first.second << " : " << item.second << std::endl;
- //
- //std::cerr << std::endl;
- //std::cerr << " Quality of aligned rungs: " << scoreSum/keptRungs << std::endl;
- //std::cerr << " Quality: " << scoreSum/rungs.size() << std::endl;
- //std::cerr << std::endl;
+ if(!quiet)
+ PrintStatistics(rungsMN);
}