diff options
author | Marcin Junczys-Dowmunt <junczys@amu.edu.pl> | 2014-12-22 02:32:28 +0300 |
---|---|---|
committer | Marcin Junczys-Dowmunt <junczys@amu.edu.pl> | 2014-12-22 02:32:28 +0300 |
commit | 3d3367bcc722414a3f73e1d2a14b2ca928589844 (patch) | |
tree | 8923d458230d6bb60c6cfbd3f1899056db5d338d | |
parent | 0ae4b5cbba0e16de77ab8bc870e972c8c62755ea (diff) |
Added printer and statistics
-rw-r--r-- | contrib/bleu-champ/Corpus.hpp | 6 | ||||
-rw-r--r-- | contrib/bleu-champ/Dynamic.hpp | 28 | ||||
-rw-r--r-- | contrib/bleu-champ/Printer.hpp | 85 | ||||
-rw-r--r-- | contrib/bleu-champ/bleu-champ.cpp | 189 |
4 files changed, 190 insertions, 118 deletions
diff --git a/contrib/bleu-champ/Corpus.hpp b/contrib/bleu-champ/Corpus.hpp index 7cfe972ad..e76a43fc4 100644 --- a/contrib/bleu-champ/Corpus.hpp +++ b/contrib/bleu-champ/Corpus.hpp @@ -18,7 +18,7 @@ StringPiece operator+(const StringPiece& s1, const StringPiece& s2) { typedef std::vector<StringPiece> NGramsByOrder; typedef std::vector<NGramsByOrder> NGrams; -const size_t MAX_NGRAM_ORDER = 4; +const size_t MAX_NGRAM_ORDER = 2; class Sentence { public: @@ -71,7 +71,6 @@ class Sentence { } private: - //size_t m_id; StringPiece m_sentence; std::vector<StringPiece>* m_tokens; size_t m_start; @@ -135,7 +134,6 @@ class Corpus { Sentence sentence(StringPiece(m_corpus.c_str() + start, length), j, tokens, m_tokens); m_sentences.push_back(sentence); - //m_sentences.back().setId(m_sentences.size()); j += tokens; } @@ -161,7 +159,6 @@ class Corpus { Sentence sentence(StringPiece(m_corpus.c_str() + start, length), j, tokens, m_tokens); m_sentences.push_back(sentence); - //m_sentences.back().setId(m_sentences.size()); } const Sentence& operator()(int i, int j) const { @@ -184,7 +181,6 @@ class Corpus { else { Sentence* sentenceRange = new Sentence(m_sentences[i] + m_sentences[j]); m_ranges[range] = sentenceRange; - //sentenceRange->setId(m_sentences.size() + m_ranges.size()); return *sentenceRange; } } diff --git a/contrib/bleu-champ/Dynamic.hpp b/contrib/bleu-champ/Dynamic.hpp index 6f92013af..511a4f21b 100644 --- a/contrib/bleu-champ/Dynamic.hpp +++ b/contrib/bleu-champ/Dynamic.hpp @@ -3,9 +3,12 @@ #include <vector> #include <iostream> #include <algorithm> +#include <limits> /******************************************************************************/ +const float MIN = std::numeric_limits<float>::min(); + struct Bead { Bead() : m_bead{0 ,0} {} @@ -20,6 +23,10 @@ struct Bead { return const_cast<Bead&>(*this)[i]; } + bool operator<(const Bead& b) const { + return m_bead[0] < b[0] || (m_bead[0] == b[0] && m_bead[1] < b[1]); + } + size_t m_bead[2]; }; @@ -91,7 +98,7 @@ class Dynamic { public: Dynamic(CorpusType& corpus1, CorpusType& corpus2) : m_corpus1(corpus1), m_corpus2(corpus2), - m_seen(m_corpus1.size() + 1, std::vector<float>(m_corpus2.size() + 1, -100)), + m_seen(m_corpus1.size() + 1, std::vector<float>(m_corpus2.size() + 1, MIN)), m_prev(m_corpus1.size() + 1, std::vector<Bead>(m_corpus2.size() + 1)) {} @@ -100,19 +107,19 @@ class Dynamic { } float Align(int i, int j) { - if(i <= 0 && j <= 0) + if(i < 0 || j < 0 || (i == 0 && j == 0)) return 0; - if(m_seen[i][j] != -100) + if(m_seen[i][j] != MIN) return m_seen[i][j]; Beads allowedBeads = m_config.Search()(); - float bestScore = -1; + float bestScore = MIN; Bead bestBead = allowedBeads[0]; for(Bead& bead : allowedBeads) { - float score = -10; + float score = MIN; if(i >= bead[0] && j >= bead[1] && InCorridor(i - bead[0], j - bead[1])) { score = Align(i - bead[0], j - bead[1]) + m_config.Scorer()(m_corpus1(i - bead[0], i - 1), @@ -132,7 +139,6 @@ class Dynamic { bool InCorridor(size_t i, size_t j) { if(!m_corridor.empty()) { - //std::cout << "Corr: " << i << " " << j << std::endl; return m_corridor[i][j]; } return true; @@ -151,8 +157,8 @@ class Dynamic { return ladder; } - void BackTrack(size_t i, size_t j, Ladder& ladder) { - if(i == 0 && j == 0) + void BackTrack(int i, int j, Ladder& ladder) { + if(i < 0 || j < 0 || (i == 0 && j == 0)) return; Bead bead = m_prev[i][j]; @@ -163,10 +169,10 @@ class Dynamic { rung.j = j - bead[1]; rung.bead = bead; - if(m_seen[i - bead[0]][j - bead[1]] != -100) + if(m_seen[i - bead[0]][j - bead[1]] != MIN) rung.score = m_seen[i][j] - m_seen[i - bead[0]][j - bead[1]]; else - rung.score = 0; + rung.score = m_seen[i][j]; ladder.push_back(rung); } @@ -181,7 +187,7 @@ class Dynamic { int distance = width/2; m_corridor.resize(m + 1, std::vector<bool>(n + 1, false)); for(const Rung& r : ladder) { - for(int j = std::max(0, (int)r.j - distance); j < std::min((int)r.j + distance, (int)n); j++) + for(int j = std::max(0, (int)r.j - distance); j <= std::min((int)r.j + distance, (int)n); j++) m_corridor[r.i][j] = true; } } diff --git a/contrib/bleu-champ/Printer.hpp b/contrib/bleu-champ/Printer.hpp new file mode 100644 index 000000000..5fd007c5c --- /dev/null +++ b/contrib/bleu-champ/Printer.hpp @@ -0,0 +1,85 @@ +#pragma once + +#include <vector> +#include <map> +#include <cstdio> + +#include "Dynamic.hpp" + +struct PrintParams { + bool printIds = false; + bool printBeads = false; + bool printScores = false; + bool printUnaligned = false; + bool print11 = false; + float printThreshold = 0; +}; + +struct TextFormat { + template <class Corpus> + static void Print(const Rung& r, const Corpus& source, const Corpus& target, + const PrintParams& params) { + if(r.i == source.size() && r.j == target.size()) + return; + + if(r.score < params.printThreshold) + return; + if(params.print11 && (r.bead[0] != 1 || r.bead[1] != 1)) + return; + if(!params.printUnaligned && (r.bead[0] == 0 || r.bead[1] == 0)) + return; + + const Sentence& s1 = source(r.i, r.i + r.bead[0] - 1); + const Sentence& s2 = target(r.j, r.j + r.bead[1] - 1); + + if(params.printIds) std::cout << r.i << " " << r.j << "\t"; + if(params.printBeads) std::cout << r.bead << "\t"; + if(params.printScores) std::cout << r.score << "\t"; + + std::cout << s1 << "\t" << s2 << std::endl; + } +}; + +struct LadderFormat { + template <class Corpus> + static void Print(const Rung& r, const Corpus& source, const Corpus& target, + const PrintParams& params) { + std::cout << r.i << "\t" << r.j << "\t" << r.score << std::endl; + } +}; + +template <class Format, class Corpus> +void Print(const Ladder& ladder, const Corpus& source, const Corpus& target, + const PrintParams& params) { + for(const Rung& rung : ladder) { + Format::Print(rung, source, target, params); + } +} + +void PrintStatistics(const Ladder& ladder) { + std::map<Bead, size_t> stats; + + size_t nonZero = 0; + float scoreSum = 0; + for(size_t i = 0; i < ladder.size()-1; i++) { + const Rung& r = ladder[i]; + stats[r.bead]++; + if(r.bead[0] > 0 && r.bead[1] > 0) { + scoreSum += r.score; + nonZero++; + } + } + + std::cerr << "Bead statistics: " << std::endl; + for(auto& item : stats) { + float percent = ((float)item.second/(ladder.size()-1)) * 100; + fprintf(stderr, " %lu-%lu : %4lu (%5.2f\%)\n", item.first[0], item.first[1], item.second, percent); + } + std::cerr << std::endl; + + std::cerr << "Quality: " << scoreSum/nonZero << std::endl; + std::cerr << "Quality including unaligned rungs: " << scoreSum/ladder.size() + << std::endl; + std::cerr << std::endl; +} + diff --git a/contrib/bleu-champ/bleu-champ.cpp b/contrib/bleu-champ/bleu-champ.cpp index bb54f4bca..531602d36 100644 --- a/contrib/bleu-champ/bleu-champ.cpp +++ b/contrib/bleu-champ/bleu-champ.cpp @@ -1,7 +1,6 @@ #include <iostream> #include <vector> -#include <map> #include <boost/program_options.hpp> #include <boost/timer/timer.hpp> @@ -11,84 +10,48 @@ #include "Dynamic.hpp" #include "Scorer.hpp" #include "Corpus.hpp" +#include "Printer.hpp" namespace po = boost::program_options; template <class Config, class Corpus> -Ladder FirstPass(Corpus &source, Corpus &target) { - std::cerr << " Pass 1: Tracing path with 1-1 beads:" << std::endl; - std::cerr << " Computing best path" << std::endl; +Ladder FirstPass(Corpus &source, Corpus &target, bool quiet) { + boost::timer::auto_cpu_timer t(std::cerr, 2, " Time: %t sec CPU, %w sec real\n"); + if(!quiet) std::cerr << "Pass 1: Tracing path with 1-1 beads:" << std::endl; + if(!quiet) std::cerr << " Computing best path" << std::endl; Dynamic<Config, Corpus> aligner(source, target); aligner.Align(); - std::cerr << " Back-tracking" << std::endl; + if(!quiet) std::cerr << " Back-tracking" << std::endl; Ladder path = aligner.BackTrack(); - std::cerr << " Done" << std::endl; - std::cerr << std::endl; + t.stop(); + if(!quiet) t.report(); + if(!quiet) std::cerr << std::endl; return path; } template <class Config, class Corpus> -Ladder SecondPass(Corpus &source, Corpus &target, const Ladder& path, size_t corridorWidth) { - std::cerr << " Pass 2: Tracing path with all beads:" << std::endl; - std::cerr << " Setting corridor width to " << corridorWidth << std::endl; +Ladder SecondPass(Corpus &source, Corpus &target, const Ladder& path, size_t corridorWidth, bool quiet) { + boost::timer::auto_cpu_timer t(std::cerr, 2, " Time: %t sec CPU, %w sec real\n"); + if(!quiet) std::cerr << "Pass 2: Tracing path with all beads:" << std::endl; + if(!quiet) std::cerr << " Setting corridor width to " << corridorWidth << std::endl; Dynamic<Config, Corpus> aligner(source, target); aligner.SetCorridor(path, corridorWidth); - std::cerr << " Computing best path within corridor" << std::endl; + if(!quiet) std::cerr << " Computing best path within corridor" << std::endl; aligner.Align(); - std::cerr << " Back-tracking" << std::endl; + if(!quiet) std::cerr << " Back-tracking" << std::endl; Ladder rungs = aligner.BackTrack(); - std::cerr << " Done" << std::endl; - std::cerr << std::endl; + t.stop(); + if(!quiet) t.report(); + if(!quiet) std::cerr << std::endl; return rungs; } -struct PrintParams { - bool printIds = false; - bool printBeads = false; - bool printScores = false; - bool printUnaligned = false; - bool print11 = false; - float printThreshold = 0; -}; - -struct TextFormat { - static void Print(const Rung& r, const Corpus& source, const Corpus& target, const PrintParams& params) { - if(r.score < params.printThreshold) - return; - if(params.print11 && (r.bead[0] != 1 || r.bead[1] != 1)) - return; - if(!params.printUnaligned && (r.bead[0] == 0 || r.bead[1] == 0)) - return; - - const Sentence& s1 = source(r.i, r.i + r.bead[0] - 1); - const Sentence& s2 = target(r.j, r.j + r.bead[1] - 1); - - if(params.printIds) std::cout << r.i << " " << r.j << "\t"; - if(params.printBeads) std::cout << r.bead << "\t"; - if(params.printScores) std::cout << r.score << "\t"; - - std::cout << s1 << "\t" << s2 << std::endl; - } -}; - -struct LadderFormat { - static void Print(const Rung& r, const Corpus& source, const Corpus& target, const PrintParams& params) { - std::cout << r.i << "\t" << r.j << "\t" << r.score << std::endl; - } -}; - -template <class Format, class Corpus> -void Print(const Ladder& ladder, const Corpus& source, const Corpus& target, const PrintParams& params) { - for(const Rung& rung : ladder) { - Format::Print(rung, source, target, params); - } -} - int main(int argc, char** argv) { - boost::timer::auto_cpu_timer t(std::cerr); - bool help; + bool skip1st; + bool skip2nd; + bool quiet; std::string sourceFileName; std::string targetFileName; @@ -101,44 +64,59 @@ int main(int argc, char** argv) PrintParams params; - po::options_description desc("Allowed options"); - desc.add_options() + po::options_description general("General options"); + general.add_options() ("source,s", po::value<std::string>(&sourceFileName)->required(), "Source language file, used for alignment computation") ("target,t", po::value<std::string>(&targetFileName)->required(), "Target language file, used for alignment computation") - - ("Source,S", po::value<std::string>(&sourceFileNameOrig), - "Substitute source language file, if given will replace output of --source") - ("Target,T", po::value<std::string>(&targetFileNameOrig), - "Substitute target language file, if given will replace output of --target") + ("help,h", po::value(&help)->zero_tokens()->default_value(false), + "Print this help message and exit") + ("quiet,q", po::value(&quiet)->zero_tokens()->default_value(false), + "Do not print anything to stderr") + ; + po::options_description algo("Alignment algorithm options"); + algo.add_options() ("width,w", po::value(&corridorWidth)->default_value(30), "Width of search corridor around 1-1 path") - + ("skip-1st", po::value(&skip1st)->zero_tokens()->default_value(false), + "Skip 1st pass. Can be very slow for larger files") + ("skip-2nd", po::value(&skip2nd)->zero_tokens()->default_value(false), + "Skip 2nd pass and output only 1-1 path") + ; + + po::options_description output("Output options"); + output.add_options() + ("ladder,l", po::value(&ladderFormat)->zero_tokens()->default_value(false), "Output in hunalign ladder format (not affected by other printing options)") + ("Source,S", po::value<std::string>(&sourceFileNameOrig), + "Substitute source language file, used only for output in text mode. " + "Has to be sentence aligned with --source arg") + ("Target,T", po::value<std::string>(&targetFileNameOrig), + "Substitute target language file, used only for output in text mode. " + "Has to be sentence aligned with --target arg") + + ("min-score,m", po::value(¶ms.printThreshold)->default_value(0), "Print rungs with scores of at least arg") ("print-beads,b", po::value(¶ms.printBeads)->zero_tokens()->default_value(false), - "Print column of beads") + "Print column with beads") ("print-ids,i", po::value(¶ms.printIds)->zero_tokens()->default_value(false), - "Print column of sentence ids") + "Print column with sentence ids") ("print-scores,p", po::value(¶ms.printScores)->zero_tokens()->default_value(false), - "Print column of scores") + "Print column with scores") ("print-1-1,1", po::value(¶ms.print11)->zero_tokens()->default_value(false), "Print only 1-1 rungs") ("print-unaligned,u", po::value(¶ms.printUnaligned)->zero_tokens()->default_value(false), "Print unaligned sentences") - - ("help,h", po::value(&help)->zero_tokens()->default_value(false), - "Print this help message and exit") ; - po::options_description cmdline_options; - cmdline_options.add(desc); + po::options_description cmdline_options("Allowed options"); + cmdline_options.add(general).add(algo).add(output); po::variables_map vm; try { @@ -150,47 +128,54 @@ int main(int argc, char** argv) std::cout << "Error: " << e.what() << std::endl << std::endl; std::cout << "Usage: " + std::string(argv[0]) + " [options]" << std::endl; - std::cout << desc << std::endl; + std::cout << cmdline_options << std::endl; exit(0); } if (help) { std::cout << "Usage: " + std::string(argv[0]) + " [options]" << std::endl; - std::cout << desc << std::endl; + std::cout << cmdline_options << std::endl; exit(0); } + + boost::timer::auto_cpu_timer t(std::cerr, 2, "Total time: %t sec CPU, %w sec real\n"); - std::cerr << std::endl; + if(!quiet) std::cerr << std::endl; + std::shared_ptr<Corpus> source(new Corpus(sourceFileName)); + if(!quiet) std::cerr << "Loaded " << source->size() << " source sentences" << std::endl; + std::shared_ptr<Corpus> target(new Corpus(targetFileName)); + if(!quiet) std::cerr << "Loaded " << target->size() << " target sentences" << std::endl; + if(!quiet) std::cerr << std::endl; - Corpus source(sourceFileName); - std::cerr << " Loaded " << source.size() << " source sentences" << std::endl; - Corpus target(targetFileName); - std::cerr << " Loaded " << target.size() << " target sentences" << std::endl; - std::cerr << std::endl; - - Ladder rungs11 = FirstPass<Config<BLEU<2>, Fast>, Corpus>(source, target); - Ladder rungsMN = SecondPass<Config<BLEU<2>, Full>, Corpus>(source, target, rungs11, corridorWidth); + Ladder rungsMN; + + if(skip2nd) { + rungsMN = FirstPass<Config<BLEU<2>, Fast>, Corpus>(*source, *target, quiet); + } + else if(skip1st) { + rungsMN = FirstPass<Config<BLEU<2>, Full>, Corpus>(*source, *target, quiet); + } + else { + Ladder rungs11 = FirstPass<Config<BLEU<2>, Fast>, Corpus>(*source, *target, quiet); + rungsMN = SecondPass<Config<BLEU<2>, Full>, Corpus>(*source, *target, rungs11, corridorWidth, quiet); + } + + t.stop(); + if(!quiet) t.report(); + if(!quiet) std::cerr << std::endl; + if(sourceFileNameOrig.size()) + source.reset(new Corpus(sourceFileNameOrig)); + if(targetFileNameOrig.size()) + target.reset(new Corpus(targetFileNameOrig)); + if(ladderFormat) { - Print<LadderFormat>(rungsMN, source, target, params); + Print<LadderFormat>(rungsMN, *source, *target, params); } else { - Print<TextFormat>(rungsMN, source, target, params); + Print<TextFormat>(rungsMN, *source, *target, params); } - //float scoreSum = 0; - //size_t keptRungs = 0; - // - //std::map<std::pair<size_t,size_t>, size_t> stats; - // - // stats[std::make_pair(r.bead[0], r.bead[1])]++; - // - //std::cerr << " Bead statistics: " << std::endl; - //for(auto& item : stats) - // std::cerr << " " << item.first.first << "-" << item.first.second << " : " << item.second << std::endl; - // - //std::cerr << std::endl; - //std::cerr << " Quality of aligned rungs: " << scoreSum/keptRungs << std::endl; - //std::cerr << " Quality: " << scoreSum/rungs.size() << std::endl; - //std::cerr << std::endl; + if(!quiet) + PrintStatistics(rungsMN); } |