Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
Diffstat (limited to 'scripts/training/memscore/scorer.cpp')
-rw-r--r--scripts/training/memscore/scorer.cpp402
1 files changed, 0 insertions, 402 deletions
diff --git a/scripts/training/memscore/scorer.cpp b/scripts/training/memscore/scorer.cpp
deleted file mode 100644
index e8cf4ce49..000000000
--- a/scripts/training/memscore/scorer.cpp
+++ /dev/null
@@ -1,402 +0,0 @@
-// memscore - in-memory phrase scoring for Statistical Machine Translation
-// Christian Hardmeier, FBK-irst, Trento, 2010
-// $Id$
-
-#include <cassert>
-#include <cstring>
-#include <fstream>
-
-#include "scorer-impl.h"
-#include "timestamp.h"
-#include "lexdecom.h"
-
-#ifdef ENABLE_CHANNEL_SCORER
-#include "channel-scorer.h"
-#endif
-
-const std::vector<String> &PhraseScorerFactory::scorer_list()
-{
- static std::vector<String> list;
- if(list.size() == 0) {
- list.push_back("ml - maximum likelihood score (relative frequency)");
- list.push_back("wittenbell - Witten-Bell smoothing");
- list.push_back("absdiscount - absolute discounting");
- list.push_back("kndiscount1 - Knesser-Ney discounting");
- list.push_back("kndiscount3 - modified Knesser-Ney discounting");
- list.push_back("lexweights <weightfile> - lexical weights (Koehn et al., NAACL 2003)");
-#ifdef ENABLE_CHANNEL_SCORER
- list.push_back("channel <sigma> <srclm> <tgtlm> - channel adaptation");
-#endif
- list.push_back("const <constant> - constant phrase penalty");
- list.push_back("lexdecomp <weightfile> - lexical decomposition smoothing");
- }
-
- return list;
-}
-
-PhraseScorer *PhraseScorerFactory::create_scorer(const char *argv[], int &argp, bool reverse)
-{
- const char *arg = argv[argp++];
- if(arg == NULL)
- usage();
-
- if(!strcmp(arg, "ml"))
- return MLPhraseScorer::create_scorer(argv, argp, reverse, *this);
- else if(!strcmp(arg, "wittenbell"))
- return WittenBellPhraseScorer::create_scorer(argv, argp, reverse, *this);
- else if(!strcmp(arg, "absdiscount"))
- return AbsoluteDiscountPhraseScorer::create_scorer(argv, argp, reverse, *this);
- else if(!strcmp(arg, "kndiscount1"))
- return KNDiscount1PhraseScorer::create_scorer(argv, argp, reverse, *this);
- else if(!strcmp(arg, "kndiscount3"))
- return KNDiscount3PhraseScorer::create_scorer(argv, argp, reverse, *this);
- else if(!strcmp(arg, "lexweights"))
- return LexicalWeightPhraseScorer::create_scorer(argv, argp, reverse, *this);
-#ifdef ENABLE_CHANNEL_SCORER
- else if(!strcmp(arg, "channel"))
- return ChannelAdaptationPhraseScorer::create_scorer(argv, argp, reverse, *this);
-#endif
- else if(!strcmp(arg, "const"))
- return ConstantPhraseScorer::create_scorer(argv, argp, reverse, *this);
- else if (!strcmp(arg, "lexdecomp"))
- return LexicalDecompositionPhraseScorer::create_scorer(argv, argp, reverse, *this);
- else {
- std::cerr << "Unknown phrase scorer type: " << arg << std::endl << std::endl;
- usage();
- }
-}
-
-PhraseScorer *MLPhraseScorer::create_scorer(const char *argv[], int &argp, bool reverse, const PhraseScorerFactory &ptf)
-{
- return new MLPhraseScorer(ptf.get_phrase_table(), reverse);
-}
-
-#if 1
-void MLPhraseScorer::do_score_phrases() {}
-#else
-void MLPhraseScorer::do_score_phrases()
-{
- Score bla = 0;
- Timestamp t_it;
- for(Count i = 0; i < 200; i++) {
- for(PhraseTable::iterator it = phrase_table_.begin(); it != phrase_table_.end(); ++it) {
- PhrasePairInfo ppair = *it;
- Phrase tgt = ppair.get_tgt();
- bla += bla * ppair.get_count() / phrase_table_.get_tgt_phrase(tgt).get_count();
- }
- }
- std::cerr << "Time for 200 iterations with ML estimation: " << (t_it.elapsed_time() / 1000) << " ms" << std::endl;
- std::cerr << bla << std::endl;
-}
-#endif
-
-Score MLPhraseScorer::do_get_score(const PhraseTable::const_iterator &it)
-{
- PhraseInfo &tgt_phrase = phrase_table_.get_tgt_phrase(it->get_tgt());
- return static_cast<Score>(it->get_count()) / tgt_phrase.get_count();
-}
-
-PhraseScorer *WittenBellPhraseScorer::create_scorer(const char *argv[], int &argp, bool reverse, const PhraseScorerFactory &ptf)
-{
- return new WittenBellPhraseScorer(ptf.get_phrase_table(), reverse);
-}
-
-Score WittenBellPhraseScorer::do_get_score(const PhraseTable::const_iterator &it)
-{
- PhraseInfo &tgt_phrase = phrase_table_.get_tgt_phrase(it->get_tgt());
- return static_cast<Score>(it->get_count()) / (tgt_phrase.get_count() + tgt_phrase.get_distinct());
-}
-
-
-PhraseScorer *AbsoluteDiscountPhraseScorer::create_scorer(const char *argv[], int &argp, bool reverse, const PhraseScorerFactory &ptf)
-{
- return new AbsoluteDiscountPhraseScorer(ptf.get_phrase_table(), reverse);
-}
-// p(s|t) = (c(s,t) - beta) / c(t) <-- absolute discounting
-
-void AbsoluteDiscountPhraseScorer::do_score_phrases()
-{
- Count n1 = 0, n2 = 0;
-
- for(PhraseTable::iterator it = phrase_table_.begin(); it != phrase_table_.end(); ++it) {
- PhrasePairInfo ppinfo = *it;
- Count c = ppinfo.get_count();
- switch(c) {
- case 1:
- n1++;
- break;
- case 2:
- n2++;
- }
- }
-
- discount_ = static_cast<Score>(n1) / (n1 + 2*n2);
-}
-
-inline Score AbsoluteDiscountPhraseScorer::get_discount()
-{
- return discount_;
-}
-
-Score AbsoluteDiscountPhraseScorer::do_get_score(const PhraseTable::const_iterator &it)
-{
- PhraseInfo &tgt_phrase = phrase_table_.get_tgt_phrase(it->get_tgt());
- return (it->get_count() - discount_) / tgt_phrase.get_count();
-}
-
-PhraseScorer *KNDiscount1PhraseScorer::create_scorer(const char *argv[], int &argp, bool reverse, const PhraseScorerFactory &ptf)
-{
- return new KNDiscount1PhraseScorer(ptf.get_phrase_table(), reverse);
-}
-
-
-void KNDiscount1PhraseScorer::do_score_phrases()
-{
- Count n1 = 0, n2 = 0;
- Count total_count = 0;
-
- for(PhraseTable::iterator it = phrase_table_.begin(); it != phrase_table_.end(); ++it) {
- PhrasePairInfo ppinfo = *it;
- PhraseInfo &tgt_phrase = phrase_table_.get_tgt_phrase(it->get_tgt());
- total_count += tgt_phrase.get_count();
-
- Count c = ppinfo.get_count();
- switch(c) {
- case 1:
- n1++;
- break;
- case 2:
- n2++;
- }
- }
-
- discount_ = static_cast<Score>(n1) / (n1 + 2*n2);
- total_count_ = static_cast<Count>(total_count);
-
-}
-
-Score KNDiscount1PhraseScorer::do_get_score(const PhraseTable::const_iterator &it)
-{
- PhraseInfo &tgt_phrase = phrase_table_.get_tgt_phrase(it->get_tgt());
- PhraseInfo &src_phrase = phrase_table_.get_src_phrase(it->get_src());
- return ((it->get_count() - discount_) / tgt_phrase.get_count()) + (discount_ * tgt_phrase.get_distinct() / tgt_phrase.get_count())*(src_phrase.get_count() / total_count_);
-}
-
-PhraseScorer *KNDiscount3PhraseScorer::create_scorer(const char *argv[], int &argp, bool reverse, const PhraseScorerFactory &ptf)
-{
- return new KNDiscount3PhraseScorer(ptf.get_phrase_table(), reverse);
-}
-
-
-void KNDiscount3PhraseScorer::do_score_phrases()
-{
- Count n1 = 0, n2 = 0, n3 = 0, n4 = 0;
- Count total_count = 0; //total number of source or target phrases (including repetitions)
- Count total_distinct_n1 = 0; //sum_{s} n1plus(s,*)
- Count total_distinct_n2 = 0;
- Count total_distinct_n3plus = 0;
- Score y;
-
- for(PhraseTable::iterator it = phrase_table_.begin(); it != phrase_table_.end(); ++it) {
- PhrasePairInfo ppinfo = *it;
- PhraseInfo &src_phrase = phrase_table_.get_src_phrase(it->get_src());
- PhraseInfo &tgt_phrase = phrase_table_.get_tgt_phrase(it->get_tgt());
-
- total_count += src_phrase.get_count();
-
- Count c = ppinfo.get_count();
- switch(c) {
- case 1:
- n1++;
- tgt_phrase.inc_n1();
- src_phrase.inc_n1();
- total_distinct_n1++;
- break;
- case 2:
- n2++;
- tgt_phrase.inc_n2();
- src_phrase.inc_n2();
- total_distinct_n2++;
- break;
- case 3:
- n3++;
- tgt_phrase.inc_n3plus();
- src_phrase.inc_n3plus();
- total_distinct_n3plus++;
- break;
- case 4:
- n4++;
- tgt_phrase.inc_n3plus();
- src_phrase.inc_n3plus();
- total_distinct_n3plus++;
- }
-
- }
-
- y = (Score)(n1) / (n1 + 2*n2);
- discount1_ = static_cast<Score> (1) - (2)*(y)*(n2 / n1);
- discount2_ = static_cast<Score> (2) - (3)*(y)*(n3 / n2);
- discount3plus_ = static_cast<Score> (3) - (4)*(y)*(n4 / n3);
- total_distinct_n1_ = static_cast<Count>(total_distinct_n1);
- total_distinct_n2_ = static_cast<Count>(total_distinct_n2);
- total_distinct_n3plus_ = static_cast<Count>(total_distinct_n3plus);
-}
-
-Score KNDiscount3PhraseScorer::do_get_score(const PhraseTable::const_iterator &it)
-{
- PhrasePairInfo ppinfo = *it;
- PhraseInfo &tgt_phrase = phrase_table_.get_tgt_phrase(it->get_tgt());
- PhraseInfo &src_phrase = phrase_table_.get_src_phrase(it->get_src());
-
- Score norm = (discount1_ * tgt_phrase.get_n1() + discount2_ * tgt_phrase.get_n2() + discount3plus_ * tgt_phrase.get_n3plus()) / tgt_phrase.get_count();
- Count c = ppinfo.get_count();
- switch(c) {
- case 1:
- return ((it->get_count() - discount1_) / tgt_phrase.get_count()) + \
- norm*(src_phrase.get_n1() / total_distinct_n1_);
- break;
- case 2:
- return ((it->get_count() - discount2_) / tgt_phrase.get_count()) + \
- norm*(src_phrase.get_n2() / total_distinct_n2_);
- break;
- default:
- return ((it->get_count() - discount3plus_) / tgt_phrase.get_count()) + \
- norm*(src_phrase.get_n3plus() / total_distinct_n3plus_);
- }
-}
-
-PhraseScorer *LexicalWeightPhraseScorer::create_scorer(const char *argv[], int &argp, bool reverse, const PhraseScorerFactory &ptf)
-{
- bool overall_max = true;
-
- if(argv[argp] == NULL)
- usage();
-
- if(!strcmp(argv[argp], "-AlignmentCount")) {
- overall_max = false;
- argp++;
- if(argv[argp] == NULL)
- usage();
- }
-
- String lwfile(argv[argp++]);
- return new LexicalWeightPhraseScorer(ptf.get_phrase_table(), reverse, lwfile, overall_max);
-}
-
-LexicalWeightPhraseScorer::LexicalWeightPhraseScorer(PhraseTable &pd, bool reverse, const String &weightfile, bool overall_max) :
- PhraseScorer(pd, reverse), overall_max_score_(overall_max), null_word_(PhraseText::index_word("NULL"))
-{
- std::ifstream wfile(weightfile.c_str());
-
- while(!wfile.eof()) {
- if(wfile.fail()) {
- std::cerr << "Problem reading file: " << weightfile << std::endl;
- exit(1);
- }
-
- String src, tgt;
- Score weight;
-
- wfile >> src >> tgt >> weight;
- Count src_id = PhraseText::index_word(src);
- Count tgt_id = PhraseText::index_word(tgt);
- weight_map_.insert(std::make_pair(std::make_pair(src_id, tgt_id), weight));
- }
-
- wfile.close();
-}
-
-Score LexicalWeightPhraseScorer::get_weight(const String &s_src, const String &s_tgt) const
-{
- Count src = PhraseText::index_word(s_src);
- Count tgt = PhraseText::index_word(s_tgt);
- return get_weight(src, tgt);
-}
-
-inline Score LexicalWeightPhraseScorer::get_weight(Count src, Count tgt) const
-{
- WeightMapType_::const_iterator it = weight_map_.find(std::make_pair(src, tgt));
- if(it == weight_map_.end())
- return 0.00001; // default value copied from Philipp Koehn's scorer
- return it->second;
-}
-
-#if 1
-void LexicalWeightPhraseScorer::do_score_phrases() {}
-#else
-void LexicalWeightPhraseScorer::do_score_phrases()
-{
- Score bla = 0;
- Timestamp t_it;
- for(Count i = 0; i < 200; i++) {
- for(PhraseTable::iterator it = phrase_table_.begin(); it != phrase_table_.end(); ++it) {
- PhrasePairInfo ppair = *it;
- Phrase src = ppair.get_src();
- Phrase tgt = ppair.get_tgt();
- bla += bla * get_score(src, tgt);
- }
- }
- std::cerr << "Time for 200 iterations with lexical weights: " << (t_it.elapsed_time() / 1000) << " ms" << std::endl;
- std::cerr << bla << std::endl;
-}
-#endif
-
-Score LexicalWeightPhraseScorer::do_get_score(const PhraseTable::const_iterator &it)
-{
- const Phrase src = it->get_src();
- const Phrase tgt = it->get_tgt();
- const PhraseText &src_phrase = phrase_table_.get_src_phrase(src).get_phrase();
- const PhraseText &tgt_phrase = phrase_table_.get_tgt_phrase(tgt).get_phrase();
- const PhrasePairInfo &ppair = *it;
-
- Count max_count = 0;
-
- Score maxlex = 0;
- PhrasePairInfo::AlignmentVector av = ppair.get_alignments();
- for(PhrasePairInfo::AlignmentVector::const_iterator it = av.begin(); it != av.end(); ++it) {
- const PhraseAlignment &alig = it->first;
- const Count alig_cnt = it->second;
-
- assert(alig.get_source_length() == src_phrase.size() && alig.get_target_length() == tgt_phrase.size());
-
- if(!overall_max_score_ && alig_cnt < max_count)
- continue;
- max_count = alig_cnt;
-
- Score lex = 1;
- for(Count s = 0; s < src_phrase.size(); s++) {
- Score factor = 0;
- Count na = 0;
- for(Count t = 0; t < tgt_phrase.size(); t++)
- if(alig.is_aligned(s, t)) {
- const Score w = get_weight(src_phrase[s], tgt_phrase[t]);
- factor += w;
- na++;
- }
-
- if(na > 0)
- lex *= factor / na;
- else
- lex *= get_weight(src_phrase[s], null_word_);
- }
-
- if(lex > maxlex)
- maxlex = lex;
- }
-
- return maxlex;
-}
-
-PhraseScorer *ConstantPhraseScorer::create_scorer(const char *argv[], int &argp, bool reverse, const PhraseScorerFactory &ptf)
-{
- if(argv[argp] == NULL)
- usage();
- Score c = atof(argv[argp++]);
- return new ConstantPhraseScorer(ptf.get_phrase_table(), reverse, c);
-}
-
-Score ConstantPhraseScorer::do_get_score(const PhraseTable::const_iterator &it)
-{
- return constant_;
-}
-