From 4d189eb14ddb92fec870026df4173a15d72ea800 Mon Sep 17 00:00:00 2001 From: Tetsuo Kiso Date: Wed, 30 Nov 2011 00:27:57 +0900 Subject: Fix a typedef for comparing N-grams. Declared const_iterator was not *const* actually. --- mert/BleuScorer.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mert/BleuScorer.h') diff --git a/mert/BleuScorer.h b/mert/BleuScorer.h index 7e0e18b53..a10b09a7a 100644 --- a/mert/BleuScorer.h +++ b/mert/BleuScorer.h @@ -56,7 +56,7 @@ private: typedef map,int,CompareNgrams> counts_t; typedef map,int,CompareNgrams>::iterator counts_iterator; - typedef map,int,CompareNgrams>::iterator counts_const_iterator; + typedef map,int,CompareNgrams>::const_iterator counts_const_iterator; typedef ScopedVector refcounts_t; /** -- cgit v1.2.3 From 30fa97e404d773a6359c1063405cbcab2db20f6a Mon Sep 17 00:00:00 2001 From: Tetsuo Kiso Date: Wed, 1 Feb 2012 20:24:48 +0900 Subject: Move reference length type into a private member of BleuScorer. The reason is that the type is used as internal purpose. --- mert/BleuScorer.h | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'mert/BleuScorer.h') diff --git a/mert/BleuScorer.h b/mert/BleuScorer.h index a10b09a7a..f4d568639 100644 --- a/mert/BleuScorer.h +++ b/mert/BleuScorer.h @@ -12,9 +12,6 @@ using namespace std; -enum BleuReferenceLengthStrategy { BLEU_AVERAGE, BLEU_SHORTEST, BLEU_CLOSEST }; - - /** * Bleu scoring */ @@ -33,6 +30,12 @@ public: } private: + enum ReferenceLengthType { + AVERAGE, + SHORTEST, + CLOSEST, + }; + //Used to construct the ngram map struct CompareNgrams { bool operator()(const vector& a, const vector& b) const { @@ -67,7 +70,7 @@ private: void dump_counts(counts_t& counts) const; const int kLENGTH; - BleuReferenceLengthStrategy _refLengthStrategy; + ReferenceLengthType m_ref_length_type; // data extracted from reference files refcounts_t _refcounts; -- cgit v1.2.3 From b19e7777ce82bdadee894ae7989b171764172c81 Mon Sep 17 00:00:00 2001 From: Tetsuo Kiso Date: Wed, 1 Feb 2012 20:54:20 +0900 Subject: Add prefix 'm_' to private and protected members in Scorer classes. --- mert/BleuScorer.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'mert/BleuScorer.h') diff --git a/mert/BleuScorer.h b/mert/BleuScorer.h index f4d568639..5f105add2 100644 --- a/mert/BleuScorer.h +++ b/mert/BleuScorer.h @@ -60,7 +60,6 @@ private: typedef map,int,CompareNgrams> counts_t; typedef map,int,CompareNgrams>::iterator counts_iterator; typedef map,int,CompareNgrams>::const_iterator counts_const_iterator; - typedef ScopedVector refcounts_t; /** * Count the ngrams of each type, up to the given length in the input line. @@ -73,8 +72,8 @@ private: ReferenceLengthType m_ref_length_type; // data extracted from reference files - refcounts_t _refcounts; - vector > _reflengths; + ScopedVector m_ref_counts; + vector > m_ref_lengths; // no copying allowed BleuScorer(const BleuScorer&); -- cgit v1.2.3 From 47ac8a474d2746d59615fff0e42819295d6fa195 Mon Sep 17 00:00:00 2001 From: Tetsuo Kiso Date: Mon, 20 Feb 2012 09:46:08 +0900 Subject: Change the naming conventions for the guard macros; Rename TER directory. This change might be useful to avoid duplicating the names. The reason is that although MERT programs are standalone applications, some header files such as data.h and point.h have common guard macro names like "DATA_H" and "POINT_H", and this is not good naming conventions when you want to include external headers. Some files actually include headers in Moses and KenLM's util. --- mert/BleuScorer.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'mert/BleuScorer.h') diff --git a/mert/BleuScorer.h b/mert/BleuScorer.h index 5f105add2..9875b9c52 100644 --- a/mert/BleuScorer.h +++ b/mert/BleuScorer.h @@ -1,5 +1,5 @@ -#ifndef __BLEUSCORER_H__ -#define __BLEUSCORER_H__ +#ifndef MERT_BLEU_SCORER_H_ +#define MERT_BLEU_SCORER_H_ #include #include @@ -80,4 +80,4 @@ private: BleuScorer& operator=(const BleuScorer&); }; -#endif // __BLEUSCORER_H__ +#endif // MERT_BLEU_SCORER_H_ -- cgit v1.2.3 From 0c9023abc6c231e96cd11cc15502f0e1353fec2a Mon Sep 17 00:00:00 2001 From: Tetsuo Kiso Date: Sat, 25 Feb 2012 18:14:00 +0900 Subject: Clean up commented out code snippets for debugging purposes. --- mert/BleuScorer.h | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) (limited to 'mert/BleuScorer.h') diff --git a/mert/BleuScorer.h b/mert/BleuScorer.h index 9875b9c52..68c485684 100644 --- a/mert/BleuScorer.h +++ b/mert/BleuScorer.h @@ -24,16 +24,13 @@ public: virtual void setReferenceFiles(const vector& referenceFiles); virtual void prepareStats(size_t sid, const string& text, ScoreStats& entry); virtual float calculateScore(const vector& comps) const; - - virtual size_t NumberOfScores() const { - return 2 * kLENGTH + 1; - } + virtual size_t NumberOfScores() const { return 2 * kLENGTH + 1; } private: enum ReferenceLengthType { AVERAGE, SHORTEST, - CLOSEST, + CLOSEST }; //Used to construct the ngram map @@ -44,16 +41,14 @@ private: const size_t bs = b.size(); for (i = 0; i < as && i < bs; ++i) { if (a[i] < b[i]) { - //cerr << "true" << endl; return true; } if (a[i] > b[i]) { - //cerr << "false" << endl; return false; } } //entries are equal, shortest wins - return as < bs;; + return as < bs; } }; -- cgit v1.2.3 From 17f06a32501289daf11181e24454a2d3f6635ba0 Mon Sep 17 00:00:00 2001 From: Tetsuo Kiso Date: Sun, 26 Feb 2012 01:11:56 +0900 Subject: Hide the implementation details of Ngram counts from the header. --- mert/BleuScorer.h | 34 +++++++++------------------------- 1 file changed, 9 insertions(+), 25 deletions(-) (limited to 'mert/BleuScorer.h') diff --git a/mert/BleuScorer.h b/mert/BleuScorer.h index 68c485684..4166d30e7 100644 --- a/mert/BleuScorer.h +++ b/mert/BleuScorer.h @@ -33,41 +33,25 @@ private: CLOSEST }; - //Used to construct the ngram map - struct CompareNgrams { - bool operator()(const vector& a, const vector& b) const { - size_t i; - const size_t as = a.size(); - const size_t bs = b.size(); - for (i = 0; i < as && i < bs; ++i) { - if (a[i] < b[i]) { - return true; - } - if (a[i] > b[i]) { - return false; - } - } - //entries are equal, shortest wins - return as < bs; - } - }; - - typedef map,int,CompareNgrams> counts_t; - typedef map,int,CompareNgrams>::iterator counts_iterator; - typedef map,int,CompareNgrams>::const_iterator counts_const_iterator; + /** + * A NgramCounts is a key-value store. + * Clients don't have to worry about the actual implementation + * since this type is used in internal only. + */ + class NgramCounts; /** * Count the ngrams of each type, up to the given length in the input line. */ - size_t countNgrams(const string& line, counts_t& counts, unsigned int n); + size_t countNgrams(const string& line, NgramCounts& counts, unsigned int n); - void dump_counts(counts_t& counts) const; + void dump_counts(const NgramCounts& counts) const; const int kLENGTH; ReferenceLengthType m_ref_length_type; // data extracted from reference files - ScopedVector m_ref_counts; + ScopedVector m_ref_counts; vector > m_ref_lengths; // no copying allowed -- cgit v1.2.3 From 8e0a61d0d731bc1e2a4f427d48a5a26ff8f7c923 Mon Sep 17 00:00:00 2001 From: Tetsuo Kiso Date: Sun, 26 Feb 2012 01:54:51 +0900 Subject: Clean up calculation effective reference length. --- mert/BleuScorer.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'mert/BleuScorer.h') diff --git a/mert/BleuScorer.h b/mert/BleuScorer.h index 4166d30e7..6c1660fab 100644 --- a/mert/BleuScorer.h +++ b/mert/BleuScorer.h @@ -47,6 +47,14 @@ private: void dump_counts(const NgramCounts& counts) const; + // For calculating effective reference length. + void CalcAverage(size_t sentence_id, + vector& stats) const; + void CalcClosest(size_t sentence_id, size_t length, + vector& stats) const; + void CalcShortest(size_t sentence_id, + vector& stats) const; + const int kLENGTH; ReferenceLengthType m_ref_length_type; -- cgit v1.2.3 From 669b9d9c7aab3553df561b4b6e1f1328669d5ef2 Mon Sep 17 00:00:00 2001 From: Tetsuo Kiso Date: Sun, 26 Feb 2012 02:01:03 +0900 Subject: Minor change the logging utility for n-gram counts. Use std::ostream instead of directly using std::cerr. --- mert/BleuScorer.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mert/BleuScorer.h') diff --git a/mert/BleuScorer.h b/mert/BleuScorer.h index 6c1660fab..7ae19fa5f 100644 --- a/mert/BleuScorer.h +++ b/mert/BleuScorer.h @@ -1,7 +1,7 @@ #ifndef MERT_BLEU_SCORER_H_ #define MERT_BLEU_SCORER_H_ -#include +#include #include #include @@ -45,7 +45,7 @@ private: */ size_t countNgrams(const string& line, NgramCounts& counts, unsigned int n); - void dump_counts(const NgramCounts& counts) const; + void dump_counts(std::ostream* os, const NgramCounts& counts) const; // For calculating effective reference length. void CalcAverage(size_t sentence_id, -- cgit v1.2.3 From ed6e6f00b1b73a99d9177b984835af063ccf690f Mon Sep 17 00:00:00 2001 From: Tetsuo Kiso Date: Sat, 10 Mar 2012 02:49:31 +0900 Subject: Minor change for calculating BLEU. To avoid defining the similar variables twice to calculate document-wise BLEU and sentence-wise BLEU scores. --- mert/BleuScorer.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'mert/BleuScorer.h') diff --git a/mert/BleuScorer.h b/mert/BleuScorer.h index 7ae19fa5f..f3513e135 100644 --- a/mert/BleuScorer.h +++ b/mert/BleuScorer.h @@ -12,6 +12,8 @@ using namespace std; +const int kBleuNgramOrder = 4; + /** * Bleu scoring */ @@ -24,7 +26,7 @@ public: virtual void setReferenceFiles(const vector& referenceFiles); virtual void prepareStats(size_t sid, const string& text, ScoreStats& entry); virtual float calculateScore(const vector& comps) const; - virtual size_t NumberOfScores() const { return 2 * kLENGTH + 1; } + virtual size_t NumberOfScores() const { return 2 * kBleuNgramOrder + 1; } private: enum ReferenceLengthType { @@ -55,7 +57,6 @@ private: void CalcShortest(size_t sentence_id, vector& stats) const; - const int kLENGTH; ReferenceLengthType m_ref_length_type; // data extracted from reference files -- cgit v1.2.3 From fba01c7cdfb10f8283cff44d78b40abb7ad61cc0 Mon Sep 17 00:00:00 2001 From: Tetsuo Kiso Date: Wed, 14 Mar 2012 22:14:11 +0900 Subject: Create a header file for NgramCounts class. The reason is that we want to add the unit test. --- mert/BleuScorer.h | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) (limited to 'mert/BleuScorer.h') diff --git a/mert/BleuScorer.h b/mert/BleuScorer.h index f3513e135..c35d4ad1d 100644 --- a/mert/BleuScorer.h +++ b/mert/BleuScorer.h @@ -14,6 +14,8 @@ using namespace std; const int kBleuNgramOrder = 4; +class NgramCounts; + /** * Bleu scoring */ @@ -35,13 +37,6 @@ private: CLOSEST }; - /** - * A NgramCounts is a key-value store. - * Clients don't have to worry about the actual implementation - * since this type is used in internal only. - */ - class NgramCounts; - /** * Count the ngrams of each type, up to the given length in the input line. */ -- cgit v1.2.3 From 6b95a19eda818fb772767a0037c70a7bbb6c32e5 Mon Sep 17 00:00:00 2001 From: Tetsuo Kiso Date: Sun, 18 Mar 2012 05:58:40 +0900 Subject: Create Reference class to clean up BleuScorer. - Add an unit test for Reference. - Move functions to calculate the reference length from BleuScorer to Reference. --- mert/BleuScorer.h | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) (limited to 'mert/BleuScorer.h') diff --git a/mert/BleuScorer.h b/mert/BleuScorer.h index c35d4ad1d..d58277a41 100644 --- a/mert/BleuScorer.h +++ b/mert/BleuScorer.h @@ -15,6 +15,7 @@ using namespace std; const int kBleuNgramOrder = 4; class NgramCounts; +class Reference; /** * Bleu scoring @@ -30,6 +31,8 @@ public: virtual float calculateScore(const vector& comps) const; virtual size_t NumberOfScores() const { return 2 * kBleuNgramOrder + 1; } + int CalcReferenceLength(size_t sentence_id, size_t length); + private: enum ReferenceLengthType { AVERAGE, @@ -44,19 +47,10 @@ private: void dump_counts(std::ostream* os, const NgramCounts& counts) const; - // For calculating effective reference length. - void CalcAverage(size_t sentence_id, - vector& stats) const; - void CalcClosest(size_t sentence_id, size_t length, - vector& stats) const; - void CalcShortest(size_t sentence_id, - vector& stats) const; - ReferenceLengthType m_ref_length_type; - // data extracted from reference files - ScopedVector m_ref_counts; - vector > m_ref_lengths; + // reference translations. + ScopedVector m_references; // no copying allowed BleuScorer(const BleuScorer&); -- cgit v1.2.3 From f686e8771a5db09e32474ed0735dbdef275158d3 Mon Sep 17 00:00:00 2001 From: Tetsuo Kiso Date: Mon, 19 Mar 2012 22:45:15 +0900 Subject: Add some functions to BleuScorer for unit testing. This commit also includes - Fix typo. - Fix indentations. - Add 'const' to Scorer::applyFactors(). --- mert/BleuScorer.h | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) (limited to 'mert/BleuScorer.h') diff --git a/mert/BleuScorer.h b/mert/BleuScorer.h index d58277a41..2f2c2a153 100644 --- a/mert/BleuScorer.h +++ b/mert/BleuScorer.h @@ -23,6 +23,12 @@ class Reference; class BleuScorer: public StatisticsBasedScorer { public: + enum ReferenceLengthType { + AVERAGE, + CLOSEST, + SHORTEST + }; + explicit BleuScorer(const string& config = ""); ~BleuScorer(); @@ -33,20 +39,19 @@ public: int CalcReferenceLength(size_t sentence_id, size_t length); -private: - enum ReferenceLengthType { - AVERAGE, - SHORTEST, - CLOSEST - }; + ReferenceLengthType GetReferenceLengthType() const { return m_ref_length_type; } + void SetReferenceLengthType(ReferenceLengthType type) { m_ref_length_type = type; } + + const std::vector& GetReferences() const { return m_references.get(); } /** * Count the ngrams of each type, up to the given length in the input line. */ - size_t countNgrams(const string& line, NgramCounts& counts, unsigned int n); + size_t CountNgrams(const string& line, NgramCounts& counts, unsigned int n); - void dump_counts(std::ostream* os, const NgramCounts& counts) const; + void DumpCounts(std::ostream* os, const NgramCounts& counts) const; +private: ReferenceLengthType m_ref_length_type; // reference translations. -- cgit v1.2.3 From eaa0ab486acfe50197cfd9efd79850a09a90794f Mon Sep 17 00:00:00 2001 From: Tetsuo Kiso Date: Wed, 4 Apr 2012 22:33:30 +0900 Subject: Add a test case for BLEU's clipped counts. - Make BleuScorer::setReferenceFiles() more testable by adding OpenReference() and OpenReferenceStream(). --- mert/BleuScorer.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'mert/BleuScorer.h') diff --git a/mert/BleuScorer.h b/mert/BleuScorer.h index 2f2c2a153..5fce47ad4 100644 --- a/mert/BleuScorer.h +++ b/mert/BleuScorer.h @@ -51,6 +51,11 @@ public: void DumpCounts(std::ostream* os, const NgramCounts& counts) const; + bool OpenReference(const char* filename, size_t file_id); + + // NOTE: this function is used for unit testing. + bool OpenReferenceStream(std::istream* is, size_t file_id); + private: ReferenceLengthType m_ref_length_type; -- cgit v1.2.3 From d034eeb703bd56776d0e83675ab4d105cda0e340 Mon Sep 17 00:00:00 2001 From: Tetsuo Kiso Date: Sat, 7 Apr 2012 01:02:32 +0900 Subject: Add test cases for BLEU and sentence-level BLEU+1. - Move a definition of sentenceLevelBleuPlusOne() from pro.cpp to BleuScorer.cpp. - Add check for the length of an input vector. --- mert/BleuScorer.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'mert/BleuScorer.h') diff --git a/mert/BleuScorer.h b/mert/BleuScorer.h index 5fce47ad4..b6503ba9b 100644 --- a/mert/BleuScorer.h +++ b/mert/BleuScorer.h @@ -67,4 +67,9 @@ private: BleuScorer& operator=(const BleuScorer&); }; +/** Computes sentence-level BLEU+1 score. + * This function is used in PRO. + */ +float sentenceLevelBleuPlusOne(const vector& stats); + #endif // MERT_BLEU_SCORER_H_ -- cgit v1.2.3 From 9c9d88a78a13d7bd5ad4f10b06c355519e4e41ad Mon Sep 17 00:00:00 2001 From: Tetsuo Kiso Date: Thu, 10 May 2012 07:51:05 +0900 Subject: Avoid "using namespace std" in headers. --- mert/BleuScorer.h | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) (limited to 'mert/BleuScorer.h') diff --git a/mert/BleuScorer.h b/mert/BleuScorer.h index b6503ba9b..1f568f744 100644 --- a/mert/BleuScorer.h +++ b/mert/BleuScorer.h @@ -10,8 +10,6 @@ #include "Scorer.h" #include "ScopedVector.h" -using namespace std; - const int kBleuNgramOrder = 4; class NgramCounts; @@ -29,15 +27,15 @@ public: SHORTEST }; - explicit BleuScorer(const string& config = ""); + explicit BleuScorer(const std::string& config = ""); ~BleuScorer(); - virtual void setReferenceFiles(const vector& referenceFiles); - virtual void prepareStats(size_t sid, const string& text, ScoreStats& entry); - virtual float calculateScore(const vector& comps) const; - virtual size_t NumberOfScores() const { return 2 * kBleuNgramOrder + 1; } + virtual void setReferenceFiles(const std::vector& referenceFiles); + virtual void prepareStats(std::size_t sid, const std::string& text, ScoreStats& entry); + virtual float calculateScore(const std::vector& comps) const; + virtual std::size_t NumberOfScores() const { return 2 * kBleuNgramOrder + 1; } - int CalcReferenceLength(size_t sentence_id, size_t length); + int CalcReferenceLength(std::size_t sentence_id, std::size_t length); ReferenceLengthType GetReferenceLengthType() const { return m_ref_length_type; } void SetReferenceLengthType(ReferenceLengthType type) { m_ref_length_type = type; } @@ -47,14 +45,14 @@ public: /** * Count the ngrams of each type, up to the given length in the input line. */ - size_t CountNgrams(const string& line, NgramCounts& counts, unsigned int n); + std::size_t CountNgrams(const std::string& line, NgramCounts& counts, unsigned int n); void DumpCounts(std::ostream* os, const NgramCounts& counts) const; - bool OpenReference(const char* filename, size_t file_id); + bool OpenReference(const char* filename, std::size_t file_id); // NOTE: this function is used for unit testing. - bool OpenReferenceStream(std::istream* is, size_t file_id); + bool OpenReferenceStream(std::istream* is, std::size_t file_id); private: ReferenceLengthType m_ref_length_type; @@ -70,6 +68,6 @@ private: /** Computes sentence-level BLEU+1 score. * This function is used in PRO. */ -float sentenceLevelBleuPlusOne(const vector& stats); +float sentenceLevelBleuPlusOne(const std::vector& stats); #endif // MERT_BLEU_SCORER_H_ -- cgit v1.2.3 From fd577d7a65cab923b9102d61873a032654d573a1 Mon Sep 17 00:00:00 2001 From: Colin Cherry Date: Tue, 29 May 2012 13:38:57 -0400 Subject: Batch k-best MIRA is written and integrated into mert-moses.pl Regression tests all check out, and kbmira seems to work fine on a Hansard French->English task. HypPackEnumerator class may be of interest to pro.cpp and future optimizers, as it abstracts a lot of the boilerplate involved in enumerating multiple k-best lists. MiraWeightVector is not really mira-specific - just a weight vector that enables efficient averaging. Could be useful to a perceptron as well. Same goes for MiraFeatureVector. Interaction with sparse features is written, but untested. --- mert/BleuScorer.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'mert/BleuScorer.h') diff --git a/mert/BleuScorer.h b/mert/BleuScorer.h index 1f568f744..8f1384f5a 100644 --- a/mert/BleuScorer.h +++ b/mert/BleuScorer.h @@ -70,4 +70,14 @@ private: */ float sentenceLevelBleuPlusOne(const std::vector& stats); +/** Computes sentence-level BLEU score given a background corpus. + * This function is used in batch MIRA. + */ +float sentenceLevelBackgroundBleu(const std::vector& sent, const std::vector& bg); + +/** + * Computes plain old BLEU from a vector of stats + */ +float unsmoothedBleu(const std::vector& stats); + #endif // MERT_BLEU_SCORER_H_ -- cgit v1.2.3