Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
path: root/mert
diff options
context:
space:
mode:
authorBarry Haddow <barry.haddow@gmail.com>2011-11-16 18:54:23 +0400
committerBarry Haddow <barry.haddow@gmail.com>2011-11-16 18:54:23 +0400
commit79de3c8699153aa759f584825f4f5a75e3f9c08c (patch)
tree9e1f4b862ef7190b51093a5d51a6798e071d100e /mert
parent0a2e0f44a6d5fa2755b6f3894a55aa608272987d (diff)
Complete initial version of pro extractor
Diffstat (limited to 'mert')
-rw-r--r--mert/Data.cpp1
-rw-r--r--mert/ScoreDataIterator.cpp1
-rw-r--r--mert/pro.cpp186
3 files changed, 163 insertions, 25 deletions
diff --git a/mert/Data.cpp b/mert/Data.cpp
index 0acfbeac3..1fae0080b 100644
--- a/mert/Data.cpp
+++ b/mert/Data.cpp
@@ -196,6 +196,7 @@ void Data::sampleRankedPairs( const std::string &rankedpairfile ) {
unsigned int translation2 = rand() % n_translations;
float bleu2 = sentenceLevelBleuPlusOne(scoredata->get(S,translation2));
+ cerr << "Sampled " << translation1 << " " << translation2 << endl;
if (abs(bleu1-bleu2) < min_diff)
continue;
diff --git a/mert/ScoreDataIterator.cpp b/mert/ScoreDataIterator.cpp
index c062cc52d..4cac63c54 100644
--- a/mert/ScoreDataIterator.cpp
+++ b/mert/ScoreDataIterator.cpp
@@ -46,7 +46,6 @@ void ScoreDataIterator::readNext() {
m_in->ReadLine(); //ignore rest of line
for (size_t i = 0; i < count; ++i) {
StringPiece line = m_in->ReadLine();
- cerr << line << endl;
m_next.push_back(ScoreDataItem());
for (TokenIter<AnyCharacter, true> token(line,AnyCharacter(" \t")); token; ++token) {
float value = ParseFloat(*token);
diff --git a/mert/pro.cpp b/mert/pro.cpp
index 0a716abd8..a9bd8c134 100644
--- a/mert/pro.cpp
+++ b/mert/pro.cpp
@@ -27,6 +27,8 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
* For details of PRO, refer to Hopkins & May (EMNLP 2011)
**/
+#include <cmath>
+#include <cstddef>
#include <cstdlib>
#include <ctime>
#include <iostream>
@@ -42,12 +44,74 @@ using namespace std;
namespace po = boost::program_options;
+class SampledPair {
+private:
+ pair<size_t,size_t> translation1;
+ pair<size_t,size_t> translation2;
+ float scoreDiff;
+public:
+ SampledPair(const pair<size_t,size_t>& t1, const pair<size_t,size_t>& t2, float diff ) {
+ if (diff > 0) {
+ translation1 = t1;
+ translation2 = t2;
+ scoreDiff = diff;
+ }
+ else {
+ translation1 = t2;
+ translation2 = t1;
+ scoreDiff = -diff;
+ }
+ }
+ float getDiff() const { return scoreDiff; }
+ const pair<size_t,size_t>& getTranslation1() const { return translation1; }
+ const pair<size_t,size_t>& getTranslation2() const { return translation2; }
+};
+
+
+static float sentenceLevelBleuPlusOne(const vector<float>& stats) {
+ float logbleu = 0.0;
+ const unsigned int bleu_order = 4;
+ for (unsigned int j=0; j<bleu_order; j++) {
+ //cerr << (stats.get(2*j)+1) << "/" << (stats.get(2*j+1)+1) << " ";
+ logbleu += log(stats[2*j]+1) - log(stats[2*j+1]+1);
+ }
+ logbleu /= bleu_order;
+ float brevity = 1.0 - (float)stats[(bleu_order*2)]/stats[1];
+ if (brevity < 0.0) {
+ logbleu += brevity;
+ }
+ //cerr << brevity << " -> " << exp(logbleu) << endl;
+ return exp(logbleu);
+}
+
+static void outputSample(ostream& out, const FeatureDataItem& f1, const FeatureDataItem& f2) {
+ // difference in score in regular features
+ for(unsigned int j=0; j<f1.dense.size(); j++)
+ if (abs(f1.dense[j]-f2.dense[j]) > 0.00001)
+ out << " F" << j << " " << (f1.dense[j]-f2.dense[j]);
+
+ if (f1.sparse.size() || f2.sparse.size()) {
+ out << " ";
+
+ // sparse features
+ const SparseVector &s1 = f1.sparse;
+ const SparseVector &s2 = f2.sparse;
+ SparseVector diff = s1 - s2;
+ diff.write(out);
+ }
+}
+
+
int main(int argc, char** argv)
{
bool help;
vector<string> scoreFiles;
vector<string> featureFiles;
int seed;
+ //TODO: options
+ const unsigned int n_candidates = 5000; // Gamma, in Hopkins & May
+ const unsigned int n_samples = 50; // Xi, in Hopkins & May
+ const float min_diff = 0.05;
po::options_description desc("Allowed options");
desc.add_options()
@@ -66,7 +130,7 @@ int main(int argc, char** argv)
if (help) {
cout << "Usage: " + string(argv[0]) + " [options]" << endl;
cout << desc << endl;
- return 0;
+ exit(0);
}
if (vm.count("random-seed")) {
@@ -77,33 +141,107 @@ int main(int argc, char** argv)
srand(time(NULL));
}
- FeatureDataIterator fi(featureFiles[0]);
- //cerr << featureFiles[0] << endl;
- for (; fi != FeatureDataIterator::end(); ++fi) {
- const vector<FeatureDataItem>& featureData = *fi;
- cerr << "Read " << featureData.size() << " items " << endl;
- for (size_t i = 0; i < featureData.size(); ++i) {
- cerr << "Dense: ";
- for (size_t j = 0; j < featureData[i].dense.size(); ++j) {
- cerr << featureData[i].dense[j] << " ";
- }
- cerr << "\n";
- }
- cerr << "\n";
+ if (scoreFiles.size() == 0 || featureFiles.size() == 0) {
+ cerr << "No data to process" << endl;
+ exit(0);
+ }
+
+ if (featureFiles.size() != scoreFiles.size()) {
+ cerr << "Error: Number of feature files (" << featureFiles.size() <<
+ ") does not match number of score files (" << scoreFiles.size() << ")" << endl;
+ exit(1);
}
- ScoreDataIterator si(scoreFiles[0]);
- for (; si != ScoreDataIterator::end(); ++si) {
- const vector<ScoreDataItem>& scoreData = *si;
- cerr << "Read " << scoreData.size() << " items " << endl;
- for (size_t i = 0; i < scoreData.size(); ++i) {
- cerr << "SD: ";
- for (size_t j = 0; j < scoreData[i].size(); ++j) {
- cerr << scoreData[i][j] << " ";
+
+ vector<FeatureDataIterator> featureDataIters;
+ vector<ScoreDataIterator> scoreDataIters;
+ for (size_t i = 0; i < featureFiles.size(); ++i) {
+ featureDataIters.push_back(FeatureDataIterator(featureFiles[i]));
+ scoreDataIters.push_back(ScoreDataIterator(scoreFiles[i]));
+ }
+
+ //loop through nbest lists
+ size_t sentenceId = 0;
+ while(1) {
+ vector<pair<size_t,size_t> > hypotheses;
+ //TODO: de-deuping. Collect hashes of score,feature pairs and
+ //only add index if it's unique.
+ if (featureDataIters[0] == FeatureDataIterator::end()) {
+ break;
+ }
+ for (size_t i = 0; i < featureFiles.size(); ++i) {
+ if (featureDataIters[i] == FeatureDataIterator::end()) {
+ cerr << "Error: Feature file " << i << " ended prematurely" << endl;
+ exit(1);
+ }
+ if (scoreDataIters[i] == ScoreDataIterator::end()) {
+ cerr << "Error: Score file " << i << " ended prematurely" << endl;
+ exit(1);
}
- cerr << "\n";
+ if (featureDataIters[i]->size() != scoreDataIters[i]->size()) {
+ cerr << "Error: For sentence " << sentenceId << " features and scores have different size" << endl;
+ exit(1);
+ }
+ for (size_t j = 0; j < featureDataIters[i]->size(); ++j) {
+ hypotheses.push_back(pair<size_t,size_t>(i,j));
+ }
+ }
+
+ //collect the candidates
+ vector<SampledPair> samples;
+ vector<float> scores;
+ size_t n_translations = hypotheses.size();
+ for(size_t i=0; i<n_candidates; i++) {
+ size_t rand1 = rand() % n_translations;
+ pair<size_t,size_t> translation1 = hypotheses[rand1];
+ float bleu1 = sentenceLevelBleuPlusOne(scoreDataIters[translation1.first]->operator[](translation1.second));
+
+ size_t rand2 = rand() % n_translations;
+ pair<size_t,size_t> translation2 = hypotheses[rand2];
+ float bleu2 = sentenceLevelBleuPlusOne(scoreDataIters[translation2.first]->operator[](translation2.second));
+ cerr << "Sampled " << translation1.second<< " " << translation2.second << endl;
+
+ /*
+ cerr << "t(" << translation1.first << "," << translation1.second << ") = " << bleu1 <<
+ " t(" << translation2.first << "," << translation2.second << ") = " <<
+ bleu2 << " diff = " << abs(bleu1-bleu2) << endl;
+ */
+ if (abs(bleu1-bleu2) < min_diff)
+ continue;
+
+ samples.push_back(SampledPair(translation1, translation2, bleu1-bleu2));
+ scores.push_back(1.0-abs(bleu1-bleu2));
+ }
+
+ float sample_threshold = -1.0;
+ if (samples.size() > n_samples) {
+ nth_element(scores.begin(), scores.begin() + (n_samples-1), scores.end());
+ sample_threshold = 0.99999-scores[n_samples-1];
+ }
+
+ size_t collected = 0;
+ for (size_t i = 0; collected < n_samples && i < samples.size(); ++i) {
+ if (samples[i].getDiff() < sample_threshold) continue;
+ ++collected;
+ size_t file_id1 = samples[i].getTranslation1().first;
+ size_t hypo_id1 = samples[i].getTranslation1().second;
+ size_t file_id2 = samples[i].getTranslation2().first;
+ size_t hypo_id2 = samples[i].getTranslation2().second;
+ cout << "1";
+ outputSample(cout, featureDataIters[file_id1]->operator[](hypo_id1),
+ featureDataIters[file_id2]->operator[](hypo_id2));
+ cout << endl;
+ cout << "0";
+ outputSample(cout, featureDataIters[file_id2]->operator[](hypo_id2),
+ featureDataIters[file_id1]->operator[](hypo_id1));
+ cout << endl;
+ }
+ //advance all iterators
+ for (size_t i = 0; i < featureFiles.size(); ++i) {
+ ++featureDataIters[i];
+ ++scoreDataIters[i];
}
- cerr << "\n";
+ ++sentenceId;
}
}