Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorBarry Haddow <barry.haddow@gmail.com>2011-11-23 17:34:55 +0400
committerBarry Haddow <barry.haddow@gmail.com>2011-11-23 17:34:55 +0400
commitd4589bef35e9f705d52b67d7239dcc4e0617bec4 (patch)
tree80fa48610e801e7f2840e03d3a8fc9c73026f501 /scripts
parent1248bdfdbb5ed04fb69887652307c3519f6eecc8 (diff)
Not needed - mbr is in the *-cmd directories.
Diffstat (limited to 'scripts')
-rw-r--r--scripts/Makefile2
-rw-r--r--scripts/released-files1
-rwxr-xr-xscripts/training/mbr/Makefile14
-rw-r--r--scripts/training/mbr/mbr.cpp398
4 files changed, 1 insertions, 414 deletions
diff --git a/scripts/Makefile b/scripts/Makefile
index dfa0fd3a1..80460f36a 100644
--- a/scripts/Makefile
+++ b/scripts/Makefile
@@ -24,7 +24,7 @@ RELEASEDIR=$(TARGETDIR)/scripts-$(TS)
all: compile
-SUBDIRS=training/phrase-extract training/symal training/mbr training/lexical-reordering ems/biconcor
+SUBDIRS=training/phrase-extract training/symal training/lexical-reordering ems/biconcor
SUBDIRS_CLEAN=$(SUBDIRS) training/memscore training/eppex training/compact-rule-table
compile: compile-memscore compile-eppex compile-compact-rule-table compile-extract-ghkm
diff --git a/scripts/released-files b/scripts/released-files
index 5153a4e52..ddd6d3287 100644
--- a/scripts/released-files
+++ b/scripts/released-files
@@ -79,7 +79,6 @@ training/clone_moses_model.pl
training/compact-rule-table/tools/compactify
training/eppex/counter
training/eppex/eppex
-training/mbr/mbr
training/filter-model-given-input.pl
training/filter-rule-table.py
training/lexical-reordering/score
diff --git a/scripts/training/mbr/Makefile b/scripts/training/mbr/Makefile
deleted file mode 100755
index adb0b0e3b..000000000
--- a/scripts/training/mbr/Makefile
+++ /dev/null
@@ -1,14 +0,0 @@
-CXXFLAGS=-O3
-LDFLAGS=
-LDLIBS=
-
-all: mbr
-
-clean:
- rm -f *.o
-
-mert: $(OBJS)
- $(G++) $(OBJS) $(LDLIBS) -o $@
-
-mert_p: $(OBJS)
- $(G++) $(LDFLAGS) $(OBJS) $(LDLIBS) -o $@
diff --git a/scripts/training/mbr/mbr.cpp b/scripts/training/mbr/mbr.cpp
deleted file mode 100644
index 8004620cc..000000000
--- a/scripts/training/mbr/mbr.cpp
+++ /dev/null
@@ -1,398 +0,0 @@
-#include <iostream>
-#include <fstream>
-#include <sstream>
-#include <iomanip>
-#include <vector>
-#include <map>
-#include <stdlib.h>
-#include <math.h>
-#include <algorithm>
-#include <stdio.h>
-#include <unistd.h>
-#include <cstring>
-#include <time.h>
-
-using namespace std ;
-
-
-/* Input :
- 1. a sorted n-best list, with duplicates filtered out in the following format
- 0 ||| amr moussa is currently on a visit to libya , tomorrow , sunday , to hold talks with regard to the in sudan . ||| 0 -4.94418 0 0 -2.16036 0 0 -81.4462 -106.593 -114.43 -105.55 -12.7873 -26.9057 -25.3715 -52.9336 7.99917 -24 ||| -4.58432
-
- 2. a weight vector
- 3. bleu order ( default = 4)
- 4. scaling factor to weigh the weight vector (default = 1.0)
-
- Output :
- translations that minimise the Bayes Risk of the n-best list
-
-
-*/
-
-int TABLE_LINE_MAX_LENGTH = 5000;
-vector<double> weights;
-float SCALE = 1.0;
-int BLEU_ORDER = 4;
-int SMOOTH = 1;
-int DEBUG = 0;
-double min_interval = 1e-4;
-
-#define SAFE_GETLINE(_IS, _LINE, _SIZE, _DELIM) {_IS.getline(_LINE, _SIZE, _DELIM); if(_IS.fail() && !_IS.bad() && !_IS.eof()) _IS.clear();}
-
-typedef string WORD;
-typedef unsigned int WORD_ID;
-
-
-map<WORD, WORD_ID> lookup;
-vector< WORD > vocab;
-
-class candidate_t
-{
-public:
- vector<WORD_ID> translation;
- vector<double> features;
- int translation_size;
-} ;
-
-
-void usage(void)
-{
- fprintf(stderr,
- "usage: mbr -s SCALE -n BLEU_ORDER -w weights.txt -i nbest.txt");
-}
-
-
-char *strstrsep(char **stringp, const char *delim)
-{
- char *match, *save;
- save = *stringp;
- if (*stringp == NULL)
- return NULL;
- match = strstr(*stringp, delim);
- if (match == NULL) {
- *stringp = NULL;
- return save;
- }
- *match = '\0';
- *stringp = match + strlen(delim);
- return save;
-}
-
-
-
-vector<string> tokenize( const char input[] )
-{
- vector< string > token;
- bool betweenWords = true;
- int start;
- int i=0;
- for(; input[i] != '\0'; i++) {
- bool isSpace = (input[i] == ' ' || input[i] == '\t');
-
- if (!isSpace && betweenWords) {
- start = i;
- betweenWords = false;
- } else if (isSpace && !betweenWords) {
- token.push_back( string( input+start, i-start ) );
- betweenWords = true;
- }
- }
- if (!betweenWords)
- token.push_back( string( input+start, i-start+1 ) );
- return token;
-}
-
-
-
-WORD_ID storeIfNew( WORD word )
-{
- if( lookup.find( word ) != lookup.end() )
- return lookup[ word ];
-
- WORD_ID id = vocab.size();
- vocab.push_back( word );
- lookup[ word ] = id;
- return id;
-}
-
-int count( string input, char delim )
-{
- int count = 0;
- for ( int i = 0; i < input.size(); i++) {
- if ( input[i] == delim)
- count++;
- }
- return count;
-}
-
-double calculate_probability(const vector<double> & feats, const vector<double> & weights,double SCALE)
-{
-
- if (feats.size() != weights.size())
- cerr << "ERROR : Number of features <> number of weights " << endl;
-
- double prob = 0;
- for ( int i = 0; i < feats.size(); i++) {
- prob += feats[i]*weights[i]*SCALE;
- }
- return exp(prob);
-}
-
-void extract_ngrams(const vector<WORD_ID>& sentence, map < vector < WORD_ID>, int > & allngrams)
-{
- vector< WORD_ID> ngram;
- for (int k = 0; k< BLEU_ORDER; k++) {
- for(int i =0; i < max((int)sentence.size()-k,0); i++) {
- for ( int j = i; j<= i+k; j++) {
- ngram.push_back(sentence[j]);
- }
- ++allngrams[ngram];
- ngram.clear();
- }
- }
-}
-
-
-double calculate_score(const vector<candidate_t*> & sents, int ref, int hyp, vector < map < vector < WORD_ID>, int > > & ngram_stats )
-{
- int comps_n = 2*BLEU_ORDER+1;
- int comps[comps_n];
- double logbleu = 0.0, brevity;
-
- int hyp_length = sents[hyp]->translation_size;
-
- for (int i =0; i<BLEU_ORDER; i++) {
- comps[2*i] = 0;
- comps[2*i+1] = max(hyp_length-i,0);
- }
-
- map< vector < WORD_ID > ,int > & hyp_ngrams = ngram_stats[hyp] ;
- map< vector < WORD_ID >, int > & ref_ngrams = ngram_stats[ref] ;
-
- for (map< vector< WORD_ID >, int >::iterator it = hyp_ngrams.begin();
- it != hyp_ngrams.end(); it++) {
- map< vector< WORD_ID >, int >::iterator ref_it = ref_ngrams.find(it->first);
- if(ref_it != ref_ngrams.end()) {
- comps[2* (it->first.size()-1)] += min(ref_it->second,it->second);
- }
- }
- comps[comps_n-1] = sents[ref]->translation_size;
-
- if (DEBUG) {
- for ( int i = 0; i < comps_n; i++)
- cerr << "Comp " << i << " : " << comps[i];
- }
-
- for (int i=0; i<BLEU_ORDER; i++) {
- if (comps[0] == 0)
- return 0.0;
- if ( i > 0 )
- logbleu += log(static_cast<double>(comps[2*i]+SMOOTH))-log(static_cast<double>(comps[2*i+1]+SMOOTH));
- else
- logbleu += log(static_cast<double>(comps[2*i]))-log(static_cast<double>(comps[2*i+1]));
- }
- logbleu /= BLEU_ORDER;
- brevity = 1.0-(double)comps[comps_n-1]/comps[1]; // comps[comps_n-1] is the ref length, comps[1] is the test length
- if (brevity < 0.0)
- logbleu += brevity;
- return exp(logbleu);
-}
-
-vector<double> read_weights(string fileName)
-{
- ifstream inFile;
- inFile.open(fileName.c_str());
- istream *inFileP = &inFile;
-
- char line[TABLE_LINE_MAX_LENGTH];
- int i=0;
- vector<double> weights;
-
- while(true) {
- i++;
- SAFE_GETLINE((*inFileP), line, TABLE_LINE_MAX_LENGTH, '\n');
- if (inFileP->eof()) break;
- vector<string> token = tokenize(line);
-
- for (int j = 0; j < token.size(); j++) {
- weights.push_back(atof(token[j].c_str()));
- }
- }
- cerr << endl;
- return weights;
-}
-
-int find_pos_of_min_element(const vector<double>& vec)
-{
-
- int min_pos = -1;
- double min_element = 10000;
- for ( int i = 0; i < vec.size(); i++) {
- if (vec[i] < min_element) {
- min_element = vec[i];
- min_pos = i;
- }
- }
- /* cerr << "Min pos is : " << min_pos << endl;
- cerr << "Min mbr loss is : " << min_element << endl;*/
- return min_pos;
-}
-
-void process(int sent, const vector<candidate_t*> & sents)
-{
-// cerr << "Sentence " << sent << " has " << sents.size() << " candidate translations" << endl;
- double marginal = 0;
-
- vector<double> joint_prob_vec;
- double joint_prob;
- vector< map < vector <WORD_ID>, int > > ngram_stats;
-
- for (int i = 0; i < sents.size(); i++) {
-// cerr << "Sents " << i << " has trans : " << sents[i]->translation << endl;
- //Calculate marginal and cache the posteriors
- joint_prob = calculate_probability(sents[i]->features,weights,SCALE);
- marginal += joint_prob;
- joint_prob_vec.push_back(joint_prob);
- //Cache ngram counts
- map < vector <WORD_ID>, int > counts;
- extract_ngrams(sents[i]->translation,counts);
- ngram_stats.push_back(counts);
- }
- //cerr << "Marginal is " << marginal;
-
- vector<double> mbr_loss;
- double bleu, weightedLoss;
- double weightedLossCumul = 0;
- double minMBRLoss = 1000000;
- int minMBRLossIdx = -1;
-
- /* Main MBR computation done here */
- for (int i = 0; i < sents.size(); i++) {
- weightedLossCumul = 0;
- for (int j = 0; j < sents.size(); j++) {
- if ( i != j) {
- bleu = calculate_score(sents, j, i,ngram_stats );
- weightedLoss = ( 1 - bleu) * ( joint_prob_vec[j]/marginal);
- weightedLossCumul += weightedLoss;
- if (weightedLossCumul > minMBRLoss)
- break;
- }
- }
- if (weightedLossCumul < minMBRLoss) {
- minMBRLoss = weightedLossCumul;
- minMBRLossIdx = i;
- }
- }
-// cerr << "Min pos is : " << minMBRLossIdx << endl;
-// cerr << "Min mbr loss is : " << minMBRLoss << endl;
- /* Find sentence that minimises Bayes Risk under 1- BLEU loss */
- vector< WORD_ID > best_translation = sents[minMBRLossIdx]->translation;
- for (int i = 0; i < best_translation.size(); i++)
- cout << vocab[best_translation[i]] << " " ;
- cout << endl;
-}
-
-
-void read_nbest_data(string fileName)
-{
-
- FILE * fp;
- fp = fopen (fileName.c_str() , "r");
-
- static char buf[10000];
- char *rest, *tok;
- int field;
- int sent_i, cur_sent;
- candidate_t *cand = NULL;
- vector<candidate_t*> testsents;
-
- cur_sent = -1;
-
- while (fgets(buf, sizeof(buf), fp) != NULL) {
- field = 0;
- rest = buf;
- while ((tok = strstrsep(&rest, "|||")) != NULL) {
- if (field == 0) {
- sent_i = strtol(tok, NULL, 10);
- cand = new candidate_t;
- } else if (field == 2) {
- vector<double> features;
- char * subtok;
- subtok = strtok (tok," ");
-
- while (subtok != NULL) {
- features.push_back(atof(subtok));
- subtok = strtok (NULL, " ");
- }
- cand->features = features;
- } else if (field == 1) {
- vector<string> trans_str = tokenize(tok);
- vector<WORD_ID> trans_int;
- for (int j=0; j<trans_str.size(); j++) {
- trans_int.push_back( storeIfNew( trans_str[j] ) );
- }
- cand->translation= trans_int;
- cand->translation_size = cand->translation.size();
- } else if (field == 3) {
- continue;
- } else {
- fprintf(stderr, "too many fields in n-best list line\n");
- }
- field++;
- }
- if (sent_i != cur_sent) {
- if (cur_sent != - 1) {
- process(cur_sent,testsents);
- }
- cur_sent = sent_i;
- testsents.clear();
- }
- testsents.push_back(cand);
- }
- process(cur_sent,testsents);
- cerr << endl;
-}
-
-int main(int argc, char **argv)
-{
-
- time_t starttime = time(NULL);
- int c;
-
- string f_weight = "";
- string f_nbest = "";
-
- while ((c = getopt(argc, argv, "s:w:n:i:")) != -1) {
- switch (c) {
- case 's':
- SCALE = atof(optarg);
- break;
- case 'n':
- BLEU_ORDER = atoi(optarg);
- break;
- case 'w':
- f_weight = optarg;
- break;
- case 'i':
- f_nbest = optarg;
- break;
- default:
- usage();
- }
- }
-
- argc -= optind;
- argv += optind;
-
- if (argc < 2) {
- usage();
- }
-
-
- weights = read_weights(f_weight);
- read_nbest_data(f_nbest);
-
- time_t endtime = time(NULL);
- cerr << "Processed data in" << (endtime-starttime) << " seconds\n";
-}
-