diff options
Diffstat (limited to 'mert/extractor.cpp')
-rw-r--r-- | mert/extractor.cpp | 219 |
1 files changed, 219 insertions, 0 deletions
diff --git a/mert/extractor.cpp b/mert/extractor.cpp new file mode 100644 index 000000000..482bbf5fc --- /dev/null +++ b/mert/extractor.cpp @@ -0,0 +1,219 @@ +/** + * Extract features and score statistics from nvest file, optionally merging with + * those from the previous iteration. + * Developed during the 2nd MT marathon. + **/ + +#include <iostream> +#include <string> +#include <vector> + +#include <getopt.h> + +#include "Data.h" +#include "Scorer.h" +#include "ScorerFactory.h" +#include "Timer.h" +#include "Util.h" + +using namespace std; + +void usage() { + cerr<<"usage: extractor [options])"<<endl; + cerr<<"[--sctype|-s] the scorer type (default BLEU)"<<endl; + cerr<<"[--scconfig|-c] configuration string passed to scorer"<<endl; + cerr<<"\tThis is of the form NAME1:VAL1,NAME2:VAL2 etc "<<endl; + cerr<<"[--reference|-r] comma separated list of reference files"<<endl; + cerr<<"[--binary|-b] use binary output format (default to text )"<<endl; + cerr<<"[--nbest|-n] the nbest file"<<endl; + cerr<<"[--scfile|-S] the scorer data output file"<<endl; + cerr<<"[--ffile|-F] the feature data output file"<<endl; +cerr<<"[--prev-ffile|-E] comma separated list of previous feature data" <<endl; + cerr<<"[--prev-scfile|-R] comma separated list of previous scorer data"<<endl; + cerr<<"[-v] verbose level"<<endl; + cerr<<"[--help|-h] print this message and exit"<<endl; + exit(1); +} + + +static struct option long_options[] = + { + {"sctype",required_argument,0,'s'}, + {"scconfig",required_argument,0,'c'}, + {"reference",required_argument,0,'r'}, + {"binary",no_argument,0,'b'}, + {"nbest",required_argument,0,'n'}, + {"scfile",required_argument,0,'S'}, + {"ffile",required_argument,0,'F'}, + {"prev-scfile",required_argument,0,'R'}, + {"prev-ffile",required_argument,0,'E'}, + {"verbose",required_argument,0,'v'}, + {"help",no_argument,0,'h'}, + {0, 0, 0, 0} + }; +int option_index; + +int main(int argc, char** argv) { + + + ResetUserTime(); + + /* + Timer timer; + timer.start("Starting..."); + */ + + //defaults + string scorerType("BLEU"); + string scorerConfig(""); + string referenceFile(""); + string nbestFile(""); + string scoreDataFile("statscore.data"); + string featureDataFile("features.data"); + string prevScoreDataFile(""); + string prevFeatureDataFile(""); + bool binmode = false; + int verbosity = 0; + int c; + while ((c=getopt_long (argc,argv, "s:r:n:S:F:R:E:v:hb", long_options, &option_index)) != -1) { + switch(c) { + case 's': + scorerType = string(optarg); + break; + case 'c': + scorerConfig = string(optarg); + break; + case 'r': + referenceFile = string(optarg); + break; + case 'b': + binmode = true; + break; + case 'n': + nbestFile = string(optarg); + break; + case 'S': + scoreDataFile = string(optarg); + break; + case 'F': + featureDataFile = string(optarg); + break; + case 'E': + prevFeatureDataFile = string(optarg); + break; + case 'R': + prevScoreDataFile = string(optarg); + break; + case 'v': + verbosity = atoi(optarg); + break; + default: + usage(); + } + } + try { + +//check whether score statistics file is specified + if (scoreDataFile.length() == 0){ + throw runtime_error("Error: output score statistics file is not specified"); + } + +//check wheter feature file is specified + if (featureDataFile.length() == 0){ + throw runtime_error("Error: output feature file is not specified"); + } + +//check whether reference file is specified when nbest is specified + if ((nbestFile.length() > 0 && referenceFile.length() == 0)){ + throw runtime_error("Error: reference file is not specified; you can not score the nbest"); + } + + + vector<string> nbestFiles; + if (nbestFile.length() > 0){ + std::string substring; + while (!nbestFile.empty()){ + getNextPound(nbestFile, substring, ","); + nbestFiles.push_back(substring); + } + } + + vector<string> referenceFiles; + if (referenceFile.length() > 0){ + std::string substring; + while (!referenceFile.empty()){ + getNextPound(referenceFile, substring, ","); + referenceFiles.push_back(substring); + } + } + + vector<string> prevScoreDataFiles; + if (prevScoreDataFile.length() > 0){ + std::string substring; + while (!prevScoreDataFile.empty()){ + getNextPound(prevScoreDataFile, substring, ","); + prevScoreDataFiles.push_back(substring); + } + } + + vector<string> prevFeatureDataFiles; + if (prevFeatureDataFile.length() > 0){ + std::string substring; + while (!prevFeatureDataFile.empty()){ + getNextPound(prevFeatureDataFile, substring, ","); + prevFeatureDataFiles.push_back(substring); + } + } + + if (prevScoreDataFiles.size() != prevFeatureDataFiles.size()){ + throw runtime_error("Error: there is a different number of previous score and feature files"); + } + + + if (binmode) cerr << "Binary write mode is selected" << endl; + else cerr << "Binary write mode is NOT selected" << endl; + + TRACE_ERR("Scorer type: " << scorerType << endl); + ScorerFactory sfactory; + Scorer* scorer = sfactory.getScorer(scorerType,scorerConfig); + + //load references + if (referenceFiles.size() > 0) + scorer->setReferenceFiles(referenceFiles); + + PrintUserTime("References loaded"); + + Data data(*scorer); + + //load old data + for (size_t i=0;i < prevScoreDataFiles.size(); i++){ + data.load(prevFeatureDataFiles.at(i), prevScoreDataFiles.at(i)); + } + + PrintUserTime("Previous data loaded"); + + //computing score statistics of each nbest file + for (size_t i=0;i < nbestFiles.size(); i++){ + data.loadnbest(nbestFiles.at(i)); + } + + PrintUserTime("Nbest entries loaded and scored"); + + if (binmode) + cerr << "Binary write mode is selected" << endl; + else + cerr << "Binary write mode is NOT selected" << endl; + + data.save(featureDataFile, scoreDataFile, binmode); + PrintUserTime("Stopping..."); +/* + timer.stop("Stopping..."); + */ + + return EXIT_SUCCESS; + } catch (const exception& e) { + cerr << "Exception: " << e.what() << endl; + return EXIT_FAILURE; + } + +} |