diff options
author | Kenneth Heafield <github@kheafield.com> | 2012-11-12 18:17:48 +0400 |
---|---|---|
committer | Kenneth Heafield <github@kheafield.com> | 2012-11-12 18:17:48 +0400 |
commit | 62d37fa2b66bc6e28839ff054dcffd259a9088fb (patch) | |
tree | c578252c0763c81e2583e1a80d5996ea9c9f8bce /phrase-extract/statistics-main.cpp | |
parent | 4f8f864650c955e65536328bd70f385976ce9063 (diff) |
Refactor phrase-extract/Jamfile
Diffstat (limited to 'phrase-extract/statistics-main.cpp')
-rw-r--r-- | phrase-extract/statistics-main.cpp | 346 |
1 files changed, 346 insertions, 0 deletions
diff --git a/phrase-extract/statistics-main.cpp b/phrase-extract/statistics-main.cpp new file mode 100644 index 000000000..67373ec93 --- /dev/null +++ b/phrase-extract/statistics-main.cpp @@ -0,0 +1,346 @@ +// $Id$ +// vim:tabstop=2 + +#include <sstream> +#include <cstdio> +#include <iostream> +#include <fstream> +#include <vector> +#include <string> +#include <stdlib.h> +#include <assert.h> +#include <time.h> + +#include "AlignmentPhrase.h" +#include "SafeGetline.h" +#include "tables-core.h" +#include "InputFileStream.h" + +using namespace std; +using namespace MosesTraining; + +#define LINE_MAX_LENGTH 10000 + +namespace MosesTraining +{ + +class PhraseAlignment +{ +public: + int english, foreign; + vector< vector<size_t> > alignedToE; + vector< vector<size_t> > alignedToF; + + bool create( char*, int ); + void clear(); + bool equals( const PhraseAlignment& ); +}; + +class LexicalTable +{ +public: + map< WORD_ID, map< WORD_ID, double > > ltable; + void load( const string &); +}; + +} + +void processPhrasePairs( vector< PhraseAlignment > & ); + +ofstream phraseTableFile; + +Vocabulary vcbE; +Vocabulary vcbF; +LexicalTable lexTable; +PhraseTable phraseTableE; +PhraseTable phraseTableF; +bool inverseFlag; +int phrasePairBase = 0; // only used for "proper" conditioning + +int main(int argc, char* argv[]) +{ + cerr << "PhraseStatistics v1.1 written by Nicola Bertoldi\n" + << "modifying PhraseScore v1.4 written by Philipp Koehn\n" + << "It computes statistics for extracted phrase pairs\n" + << "if (direct):\n" + << "src_phrase ||| trg_phrase || freq(src_phrase, trg_phrase) freq(src_phrase) length(src_phrase) length(trg_phrase)\n" + << "if (inverse)\n" + << "src_phrase ||| trg_phrase || freq(src_phrase, trg_phrase) freq(trg_phrase) length(src_phrase) length(trg_phrase)\n"; + time_t starttime = time(NULL); + + if (argc != 4 && argc != 5) { + cerr << "syntax: statistics extract lex phrase-table [inverse]\n"; + exit(1); + } + char* &fileNameExtract = argv[1]; + char* &fileNameLex = argv[2]; + char* &fileNamePhraseTable = argv[3]; + inverseFlag = false; + if (argc > 4) { + inverseFlag = true; + cerr << "using inverse mode\n"; + } + + // lexical translation table + lexTable.load( fileNameLex ); + + // sorted phrase extraction file + Moses::InputFileStream extractFile(fileNameExtract); + + if (extractFile.fail()) { + cerr << "ERROR: could not open extract file " << fileNameExtract << endl; + exit(1); + } + istream &extractFileP = extractFile; + + // output file: phrase translation table + phraseTableFile.open(fileNamePhraseTable); + if (phraseTableFile.fail()) { + cerr << "ERROR: could not open file phrase table file " + << fileNamePhraseTable << endl; + exit(1); + } + + // loop through all extracted phrase translations + int lastForeign = -1; + vector< PhraseAlignment > phrasePairsWithSameF; + int i=0; + int fileCount = 0; + while(true) { + if (extractFileP.eof()) break; + if (++i % 100000 == 0) cerr << "." << flush; + char line[LINE_MAX_LENGTH]; + SAFE_GETLINE((extractFileP), line, LINE_MAX_LENGTH, '\n', __FILE__); + // if (fileCount>0) + if (extractFileP.eof()) + break; + PhraseAlignment phrasePair; + bool isPhrasePair = phrasePair.create( line, i ); + if (lastForeign >= 0 && lastForeign != phrasePair.foreign) { + processPhrasePairs( phrasePairsWithSameF ); + for(size_t j=0; j<phrasePairsWithSameF.size(); j++) + phrasePairsWithSameF[j].clear(); + phrasePairsWithSameF.clear(); + phraseTableE.clear(); + phraseTableF.clear(); + phrasePair.clear(); // process line again, since phrase tables flushed + phrasePair.create( line, i ); + phrasePairBase = 0; + } + lastForeign = phrasePair.foreign; + if (isPhrasePair) + phrasePairsWithSameF.push_back( phrasePair ); + else + phrasePairBase++; + } + processPhrasePairs( phrasePairsWithSameF ); + phraseTableFile.close(); +} + +void processPhrasePairs( vector< PhraseAlignment > &phrasePair ) +{ + if (phrasePair.size() == 0) return; + map<int, int> countE; + map<int, int> alignmentE; + int totalCount = 0; + int currentCount = 0; + int maxSameCount = 0; + int maxSame = -1; + int old = -1; + for(size_t i=0; i<phrasePair.size(); i++) { + if (i>0) { + if (phrasePair[old].english == phrasePair[i].english) { + if (! phrasePair[i].equals( phrasePair[old] )) { + if (currentCount > maxSameCount) { + maxSameCount = currentCount; + maxSame = i-1; + } + currentCount = 0; + } + } else { + // wrap up old E + if (currentCount > maxSameCount) { + maxSameCount = currentCount; + maxSame = i-1; + } + + alignmentE[ phrasePair[old].english ] = maxSame; + // if (maxSameCount != totalCount) + // cout << "max count is " << maxSameCount << "/" << totalCount << endl; + + // get ready for new E + totalCount = 0; + currentCount = 0; + maxSameCount = 0; + maxSame = -1; + } + } + countE[ phrasePair[i].english ]++; + old = i; + currentCount++; + totalCount++; + } + + // wrap up old E + if (currentCount > maxSameCount) { + maxSameCount = currentCount; + maxSame = phrasePair.size()-1; + } + alignmentE[ phrasePair[old].english ] = maxSame; + // if (maxSameCount != totalCount) + // cout << "max count is " << maxSameCount << "/" << totalCount << endl; + + // output table + typedef map< int, int >::iterator II; + PHRASE phraseF = phraseTableF.getPhrase( phrasePair[0].foreign ); + size_t index = 0; + for(II i = countE.begin(); i != countE.end(); i++) { + //cout << "\tp( " << i->first << " | " << phrasePair[0].foreign << " ; " << phraseF.size() << " ) = ...\n"; + //cerr << index << endl; + + // foreign phrase (unless inverse) + if (! inverseFlag) { + for(size_t j=0; j<phraseF.size(); j++) { + phraseTableFile << vcbF.getWord( phraseF[j] ); + phraseTableFile << " "; + } + phraseTableFile << "||| "; + } + + // english phrase + PHRASE phraseE = phraseTableE.getPhrase( i->first ); + for(size_t j=0; j<phraseE.size(); j++) { + phraseTableFile << vcbE.getWord( phraseE[j] ); + phraseTableFile << " "; + } + phraseTableFile << "||| "; + + // foreign phrase (if inverse) + if (inverseFlag) { + for(size_t j=0; j<phraseF.size(); j++) { + phraseTableFile << vcbF.getWord( phraseF[j] ); + phraseTableFile << " "; + } + phraseTableFile << "||| "; + } + + // phrase pair frequency + phraseTableFile << i->second; + + //source phrase pair frequency + phraseTableFile << " " << phrasePair.size(); + + // source phrase length + phraseTableFile << " " << phraseF.size(); + + // target phrase length + phraseTableFile << " " << phraseE.size(); + + phraseTableFile << endl; + + index += i->second; + } +} + +bool PhraseAlignment::create( char line[], int lineID ) +{ + vector< string > token = tokenize( line ); + int item = 1; + PHRASE phraseF, phraseE; + for (size_t j=0; j<token.size(); j++) { + if (token[j] == "|||") item++; + else { + if (item == 1) + phraseF.push_back( vcbF.storeIfNew( token[j] ) ); + else if (item == 2) + phraseE.push_back( vcbE.storeIfNew( token[j] ) ); + else if (item == 3) { + int e,f; + sscanf(token[j].c_str(), "%d-%d", &f, &e); + if ((size_t)e >= phraseE.size() || (size_t)f >= phraseF.size()) { + cerr << "WARNING: sentence " << lineID << " has alignment point (" << f << ", " << e << ") out of bounds (" << phraseF.size() << ", " << phraseE.size() << ")\n"; + } else { + if (alignedToE.size() == 0) { + vector< size_t > dummy; + for(size_t i=0; i<phraseE.size(); i++) + alignedToE.push_back( dummy ); + for(size_t i=0; i<phraseF.size(); i++) + alignedToF.push_back( dummy ); + foreign = phraseTableF.storeIfNew( phraseF ); + english = phraseTableE.storeIfNew( phraseE ); + } + alignedToE[e].push_back( f ); + alignedToF[f].push_back( e ); + } + } + } + } + return (item>2); // real phrase pair, not just foreign phrase +} + +void PhraseAlignment::clear() +{ + for(size_t i=0; i<alignedToE.size(); i++) + alignedToE[i].clear(); + for(size_t i=0; i<alignedToF.size(); i++) + alignedToF[i].clear(); + alignedToE.clear(); + alignedToF.clear(); +} + +bool PhraseAlignment::equals( const PhraseAlignment& other ) +{ + if (this == &other) return true; + if (other.english != english) return false; + if (other.foreign != foreign) return false; + PHRASE phraseE = phraseTableE.getPhrase( english ); + PHRASE phraseF = phraseTableF.getPhrase( foreign ); + for(size_t i=0; i<phraseE.size(); i++) { + if (alignedToE[i].size() != other.alignedToE[i].size()) return false; + for(size_t j=0; j<alignedToE[i].size(); j++) { + if (alignedToE[i][j] != other.alignedToE[i][j]) return false; + } + } + for(size_t i=0; i<phraseF.size(); i++) { + if (alignedToF[i].size() != other.alignedToF[i].size()) return false; + for(size_t j=0; j<alignedToF[i].size(); j++) { + if (alignedToF[i][j] != other.alignedToF[i][j]) return false; + } + } + return true; +} + +void LexicalTable::load( const string &filePath ) +{ + cerr << "Loading lexical translation table from " << filePath; + ifstream inFile; + inFile.open(filePath.c_str()); + if (inFile.fail()) { + cerr << " - ERROR: could not open file\n"; + exit(1); + } + istream *inFileP = &inFile; + + char line[LINE_MAX_LENGTH]; + + int i=0; + while(true) { + i++; + if (i%100000 == 0) cerr << "." << flush; + SAFE_GETLINE((*inFileP), line, LINE_MAX_LENGTH, '\n', __FILE__); + if (inFileP->eof()) break; + + vector<string> token = tokenize( line ); + if (token.size() != 3) { + cerr << "line " << i << " in " << filePath << " has wrong number of tokens, skipping:\n" << + token.size() << " " << token[0] << " " << line << endl; + continue; + } + + double prob = atof( token[2].c_str() ); + WORD_ID wordE = vcbE.storeIfNew( token[0] ); + WORD_ID wordF = vcbF.storeIfNew( token[1] ); + ltable[ wordF ][ wordE ] = prob; + } + cerr << endl; +} |