From 62d37fa2b66bc6e28839ff054dcffd259a9088fb Mon Sep 17 00:00:00 2001 From: Kenneth Heafield Date: Mon, 12 Nov 2012 14:17:48 +0000 Subject: Refactor phrase-extract/Jamfile --- phrase-extract/statistics-main.cpp | 346 +++++++++++++++++++++++++++++++++++++ 1 file changed, 346 insertions(+) create mode 100644 phrase-extract/statistics-main.cpp (limited to 'phrase-extract/statistics-main.cpp') diff --git a/phrase-extract/statistics-main.cpp b/phrase-extract/statistics-main.cpp new file mode 100644 index 000000000..67373ec93 --- /dev/null +++ b/phrase-extract/statistics-main.cpp @@ -0,0 +1,346 @@ +// $Id$ +// vim:tabstop=2 + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "AlignmentPhrase.h" +#include "SafeGetline.h" +#include "tables-core.h" +#include "InputFileStream.h" + +using namespace std; +using namespace MosesTraining; + +#define LINE_MAX_LENGTH 10000 + +namespace MosesTraining +{ + +class PhraseAlignment +{ +public: + int english, foreign; + vector< vector > alignedToE; + vector< vector > alignedToF; + + bool create( char*, int ); + void clear(); + bool equals( const PhraseAlignment& ); +}; + +class LexicalTable +{ +public: + map< WORD_ID, map< WORD_ID, double > > ltable; + void load( const string &); +}; + +} + +void processPhrasePairs( vector< PhraseAlignment > & ); + +ofstream phraseTableFile; + +Vocabulary vcbE; +Vocabulary vcbF; +LexicalTable lexTable; +PhraseTable phraseTableE; +PhraseTable phraseTableF; +bool inverseFlag; +int phrasePairBase = 0; // only used for "proper" conditioning + +int main(int argc, char* argv[]) +{ + cerr << "PhraseStatistics v1.1 written by Nicola Bertoldi\n" + << "modifying PhraseScore v1.4 written by Philipp Koehn\n" + << "It computes statistics for extracted phrase pairs\n" + << "if (direct):\n" + << "src_phrase ||| trg_phrase || freq(src_phrase, trg_phrase) freq(src_phrase) length(src_phrase) length(trg_phrase)\n" + << "if (inverse)\n" + << "src_phrase ||| trg_phrase || freq(src_phrase, trg_phrase) freq(trg_phrase) length(src_phrase) length(trg_phrase)\n"; + time_t starttime = time(NULL); + + if (argc != 4 && argc != 5) { + cerr << "syntax: statistics extract lex phrase-table [inverse]\n"; + exit(1); + } + char* &fileNameExtract = argv[1]; + char* &fileNameLex = argv[2]; + char* &fileNamePhraseTable = argv[3]; + inverseFlag = false; + if (argc > 4) { + inverseFlag = true; + cerr << "using inverse mode\n"; + } + + // lexical translation table + lexTable.load( fileNameLex ); + + // sorted phrase extraction file + Moses::InputFileStream extractFile(fileNameExtract); + + if (extractFile.fail()) { + cerr << "ERROR: could not open extract file " << fileNameExtract << endl; + exit(1); + } + istream &extractFileP = extractFile; + + // output file: phrase translation table + phraseTableFile.open(fileNamePhraseTable); + if (phraseTableFile.fail()) { + cerr << "ERROR: could not open file phrase table file " + << fileNamePhraseTable << endl; + exit(1); + } + + // loop through all extracted phrase translations + int lastForeign = -1; + vector< PhraseAlignment > phrasePairsWithSameF; + int i=0; + int fileCount = 0; + while(true) { + if (extractFileP.eof()) break; + if (++i % 100000 == 0) cerr << "." << flush; + char line[LINE_MAX_LENGTH]; + SAFE_GETLINE((extractFileP), line, LINE_MAX_LENGTH, '\n', __FILE__); + // if (fileCount>0) + if (extractFileP.eof()) + break; + PhraseAlignment phrasePair; + bool isPhrasePair = phrasePair.create( line, i ); + if (lastForeign >= 0 && lastForeign != phrasePair.foreign) { + processPhrasePairs( phrasePairsWithSameF ); + for(size_t j=0; j &phrasePair ) +{ + if (phrasePair.size() == 0) return; + map countE; + map alignmentE; + int totalCount = 0; + int currentCount = 0; + int maxSameCount = 0; + int maxSame = -1; + int old = -1; + for(size_t i=0; i0) { + if (phrasePair[old].english == phrasePair[i].english) { + if (! phrasePair[i].equals( phrasePair[old] )) { + if (currentCount > maxSameCount) { + maxSameCount = currentCount; + maxSame = i-1; + } + currentCount = 0; + } + } else { + // wrap up old E + if (currentCount > maxSameCount) { + maxSameCount = currentCount; + maxSame = i-1; + } + + alignmentE[ phrasePair[old].english ] = maxSame; + // if (maxSameCount != totalCount) + // cout << "max count is " << maxSameCount << "/" << totalCount << endl; + + // get ready for new E + totalCount = 0; + currentCount = 0; + maxSameCount = 0; + maxSame = -1; + } + } + countE[ phrasePair[i].english ]++; + old = i; + currentCount++; + totalCount++; + } + + // wrap up old E + if (currentCount > maxSameCount) { + maxSameCount = currentCount; + maxSame = phrasePair.size()-1; + } + alignmentE[ phrasePair[old].english ] = maxSame; + // if (maxSameCount != totalCount) + // cout << "max count is " << maxSameCount << "/" << totalCount << endl; + + // output table + typedef map< int, int >::iterator II; + PHRASE phraseF = phraseTableF.getPhrase( phrasePair[0].foreign ); + size_t index = 0; + for(II i = countE.begin(); i != countE.end(); i++) { + //cout << "\tp( " << i->first << " | " << phrasePair[0].foreign << " ; " << phraseF.size() << " ) = ...\n"; + //cerr << index << endl; + + // foreign phrase (unless inverse) + if (! inverseFlag) { + for(size_t j=0; jfirst ); + for(size_t j=0; jsecond; + + //source phrase pair frequency + phraseTableFile << " " << phrasePair.size(); + + // source phrase length + phraseTableFile << " " << phraseF.size(); + + // target phrase length + phraseTableFile << " " << phraseE.size(); + + phraseTableFile << endl; + + index += i->second; + } +} + +bool PhraseAlignment::create( char line[], int lineID ) +{ + vector< string > token = tokenize( line ); + int item = 1; + PHRASE phraseF, phraseE; + for (size_t j=0; j= phraseE.size() || (size_t)f >= phraseF.size()) { + cerr << "WARNING: sentence " << lineID << " has alignment point (" << f << ", " << e << ") out of bounds (" << phraseF.size() << ", " << phraseE.size() << ")\n"; + } else { + if (alignedToE.size() == 0) { + vector< size_t > dummy; + for(size_t i=0; i2); // real phrase pair, not just foreign phrase +} + +void PhraseAlignment::clear() +{ + for(size_t i=0; ieof()) break; + + vector token = tokenize( line ); + if (token.size() != 3) { + cerr << "line " << i << " in " << filePath << " has wrong number of tokens, skipping:\n" << + token.size() << " " << token[0] << " " << line << endl; + continue; + } + + double prob = atof( token[2].c_str() ); + WORD_ID wordE = vcbE.storeIfNew( token[0] ); + WORD_ID wordF = vcbF.storeIfNew( token[1] ); + ltable[ wordF ][ wordE ] = prob; + } + cerr << endl; +} -- cgit v1.2.3