diff options
author | bhaddow <bhaddow@1f5c12ca-751b-0410-a591-d2e778427230> | 2011-09-07 20:40:55 +0400 |
---|---|---|
committer | bhaddow <bhaddow@1f5c12ca-751b-0410-a591-d2e778427230> | 2011-09-07 20:40:55 +0400 |
commit | de51b69d030a02d3e3117d97774c398e0cdd333b (patch) | |
tree | 022a0e27fc674100a65f8c863e290c87b38774ad /scripts | |
parent | 41a184943720ddf85ac83339ecffa6db15ed8efb (diff) |
remove (temporarily)
git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@4185 1f5c12ca-751b-0410-a591-d2e778427230
Diffstat (limited to 'scripts')
-rw-r--r-- | scripts/training/phrase-extract/score.cpp | 515 |
1 files changed, 0 insertions, 515 deletions
diff --git a/scripts/training/phrase-extract/score.cpp b/scripts/training/phrase-extract/score.cpp deleted file mode 100644 index 8a09c519e..000000000 --- a/scripts/training/phrase-extract/score.cpp +++ /dev/null @@ -1,515 +0,0 @@ -/*********************************************************************** - Moses - factored phrase-based language decoder - Copyright (C) 2009 University of Edinburgh - - This library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - This library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with this library; if not, write to the Free Software - Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - ***********************************************************************/ - -#include <sstream> -#include <cstdio> -#include <iostream> -#include <fstream> -#include <vector> -#include <stdlib.h> -#include <assert.h> -#include <cstring> -#include <set> - -#include "SafeGetline.h" -#include "tables-core.h" -#include "PhraseAlignment.h" -#include "score.h" -#include "InputFileStream.h" - -using namespace std; - -#define LINE_MAX_LENGTH 100000 - -Vocabulary vcbT; -Vocabulary vcbS; - -class LexicalTable -{ -public: - map< WORD_ID, map< WORD_ID, double > > ltable; - void load( char[] ); - double permissiveLookup( WORD_ID wordS, WORD_ID wordT ) { - // cout << endl << vcbS.getWord( wordS ) << "-" << vcbT.getWord( wordT ) << ":"; - if (ltable.find( wordS ) == ltable.end()) return 1.0; - if (ltable[ wordS ].find( wordT ) == ltable[ wordS ].end()) return 1.0; - // cout << ltable[ wordS ][ wordT ]; - return ltable[ wordS ][ wordT ]; - } -}; - -vector<string> tokenize( const char [] ); - -void computeCountOfCounts( char* fileNameExtract, int maxLines ); -void processPhrasePairs( vector< PhraseAlignment > & , ostream &phraseTableFile); -PhraseAlignment* findBestAlignment(const PhraseAlignmentCollection & ); -void outputPhrasePair(const PhraseAlignmentCollection &, float, ostream &phraseTableFile ); -double computeLexicalTranslation( const PHRASE &, const PHRASE &, PhraseAlignment * ); - -LexicalTable lexTable; -bool inverseFlag = false; -bool hierarchicalFlag = false; -bool wordAlignmentFlag = false; -bool goodTuringFlag = false; -#define GT_MAX 10 -bool logProbFlag = false; -int negLogProb = 1; -bool lexFlag = true; -int countOfCounts[GT_MAX+1]; -float discountFactor[GT_MAX+1]; -int maxLinesGTDiscount = -1; -bool phrasePairCountFlag = false; - -int main(int argc, char* argv[]) -{ - cerr << "Score v2.0 written by Philipp Koehn\n" - << "scoring methods for extracted rules\n"; - - if (argc < 4) { - cerr << "syntax: score extract lex phrase-table [--Inverse] [--Hierarchical] [--LogProb] [--NegLogProb] [--NoLex] [--GoodTuring] [--WordAlignment] [--MaxLinesGTDiscount num] [--PhrasePairCount]\n"; - exit(1); - } - char* fileNameExtract = argv[1]; - char* fileNameLex = argv[2]; - char* fileNamePhraseTable = argv[3]; - - for(int i=4; i<argc; i++) { - if (strcmp(argv[i],"inverse") == 0 || strcmp(argv[i],"--Inverse") == 0) { - inverseFlag = true; - cerr << "using inverse mode\n"; - } else if (strcmp(argv[i],"--Hierarchical") == 0) { - hierarchicalFlag = true; - cerr << "processing hierarchical rules\n"; - } else if (strcmp(argv[i],"--WordAlignment") == 0) { - wordAlignmentFlag = true; - cerr << "outputing word alignment" << endl; - } else if (strcmp(argv[i],"--NoLex") == 0) { - lexFlag = false; - cerr << "not computing lexical translation score\n"; - } else if (strcmp(argv[i],"--GoodTuring") == 0) { - goodTuringFlag = true; - cerr << "using Good Turing discounting\n"; - } else if (strcmp(argv[i],"--LogProb") == 0) { - logProbFlag = true; - cerr << "using log-probabilities\n"; - } else if (strcmp(argv[i],"--NegLogProb") == 0) { - logProbFlag = true; - negLogProb = -1; - cerr << "using negative log-probabilities\n"; - } else if (strcmp(argv[i],"--MaxLinesGTDiscount") == 0) { - ++i; - maxLinesGTDiscount = atoi(argv[i]); - cerr << "maxLinesGTDiscount=" << maxLinesGTDiscount << endl; - } else if (strcmp(argv[i],"--PhrasePairCount") == 0) { - phrasePairCountFlag = true; - cerr << "outputting phrase pair counts" << endl; - } else { - cerr << "ERROR: unknown option " << argv[i] << endl; - exit(1); - } - } - - // lexical translation table - if (lexFlag) - lexTable.load( fileNameLex ); - - // compute count of counts for Good Turing discounting - if (goodTuringFlag) - computeCountOfCounts( fileNameExtract, maxLinesGTDiscount ); - - // sorted phrase extraction file - Moses::InputFileStream extractFile(fileNameExtract); - - if (extractFile.fail()) { - cerr << "ERROR: could not open extract file " << fileNameExtract << endl; - exit(1); - } - istream &extractFileP = extractFile; - - // output file: phrase translation table - ostream *phraseTableFile; - - if (strcmp(fileNamePhraseTable, "-") == 0) { - phraseTableFile = &cout; - } - else { - ofstream *outputFile = new ofstream(); - outputFile->open(fileNamePhraseTable); - if (outputFile->fail()) { - cerr << "ERROR: could not open file phrase table file " - << fileNamePhraseTable << endl; - exit(1); - } - phraseTableFile = outputFile; - } - - // loop through all extracted phrase translations - float lastCount = 0.0f; - vector< PhraseAlignment > phrasePairsWithSameF; - int i=0; - char line[LINE_MAX_LENGTH],lastLine[LINE_MAX_LENGTH]; - lastLine[0] = '\0'; - PhraseAlignment *lastPhrasePair = NULL; - while(true) { - if (extractFileP.eof()) break; - if (++i % 100000 == 0) - { - cerr << i << " " << flush; - } - - SAFE_GETLINE((extractFileP), line, LINE_MAX_LENGTH, '\n', __FILE__); - if (extractFileP.eof()) break; - - // identical to last line? just add count - if (strcmp(line,lastLine) == 0) { - lastPhrasePair->count += lastCount; - continue; - } - strcpy( lastLine, line ); - - // create new phrase pair - PhraseAlignment phrasePair; - phrasePair.create( line, i ); - lastCount = phrasePair.count; - - // only differs in count? just add count - if (lastPhrasePair != NULL && lastPhrasePair->equals( phrasePair )) { - lastPhrasePair->count += phrasePair.count; - continue; - } - - // if new source phrase, process last batch - if (lastPhrasePair != NULL && - lastPhrasePair->GetSource() != phrasePair.GetSource()) { - - processPhrasePairs( phrasePairsWithSameF, *phraseTableFile ); - phrasePairsWithSameF.clear(); - lastPhrasePair = NULL; - } - - // add phrase pairs to list, it's now the last one - phrasePairsWithSameF.push_back( phrasePair ); - lastPhrasePair = &phrasePairsWithSameF.back(); - } - processPhrasePairs( phrasePairsWithSameF, *phraseTableFile ); - - phraseTableFile->flush(); - if (phraseTableFile != &cout) { - (dynamic_cast<ofstream*>(phraseTableFile))->close(); - delete phraseTableFile; - } -} - -void computeCountOfCounts( char* fileNameExtract, int maxLines ) -{ - cerr << "computing counts of counts"; - for(int i=1; i<=GT_MAX; i++) countOfCounts[i] = 0; - - Moses::InputFileStream extractFile(fileNameExtract); - if (extractFile.fail()) { - cerr << "ERROR: could not open extract file " << fileNameExtract << endl; - exit(1); - } - istream &extractFileP = extractFile; - - // loop through all extracted phrase translations - int lineNum = 0; - char line[LINE_MAX_LENGTH],lastLine[LINE_MAX_LENGTH]; - lastLine[0] = '\0'; - float lastCount = 0.0f; - PhraseAlignment *lastPhrasePair = NULL; - while(true) { - if (extractFileP.eof()) break; - if (maxLines > 0 && lineNum >= maxLines) break; - if (++lineNum % 100000 == 0) cerr << "." << flush; - SAFE_GETLINE((extractFileP), line, LINE_MAX_LENGTH, '\n', __FILE__); - if (extractFileP.eof()) break; - - // identical to last line? just add count - if (strcmp(line,lastLine) == 0) { - lastPhrasePair->count += lastCount; - continue; - } - strcpy( lastLine, line ); - - // create new phrase pair - PhraseAlignment *phrasePair = new PhraseAlignment(); - phrasePair->create( line, lineNum ); - lastCount = phrasePair->count; - - if (lineNum == 1) { - lastPhrasePair = phrasePair; - continue; - } - - // only differs in count? just add count - if (lastPhrasePair->match( *phrasePair )) { - lastPhrasePair->count += phrasePair->count; - phrasePair->clear(); - delete(phrasePair); - continue; - } - - int count = lastPhrasePair->count + 0.99999; - if(count <= GT_MAX) - countOfCounts[ count ]++; - lastPhrasePair->clear(); - delete( lastPhrasePair ); - lastPhrasePair = phrasePair; - } - - delete lastPhrasePair; - - discountFactor[0] = 0.01; // floor - cerr << "\n"; - for(int i=1; i<GT_MAX; i++) { - discountFactor[i] = ((float)i+1)/(float)i*(((float)countOfCounts[i+1]+0.1) / ((float)countOfCounts[i]+0.1)); - cerr << "count " << i << ": " << countOfCounts[ i ] << ", discount factor: " << discountFactor[i]; - // some smoothing... - if (discountFactor[i]>1) - discountFactor[i] = 1; - if (discountFactor[i]<discountFactor[i-1]) - discountFactor[i] = discountFactor[i-1]; - cerr << " -> " << discountFactor[i]*i << endl; - } -} - -void processPhrasePairs( vector< PhraseAlignment > &phrasePair, ostream &phraseTableFile ) -{ - if (phrasePair.size() == 0) return; - - // group phrase pairs based on alignments that matter - // (i.e. that re-arrange non-terminals) - PhrasePairGroup phrasePairGroup; - float totalSource = 0; - - //cerr << "phrasePair.size() = " << phrasePair.size() << endl; - - // loop through phrase pairs - for(size_t i=0; i<phrasePair.size(); i++) { - // add to total count - PhraseAlignment &currPhrasePair = phrasePair[i]; - - totalSource += phrasePair[i].count; - - // check for matches - //cerr << "phrasePairGroup.size() = " << phrasePairGroup.size() << endl; - - PhraseAlignmentCollection phraseAlignColl; - phraseAlignColl.push_back(&currPhrasePair); - pair<PhrasePairGroup::iterator, bool> retInsert; - retInsert = phrasePairGroup.insert(phraseAlignColl); - if (!retInsert.second) - { // already exist. Add to that collection instead - PhraseAlignmentCollection &existingColl = const_cast<PhraseAlignmentCollection&>(*retInsert.first); - existingColl.push_back(&currPhrasePair); - } - - } - - const PhrasePairGroup::SortedColl &sortedColl = phrasePairGroup.GetSortedColl(); - PhrasePairGroup::SortedColl::const_iterator iter; - - for(iter = sortedColl.begin(); iter != sortedColl.end(); ++iter) - { - const PhraseAlignmentCollection &group = **iter; - outputPhrasePair( group, totalSource, phraseTableFile ); - } -} - -PhraseAlignment* findBestAlignment(const PhraseAlignmentCollection &phrasePair ) -{ - float bestAlignmentCount = -1; - PhraseAlignment* bestAlignment; - - for(int i=0; i<phrasePair.size(); i++) { - if (phrasePair[i]->count > bestAlignmentCount) { - bestAlignmentCount = phrasePair[i]->count; - bestAlignment = phrasePair[i]; - } - } - - return bestAlignment; -} - -void outputPhrasePair(const PhraseAlignmentCollection &phrasePair, float totalCount, ostream &phraseTableFile ) -{ - if (phrasePair.size() == 0) return; - - PhraseAlignment *bestAlignment = findBestAlignment( phrasePair ); - - // compute count - float count = 0; - for(size_t i=0; i<phrasePair.size(); i++) { - count += phrasePair[i]->count; - } - const float originalCount = count; - - const PHRASE &phraseS = phrasePair[0]->GetSource(); - const PHRASE &phraseT = phrasePair[0]->GetTarget(); - - // labels (if hierarchical) - - // source phrase (unless inverse) - if (! inverseFlag) { - for(int j=0; j<phraseS.size(); j++) { - phraseTableFile << vcbS.getWord( phraseS[j] ); - phraseTableFile << " "; - } - phraseTableFile << "||| "; - } - - // target phrase - for(int j=0; j<phraseT.size(); j++) { - phraseTableFile << vcbT.getWord( phraseT[j] ); - phraseTableFile << " "; - } - phraseTableFile << "||| "; - - // source phrase (if inverse) - if (inverseFlag) { - for(int j=0; j<phraseS.size(); j++) { - phraseTableFile << vcbS.getWord( phraseS[j] ); - phraseTableFile << " "; - } - phraseTableFile << "||| "; - } - - // phrase translation probability - if (goodTuringFlag && count<GT_MAX) - count *= discountFactor[(int)(count+0.99999)]; - double condScore = count / totalCount; - phraseTableFile << ( logProbFlag ? negLogProb*log(condScore) : condScore ); - - // lexical translation probability - if (lexFlag) { - double lexScore = computeLexicalTranslation( phraseS, phraseT, bestAlignment); - phraseTableFile << " " << ( logProbFlag ? negLogProb*log(lexScore) : lexScore ); - } - - phraseTableFile << " ||| "; - - // alignment info for non-terminals - if (! inverseFlag) { - if (hierarchicalFlag) { - // always output alignment if hiero style, but only for non-terms - assert(phraseT.size() == bestAlignment->alignedToT.size() + 1); - for(int j = 0; j < phraseT.size() - 1; j++) { - if (isNonTerminal(vcbT.getWord( phraseT[j] ))) { - if (bestAlignment->alignedToT[ j ].size() != 1) { - cerr << "Error: unequal numbers of non-terminals. Make sure the text does not contain words in square brackets (like [xxx])." << endl; - phraseTableFile.flush(); - assert(bestAlignment->alignedToT[ j ].size() == 1); - } - int sourcePos = *(bestAlignment->alignedToT[ j ].begin()); - phraseTableFile << sourcePos << "-" << j << " "; - } - } - } else if (wordAlignmentFlag) { - // alignment info in pb model - for(int j=0; j<bestAlignment->alignedToT.size(); j++) { - const set< size_t > &aligned = bestAlignment->alignedToT[j]; - for (set< size_t >::const_iterator p(aligned.begin()); p != aligned.end(); ++p) { - phraseTableFile << *p << "-" << j << " "; - } - } - } - } - - phraseTableFile << " ||| " << totalCount; - if (phrasePairCountFlag) { - phraseTableFile << " " << originalCount; - } - phraseTableFile << endl; -} - -double computeLexicalTranslation( const PHRASE &phraseS, const PHRASE &phraseT, PhraseAlignment *alignment ) -{ - // lexical translation probability - double lexScore = 1.0; - int null = vcbS.getWordID("NULL"); - // all target words have to be explained - for(int ti=0; ti<alignment->alignedToT.size(); ti++) { - const set< size_t > & srcIndices = alignment->alignedToT[ ti ]; - if (srcIndices.empty()) { - // explain unaligned word by NULL - lexScore *= lexTable.permissiveLookup( null, phraseT[ ti ] ); - } else { - // go through all the aligned words to compute average - double thisWordScore = 0; - for (set< size_t >::const_iterator p(srcIndices.begin()); p != srcIndices.end(); ++p) { - thisWordScore += lexTable.permissiveLookup( phraseS[ *p ], phraseT[ ti ] ); - } - lexScore *= thisWordScore / (double)srcIndices.size(); - } - } - return lexScore; -} - -void LexicalTable::load( char *fileName ) -{ - cerr << "Loading lexical translation table from " << fileName; - ifstream inFile; - inFile.open(fileName); - if (inFile.fail()) { - cerr << " - ERROR: could not open file\n"; - exit(1); - } - istream *inFileP = &inFile; - - char line[LINE_MAX_LENGTH]; - - int i=0; - while(true) { - i++; - if (i%100000 == 0) cerr << "." << flush; - SAFE_GETLINE((*inFileP), line, LINE_MAX_LENGTH, '\n', __FILE__); - if (inFileP->eof()) break; - - vector<string> token = tokenize( line ); - if (token.size() != 3) { - cerr << "line " << i << " in " << fileName - << " has wrong number of tokens, skipping:\n" - << token.size() << " " << token[0] << " " << line << endl; - continue; - } - - double prob = atof( token[2].c_str() ); - WORD_ID wordT = vcbT.storeIfNew( token[0] ); - WORD_ID wordS = vcbS.storeIfNew( token[1] ); - ltable[ wordS ][ wordT ] = prob; - } - cerr << endl; -} - - -std::pair<PhrasePairGroup::Coll::iterator,bool> PhrasePairGroup::insert ( const PhraseAlignmentCollection& obj ) -{ - std::pair<iterator,bool> ret = m_coll.insert(obj); - - if (ret.second) - { // obj inserted. Also add to sorted vector - const PhraseAlignmentCollection &insertedObj = *ret.first; - m_sortedColl.push_back(&insertedObj); - } - - return ret; -} - - |