/*********************************************************************** Moses - factored phrase-based language decoder Copyright (C) 2009 University of Edinburgh This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ***********************************************************************/ #include #include #include #include #include #include #include #include "tables-core.h" #include "SafeGetline.h" #include "InputFileStream.h" #include "OutputFileStream.h" #define LINE_MAX_LENGTH 10000 using namespace std; bool hierarchicalFlag = false; bool onlyDirectFlag = false; bool phraseCountFlag = true; bool lowCountFlag = false; bool goodTuringFlag = false; bool kneserNeyFlag = false; bool logProbFlag = false; bool outputNTLengths = false; inline float maybeLogProb( float a ) { return logProbFlag ? log(a) : a; } char line[LINE_MAX_LENGTH]; void processFiles( char*, char*, char*, char* ); void loadCountOfCounts( char* ); bool getLine( istream &fileP, vector< string > &item ); vector< string > splitLine(); int main(int argc, char* argv[]) { cerr << "Consolidate v2.0 written by Philipp Koehn\n" << "consolidating direct and indirect rule tables\n"; if (argc < 4) { cerr << "syntax: consolidate phrase-table.direct phrase-table.indirect phrase-table.consolidated [--Hierarchical] [--OnlyDirect] [--OutputNTLengths] \n"; exit(1); } char* &fileNameDirect = argv[1]; char* &fileNameIndirect = argv[2]; char* &fileNameConsolidated = argv[3]; char* fileNameCountOfCounts; for(int i=4; i countOfCounts; vector< float > goodTuringDiscount; float kneserNey_D1, kneserNey_D2, kneserNey_D3, totalCount = -1; void loadCountOfCounts( char* fileNameCountOfCounts ) { Moses::InputFileStream fileCountOfCounts(fileNameCountOfCounts); if (fileCountOfCounts.fail()) { cerr << "ERROR: could not open count of counts file " << fileNameCountOfCounts << endl; exit(1); } istream &fileP = fileCountOfCounts; countOfCounts.push_back(0.0); while(1) { if (fileP.eof()) break; SAFE_GETLINE((fileP), line, LINE_MAX_LENGTH, '\n', __FILE__); if (fileP.eof()) break; if (totalCount < 0) totalCount = atof(line); // total number of distinct phrase pairs else countOfCounts.push_back( atof(line) ); } fileCountOfCounts.Close(); // compute Good Turing discounts if (goodTuringFlag) { goodTuringDiscount.push_back(0.01); // floor value for( size_t i=1; i1) goodTuringDiscount[i] = 1; if (goodTuringDiscount[i] 0.9) kneserNey_D1 = 0.9; if (kneserNey_D2 > 1.9) kneserNey_D2 = 1.9; if (kneserNey_D3 > 2.9) kneserNey_D3 = 2.9; } void processFiles( char* fileNameDirect, char* fileNameIndirect, char* fileNameConsolidated, char* fileNameCountOfCounts ) { if (goodTuringFlag || kneserNeyFlag) loadCountOfCounts( fileNameCountOfCounts ); // open input files Moses::InputFileStream fileDirect(fileNameDirect); Moses::InputFileStream fileIndirect(fileNameIndirect); if (fileDirect.fail()) { cerr << "ERROR: could not open phrase table file " << fileNameDirect << endl; exit(1); } istream &fileDirectP = fileDirect; if (fileIndirect.fail()) { cerr << "ERROR: could not open phrase table file " << fileNameIndirect << endl; exit(1); } istream &fileIndirectP = fileIndirect; // open output file: consolidated phrase table Moses::OutputFileStream fileConsolidated; bool success = fileConsolidated.Open(fileNameConsolidated); if (!success) { cerr << "ERROR: could not open output file " << fileNameConsolidated << endl; exit(1); } // loop through all extracted phrase translations int i=0; while(true) { i++; if (i%100000 == 0) cerr << "." << flush; vector< string > itemDirect, itemIndirect; if (! getLine(fileIndirectP,itemIndirect) || ! getLine(fileDirectP, itemDirect )) break; // direct: target source alignment probabilities // indirect: source target probabilities // consistency checks if (itemDirect[0].compare( itemIndirect[0] ) != 0) { cerr << "ERROR: target phrase does not match in line " << i << ": '" << itemDirect[0] << "' != '" << itemIndirect[0] << "'" << endl; exit(1); } if (itemDirect[1].compare( itemIndirect[1] ) != 0) { cerr << "ERROR: source phrase does not match in line " << i << ": '" << itemDirect[1] << "' != '" << itemIndirect[1] << "'" << endl; exit(1); } // output hierarchical phrase pair (with separated labels) fileConsolidated << itemDirect[0] << " ||| " << itemDirect[1]; // SCORES ... fileConsolidated << " |||"; vector directCounts = tokenize(itemDirect[4].c_str()); vector indirectCounts = tokenize(itemIndirect[4].c_str()); float countF = atof(directCounts[0].c_str()); float countE = atof(indirectCounts[0].c_str()); float countEF = atof(indirectCounts[1].c_str()); float n1_F, n1_E; if (kneserNeyFlag) { n1_F = atof(directCounts[2].c_str()); n1_E = atof(indirectCounts[2].c_str()); } // Good Turing discounting float adjustedCountEF = countEF; if (goodTuringFlag && countEF+0.99999 < goodTuringDiscount.size()-1) adjustedCountEF *= goodTuringDiscount[(int)(countEF+0.99998)]; float adjustedCountEF_indirect = adjustedCountEF; // Kneser Ney discounting [Foster et al, 2006] if (kneserNeyFlag) { float D = kneserNey_D3; if (countEF < 2) D = kneserNey_D1; if (countEF < 3) D = kneserNey_D2; if (D > countEF) D = countEF - 0.01; // sanity constraint float p_b_E = n1_E / totalCount; // target phrase prob based on distinct float alpha_F = D * n1_F / countF; // available mass adjustedCountEF = countEF - D + countF * alpha_F * p_b_E; // for indirect float p_b_F = n1_F / totalCount; // target phrase prob based on distinct float alpha_E = D * n1_E / countE; // available mass adjustedCountEF_indirect = countEF - D + countE * alpha_E * p_b_F; } // prob indirect if (!onlyDirectFlag) { fileConsolidated << " " << maybeLogProb(adjustedCountEF_indirect/countE); fileConsolidated << " " << itemIndirect[2]; } // prob direct fileConsolidated << " " << maybeLogProb(adjustedCountEF/countF); fileConsolidated << " " << itemDirect[2]; // phrase count feature if (phraseCountFlag) { fileConsolidated << " " << maybeLogProb(2.718); } // low count feature if (lowCountFlag) { fileConsolidated << " " << maybeLogProb(exp(-1.0/countEF)); } // alignment fileConsolidated << " ||| " << itemDirect[3]; // counts, for debugging fileConsolidated << "||| " << countE << " " << countF; // << " " << countEF; if (outputNTLengths) { fileConsolidated << " ||| " << itemDirect[5]; } fileConsolidated << endl; } fileDirect.Close(); fileIndirect.Close(); fileConsolidated.Close(); } bool getLine( istream &fileP, vector< string > &item ) { if (fileP.eof()) return false; SAFE_GETLINE((fileP), line, LINE_MAX_LENGTH, '\n', __FILE__); if (fileP.eof()) return false; item = splitLine(); return true; } vector< string > splitLine() { vector< string > item; bool betweenWords = true; int start=0; int i=0; for(; line[i] != '\0'; i++) { if (line[i] == ' ' && line[i+1] == '|' && line[i+2] == '|' && line[i+3] == '|' && line[i+4] == ' ') { if (start > i) start = i; // empty item item.push_back( string( line+start, i-start ) ); start = i+5; i += 3; } } item.push_back( string( line+start, i-start ) ); return item; }