From f69c1dab02e72f5e73c2099d126cd4be86c9224d Mon Sep 17 00:00:00 2001 From: Philipp Koehn Date: Wed, 4 Feb 2015 09:18:09 +0000 Subject: more efficient default recaser training --- phrase-extract/consolidate-direct-main.cpp | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) (limited to 'phrase-extract/consolidate-direct-main.cpp') diff --git a/phrase-extract/consolidate-direct-main.cpp b/phrase-extract/consolidate-direct-main.cpp index 576cdd568..423a3909b 100644 --- a/phrase-extract/consolidate-direct-main.cpp +++ b/phrase-extract/consolidate-direct-main.cpp @@ -28,6 +28,8 @@ using namespace std; +std::vector tokenize( const char [] ); + vector< string > splitLine(const char *line) { vector< string > item; @@ -58,7 +60,7 @@ bool getLine( istream &fileP, vector< string > &item ) string line; if (getline(fileP, line)) { item = splitLine(line.c_str()); - return false; + return true; } else { return false; } @@ -107,17 +109,17 @@ int main(int argc, char* argv[]) if (! getLine(fileDirectP, itemDirect )) break; - (*fileConsolidated) << itemDirect[0] << " ||| " << itemDirect[1] << " ||| "; - - // output alignment and probabilities - (*fileConsolidated) << itemDirect[2] // prob direct - << " 2.718" // phrase count feature - << " ||| " << itemDirect[3]; // alignment - - // counts - (*fileConsolidated) << "||| 0 " << itemDirect[4]; // indirect - (*fileConsolidated) << endl; - + vector< string > count = tokenize( itemDirect[4].c_str() ); + float countEF = atof(count[0].c_str()); + float countF = atof(count[1].c_str()); + float prob = countF/countEF; + + (*fileConsolidated) << itemDirect[0] << " ||| " // source + << itemDirect[1] << " ||| " // target + << prob << " ||| " // prob + << itemDirect[2] << "||| " // alignment + << itemDirect[4] << " " << countEF // counts + << " ||| " << endl; } fileConsolidated->flush(); -- cgit v1.2.3