Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMatthias Huck <huck@i6.informatik.rwth-aachen.de>2015-03-06 01:25:32 +0300
committerMatthias Huck <huck@i6.informatik.rwth-aachen.de>2015-03-06 01:25:32 +0300
commitaa077ab66c802ff8053dc8d0c3741a14625e24a1 (patch)
tree65a960d8de7c65f4b4740838b0911d7555d25dbd /phrase-extract/consolidate-main.cpp
parentcb2e1b8a4036b58f81b85e5c43a970832a0df660 (diff)
GHKM extraction / consolidate: write most frequent POS sequence from property to factor (for usage with a POS LM)
Diffstat (limited to 'phrase-extract/consolidate-main.cpp')
-rw-r--r--phrase-extract/consolidate-main.cpp51
1 files changed, 38 insertions, 13 deletions
diff --git a/phrase-extract/consolidate-main.cpp b/phrase-extract/consolidate-main.cpp
index e48009769..7ef146f64 100644
--- a/phrase-extract/consolidate-main.cpp
+++ b/phrase-extract/consolidate-main.cpp
@@ -25,7 +25,7 @@
#include <cstdlib>
#include <cstring>
-#include "tables-core.h"
+#include "moses/Util.h"
#include "InputFileStream.h"
#include "OutputFileStream.h"
#include "PropertiesConsolidator.h"
@@ -46,7 +46,12 @@ float minScore2 = 0;
inline float maybeLogProb( float a )
{
- return logProbFlag ? log(a) : a;
+ return logProbFlag ? std::log(a) : a;
+}
+
+inline bool isNonTerminal( const std::string &word )
+{
+ return (word.length()>=3 && word[0] == '[' && word[word.length()-1] == ']');
}
void processFiles( char*, char*, char*, char*, char*, char* );
@@ -294,8 +299,8 @@ void processFiles( char* fileNameDirect, char* fileNameIndirect, char* fileNameC
breakdownCoreAndSparse( itemDirect[3], directScores, directSparseScores );
breakdownCoreAndSparse( itemIndirect[3], indirectScores, indirectSparseScores );
- vector<string> directCounts = tokenize(itemDirect[4].c_str());
- vector<string> indirectCounts = tokenize(itemIndirect[4].c_str());
+ vector<string> directCounts = Moses::Tokenize(itemDirect[4]);
+ vector<string> indirectCounts = Moses::Tokenize(itemIndirect[4]);
float countF = atof(directCounts[0].c_str());
float countE = atof(indirectCounts[0].c_str());
float countEF = atof(indirectCounts[1].c_str());
@@ -334,8 +339,32 @@ void processFiles( char* fileNameDirect, char* fileNameIndirect, char* fileNameC
continue;
}
- // output hierarchical phrase pair (with separated labels)
- fileConsolidated << itemDirect[0] << " ||| " << itemDirect[1] << " |||";
+ // output phrase pair
+ fileConsolidated << itemDirect[0] << " ||| ";
+
+ if (partsOfSpeechFlag) {
+ // write POS factor from property
+ std::vector<std::string> targetTokens = Moses::Tokenize(itemDirect[1]);
+ std::vector<std::string> propertyValuePOS;
+ propertiesConsolidator.GetPOSPropertyValueFromPropertiesString(itemDirect[5], propertyValuePOS);
+ size_t targetTerminalIndex = 0;
+ for (std::vector<std::string>::const_iterator targetTokensIt=targetTokens.begin();
+ targetTokensIt!=targetTokens.end(); ++targetTokensIt) {
+ fileConsolidated << *targetTokensIt;
+ if (!isNonTerminal(*targetTokensIt)) {
+ assert(propertyValuePOS.size() > targetTerminalIndex);
+ fileConsolidated << "|" << propertyValuePOS[targetTerminalIndex];
+ ++targetTerminalIndex;
+ }
+ fileConsolidated << " ";
+ }
+ fileConsolidated << "|||";
+
+ } else {
+
+ fileConsolidated << itemDirect[1] << " |||";
+ }
+
// prob indirect
if (!onlyDirectFlag) {
@@ -354,7 +383,7 @@ void processFiles( char* fileNameDirect, char* fileNameIndirect, char* fileNameC
// low count feature
if (lowCountFlag) {
- fileConsolidated << " " << maybeLogProb(exp(-1.0/countEF));
+ fileConsolidated << " " << maybeLogProb(std::exp(-1.0/countEF));
}
// count bin feature (as a core feature)
@@ -405,11 +434,7 @@ void processFiles( char* fileNameDirect, char* fileNameIndirect, char* fileNameC
// arbitrary key-value pairs
fileConsolidated << " |||";
if (itemDirect.size() >= 6) {
- //if (sourceLabelsFlag) {
- fileConsolidated << propertiesConsolidator.ProcessPropertiesString(itemDirect[5]);
- //} else {
- // fileConsolidated << itemDirect[5];
- //}
+ propertiesConsolidator.ProcessPropertiesString(itemDirect[5], fileConsolidated);
}
fileConsolidated << endl;
@@ -423,7 +448,7 @@ void breakdownCoreAndSparse( string combined, string &core, string &sparse )
{
core = "";
sparse = "";
- vector<string> score = tokenize( combined.c_str() );
+ vector<string> score = Moses::Tokenize( combined );
for(size_t i=0; i<score.size(); i++) {
if ((score[i][0] >= '0' && score[i][0] <= '9') || i+1 == score.size())
core += " " + score[i];