Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorphikoehn <pkoehn@inf.ed.ac.uk>2012-08-19 02:47:05 +0400
committerphikoehn <pkoehn@inf.ed.ac.uk>2012-08-19 02:47:05 +0400
commit366ab93f8aa53b7b065fe8366201bd59dafc51ba (patch)
treeda32d2aaaa424d633ad55574dac4257067fd73ab /phrase-extract
parent388f5a2046d4c45f3b311bbe133be02c3fa560d8 (diff)
a lot of changes
Diffstat (limited to 'phrase-extract')
-rw-r--r--phrase-extract/Jamfile5
-rw-r--r--phrase-extract/PhraseAlignment.cpp14
-rw-r--r--phrase-extract/PhraseAlignment.h5
-rw-r--r--phrase-extract/consolidate.cpp69
-rw-r--r--phrase-extract/extract-rules.cpp9
-rw-r--r--phrase-extract/extract.cpp23
-rw-r--r--phrase-extract/score.cpp128
7 files changed, 218 insertions, 35 deletions
diff --git a/phrase-extract/Jamfile b/phrase-extract/Jamfile
index d834674b8..50f03a739 100644
--- a/phrase-extract/Jamfile
+++ b/phrase-extract/Jamfile
@@ -2,6 +2,7 @@ obj InputFileStream.o : InputFileStream.cpp : <include>. ;
alias InputFileStream : InputFileStream.o ..//z ;
obj tables-core.o : tables-core.cpp : <include>. ;
+obj domain.o : domain.cpp : <include>. ;
obj AlignmentPhrase.o : AlignmentPhrase.cpp : <include>. ;
obj SentenceAlignment.o : SentenceAlignment.cpp : <include>. ;
obj SyntaxTree.o : SyntaxTree.cpp : <include>. ;
@@ -16,7 +17,7 @@ exe extract-rules : tables-core.o SentenceAlignment.o SyntaxTree.o XmlTree.o Sen
exe extract-lex : extract-lex.cpp InputFileStream ;
-exe score : tables-core.o AlignmentPhrase.o score.cpp PhraseAlignment.cpp OutputFileStream.cpp InputFileStream ..//boost_iostreams ;
+exe score : tables-core.o domain.o AlignmentPhrase.o score.cpp PhraseAlignment.cpp OutputFileStream.cpp InputFileStream ..//boost_iostreams ;
exe consolidate : consolidate.cpp tables-core.o OutputFileStream.cpp InputFileStream ..//boost_iostreams ;
@@ -24,7 +25,7 @@ exe consolidate-direct : consolidate-direct.cpp OutputFileStream.cpp InputFileSt
exe consolidate-reverse : consolidate-reverse.cpp tables-core.o InputFileStream ;
-exe relax-parse : tables-core.o SyntaxTree.o XmlTree.o relax-parse.cpp ;
+exe relax-parse : tables-core.o SyntaxTree.o XmlTree.o relax-parse.cpp InputFileStream ;
exe statistics : tables-core.o AlignmentPhrase.o statistics.cpp InputFileStream ;
diff --git a/phrase-extract/PhraseAlignment.cpp b/phrase-extract/PhraseAlignment.cpp
index e432294b9..bdfead082 100644
--- a/phrase-extract/PhraseAlignment.cpp
+++ b/phrase-extract/PhraseAlignment.cpp
@@ -79,12 +79,11 @@ inline void Tokenize( std::vector<T> &output
}
// read in a phrase pair and store it
-void PhraseAlignment::create( char line[], int lineID )
+void PhraseAlignment::create( char line[], int lineID, bool includeSentenceIdFlag )
{
assert(phraseS.empty());
assert(phraseT.empty());
- //cerr << "processing " << line;
vector< string > token = tokenize( line );
int item = 1;
for (size_t j=0; j<token.size(); j++) {
@@ -111,12 +110,13 @@ void PhraseAlignment::create( char line[], int lineID )
alignedToT[t].insert( s );
alignedToS[s].insert( t );
}
- } else if (item == 4) { // count
+ } else if (includeSentenceIdFlag && item == 4) { // optional sentence id
+ sscanf(token[j].c_str(), "%d", &sentenceId);
+ } else if (item + (includeSentenceIdFlag?-1:0) == 4) { // count
sscanf(token[j].c_str(), "%f", &count);
- }
- else if (item == 5) { // non-term lengths
+ } else if (item + (includeSentenceIdFlag?-1:0) == 5) { // non-term lengths
addNTLength(token[j]);
- } else if (item == 6) { // target syntax PCFG score
+ } else if (item + (includeSentenceIdFlag?-1:0) == 6) { // target syntax PCFG score
float pcfgScore = std::atof(token[j].c_str());
pcfgSum = pcfgScore * count;
}
@@ -124,7 +124,7 @@ void PhraseAlignment::create( char line[], int lineID )
createAlignVec(phraseS.size(), phraseT.size());
- if (item == 3) {
+ if (item + (includeSentenceIdFlag?-1:0) == 3) {
count = 1.0;
}
if (item < 3 || item > 6) {
diff --git a/phrase-extract/PhraseAlignment.h b/phrase-extract/PhraseAlignment.h
index 9763b7a52..ce8562a5c 100644
--- a/phrase-extract/PhraseAlignment.h
+++ b/phrase-extract/PhraseAlignment.h
@@ -30,10 +30,13 @@ protected:
public:
float pcfgSum;
float count;
+ size_t sentenceId;
+ std::string domain;
+
std::vector< std::set<size_t> > alignedToT;
std::vector< std::set<size_t> > alignedToS;
- void create( char*, int );
+ void create( char*, int, bool );
void clear();
bool equals( const PhraseAlignment& );
bool match( const PhraseAlignment& );
diff --git a/phrase-extract/consolidate.cpp b/phrase-extract/consolidate.cpp
index 1b724bd28..82807fd42 100644
--- a/phrase-extract/consolidate.cpp
+++ b/phrase-extract/consolidate.cpp
@@ -47,9 +47,11 @@ inline float maybeLogProb( float a ) { return logProbFlag ? log(a) : a; }
char line[LINE_MAX_LENGTH];
void processFiles( char*, char*, char*, char* );
void loadCountOfCounts( char* );
+void breakdownCoreAndSparse( string combined, string &core, string &sparse );
bool getLine( istream &fileP, vector< string > &item );
vector< string > splitLine();
vector< int > countBin;
+bool sparseCountBinFeatureFlag = false;
int main(int argc, char* argv[])
{
@@ -94,8 +96,11 @@ int main(int argc, char* argv[])
} else if (strcmp(argv[i],"--LowCountFeature") == 0) {
lowCountFlag = true;
cerr << "including the low count feature\n";
- } else if (strcmp(argv[i],"--CountBinFeature") == 0) {
- cerr << "include count bin feature:";
+ } else if (strcmp(argv[i],"--CountBinFeature") == 0 ||
+ strcmp(argv[i],"--SparseCountBinFeature") == 0) {
+ if (strcmp(argv[i],"--SparseCountBinFeature") == 0)
+ sparseCountBinFeatureFlag = true;
+ cerr << "include "<< (sparseCountBinFeatureFlag ? "sparse " : "") << "count bin feature:";
int prev = 0;
while(i+1<argc && argv[i+1][0]>='0' && argv[i+1][0]<='9') {
int binCount = atoi(argv[++i]);
@@ -223,10 +228,13 @@ void processFiles( char* fileNameDirect, char* fileNameIndirect, char* fileNameC
}
// output hierarchical phrase pair (with separated labels)
- fileConsolidated << itemDirect[0] << " ||| " << itemDirect[1];
+ fileConsolidated << itemDirect[0] << " ||| " << itemDirect[1] << " |||";
// SCORES ...
- fileConsolidated << " |||";
+ string directScores, directSparseScores, indirectScores, indirectSparseScores;
+ breakdownCoreAndSparse( itemDirect[2], directScores, directSparseScores );
+ breakdownCoreAndSparse( itemIndirect[2], indirectScores, indirectSparseScores );
+
vector<string> directCounts = tokenize(itemDirect[4].c_str());
vector<string> indirectCounts = tokenize(itemIndirect[4].c_str());
float countF = atof(directCounts[0].c_str());
@@ -264,12 +272,12 @@ void processFiles( char* fileNameDirect, char* fileNameIndirect, char* fileNameC
// prob indirect
if (!onlyDirectFlag) {
fileConsolidated << " " << maybeLogProb(adjustedCountEF_indirect/countE);
- fileConsolidated << " " << itemIndirect[2];
+ fileConsolidated << " " << directScores;
}
// prob direct
fileConsolidated << " " << maybeLogProb(adjustedCountEF/countF);
- fileConsolidated << " " << itemDirect[2];
+ fileConsolidated << " " << indirectScores;
// phrase count feature
if (phraseCountFlag) {
@@ -281,8 +289,8 @@ void processFiles( char* fileNameDirect, char* fileNameIndirect, char* fileNameC
fileConsolidated << " " << maybeLogProb(exp(-1.0/countEF));
}
- // count bin feature
- if (countBin.size()>0) {
+ // count bin feature (as a core feature)
+ if (countBin.size()>0 && !sparseCountBinFeatureFlag) {
bool foundBin = false;
for(size_t i=0; i < countBin.size(); i++) {
if (!foundBin && countEF <= countBin[i]) {
@@ -307,6 +315,35 @@ void processFiles( char* fileNameDirect, char* fileNameIndirect, char* fileNameC
fileConsolidated << " ||| " << itemDirect[5];
}
+ // count bin feature (as a sparse feature)
+ if (sparseCountBinFeatureFlag ||
+ directSparseScores.compare("") != 0 ||
+ indirectSparseScores.compare("") != 0)
+ {
+ fileConsolidated << " |||";
+ if (directSparseScores.compare("") != 0)
+ fileConsolidated << " " << directSparseScores;
+ if (indirectSparseScores.compare("") != 0)
+ fileConsolidated << " " << indirectSparseScores;
+ if (sparseCountBinFeatureFlag) {
+ bool foundBin = false;
+ for(size_t i=0; i < countBin.size(); i++) {
+ if (!foundBin && countEF <= countBin[i]) {
+ fileConsolidated << " cb_";
+ if (i == 0 && countBin[i] > 1)
+ fileConsolidated << "1_";
+ else if (i > 0 && countBin[i-1]+1 < countBin[i])
+ fileConsolidated << (countBin[i-1]+1) << "_";
+ fileConsolidated << countBin[i] << " 1";
+ foundBin = true;
+ }
+ }
+ if (!foundBin) {
+ fileConsolidated << " cb_max 1";
+ }
+ }
+ }
+
fileConsolidated << endl;
}
fileDirect.Close();
@@ -314,6 +351,22 @@ void processFiles( char* fileNameDirect, char* fileNameIndirect, char* fileNameC
fileConsolidated.Close();
}
+void breakdownCoreAndSparse( string combined, string &core, string &sparse )
+{
+ core = "";
+ sparse = "";
+ vector<string> score = tokenize( combined.c_str() );
+ for(size_t i=0; i<score.size(); i++) {
+ if ((score[i][0] >= '0' && score[i][0] <= '9') || i+1 == score.size())
+ core += " " + score[i];
+ else {
+ sparse += " " + score[i];
+ sparse += " " + score[++i];
+ }
+ }
+ if (core.size() > 0 ) core = core.substr(1);
+ if (sparse.size() > 0 ) sparse = sparse.substr(1);
+}
bool getLine( istream &fileP, vector< string > &item )
{
diff --git a/phrase-extract/extract-rules.cpp b/phrase-extract/extract-rules.cpp
index c333040f6..0abf548c3 100644
--- a/phrase-extract/extract-rules.cpp
+++ b/phrase-extract/extract-rules.cpp
@@ -120,6 +120,7 @@ int main(int argc, char* argv[])
<< "rule extraction from an aligned parallel corpus\n";
RuleExtractionOptions options;
+ int sentenceOffset = 0;
#ifdef WITH_THREADS
int thread_count = 1;
#endif
@@ -274,6 +275,12 @@ int main(int argc, char* argv[])
strcmp(argv[i],"--Threads") == 0) {
thread_count = atoi(argv[++i]);
#endif
+ } else if (strcmp(argv[i], "--SentenceOffset") == 0) {
+ if (i+1 >= argc || argv[i+1][0] < '0' || argv[i+1][0] > '9') {
+ cerr << "extract: syntax error, used switch --SentenceOffset without a number" << endl;
+ exit(1);
+ }
+ sentenceOffset = atoi(argv[++i]);
} else {
cerr << "extract: syntax error, unknown option '" << string(argv[i]) << "'\n";
exit(1);
@@ -314,7 +321,7 @@ int main(int argc, char* argv[])
#endif
// loop through all sentence pairs
- size_t i=0;
+ size_t i=sentenceOffset;
while(true) {
i++;
if (i%1000 == 0) cerr << "." << flush;
diff --git a/phrase-extract/extract.cpp b/phrase-extract/extract.cpp
index 6583fd077..89c45a2e6 100644
--- a/phrase-extract/extract.cpp
+++ b/phrase-extract/extract.cpp
@@ -95,6 +95,8 @@ int maxPhraseLength;
bool orientationFlag = false;
bool translationFlag = true;
bool sentenceIdFlag = false; //create extract file with sentence id
+int sentenceOffset = 0;
+bool includeSentenceIdFlag = false; //include sentence id in extract file
bool onlyOutputSpanInfo = false;
bool gzOutput = false;
@@ -106,7 +108,7 @@ int main(int argc, char* argv[])
<< "phrase extraction from an aligned parallel corpus\n";
if (argc < 6) {
- cerr << "syntax: extract en de align extract max-length [orientation [ --model [wbe|phrase|hier]-[msd|mslr|mono] ] | --OnlyOutputSpanInfo | --NoTTable | --SentenceId]\n";
+ cerr << "syntax: extract en de align extract max-length [orientation [ --model [wbe|phrase|hier]-[msd|mslr|mono] ] | --OnlyOutputSpanInfo | --NoTTable | --SentenceId | --IncludeSentenceId | --SentenceOffset n ]\n";
exit(1);
}
char* &fileNameE = argv[1];
@@ -124,6 +126,14 @@ int main(int argc, char* argv[])
translationFlag = false;
} else if (strcmp(argv[i], "--SentenceId") == 0) {
sentenceIdFlag = true;
+ } else if (strcmp(argv[i], "--IncludeSentenceId") == 0) {
+ includeSentenceIdFlag = true;
+ } else if (strcmp(argv[i], "--SentenceOffset") == 0) {
+ if (i+1 >= argc || argv[i+1][0] < '0' || argv[i+1][0] > '9') {
+ cerr << "extract: syntax error, used switch --SentenceOffset without a number" << endl;
+ exit(1);
+ }
+ sentenceOffset = atoi(argv[++i]);
} else if (strcmp(argv[i], "--GZOutput") == 0) {
gzOutput = true;
} else if(strcmp(argv[i],"--model") == 0) {
@@ -135,7 +145,7 @@ int main(int argc, char* argv[])
char* modelName = strtok(modelParams, "-");
char* modelType = strtok(NULL, "-");
- REO_MODEL_TYPE intModelType;
+ // REO_MODEL_TYPE intModelType;
if(strcmp(modelName, "wbe") == 0) {
wordModel = true;
@@ -217,7 +227,7 @@ int main(int argc, char* argv[])
extractFileSentenceId.Open(fileNameExtractSentenceId.c_str());
}
- int i=0;
+ int i = sentenceOffset;
while(true) {
i++;
if (i%10000 == 0) cerr << "." << flush;
@@ -669,9 +679,11 @@ void addPhrase( SentenceAlignment &sentence, int startE, int endE, int startF, i
if (orientationFlag)
extractFileOrientation << orientationInfo;
- if (sentenceIdFlag) {
+ if (sentenceIdFlag)
extractFileSentenceId << sentence.sentenceID;
- }
+
+ if (includeSentenceIdFlag)
+ extractFile << " ||| " << sentence.sentenceID;
if (translationFlag) extractFile << "\n";
if (translationFlag) extractFileInv << "\n";
@@ -708,4 +720,3 @@ void extractBase( SentenceAlignment &sentence )
}
}
-
diff --git a/phrase-extract/score.cpp b/phrase-extract/score.cpp
index bfc95de09..f02b6b3b0 100644
--- a/phrase-extract/score.cpp
+++ b/phrase-extract/score.cpp
@@ -29,6 +29,7 @@
#include "SafeGetline.h"
#include "tables-core.h"
+#include "domain.h"
#include "PhraseAlignment.h"
#include "score.h"
#include "InputFileStream.h"
@@ -53,6 +54,7 @@ bool kneserNeyFlag = false;
#define COC_MAX 10
bool logProbFlag = false;
int negLogProb = 1;
+inline float maybeLogProb( float a ) { return logProbFlag ? negLogProb*log(a) : a; }
bool lexFlag = true;
bool unalignedFlag = false;
bool unalignedFWFlag = false;
@@ -60,6 +62,12 @@ bool outputNTLengths = false;
int countOfCounts[COC_MAX+1];
int totalDistinct = 0;
float minCountHierarchical = 0;
+bool domainFlag = false;
+bool domainRatioFlag = false;
+bool domainSubsetFlag = false;
+bool domainSparseFlag = false;
+Domain *domain;
+bool includeSentenceIdFlag = false;
Vocabulary vcbT;
Vocabulary vcbS;
@@ -89,14 +97,15 @@ int main(int argc, char* argv[])
<< "scoring methods for extracted rules\n";
if (argc < 4) {
- cerr << "syntax: score extract lex phrase-table [--Inverse] [--Hierarchical] [--LogProb] [--NegLogProb] [--NoLex] [--GoodTuring] [--KneserNey] [--WordAlignment] [--UnalignedPenalty] [--UnalignedFunctionWordPenalty function-word-file] [--MinCountHierarchical count] [--OutputNTLengths] [--PCFG] [--UnpairedExtractFormat] [--ConditionOnTargetLHS]\n";
+ cerr << "syntax: score extract lex phrase-table [--Inverse] [--Hierarchical] [--LogProb] [--NegLogProb] [--NoLex] [--GoodTuring] [--KneserNey] [--WordAlignment] [--UnalignedPenalty] [--UnalignedFunctionWordPenalty function-word-file] [--MinCountHierarchical count] [--OutputNTLengths] [--PCFG] [--UnpairedExtractFormat] [--ConditionOnTargetLHS] [--[Sparse]Domain[Indicator|Ratio|Subset|Bin] domain-file [bins]]\n";
exit(1);
}
char* fileNameExtract = argv[1];
char* fileNameLex = argv[2];
char* fileNamePhraseTable = argv[3];
string fileNameCountOfCounts;
- char* fileNameFunctionWords;
+ char* fileNameFunctionWords = NULL;
+ char* fileNameDomain = NULL;
for(int i=4; i<argc; i++) {
if (strcmp(argv[i],"inverse") == 0 || strcmp(argv[i],"--Inverse") == 0) {
@@ -139,6 +148,22 @@ int main(int argc, char* argv[])
}
fileNameFunctionWords = argv[++i];
cerr << "using unaligned function word penalty with function words from " << fileNameFunctionWords << endl;
+ } else if (strcmp(argv[i],"--SparseDomainIndicator") == 0 ||
+ strcmp(argv[i],"--SparseDomainRatio") == 0 ||
+ strcmp(argv[i],"--SparseDomainSubset") == 0 ||
+ strcmp(argv[i],"--DomainIndicator") == 0 ||
+ strcmp(argv[i],"--DomainRatio") == 0 ||
+ strcmp(argv[i],"--DomainSubset") == 0) {
+ includeSentenceIdFlag = true;
+ domainFlag = true;
+ domainSparseFlag = strstr( argv[i], "Sparse" );
+ domainRatioFlag = strstr( argv[i], "Ratio" );
+ domainSubsetFlag = strstr( argv[i], "Subset" );
+ if (i+1==argc) {
+ cerr << "ERROR: specify domain info file with " << argv[i] << endl;
+ exit(1);
+ }
+ fileNameDomain = argv[++i];
} else if (strcmp(argv[i],"--LogProb") == 0) {
logProbFlag = true;
cerr << "using log-probabilities\n";
@@ -166,6 +191,18 @@ int main(int argc, char* argv[])
if (unalignedFWFlag)
loadFunctionWords( fileNameFunctionWords );
+ // load domain information
+ if (domainFlag) {
+ if (inverseFlag) {
+ domainFlag = false;
+ includeSentenceIdFlag = false;
+ }
+ else {
+ domain = new Domain;
+ domain->load( fileNameDomain );
+ }
+ }
+
// compute count of counts for Good Turing discounting
if (goodTuringFlag || kneserNeyFlag) {
for(int i=1; i<=COC_MAX; i++) countOfCounts[i] = 0;
@@ -221,7 +258,7 @@ int main(int argc, char* argv[])
// create new phrase pair
PhraseAlignment phrasePair;
- phrasePair.create( line, i );
+ phrasePair.create( line, i, includeSentenceIdFlag );
lastCount = phrasePair.count;
lastPcfgSum = phrasePair.pcfgSum;
@@ -320,7 +357,6 @@ void processPhrasePairs( vector< PhraseAlignment > &phrasePair, ostream &phraseT
{
const PhraseAlignmentCollection &group = **iter;
outputPhrasePair( group, totalSource, phrasePairGroup.GetSize(), phraseTableFile );
-
}
}
@@ -328,7 +364,7 @@ void processPhrasePairs( vector< PhraseAlignment > &phrasePair, ostream &phraseT
PhraseAlignment* findBestAlignment(const PhraseAlignmentCollection &phrasePair )
{
float bestAlignmentCount = -1;
- PhraseAlignment* bestAlignment;
+ PhraseAlignment* bestAlignment = NULL;
for(size_t i=0; i<phrasePair.size(); i++) {
if (phrasePair[i]->count > bestAlignmentCount) {
@@ -440,6 +476,18 @@ void outputPhrasePair(const PhraseAlignmentCollection &phrasePair, float totalCo
count += phrasePair[i]->count;
}
+ // compute domain counts
+ map< string, float > domainCount;
+ if (domainFlag) {
+ for(size_t i=0; i<phrasePair.size(); i++) {
+ string d = domain->getDomainOfSentence( phrasePair[i]->sentenceId );
+ if (domainCount.find( d ) == domainCount.end())
+ domainCount[ d ] = phrasePair[i]->count;
+ else
+ domainCount[ d ] += phrasePair[i]->count;
+ }
+ }
+
// collect count of count statistics
if (goodTuringFlag || kneserNeyFlag) {
totalDistinct++;
@@ -449,7 +497,7 @@ void outputPhrasePair(const PhraseAlignmentCollection &phrasePair, float totalCo
}
// compute PCFG score
- float pcfgScore;
+ float pcfgScore = 0;
if (pcfgFlag && !inverseFlag) {
float pcfgSum = 0;
for(size_t i=0; i<phrasePair.size(); ++i) {
@@ -489,24 +537,84 @@ void outputPhrasePair(const PhraseAlignmentCollection &phrasePair, float totalCo
// lexical translation probability
if (lexFlag) {
double lexScore = computeLexicalTranslation( phraseS, phraseT, bestAlignment);
- phraseTableFile << ( logProbFlag ? negLogProb*log(lexScore) : lexScore );
+ phraseTableFile << maybeLogProb( lexScore );
}
// unaligned word penalty
if (unalignedFlag) {
double penalty = computeUnalignedPenalty( phraseS, phraseT, bestAlignment);
- phraseTableFile << " " << ( logProbFlag ? negLogProb*log(penalty) : penalty );
+ phraseTableFile << " " << maybeLogProb( penalty );
}
// unaligned function word penalty
if (unalignedFWFlag) {
double penalty = computeUnalignedFWPenalty( phraseS, phraseT, bestAlignment);
- phraseTableFile << " " << ( logProbFlag ? negLogProb*log(penalty) : penalty );
+ phraseTableFile << " " << maybeLogProb( penalty );
}
// target-side PCFG score
if (pcfgFlag && !inverseFlag) {
- phraseTableFile << " " << pcfgScore;
+ phraseTableFile << " " << maybeLogProb( pcfgScore );
+ }
+
+ // domain count features
+ if (domainFlag) {
+ if (domainSparseFlag) {
+ // sparse, subset
+ if (domainSubsetFlag) {
+ typedef vector< string >::const_iterator I;
+ phraseTableFile << " doms";
+ for (I i = domain->list.begin(); i != domain->list.end(); i++ ) {
+ if (domainCount.find( *i ) != domainCount.end() ) {
+ phraseTableFile << "_" << *i;
+ }
+ }
+ phraseTableFile << " 1";
+ }
+ // sparse, indicator or ratio
+ else {
+ typedef map< string, float >::const_iterator I;
+ for (I i=domainCount.begin(); i != domainCount.end(); i++) {
+ if (domainRatioFlag) {
+ phraseTableFile << " domr_" << i->first << " " << (i->second / count);
+ }
+ else {
+ phraseTableFile << " dom_" << i->first << " 1";
+ }
+ }
+ }
+ }
+ // core, subset
+ else if (domainSubsetFlag) {
+ if (domain->list.size() > 6) {
+ cerr << "ERROR: too many domains for core domain subset features\n";
+ exit(1);
+ }
+ size_t bitmap = 0;
+ for(size_t bit = 0; bit < domain->list.size(); bit++) {
+ if (domainCount.find( domain->list[ bit ] ) != domainCount.end()) {
+ bitmap += 1 << bit;
+ }
+ }
+ for(size_t i = 1; i < (1 << domain->list.size()); i++) {
+ phraseTableFile << " " << maybeLogProb( (bitmap == i) ? 2.718 : 1 );
+ }
+ }
+ // core, indicator or ratio
+ else {
+ typedef vector< string >::const_iterator I;
+ for (I i = domain->list.begin(); i != domain->list.end(); i++ ) {
+ if (domainCount.find( *i ) == domainCount.end() ) {
+ phraseTableFile << " " << maybeLogProb( 1 );
+ }
+ else if (domainRatioFlag) {
+ phraseTableFile << " " << maybeLogProb( exp( domainCount[ *i ] / count ) );
+ }
+ else {
+ phraseTableFile << " " << maybeLogProb( 2.718 );
+ }
+ }
+ }
}
phraseTableFile << " ||| ";