Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorBarry Haddow <barry.haddow@gmail.com>2012-09-27 01:49:33 +0400
committerBarry Haddow <barry.haddow@gmail.com>2012-09-27 01:49:33 +0400
commit0a950ee9f4227c8afbbe58d03a854745479ffbc0 (patch)
tree3e4515adc6b3323f8742ff5addde2f29da2002c8 /phrase-extract
parent1ce788e2b83dc9b359f6132e7e82774f9d0777b1 (diff)
parentab60d1ad6f93a78e80e665bc6c7d32b61b7c1c52 (diff)
Merge remote branch 'github/master' into miramerge
Compiles, but not tested. Had to disable relent filter. Strangely, it seems to contain the whole of moses-cmd. Conflicts: Jamroot OnDiskPt/TargetPhrase.cpp moses-cmd/src/Main.cpp moses/src/AlignmentInfo.cpp moses/src/AlignmentInfo.h moses/src/ChartTranslationOptionCollection.cpp moses/src/ChartTranslationOptionCollection.h moses/src/GenerationDictionary.cpp moses/src/Jamfile moses/src/Parameter.cpp moses/src/PhraseDictionary.cpp moses/src/StaticData.cpp moses/src/StaticData.h moses/src/TargetPhrase.h moses/src/TranslationSystem.cpp moses/src/TranslationSystem.h moses/src/Word.cpp phrase-extract/score.cpp regression-testing/Jamfile scripts/ems/experiment.meta scripts/ems/experiment.perl scripts/training/train-model.perl
Diffstat (limited to 'phrase-extract')
-rw-r--r--phrase-extract/Jamfile9
-rw-r--r--phrase-extract/PhraseAlignment.cpp14
-rw-r--r--phrase-extract/PhraseAlignment.h5
-rw-r--r--phrase-extract/PhraseExtractionOptions.h152
-rw-r--r--phrase-extract/RuleExtractionOptions.h2
-rw-r--r--phrase-extract/SentenceAlignment.cpp38
-rw-r--r--phrase-extract/SentenceAlignment.h7
-rw-r--r--phrase-extract/SentenceAlignmentWithSyntax.cpp8
-rw-r--r--phrase-extract/SentenceAlignmentWithSyntax.h4
-rw-r--r--phrase-extract/consolidate.cpp69
-rw-r--r--phrase-extract/domain.cpp52
-rw-r--r--phrase-extract/domain.h32
-rw-r--r--phrase-extract/extract-rules.cpp269
-rw-r--r--phrase-extract/extract.cpp408
-rw-r--r--phrase-extract/score.cpp331
-rw-r--r--phrase-extract/score.h2
-rw-r--r--phrase-extract/statistics.cpp10
17 files changed, 1010 insertions, 402 deletions
diff --git a/phrase-extract/Jamfile b/phrase-extract/Jamfile
index d834674b8..e4f801089 100644
--- a/phrase-extract/Jamfile
+++ b/phrase-extract/Jamfile
@@ -2,6 +2,7 @@ obj InputFileStream.o : InputFileStream.cpp : <include>. ;
alias InputFileStream : InputFileStream.o ..//z ;
obj tables-core.o : tables-core.cpp : <include>. ;
+obj domain.o : domain.cpp : <include>. ;
obj AlignmentPhrase.o : AlignmentPhrase.cpp : <include>. ;
obj SentenceAlignment.o : SentenceAlignment.cpp : <include>. ;
obj SyntaxTree.o : SyntaxTree.cpp : <include>. ;
@@ -10,13 +11,13 @@ obj XmlTree.o : XmlTree.cpp : <include>. ;
alias filestreams : InputFileStream.cpp OutputFileStream.cpp : : : <include>. ;
alias trees : SyntaxTree.cpp tables-core.o XmlTree.o : : : <include>. ;
-exe extract : tables-core.o SentenceAlignment.o extract.cpp OutputFileStream.cpp InputFileStream ..//boost_iostreams ;
+exe extract : tables-core.o SentenceAlignment.o extract.cpp OutputFileStream.cpp InputFileStream ../moses/src//ThreadPool ..//boost_iostreams ;
-exe extract-rules : tables-core.o SentenceAlignment.o SyntaxTree.o XmlTree.o SentenceAlignmentWithSyntax.cpp HoleCollection.cpp extract-rules.cpp ExtractedRule.cpp OutputFileStream.cpp InputFileStream ../moses/src//ThreadPool ..//boost_iostreams ;
+exe extract-rules : tables-core.o SentenceAlignment.o SyntaxTree.o XmlTree.o SentenceAlignmentWithSyntax.cpp HoleCollection.cpp extract-rules.cpp ExtractedRule.cpp OutputFileStream.cpp InputFileStream ..//boost_iostreams ;
exe extract-lex : extract-lex.cpp InputFileStream ;
-exe score : tables-core.o AlignmentPhrase.o score.cpp PhraseAlignment.cpp OutputFileStream.cpp InputFileStream ..//boost_iostreams ;
+exe score : tables-core.o domain.o AlignmentPhrase.o score.cpp PhraseAlignment.cpp OutputFileStream.cpp InputFileStream ..//boost_iostreams ;
exe consolidate : consolidate.cpp tables-core.o OutputFileStream.cpp InputFileStream ..//boost_iostreams ;
@@ -24,7 +25,7 @@ exe consolidate-direct : consolidate-direct.cpp OutputFileStream.cpp InputFileSt
exe consolidate-reverse : consolidate-reverse.cpp tables-core.o InputFileStream ;
-exe relax-parse : tables-core.o SyntaxTree.o XmlTree.o relax-parse.cpp ;
+exe relax-parse : tables-core.o SyntaxTree.o XmlTree.o relax-parse.cpp InputFileStream ;
exe statistics : tables-core.o AlignmentPhrase.o statistics.cpp InputFileStream ;
diff --git a/phrase-extract/PhraseAlignment.cpp b/phrase-extract/PhraseAlignment.cpp
index e432294b9..bdfead082 100644
--- a/phrase-extract/PhraseAlignment.cpp
+++ b/phrase-extract/PhraseAlignment.cpp
@@ -79,12 +79,11 @@ inline void Tokenize( std::vector<T> &output
}
// read in a phrase pair and store it
-void PhraseAlignment::create( char line[], int lineID )
+void PhraseAlignment::create( char line[], int lineID, bool includeSentenceIdFlag )
{
assert(phraseS.empty());
assert(phraseT.empty());
- //cerr << "processing " << line;
vector< string > token = tokenize( line );
int item = 1;
for (size_t j=0; j<token.size(); j++) {
@@ -111,12 +110,13 @@ void PhraseAlignment::create( char line[], int lineID )
alignedToT[t].insert( s );
alignedToS[s].insert( t );
}
- } else if (item == 4) { // count
+ } else if (includeSentenceIdFlag && item == 4) { // optional sentence id
+ sscanf(token[j].c_str(), "%d", &sentenceId);
+ } else if (item + (includeSentenceIdFlag?-1:0) == 4) { // count
sscanf(token[j].c_str(), "%f", &count);
- }
- else if (item == 5) { // non-term lengths
+ } else if (item + (includeSentenceIdFlag?-1:0) == 5) { // non-term lengths
addNTLength(token[j]);
- } else if (item == 6) { // target syntax PCFG score
+ } else if (item + (includeSentenceIdFlag?-1:0) == 6) { // target syntax PCFG score
float pcfgScore = std::atof(token[j].c_str());
pcfgSum = pcfgScore * count;
}
@@ -124,7 +124,7 @@ void PhraseAlignment::create( char line[], int lineID )
createAlignVec(phraseS.size(), phraseT.size());
- if (item == 3) {
+ if (item + (includeSentenceIdFlag?-1:0) == 3) {
count = 1.0;
}
if (item < 3 || item > 6) {
diff --git a/phrase-extract/PhraseAlignment.h b/phrase-extract/PhraseAlignment.h
index 9763b7a52..35afb314b 100644
--- a/phrase-extract/PhraseAlignment.h
+++ b/phrase-extract/PhraseAlignment.h
@@ -30,10 +30,13 @@ protected:
public:
float pcfgSum;
float count;
+ int sentenceId;
+ std::string domain;
+
std::vector< std::set<size_t> > alignedToT;
std::vector< std::set<size_t> > alignedToS;
- void create( char*, int );
+ void create( char*, int, bool );
void clear();
bool equals( const PhraseAlignment& );
bool match( const PhraseAlignment& );
diff --git a/phrase-extract/PhraseExtractionOptions.h b/phrase-extract/PhraseExtractionOptions.h
new file mode 100644
index 000000000..eeec39750
--- /dev/null
+++ b/phrase-extract/PhraseExtractionOptions.h
@@ -0,0 +1,152 @@
+/***********************************************************************
+ Moses - factored phrase-based language decoder
+ Copyright (C) 2010 University of Edinburgh
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ ***********************************************************************/
+
+/* Created by Rohit Gupta, CDAC, Mumbai, India on 18 July, 2012*/
+
+#pragma once
+#ifndef PHRASEEXTRACTIONOPTIONS_H_INCLUDED_
+#define PHRASEEXTRACTIONOPTIONS_H_INCLUDED_
+
+namespace MosesTraining
+{
+enum REO_MODEL_TYPE {REO_MSD, REO_MSLR, REO_MONO};
+enum REO_POS {LEFT, RIGHT, DLEFT, DRIGHT, UNKNOWN};
+
+
+class PhraseExtractionOptions {
+
+ public:
+ const int maxPhraseLength;
+ private:
+ bool allModelsOutputFlag;
+ bool wordModel;
+ REO_MODEL_TYPE wordType;
+ bool phraseModel;
+ REO_MODEL_TYPE phraseType;
+ bool hierModel;
+ REO_MODEL_TYPE hierType;
+ bool orientationFlag;
+ bool translationFlag;
+ bool sentenceIdFlag; //create extract file with sentence id
+ bool includeSentenceIdFlag; //include sentence id in extract file
+ bool onlyOutputSpanInfo;
+ bool gzOutput;
+
+public:
+ PhraseExtractionOptions(const int initmaxPhraseLength):
+ maxPhraseLength(initmaxPhraseLength),
+ allModelsOutputFlag(false),
+ wordModel(false),
+ wordType(REO_MSD),
+ phraseModel(false),
+ phraseType(REO_MSD),
+ hierModel(false),
+ hierType(REO_MSD),
+ orientationFlag(false),
+ translationFlag(true),
+ sentenceIdFlag(false),
+ includeSentenceIdFlag(false),
+ onlyOutputSpanInfo(false),
+ gzOutput(false){}
+
+ //functions for initialization of options
+ void initAllModelsOutputFlag(const bool initallModelsOutputFlag){
+ allModelsOutputFlag=initallModelsOutputFlag;
+ }
+ void initWordModel(const bool initwordModel){
+ wordModel=initwordModel;
+ }
+ void initWordType(REO_MODEL_TYPE initwordType ){
+ wordType=initwordType;
+ }
+ void initPhraseModel(const bool initphraseModel ){
+ phraseModel=initphraseModel;
+ }
+ void initPhraseType(REO_MODEL_TYPE initphraseType){
+ phraseType=initphraseType;
+ }
+ void initHierModel(const bool inithierModel){
+ hierModel=inithierModel;
+ }
+ void initHierType(REO_MODEL_TYPE inithierType){
+ hierType=inithierType;
+ }
+ void initOrientationFlag(const bool initorientationFlag){
+ orientationFlag=initorientationFlag;
+ }
+ void initTranslationFlag(const bool inittranslationFlag){
+ translationFlag=inittranslationFlag;
+ }
+ void initSentenceIdFlag(const bool initsentenceIdFlag){
+ sentenceIdFlag=initsentenceIdFlag;
+ }
+ void initIncludeSentenceIdFlag(const bool initincludeSentenceIdFlag){
+ includeSentenceIdFlag=initincludeSentenceIdFlag;
+ }
+ void initOnlyOutputSpanInfo(const bool initonlyOutputSpanInfo){
+ onlyOutputSpanInfo= initonlyOutputSpanInfo;
+ }
+ void initGzOutput (const bool initgzOutput){
+ gzOutput= initgzOutput;
+ }
+ // functions for getting values
+ bool isAllModelsOutputFlag() const {
+ return allModelsOutputFlag;
+ }
+ bool isWordModel() const {
+ return wordModel;
+ }
+ REO_MODEL_TYPE isWordType() const {
+ return wordType;
+ }
+ bool isPhraseModel() const {
+ return phraseModel;
+ }
+ REO_MODEL_TYPE isPhraseType() const {
+ return phraseType;
+ }
+ bool isHierModel() const {
+ return hierModel;
+ }
+ REO_MODEL_TYPE isHierType() const {
+ return hierType;
+ }
+ bool isOrientationFlag() const {
+ return orientationFlag;
+ }
+ bool isTranslationFlag() const {
+ return translationFlag;
+ }
+ bool isSentenceIdFlag() const {
+ return sentenceIdFlag;
+ }
+ bool isIncludeSentenceIdFlag() const {
+ return includeSentenceIdFlag;
+ }
+ bool isOnlyOutputSpanInfo() const {
+ return onlyOutputSpanInfo;
+ }
+ bool isGzOutput () const {
+ return gzOutput;
+ }
+};
+
+}
+
+#endif
diff --git a/phrase-extract/RuleExtractionOptions.h b/phrase-extract/RuleExtractionOptions.h
index bb2d97580..431be58b0 100644
--- a/phrase-extract/RuleExtractionOptions.h
+++ b/phrase-extract/RuleExtractionOptions.h
@@ -53,6 +53,7 @@ public:
bool gzOutput;
bool unpairedExtractFormat;
bool conditionOnTargetLhs;
+ bool boundaryRules;
RuleExtractionOptions()
: maxSpan(10)
@@ -85,6 +86,7 @@ public:
, gzOutput(false)
, unpairedExtractFormat(false)
, conditionOnTargetLhs(false)
+ , boundaryRules(false)
{}
};
diff --git a/phrase-extract/SentenceAlignment.cpp b/phrase-extract/SentenceAlignment.cpp
index 8e44bddc4..af1cfa953 100644
--- a/phrase-extract/SentenceAlignment.cpp
+++ b/phrase-extract/SentenceAlignment.cpp
@@ -25,33 +25,45 @@
#include "tables-core.h"
+using namespace std;
+
namespace MosesTraining
{
SentenceAlignment::~SentenceAlignment() {}
-bool SentenceAlignment::processTargetSentence(const char * targetString, int)
+void addBoundaryWords(vector<string> &phrase)
+{
+ phrase.insert(phrase.begin(), "<s>");
+ phrase.push_back("</s>");
+}
+
+bool SentenceAlignment::processTargetSentence(const char * targetString, int, bool boundaryRules)
{
target = tokenize(targetString);
+ if (boundaryRules)
+ addBoundaryWords(target);
return true;
}
-bool SentenceAlignment::processSourceSentence(const char * sourceString, int)
+bool SentenceAlignment::processSourceSentence(const char * sourceString, int, bool boundaryRules)
{
source = tokenize(sourceString);
+ if (boundaryRules)
+ addBoundaryWords(source);
return true;
}
-bool SentenceAlignment::create( char targetString[], char sourceString[], char alignmentString[], int sentenceID)
+bool SentenceAlignment::create( char targetString[], char sourceString[], char alignmentString[], int sentenceID, bool boundaryRules)
{
using namespace std;
this->sentenceID = sentenceID;
// process sentence strings and store in target and source members.
- if (!processTargetSentence(targetString, sentenceID)) {
+ if (!processTargetSentence(targetString, sentenceID, boundaryRules)) {
return false;
}
- if (!processSourceSentence(sourceString, sentenceID)) {
+ if (!processSourceSentence(sourceString, sentenceID, boundaryRules)) {
return false;
}
@@ -81,6 +93,12 @@ bool SentenceAlignment::create( char targetString[], char sourceString[], char a
cerr << "T: " << targetString << endl << "S: " << sourceString << endl;
return false;
}
+
+ if (boundaryRules) {
+ ++s;
+ ++t;
+ }
+
// cout << "alignmentSequence[i] " << alignmentSequence[i] << " is " << s << ", " << t << endl;
if ((size_t)t >= target.size() || (size_t)s >= source.size()) {
cerr << "WARNING: sentence " << sentenceID << " has alignment point (" << s << ", " << t << ") out of bounds (" << source.size() << ", " << target.size() << ")\n";
@@ -90,6 +108,16 @@ bool SentenceAlignment::create( char targetString[], char sourceString[], char a
alignedToT[t].push_back( s );
alignedCountS[s]++;
}
+
+ if (boundaryRules) {
+ alignedToT[0].push_back(0);
+ alignedCountS[0]++;
+
+ alignedToT.back().push_back(alignedCountS.size() - 1);
+ alignedCountS.back()++;
+
+ }
+
return true;
}
diff --git a/phrase-extract/SentenceAlignment.h b/phrase-extract/SentenceAlignment.h
index b1fb5933a..7c2988780 100644
--- a/phrase-extract/SentenceAlignment.h
+++ b/phrase-extract/SentenceAlignment.h
@@ -38,12 +38,13 @@ public:
virtual ~SentenceAlignment();
- virtual bool processTargetSentence(const char *, int);
+ virtual bool processTargetSentence(const char *, int, bool boundaryRules);
- virtual bool processSourceSentence(const char *, int);
+ virtual bool processSourceSentence(const char *, int, bool boundaryRules);
bool create(char targetString[], char sourceString[],
- char alignmentString[], int sentenceID);
+ char alignmentString[], int sentenceID, bool boundaryRules);
+
};
}
diff --git a/phrase-extract/SentenceAlignmentWithSyntax.cpp b/phrase-extract/SentenceAlignmentWithSyntax.cpp
index 83a048757..5d866edfb 100644
--- a/phrase-extract/SentenceAlignmentWithSyntax.cpp
+++ b/phrase-extract/SentenceAlignmentWithSyntax.cpp
@@ -32,10 +32,10 @@ using namespace std;
namespace MosesTraining
{
-bool SentenceAlignmentWithSyntax::processTargetSentence(const char * targetString, int sentenceID)
+bool SentenceAlignmentWithSyntax::processTargetSentence(const char * targetString, int sentenceID, bool boundaryRules)
{
if (!m_options.targetSyntax) {
- return SentenceAlignment::processTargetSentence(targetString, sentenceID);
+ return SentenceAlignment::processTargetSentence(targetString, sentenceID, boundaryRules);
}
string targetStringCPP(targetString);
@@ -52,10 +52,10 @@ bool SentenceAlignmentWithSyntax::processTargetSentence(const char * targetStrin
return true;
}
-bool SentenceAlignmentWithSyntax::processSourceSentence(const char * sourceString, int sentenceID)
+bool SentenceAlignmentWithSyntax::processSourceSentence(const char * sourceString, int sentenceID, bool boundaryRules)
{
if (!m_options.sourceSyntax) {
- return SentenceAlignment::processSourceSentence(sourceString, sentenceID);
+ return SentenceAlignment::processSourceSentence(sourceString, sentenceID, boundaryRules);
}
string sourceStringCPP(sourceString);
diff --git a/phrase-extract/SentenceAlignmentWithSyntax.h b/phrase-extract/SentenceAlignmentWithSyntax.h
index 38fa77907..28eef57b7 100644
--- a/phrase-extract/SentenceAlignmentWithSyntax.h
+++ b/phrase-extract/SentenceAlignmentWithSyntax.h
@@ -59,10 +59,10 @@ public:
virtual ~SentenceAlignmentWithSyntax() {}
bool
- processTargetSentence(const char *, int);
+ processTargetSentence(const char *, int, bool boundaryRules);
bool
- processSourceSentence(const char *, int);
+ processSourceSentence(const char *, int, bool boundaryRules);
};
}
diff --git a/phrase-extract/consolidate.cpp b/phrase-extract/consolidate.cpp
index 60285e6e7..43b3f32a1 100644
--- a/phrase-extract/consolidate.cpp
+++ b/phrase-extract/consolidate.cpp
@@ -47,9 +47,11 @@ inline float maybeLogProb( float a ) { return logProbFlag ? log(a) : a; }
char line[LINE_MAX_LENGTH];
void processFiles( char*, char*, char*, char* );
void loadCountOfCounts( char* );
+void breakdownCoreAndSparse( string combined, string &core, string &sparse );
bool getLine( istream &fileP, vector< string > &item );
vector< string > splitLine();
vector< int > countBin;
+bool sparseCountBinFeatureFlag = false;
int main(int argc, char* argv[])
{
@@ -94,8 +96,11 @@ int main(int argc, char* argv[])
} else if (strcmp(argv[i],"--LowCountFeature") == 0) {
lowCountFlag = true;
cerr << "including the low count feature\n";
- } else if (strcmp(argv[i],"--CountBinFeature") == 0) {
- cerr << "include count bin feature:";
+ } else if (strcmp(argv[i],"--CountBinFeature") == 0 ||
+ strcmp(argv[i],"--SparseCountBinFeature") == 0) {
+ if (strcmp(argv[i],"--SparseCountBinFeature") == 0)
+ sparseCountBinFeatureFlag = true;
+ cerr << "include "<< (sparseCountBinFeatureFlag ? "sparse " : "") << "count bin feature:";
int prev = 0;
while(i+1<argc && argv[i+1][0]>='0' && argv[i+1][0]<='9') {
int binCount = atoi(argv[++i]);
@@ -223,10 +228,13 @@ void processFiles( char* fileNameDirect, char* fileNameIndirect, char* fileNameC
}
// output hierarchical phrase pair (with separated labels)
- fileConsolidated << itemDirect[0] << " ||| " << itemDirect[1];
+ fileConsolidated << itemDirect[0] << " ||| " << itemDirect[1] << " |||";
// SCORES ...
- fileConsolidated << " |||";
+ string directScores, directSparseScores, indirectScores, indirectSparseScores;
+ breakdownCoreAndSparse( itemDirect[2], directScores, directSparseScores );
+ breakdownCoreAndSparse( itemIndirect[2], indirectScores, indirectSparseScores );
+
vector<string> directCounts = tokenize(itemDirect[4].c_str());
vector<string> indirectCounts = tokenize(itemIndirect[4].c_str());
float countF = atof(directCounts[0].c_str());
@@ -264,12 +272,12 @@ void processFiles( char* fileNameDirect, char* fileNameIndirect, char* fileNameC
// prob indirect
if (!onlyDirectFlag) {
fileConsolidated << " " << maybeLogProb(adjustedCountEF_indirect/countE);
- fileConsolidated << " " << itemIndirect[2];
+ fileConsolidated << " " << directScores;
}
// prob direct
fileConsolidated << " " << maybeLogProb(adjustedCountEF/countF);
- fileConsolidated << " " << itemDirect[2];
+ fileConsolidated << " " << indirectScores;
// phrase count feature
if (phraseCountFlag) {
@@ -281,8 +289,8 @@ void processFiles( char* fileNameDirect, char* fileNameIndirect, char* fileNameC
fileConsolidated << " " << maybeLogProb(exp(-1.0/countEF));
}
- // count bin feature
- if (countBin.size()>0) {
+ // count bin feature (as a core feature)
+ if (countBin.size()>0 && !sparseCountBinFeatureFlag) {
bool foundBin = false;
for(size_t i=0; i < countBin.size(); i++) {
if (!foundBin && countEF <= countBin[i]) {
@@ -307,6 +315,35 @@ void processFiles( char* fileNameDirect, char* fileNameIndirect, char* fileNameC
fileConsolidated << " ||| " << itemDirect[5];
}
+ // count bin feature (as a sparse feature)
+ if (sparseCountBinFeatureFlag ||
+ directSparseScores.compare("") != 0 ||
+ indirectSparseScores.compare("") != 0)
+ {
+ fileConsolidated << " |||";
+ if (directSparseScores.compare("") != 0)
+ fileConsolidated << " " << directSparseScores;
+ if (indirectSparseScores.compare("") != 0)
+ fileConsolidated << " " << indirectSparseScores;
+ if (sparseCountBinFeatureFlag) {
+ bool foundBin = false;
+ for(size_t i=0; i < countBin.size(); i++) {
+ if (!foundBin && countEF <= countBin[i]) {
+ fileConsolidated << " cb_";
+ if (i == 0 && countBin[i] > 1)
+ fileConsolidated << "1_";
+ else if (i > 0 && countBin[i-1]+1 < countBin[i])
+ fileConsolidated << (countBin[i-1]+1) << "_";
+ fileConsolidated << countBin[i] << " 1";
+ foundBin = true;
+ }
+ }
+ if (!foundBin) {
+ fileConsolidated << " cb_max 1";
+ }
+ }
+ }
+
fileConsolidated << endl;
}
fileDirect.Close();
@@ -314,6 +351,22 @@ void processFiles( char* fileNameDirect, char* fileNameIndirect, char* fileNameC
fileConsolidated.Close();
}
+void breakdownCoreAndSparse( string combined, string &core, string &sparse )
+{
+ core = "";
+ sparse = "";
+ vector<string> score = tokenize( combined.c_str() );
+ for(size_t i=0; i<score.size(); i++) {
+ if ((score[i][0] >= '0' && score[i][0] <= '9') || i+1 == score.size())
+ core += " " + score[i];
+ else {
+ sparse += " " + score[i];
+ sparse += " " + score[++i];
+ }
+ }
+ if (core.size() > 0 ) core = core.substr(1);
+ if (sparse.size() > 0 ) sparse = sparse.substr(1);
+}
bool getLine( istream &fileP, vector< string > &item )
{
diff --git a/phrase-extract/domain.cpp b/phrase-extract/domain.cpp
new file mode 100644
index 000000000..aacb7160d
--- /dev/null
+++ b/phrase-extract/domain.cpp
@@ -0,0 +1,52 @@
+// $Id$
+//#include "beammain.h"
+#include "domain.h"
+#include "tables-core.h"
+#include "InputFileStream.h"
+#include "SafeGetline.h"
+
+#define TABLE_LINE_MAX_LENGTH 1000
+
+using namespace std;
+
+namespace MosesTraining
+{
+
+// handling of domain names: load database with sentence-id / domain name info
+void Domain::load( const std::string &domainFileName ) {
+ Moses::InputFileStream fileS( domainFileName );
+ istream *fileP = &fileS;
+ while(true) {
+ char line[TABLE_LINE_MAX_LENGTH];
+ SAFE_GETLINE((*fileP), line, TABLE_LINE_MAX_LENGTH, '\n', __FILE__);
+ if (fileP->eof()) break;
+ // read
+ vector< string > domainSpecLine = tokenize( line );
+ int lineNumber;
+ if (domainSpecLine.size() != 2 ||
+ ! sscanf(domainSpecLine[0].c_str(), "%d", &lineNumber)) {
+ cerr << "ERROR: in domain specification line: '" << line << "'" << endl;
+ exit(1);
+ }
+ // store
+ string &name = domainSpecLine[1];
+ spec.push_back( make_pair( lineNumber, name ));
+ if (name2id.find( name ) == name2id.end()) {
+ name2id[ name ] = list.size();
+ list.push_back( name );
+ }
+ }
+}
+
+// get domain name based on sentence number
+string Domain::getDomainOfSentence( int sentenceId ) {
+ for(size_t i=0; i<spec.size(); i++) {
+ if (sentenceId <= spec[i].first) {
+ return spec[i].second;
+ }
+ }
+ return "undefined";
+}
+
+}
+
diff --git a/phrase-extract/domain.h b/phrase-extract/domain.h
new file mode 100644
index 000000000..cf675c17e
--- /dev/null
+++ b/phrase-extract/domain.h
@@ -0,0 +1,32 @@
+// $Id$
+
+#ifndef _DOMAIN_H
+#define _DOMAIN_H
+
+#include <iostream>
+#include <fstream>
+#include <assert.h>
+#include <stdlib.h>
+#include <string>
+#include <queue>
+#include <map>
+#include <cmath>
+
+extern std::vector<std::string> tokenize( const char*);
+
+namespace MosesTraining
+{
+
+class Domain
+{
+public:
+ std::vector< std::pair< int, std::string > > spec;
+ std::vector< std::string > list;
+ std::map< std::string, int > name2id;
+ void load( const std::string &fileName );
+ std::string getDomainOfSentence( int sentenceId );
+};
+
+}
+
+#endif
diff --git a/phrase-extract/extract-rules.cpp b/phrase-extract/extract-rules.cpp
index c333040f6..5c308fd9b 100644
--- a/phrase-extract/extract-rules.cpp
+++ b/phrase-extract/extract-rules.cpp
@@ -46,8 +46,6 @@
#include "XmlTree.h"
#include "InputFileStream.h"
#include "OutputFileStream.h"
-#include "../moses/src/ThreadPool.h"
-#include "../moses/src/OutputCollector.h"
#define LINE_MAX_LENGTH 500000
@@ -57,55 +55,53 @@ using namespace MosesTraining;
typedef vector< int > LabelIndex;
typedef map< int, int > WordIndex;
-class ExtractTask : public Moses::Task {
+class ExtractTask
+{
private:
- size_t m_id;
- SentenceAlignmentWithSyntax *m_sentence;
- RuleExtractionOptions &m_options;
- Moses::OutputCollector* m_extractCollector;
- Moses::OutputCollector* m_extractCollectorInv;
+ SentenceAlignmentWithSyntax &m_sentence;
+ const RuleExtractionOptions &m_options;
+ Moses::OutputFileStream& m_extractFile;
+ Moses::OutputFileStream& m_extractFileInv;
+
+ vector< ExtractedRule > m_extractedRules;
+
+ // main functions
+ void extractRules();
+ void addRuleToCollection(ExtractedRule &rule);
+ void consolidateRules();
+ void writeRulesToFile();
+
+ // subs
+ void addRule( int, int, int, int, int, RuleExist &ruleExist);
+ void addHieroRule( int startT, int endT, int startS, int endS
+ , RuleExist &ruleExist, const HoleCollection &holeColl, int numHoles, int initStartF, int wordCountT, int wordCountS);
+ void printHieroPhrase( int startT, int endT, int startS, int endS
+ , HoleCollection &holeColl, LabelIndex &labelIndex, int countS);
+ string printTargetHieroPhrase( int startT, int endT, int startS, int endS
+ , WordIndex &indexT, HoleCollection &holeColl, const LabelIndex &labelIndex, double &logPCFGScore, int countS);
+ string printSourceHieroPhrase( int startT, int endT, int startS, int endS
+ , HoleCollection &holeColl, const LabelIndex &labelIndex);
+ void preprocessSourceHieroPhrase( int startT, int endT, int startS, int endS
+ , WordIndex &indexS, HoleCollection &holeColl, const LabelIndex &labelIndex);
+ void printHieroAlignment( int startT, int endT, int startS, int endS
+ , const WordIndex &indexS, const WordIndex &indexT, HoleCollection &holeColl, ExtractedRule &rule);
+ void printAllHieroPhrases( int startT, int endT, int startS, int endS, HoleCollection &holeColl, int countS);
+
+ inline string IntToString( int i )
+ {
+ stringstream out;
+ out << i;
+ return out.str();
+ }
public:
- ExtractTask(size_t id, SentenceAlignmentWithSyntax *sentence, RuleExtractionOptions &options, Moses::OutputCollector* extractCollector, Moses::OutputCollector* extractCollectorInv):
- m_id(id),
+ ExtractTask(SentenceAlignmentWithSyntax &sentence, const RuleExtractionOptions &options, Moses::OutputFileStream &extractFile, Moses::OutputFileStream &extractFileInv):
m_sentence(sentence),
m_options(options),
- m_extractCollector(extractCollector),
- m_extractCollectorInv(extractCollectorInv) {}
- ~ExtractTask() { delete m_sentence; }
+ m_extractFile(extractFile),
+ m_extractFileInv(extractFileInv) {}
void Run();
-private:
-vector< ExtractedRule > m_extractedRules;
-
-// main functions
-void extractRules();
-void addRuleToCollection(ExtractedRule &rule);
-void consolidateRules();
-void writeRulesToFile();
-
-// subs
-void addRule( int, int, int, int, RuleExist &ruleExist);
-void addHieroRule( int startT, int endT, int startS, int endS
- , RuleExist &ruleExist, const HoleCollection &holeColl, int numHoles, int initStartF, int wordCountT, int wordCountS);
-void printHieroPhrase( int startT, int endT, int startS, int endS
- , HoleCollection &holeColl, LabelIndex &labelIndex);
-string printTargetHieroPhrase( int startT, int endT, int startS, int endS
- , WordIndex &indexT, HoleCollection &holeColl, const LabelIndex &labelIndex, double &logPCFGScore);
-string printSourceHieroPhrase( int startT, int endT, int startS, int endS
- , HoleCollection &holeColl, const LabelIndex &labelIndex);
-void preprocessSourceHieroPhrase( int startT, int endT, int startS, int endS
- , WordIndex &indexS, HoleCollection &holeColl, const LabelIndex &labelIndex);
-void printHieroAlignment( int startT, int endT, int startS, int endS
- , const WordIndex &indexS, const WordIndex &indexT, HoleCollection &holeColl, ExtractedRule &rule);
-void printAllHieroPhrases( int startT, int endT, int startS, int endS, HoleCollection &holeColl);
-
-inline string IntToString( int i )
-{
- stringstream out;
- out << i;
- return out.str();
-}
};
// stats for glue grammar and unknown word label probabilities
@@ -120,15 +116,14 @@ int main(int argc, char* argv[])
<< "rule extraction from an aligned parallel corpus\n";
RuleExtractionOptions options;
+ int sentenceOffset = 0;
#ifdef WITH_THREADS
int thread_count = 1;
#endif
if (argc < 5) {
cerr << "syntax: extract-rules corpus.target corpus.source corpus.align extract ["
-#ifdef WITH_THREADS
- << " --threads NUM |"
-#endif
- << " --GlueGrammar FILE"
+
+ << " --GlueGrammar FILE"
<< " | --UnknownWordLabel FILE"
<< " | --OnlyDirect"
<< " | --OutputNTLengths"
@@ -143,7 +138,9 @@ int main(int argc, char* argv[])
<< " | --SourceSyntax | --TargetSyntax"
<< " | --AllowOnlyUnalignedWords | --DisallowNonTermConsecTarget |--NonTermConsecSource | --NoNonTermFirstWord | --NoFractionalCounting"
<< " | --UnpairedExtractFormat"
- << " | --ConditionOnTargetLHS ]\n";
+ << " | --ConditionOnTargetLHS ]"
+ << " | --BoundaryRules[" << options.boundaryRules << "]";
+
exit(1);
}
char* &fileNameT = argv[1];
@@ -268,12 +265,23 @@ int main(int argc, char* argv[])
options.unpairedExtractFormat = true;
} else if (strcmp(argv[i],"--ConditionOnTargetLHS") == 0) {
options.conditionOnTargetLhs = true;
-#ifdef WITH_THREADS
} else if (strcmp(argv[i],"-threads") == 0 ||
strcmp(argv[i],"--threads") == 0 ||
strcmp(argv[i],"--Threads") == 0) {
+#ifdef WITH_THREADS
thread_count = atoi(argv[++i]);
+#else
+ cerr << "thread support not compiled in." << '\n';
+ exit(1);
#endif
+ } else if (strcmp(argv[i], "--SentenceOffset") == 0) {
+ if (i+1 >= argc || argv[i+1][0] < '0' || argv[i+1][0] > '9') {
+ cerr << "extract: syntax error, used switch --SentenceOffset without a number" << endl;
+ exit(1);
+ }
+ sentenceOffset = atoi(argv[++i]);
+ } else if (strcmp(argv[i],"--BoundaryRules") == 0) {
+ options.boundaryRules = true;
} else {
cerr << "extract: syntax error, unknown option '" << string(argv[i]) << "'\n";
exit(1);
@@ -299,27 +307,17 @@ int main(int argc, char* argv[])
if (!options.onlyDirectFlag)
extractFileInv.Open(fileNameExtractInv.c_str());
- // output into file
- Moses::OutputCollector* extractCollector = new Moses::OutputCollector(&extractFile);
- Moses::OutputCollector* extractCollectorInv = new Moses::OutputCollector(&extractFileInv);
// stats on labels for glue grammar and unknown word label probabilities
set< string > targetLabelCollection, sourceLabelCollection;
map< string, int > targetTopLabelCollection, sourceTopLabelCollection;
-#ifdef WITH_THREADS
- // set up thread pool
- Moses::ThreadPool pool(thread_count);
- pool.SetQueueLimit(1000);
-#endif
-
// loop through all sentence pairs
- size_t i=0;
+ size_t i=sentenceOffset;
while(true) {
i++;
- if (i%1000 == 0) cerr << "." << flush;
- if (i%10000 == 0) cerr << ":" << flush;
- if (i%100000 == 0) cerr << "!" << flush;
+ if (i%1000 == 0) cerr << i << " " << flush;
+
char targetString[LINE_MAX_LENGTH];
char sourceString[LINE_MAX_LENGTH];
char alignmentString[LINE_MAX_LENGTH];
@@ -328,7 +326,7 @@ int main(int argc, char* argv[])
SAFE_GETLINE((*sFileP), sourceString, LINE_MAX_LENGTH, '\n', __FILE__);
SAFE_GETLINE((*aFileP), alignmentString, LINE_MAX_LENGTH, '\n', __FILE__);
- SentenceAlignmentWithSyntax *sentence = new SentenceAlignmentWithSyntax
+ SentenceAlignmentWithSyntax sentence
(targetLabelCollection, sourceLabelCollection,
targetTopLabelCollection, sourceTopLabelCollection, options);
//az: output src, tgt, and alingment line
@@ -339,32 +337,17 @@ int main(int argc, char* argv[])
cout << "LOG: PHRASES_BEGIN:" << endl;
}
- if (sentence->create(targetString, sourceString, alignmentString, i)) {
+ if (sentence.create(targetString, sourceString, alignmentString, i, options.boundaryRules)) {
if (options.unknownWordLabelFlag) {
- collectWordLabelCounts(*sentence);
- }
- ExtractTask *task = new ExtractTask(i-1, sentence, options, extractCollector, extractCollectorInv);
-#ifdef WITH_THREADS
- if (thread_count == 1) {
- task->Run();
- delete task;
+ collectWordLabelCounts(sentence);
}
- else {
- pool.Submit(task);
- }
-#else
+ ExtractTask *task = new ExtractTask(sentence, options, extractFile, extractFileInv);
task->Run();
delete task;
-#endif
}
if (options.onlyOutputSpanInfo) cout << "LOG: PHRASES_END:" << endl; //az: mark end of phrases
}
-#ifdef WITH_THREADS
- // wait for all threads to finish
- pool.Stop(true);
-#endif
-
tFile.Close();
sFile.Close();
aFile.Close();
@@ -390,8 +373,8 @@ void ExtractTask::Run() {
void ExtractTask::extractRules()
{
- int countT = m_sentence->target.size();
- int countS = m_sentence->source.size();
+ int countT = m_sentence.target.size();
+ int countS = m_sentence.source.size();
// phrase repository for creating hiero phrases
RuleExist ruleExist(countT);
@@ -406,17 +389,17 @@ void ExtractTask::extractRules()
int endT = startT + lengthT - 1;
// if there is target side syntax, there has to be a node
- if (m_options.targetSyntax && !m_sentence->targetTree.HasNode(startT,endT))
+ if (m_options.targetSyntax && !m_sentence.targetTree.HasNode(startT,endT))
continue;
// find find aligned source words
// first: find minimum and maximum source word
int minS = 9999;
int maxS = -1;
- vector< int > usedS = m_sentence->alignedCountS;
+ vector< int > usedS = m_sentence.alignedCountS;
for(int ti=startT; ti<=endT; ti++) {
- for(unsigned int i=0; i<m_sentence->alignedToT[ti].size(); i++) {
- int si = m_sentence->alignedToT[ti][i];
+ for(unsigned int i=0; i<m_sentence.alignedToT[ti].size(); i++) {
+ int si = m_sentence.alignedToT[ti][i];
if (si<minS) {
minS = si;
}
@@ -451,22 +434,22 @@ void ExtractTask::extractRules()
for(int startS=minS;
(startS>=0 &&
startS>maxS - m_options.maxSpan && // within length limit
- (startS==minS || m_sentence->alignedCountS[startS]==0)); // unaligned
+ (startS==minS || m_sentence.alignedCountS[startS]==0)); // unaligned
startS--) {
// end point of source phrase may advance over unaligned
for(int endS=maxS;
(endS<countS && endS<startS + m_options.maxSpan && // within length limit
- (endS==maxS || m_sentence->alignedCountS[endS]==0)); // unaligned
+ (endS==maxS || m_sentence.alignedCountS[endS]==0)); // unaligned
endS++) {
// if there is source side syntax, there has to be a node
- if (m_options.sourceSyntax && !m_sentence->sourceTree.HasNode(startS,endS))
+ if (m_options.sourceSyntax && !m_sentence.sourceTree.HasNode(startS,endS))
continue;
// TODO: loop over all source and target syntax labels
// if within length limits, add as fully-lexical phrase pair
if (endT-startT < m_options.maxSymbolsTarget && endS-startS < m_options.maxSymbolsSource) {
- addRule(startT,endT,startS,endS, ruleExist);
+ addRule(startT,endT,startS,endS, countS, ruleExist);
}
// take note that this is a valid phrase alignment
@@ -508,7 +491,7 @@ void ExtractTask::preprocessSourceHieroPhrase( int startT, int endT, int startS,
int labelI = labelIndex[ 2+holeCount+holeTotal ];
string label = m_options.sourceSyntax ?
- m_sentence->sourceTree.GetNodes(currPos,hole.GetEnd(0))[ labelI ]->GetLabel() : "X";
+ m_sentence.sourceTree.GetNodes(currPos,hole.GetEnd(0))[ labelI ]->GetLabel() : "X";
hole.SetLabel(label, 0);
currPos = hole.GetEnd(0);
@@ -526,7 +509,8 @@ void ExtractTask::preprocessSourceHieroPhrase( int startT, int endT, int startS,
}
string ExtractTask::printTargetHieroPhrase( int startT, int endT, int startS, int endS
- , WordIndex &indexT, HoleCollection &holeColl, const LabelIndex &labelIndex, double &logPCFGScore)
+ , WordIndex &indexT, HoleCollection &holeColl, const LabelIndex &labelIndex, double &logPCFGScore
+ , int countS)
{
HoleList::iterator iterHoleList = holeColl.GetHoles().begin();
assert(iterHoleList != holeColl.GetHoles().end());
@@ -548,8 +532,15 @@ string ExtractTask::printTargetHieroPhrase( int startT, int endT, int startS, in
assert(sourceLabel != "");
int labelI = labelIndex[ 2+holeCount ];
- string targetLabel = m_options.targetSyntax ?
- m_sentence->targetTree.GetNodes(currPos,hole.GetEnd(1))[ labelI ]->GetLabel() : "X";
+ string targetLabel;
+ if (m_options.targetSyntax) {
+ targetLabel = m_sentence.targetTree.GetNodes(currPos,hole.GetEnd(1))[labelI]->GetLabel();
+ } else if (m_options.boundaryRules && (startS == 0 || endS == countS - 1)) {
+ targetLabel = "S";
+ } else {
+ targetLabel = "X";
+ }
+
hole.SetLabel(targetLabel, 1);
if (m_options.unpairedExtractFormat) {
@@ -559,7 +550,7 @@ string ExtractTask::printTargetHieroPhrase( int startT, int endT, int startS, in
}
if (m_options.pcfgScore) {
- double score = m_sentence->targetTree.GetNodes(currPos,hole.GetEnd(1))[labelI]->GetPcfgScore();
+ double score = m_sentence.targetTree.GetNodes(currPos,hole.GetEnd(1))[labelI]->GetPcfgScore();
logPCFGScore -= score;
}
@@ -569,7 +560,7 @@ string ExtractTask::printTargetHieroPhrase( int startT, int endT, int startS, in
holeCount++;
} else {
indexT[currPos] = outPos;
- out += m_sentence->target[currPos] + " ";
+ out += m_sentence.target[currPos] + " ";
}
outPos++;
@@ -613,7 +604,7 @@ string ExtractTask::printSourceHieroPhrase( int startT, int endT, int startS, in
++iterHoleList;
++holeCount;
} else {
- out += m_sentence->source[currPos] + " ";
+ out += m_sentence.source[currPos] + " ";
}
outPos++;
@@ -630,8 +621,8 @@ void ExtractTask::printHieroAlignment( int startT, int endT, int startS, int end
for(int ti=startT; ti<=endT; ti++) {
WordIndex::const_iterator p = indexT.find(ti);
if (p != indexT.end()) { // does word still exist?
- for(unsigned int i=0; i<m_sentence->alignedToT[ti].size(); i++) {
- int si = m_sentence->alignedToT[ti][i];
+ for(unsigned int i=0; i<m_sentence.alignedToT[ti].size(); i++) {
+ int si = m_sentence.alignedToT[ti][i];
std::string sourceSymbolIndex = IntToString(indexS.find(si)->second);
std::string targetSymbolIndex = IntToString(p->second);
rule.alignment += sourceSymbolIndex + "-" + targetSymbolIndex + " ";
@@ -663,30 +654,37 @@ void ExtractTask::printHieroAlignment( int startT, int endT, int startS, int end
}
void ExtractTask::printHieroPhrase( int startT, int endT, int startS, int endS
- , HoleCollection &holeColl, LabelIndex &labelIndex)
+ , HoleCollection &holeColl, LabelIndex &labelIndex, int countS)
{
WordIndex indexS, indexT; // to keep track of word positions in rule
ExtractedRule rule( startT, endT, startS, endS );
// phrase labels
- string targetLabel = m_options.targetSyntax ?
- m_sentence->targetTree.GetNodes(startT,endT)[ labelIndex[0] ]->GetLabel() : "X";
+ string targetLabel;
+ if (m_options.targetSyntax) {
+ targetLabel = m_sentence.targetTree.GetNodes(startT,endT)[labelIndex[0] ]->GetLabel();
+ } else if (m_options.boundaryRules && (startS == 0 || endS == countS - 1)) {
+ targetLabel = "S";
+ } else {
+ targetLabel = "X";
+ }
+
string sourceLabel = m_options.sourceSyntax ?
- m_sentence->sourceTree.GetNodes(startS,endS)[ labelIndex[1] ]->GetLabel() : "X";
+ m_sentence.sourceTree.GetNodes(startS,endS)[ labelIndex[1] ]->GetLabel() : "X";
// create non-terms on the source side
preprocessSourceHieroPhrase(startT, endT, startS, endS, indexS, holeColl, labelIndex);
// target
if (m_options.pcfgScore) {
- double logPCFGScore = m_sentence->targetTree.GetNodes(startT,endT)[labelIndex[0]]->GetPcfgScore();
- rule.target = printTargetHieroPhrase(startT, endT, startS, endS, indexT, holeColl, labelIndex, logPCFGScore)
+ double logPCFGScore = m_sentence.targetTree.GetNodes(startT,endT)[labelIndex[0]]->GetPcfgScore();
+ rule.target = printTargetHieroPhrase(startT, endT, startS, endS, indexT, holeColl, labelIndex, logPCFGScore, countS)
+ " [" + targetLabel + "]";
rule.pcfgScore = std::exp(logPCFGScore);
} else {
double logPCFGScore = 0.0f;
- rule.target = printTargetHieroPhrase(startT, endT, startS, endS, indexT, holeColl, labelIndex, logPCFGScore)
+ rule.target = printTargetHieroPhrase(startT, endT, startS, endS, indexT, holeColl, labelIndex, logPCFGScore, countS)
+ " [" + targetLabel + "]";
}
@@ -704,24 +702,24 @@ void ExtractTask::printHieroPhrase( int startT, int endT, int startS, int endS
addRuleToCollection( rule );
}
-void ExtractTask::printAllHieroPhrases( int startT, int endT, int startS, int endS, HoleCollection &holeColl)
+void ExtractTask::printAllHieroPhrases( int startT, int endT, int startS, int endS, HoleCollection &holeColl, int countS)
{
LabelIndex labelIndex,labelCount;
// number of target head labels
- int numLabels = m_options.targetSyntax ? m_sentence->targetTree.GetNodes(startT,endT).size() : 1;
+ int numLabels = m_options.targetSyntax ? m_sentence.targetTree.GetNodes(startT,endT).size() : 1;
labelCount.push_back(numLabels);
labelIndex.push_back(0);
// number of source head labels
- numLabels = m_options.sourceSyntax ? m_sentence->sourceTree.GetNodes(startS,endS).size() : 1;
+ numLabels = m_options.sourceSyntax ? m_sentence.sourceTree.GetNodes(startS,endS).size() : 1;
labelCount.push_back(numLabels);
labelIndex.push_back(0);
// number of target hole labels
for( HoleList::const_iterator hole = holeColl.GetHoles().begin();
hole != holeColl.GetHoles().end(); hole++ ) {
- int numLabels = m_options.targetSyntax ? m_sentence->targetTree.GetNodes(hole->GetStart(1),hole->GetEnd(1)).size() : 1 ;
+ int numLabels = m_options.targetSyntax ? m_sentence.targetTree.GetNodes(hole->GetStart(1),hole->GetEnd(1)).size() : 1 ;
labelCount.push_back(numLabels);
labelIndex.push_back(0);
}
@@ -731,7 +729,7 @@ void ExtractTask::printAllHieroPhrases( int startT, int endT, int startS, int en
for( vector<Hole*>::iterator i = holeColl.GetSortedSourceHoles().begin();
i != holeColl.GetSortedSourceHoles().end(); i++ ) {
const Hole &hole = **i;
- int numLabels = m_options.sourceSyntax ? m_sentence->sourceTree.GetNodes(hole.GetStart(0),hole.GetEnd(0)).size() : 1 ;
+ int numLabels = m_options.sourceSyntax ? m_sentence.sourceTree.GetNodes(hole.GetStart(0),hole.GetEnd(0)).size() : 1 ;
labelCount.push_back(numLabels);
labelIndex.push_back(0);
}
@@ -739,7 +737,7 @@ void ExtractTask::printAllHieroPhrases( int startT, int endT, int startS, int en
// loop through the holes
bool done = false;
while(!done) {
- printHieroPhrase( startT, endT, startS, endS, holeColl, labelIndex );
+ printHieroPhrase( startT, endT, startS, endS, holeColl, labelIndex, countS );
for(unsigned int i=0; i<labelIndex.size(); i++) {
labelIndex[i]++;
if(labelIndex[i] == labelCount[i]) {
@@ -843,7 +841,7 @@ void ExtractTask::addHieroRule( int startT, int endT, int startS, int endS
}
// covered by word? check if it is aligned
else {
- if (m_sentence->alignedToT[pos].size() > 0)
+ if (m_sentence.alignedToT[pos].size() > 0)
foundAlignedWord = true;
}
}
@@ -867,7 +865,7 @@ void ExtractTask::addHieroRule( int startT, int endT, int startS, int endS
// passed all checks...
if (allowablePhrase)
- printAllHieroPhrases(startT, endT, startS, endS, copyHoleColl);
+ printAllHieroPhrases(startT, endT, startS, endS, copyHoleColl, wordCountS);
// recursively search for next hole
int nextInitStartT = m_options.nonTermConsecTarget ? endHoleT + 1 : endHoleT + 2;
@@ -879,10 +877,15 @@ void ExtractTask::addHieroRule( int startT, int endT, int startS, int endS
}
}
-void ExtractTask::addRule( int startT, int endT, int startS, int endS, RuleExist &ruleExist)
+void ExtractTask::addRule( int startT, int endT, int startS, int endS, int countS, RuleExist &ruleExist)
{
- // source
-
+ // contains only <s> or </s>. Don't output
+ if (m_options.boundaryRules
+ && ( (startS == 0 && endS == 0)
+ || (startS == countS-1 && endS == countS-1))) {
+ return;
+ }
+
if (m_options.onlyOutputSpanInfo) {
cout << startS << " " << endS << " " << startT << " " << endT << endl;
return;
@@ -893,36 +896,42 @@ void ExtractTask::addRule( int startT, int endT, int startS, int endS, RuleExist
// phrase labels
string targetLabel,sourceLabel;
if (m_options.targetSyntax && m_options.conditionOnTargetLhs) {
- sourceLabel = targetLabel = m_sentence->targetTree.GetNodes(startT,endT)[0]->GetLabel();
+ sourceLabel = targetLabel = m_sentence.targetTree.GetNodes(startT,endT)[0]->GetLabel();
}
else {
sourceLabel = m_options.sourceSyntax ?
- m_sentence->sourceTree.GetNodes(startS,endS)[0]->GetLabel() : "X";
- targetLabel = m_options.targetSyntax ?
- m_sentence->targetTree.GetNodes(startT,endT)[0]->GetLabel() : "X";
+ m_sentence.sourceTree.GetNodes(startS,endS)[0]->GetLabel() : "X";
+
+ if (m_options.targetSyntax) {
+ targetLabel = m_sentence.targetTree.GetNodes(startT,endT)[0]->GetLabel();
+ } else if (m_options.boundaryRules && (startS == 0 || endS == countS - 1)) {
+ targetLabel = "S";
+ } else {
+ targetLabel = "X";
+ }
}
// source
rule.source = "";
for(int si=startS; si<=endS; si++)
- rule.source += m_sentence->source[si] + " ";
+ rule.source += m_sentence.source[si] + " ";
rule.source += "[" + sourceLabel + "]";
// target
rule.target = "";
for(int ti=startT; ti<=endT; ti++)
- rule.target += m_sentence->target[ti] + " ";
+ rule.target += m_sentence.target[ti] + " ";
rule.target += "[" + targetLabel + "]";
if (m_options.pcfgScore) {
- double logPCFGScore = m_sentence->targetTree.GetNodes(startT,endT)[0]->GetPcfgScore();
+ double logPCFGScore = m_sentence.targetTree.GetNodes(startT,endT)[0]->GetPcfgScore();
rule.pcfgScore = std::exp(logPCFGScore);
}
// alignment
for(int ti=startT; ti<=endT; ti++) {
- for(unsigned int i=0; i<m_sentence->alignedToT[ti].size(); i++) {
- int si = m_sentence->alignedToT[ti][i];
+ for(unsigned int i=0; i<m_sentence.alignedToT[ti].size(); i++) {
+ int si = m_sentence.alignedToT[ti][i];
std::string sourceSymbolIndex = IntToString(si-startS);
std::string targetSymbolIndex = IntToString(ti-startT);
rule.alignment += sourceSymbolIndex + "-" + targetSymbolIndex + " ";
@@ -1015,8 +1024,8 @@ void ExtractTask::writeRulesToFile()
<< rule->count << "\n";
}
}
- m_extractCollector->Write( m_id, out.str() );
- m_extractCollectorInv->Write( m_id, outInv.str() );;
+ m_extractFile << out.str();
+ m_extractFileInv << outInv.str();
}
void writeGlueGrammar( const string & fileName, RuleExtractionOptions &options, set< string > &targetLabelCollection, map< string, int > &targetTopLabelCollection )
diff --git a/phrase-extract/extract.cpp b/phrase-extract/extract.cpp
index 6583fd077..6a1ee77ab 100644
--- a/phrase-extract/extract.cpp
+++ b/phrase-extract/extract.cpp
@@ -1,6 +1,7 @@
/*
* extract.cpp
- *
+ * Modified by: Rohit Gupta CDAC, Mumbai, India
+ * on July 15, 2012 to implement parallel processing
* Modified by: Nadi Tomeh - LIMSI/CNRS
* Machine Translation Marathon 2010, Dublin
*/
@@ -13,7 +14,7 @@
#include <stdlib.h>
#include <assert.h>
#include <cstring>
-
+#include <sstream>
#include <map>
#include <set>
#include <vector>
@@ -23,14 +24,16 @@
#include "tables-core.h"
#include "InputFileStream.h"
#include "OutputFileStream.h"
+#include "PhraseExtractionOptions.h"
using namespace std;
using namespace MosesTraining;
-#define LINE_MAX_LENGTH 500000
+namespace MosesTraining {
+
+
+const long int LINE_MAX_LENGTH = 500000 ;
-namespace MosesTraining
-{
// HPhraseVertex represents a point in the alignment matrix
typedef pair <int, int> HPhraseVertex;
@@ -46,58 +49,64 @@ typedef vector < HPhrase > HPhraseVector;
// The key of the map is the English index and the value is a set of the source ones
typedef map <int, set<int> > HSentenceVertices;
-enum REO_MODEL_TYPE {REO_MSD, REO_MSLR, REO_MONO};
-enum REO_POS {LEFT, RIGHT, DLEFT, DRIGHT, UNKNOWN};
-
-REO_POS getOrientWordModel(SentenceAlignment &, REO_MODEL_TYPE, bool, bool,
+ REO_POS getOrientWordModel(SentenceAlignment &, REO_MODEL_TYPE, bool, bool,
int, int, int, int, int, int, int,
bool (*)(int, int), bool (*)(int, int));
-REO_POS getOrientPhraseModel(SentenceAlignment &, REO_MODEL_TYPE, bool, bool,
+ REO_POS getOrientPhraseModel(SentenceAlignment &, REO_MODEL_TYPE, bool, bool,
int, int, int, int, int, int, int,
bool (*)(int, int), bool (*)(int, int),
const HSentenceVertices &, const HSentenceVertices &);
-REO_POS getOrientHierModel(SentenceAlignment &, REO_MODEL_TYPE, bool, bool,
+ REO_POS getOrientHierModel(SentenceAlignment &, REO_MODEL_TYPE, bool, bool,
int, int, int, int, int, int, int,
bool (*)(int, int), bool (*)(int, int),
const HSentenceVertices &, const HSentenceVertices &,
const HSentenceVertices &, const HSentenceVertices &,
REO_POS);
-void insertVertex(HSentenceVertices &, int, int);
-void insertPhraseVertices(HSentenceVertices &, HSentenceVertices &, HSentenceVertices &, HSentenceVertices &,
+ void insertVertex(HSentenceVertices &, int, int);
+ void insertPhraseVertices(HSentenceVertices &, HSentenceVertices &, HSentenceVertices &, HSentenceVertices &,
int, int, int, int);
-string getOrientString(REO_POS, REO_MODEL_TYPE);
-
-bool ge(int, int);
-bool le(int, int);
-bool lt(int, int);
-
-void extractBase(SentenceAlignment &);
-void extract(SentenceAlignment &);
-void addPhrase(SentenceAlignment &, int, int, int, int, string &);
-bool isAligned (SentenceAlignment &, int, int);
-
-bool allModelsOutputFlag = false;
-
-bool wordModel = false;
-REO_MODEL_TYPE wordType = REO_MSD;
-bool phraseModel = false;
-REO_MODEL_TYPE phraseType = REO_MSD;
-bool hierModel = false;
-REO_MODEL_TYPE hierType = REO_MSD;
-
-
-Moses::OutputFileStream extractFile;
-Moses::OutputFileStream extractFileInv;
-Moses::OutputFileStream extractFileOrientation;
-Moses::OutputFileStream extractFileSentenceId;
-int maxPhraseLength;
-bool orientationFlag = false;
-bool translationFlag = true;
-bool sentenceIdFlag = false; //create extract file with sentence id
-bool onlyOutputSpanInfo = false;
-bool gzOutput = false;
+ string getOrientString(REO_POS, REO_MODEL_TYPE);
+
+ bool ge(int, int);
+ bool le(int, int);
+ bool lt(int, int);
+
+ bool isAligned (SentenceAlignment &, int, int);
+ int sentenceOffset = 0;
+
+}
+namespace MosesTraining{
+
+class ExtractTask
+{
+public:
+ ExtractTask(size_t id, SentenceAlignment &sentence,PhraseExtractionOptions &initoptions, Moses::OutputFileStream &extractFile, Moses::OutputFileStream &extractFileInv,Moses::OutputFileStream &extractFileOrientation,Moses::OutputFileStream &extractFileSentenceId ):
+ m_sentence(sentence),
+ m_options(initoptions),
+ m_extractFile(extractFile),
+ m_extractFileInv(extractFileInv),
+ m_extractFileOrientation(extractFileOrientation),
+ m_extractFileSentenceId(extractFileSentenceId) {}
+void Run();
+private:
+ vector< string > m_extractedPhrases;
+ vector< string > m_extractedPhrasesInv;
+ vector< string > m_extractedPhrasesOri;
+ vector< string > m_extractedPhrasesSid;
+ void extractBase(SentenceAlignment &);
+ void extract(SentenceAlignment &);
+ void addPhrase(SentenceAlignment &, int, int, int, int, string &);
+ void writePhrasesToFile();
+
+ SentenceAlignment &m_sentence;
+ const PhraseExtractionOptions &m_options;
+ Moses::OutputFileStream &m_extractFile;
+ Moses::OutputFileStream &m_extractFileInv;
+ Moses::OutputFileStream &m_extractFileOrientation;
+ Moses::OutputFileStream &m_extractFileSentenceId;
+};
}
int main(int argc, char* argv[])
@@ -105,70 +114,84 @@ int main(int argc, char* argv[])
cerr << "PhraseExtract v1.4, written by Philipp Koehn\n"
<< "phrase extraction from an aligned parallel corpus\n";
- if (argc < 6) {
- cerr << "syntax: extract en de align extract max-length [orientation [ --model [wbe|phrase|hier]-[msd|mslr|mono] ] | --OnlyOutputSpanInfo | --NoTTable | --SentenceId]\n";
+ if (argc < 6) {
+ cerr << "syntax: extract en de align extract max-length [orientation [ --model [wbe|phrase|hier]-[msd|mslr|mono] ] ";
+ cerr<<"| --OnlyOutputSpanInfo | --NoTTable | --SentenceId | --GZOutput | --IncludeSentenceId | --SentenceOffset n ]\n";
exit(1);
}
- char* &fileNameE = argv[1];
- char* &fileNameF = argv[2];
- char* &fileNameA = argv[3];
- string fileNameExtract = string(argv[4]);
- maxPhraseLength = atoi(argv[5]);
+
+ Moses::OutputFileStream extractFile;
+ Moses::OutputFileStream extractFileInv;
+ Moses::OutputFileStream extractFileOrientation;
+ Moses::OutputFileStream extractFileSentenceId;
+ const char* const &fileNameE = argv[1];
+ const char* const &fileNameF = argv[2];
+ const char* const &fileNameA = argv[3];
+ const string fileNameExtract = string(argv[4]);
+ PhraseExtractionOptions options(atoi(argv[5]));
for(int i=6; i<argc; i++) {
if (strcmp(argv[i],"--OnlyOutputSpanInfo") == 0) {
- onlyOutputSpanInfo = true;
+ options.initOnlyOutputSpanInfo(true);
} else if (strcmp(argv[i],"orientation") == 0 || strcmp(argv[i],"--Orientation") == 0) {
- orientationFlag = true;
+ options.initOrientationFlag(true);
} else if (strcmp(argv[i],"--NoTTable") == 0) {
- translationFlag = false;
+ options.initTranslationFlag(false);
} else if (strcmp(argv[i], "--SentenceId") == 0) {
- sentenceIdFlag = true;
+ options.initSentenceIdFlag(true);
+ } else if (strcmp(argv[i], "--IncludeSentenceId") == 0) {
+ options.initIncludeSentenceIdFlag(true);
+ } else if (strcmp(argv[i], "--SentenceOffset") == 0) {
+ if (i+1 >= argc || argv[i+1][0] < '0' || argv[i+1][0] > '9') {
+ cerr << "extract: syntax error, used switch --SentenceOffset without a number" << endl;
+ exit(1);
+ }
+ sentenceOffset = atoi(argv[++i]);
} else if (strcmp(argv[i], "--GZOutput") == 0) {
- gzOutput = true;
+ options.initGzOutput(true);
} else if(strcmp(argv[i],"--model") == 0) {
if (i+1 >= argc) {
cerr << "extract: syntax error, no model's information provided to the option --model " << endl;
exit(1);
}
- char* modelParams = argv[++i];
- char* modelName = strtok(modelParams, "-");
- char* modelType = strtok(NULL, "-");
+ char* modelParams = argv[++i];
+ char* modelName = strtok(modelParams, "-");
+ char* modelType = strtok(NULL, "-");
- REO_MODEL_TYPE intModelType;
+ // REO_MODEL_TYPE intModelType;
if(strcmp(modelName, "wbe") == 0) {
- wordModel = true;
+ options.initWordModel(true);
if(strcmp(modelType, "msd") == 0)
- wordType = REO_MSD;
+ options.initWordType(REO_MSD);
else if(strcmp(modelType, "mslr") == 0)
- wordType = REO_MSLR;
+ options.initWordType(REO_MSLR);
else if(strcmp(modelType, "mono") == 0 || strcmp(modelType, "monotonicity") == 0)
- wordType = REO_MONO;
+ options.initWordType(REO_MONO);
else {
cerr << "extract: syntax error, unknown reordering model type: " << modelType << endl;
exit(1);
}
} else if(strcmp(modelName, "phrase") == 0) {
- phraseModel = true;
+ options.initPhraseModel(true);
if(strcmp(modelType, "msd") == 0)
- phraseType = REO_MSD;
+ options.initPhraseType(REO_MSD);
else if(strcmp(modelType, "mslr") == 0)
- phraseType = REO_MSLR;
+ options.initPhraseType(REO_MSLR);
else if(strcmp(modelType, "mono") == 0 || strcmp(modelType, "monotonicity") == 0)
- phraseType = REO_MONO;
+ options.initPhraseType(REO_MONO);
else {
cerr << "extract: syntax error, unknown reordering model type: " << modelType << endl;
exit(1);
}
} else if(strcmp(modelName, "hier") == 0) {
- hierModel = true;
+ options.initHierModel(true);
if(strcmp(modelType, "msd") == 0)
- hierType = REO_MSD;
+ options.initHierType(REO_MSD);
else if(strcmp(modelType, "mslr") == 0)
- hierType = REO_MSLR;
+ options.initHierType(REO_MSLR);
else if(strcmp(modelType, "mono") == 0 || strcmp(modelType, "monotonicity") == 0)
- hierType = REO_MONO;
+ options.initHierType(REO_MONO);
else {
cerr << "extract: syntax error, unknown reordering model type: " << modelType << endl;
exit(1);
@@ -178,7 +201,8 @@ int main(int argc, char* argv[])
exit(1);
}
- allModelsOutputFlag = true;
+ options.initAllModelsOutputFlag(true);
+
} else {
cerr << "extract: syntax error, unknown option '" << string(argv[i]) << "'\n";
exit(1);
@@ -187,9 +211,9 @@ int main(int argc, char* argv[])
// default reordering model if no model selected
// allows for the old syntax to be used
- if(orientationFlag && !allModelsOutputFlag) {
- wordModel = true;
- wordType = REO_MSD;
+ if(options.isOrientationFlag() && !options.isAllModelsOutputFlag()) {
+ options.initWordModel(true);
+ options.initWordType(REO_MSD);
}
// open input files
@@ -202,22 +226,22 @@ int main(int argc, char* argv[])
istream *aFileP = &aFile;
// open output files
- if (translationFlag) {
- string fileNameExtractInv = fileNameExtract + ".inv" + (gzOutput?".gz":"");
- extractFile.Open( (fileNameExtract + (gzOutput?".gz":"")).c_str());
+ if (options.isTranslationFlag()) {
+ string fileNameExtractInv = fileNameExtract + ".inv" + (options.isGzOutput()?".gz":"");
+ extractFile.Open( (fileNameExtract + (options.isGzOutput()?".gz":"")).c_str());
extractFileInv.Open(fileNameExtractInv.c_str());
}
- if (orientationFlag) {
- string fileNameExtractOrientation = fileNameExtract + ".o" + (gzOutput?".gz":"");
+ if (options.isOrientationFlag()) {
+ string fileNameExtractOrientation = fileNameExtract + ".o" + (options.isGzOutput()?".gz":"");
extractFileOrientation.Open(fileNameExtractOrientation.c_str());
}
- if (sentenceIdFlag) {
- string fileNameExtractSentenceId = fileNameExtract + ".sid" + (gzOutput?".gz":"");
+ if (options.isSentenceIdFlag()) {
+ string fileNameExtractSentenceId = fileNameExtract + ".sid" + (options.isGzOutput()?".gz":"");
extractFileSentenceId.Open(fileNameExtractSentenceId.c_str());
}
- int i=0;
+ int i = sentenceOffset;
while(true) {
i++;
if (i%10000 == 0) cerr << "." << flush;
@@ -229,31 +253,38 @@ int main(int argc, char* argv[])
SAFE_GETLINE((*fFileP), foreignString, LINE_MAX_LENGTH, '\n', __FILE__);
SAFE_GETLINE((*aFileP), alignmentString, LINE_MAX_LENGTH, '\n', __FILE__);
SentenceAlignment sentence;
- // cout << "read in: " << englishString << " & " << foreignString << " & " << alignmentString << endl;
+ // cout << "read in: " << englishString << " & " << foreignString << " & " << alignmentString << endl;
//az: output src, tgt, and alingment line
- if (onlyOutputSpanInfo) {
+ if (options.isOnlyOutputSpanInfo()) {
cout << "LOG: SRC: " << foreignString << endl;
cout << "LOG: TGT: " << englishString << endl;
cout << "LOG: ALT: " << alignmentString << endl;
cout << "LOG: PHRASES_BEGIN:" << endl;
}
+ if (sentence.create( englishString, foreignString, alignmentString, i, false)) {
+ ExtractTask *task = new ExtractTask(i-1, sentence, options, extractFile , extractFileInv, extractFileOrientation, extractFileSentenceId);
+ task->Run();
+ delete task;
- if (sentence.create( englishString, foreignString, alignmentString, i)) {
- extract(sentence);
}
- if (onlyOutputSpanInfo) cout << "LOG: PHRASES_END:" << endl; //az: mark end of phrases
+ if (options.isOnlyOutputSpanInfo()) cout << "LOG: PHRASES_END:" << endl; //az: mark end of phrases
}
+
eFile.Close();
fFile.Close();
aFile.Close();
+
//az: only close if we actually opened it
- if (!onlyOutputSpanInfo) {
- if (translationFlag) {
+ if (!options.isOnlyOutputSpanInfo()) {
+ if (options.isTranslationFlag()) {
extractFile.Close();
extractFileInv.Close();
+
}
- if (orientationFlag) extractFileOrientation.Close();
- if (sentenceIdFlag) {
+ if (options.isOrientationFlag()){
+ extractFileOrientation.Close();
+ }
+ if (options.isSentenceIdFlag()) {
extractFileSentenceId.Close();
}
}
@@ -261,8 +292,17 @@ int main(int argc, char* argv[])
namespace MosesTraining
{
+void ExtractTask::Run() {
+ extract(m_sentence);
+ writePhrasesToFile();
+ m_extractedPhrases.clear();
+ m_extractedPhrasesInv.clear();
+ m_extractedPhrasesOri.clear();
+ m_extractedPhrasesSid.clear();
+
+}
-void extract(SentenceAlignment &sentence)
+void ExtractTask::extract(SentenceAlignment &sentence)
{
int countE = sentence.target.size();
int countF = sentence.source.size();
@@ -281,14 +321,14 @@ void extract(SentenceAlignment &sentence)
HSentenceVertices::const_iterator it;
- bool relaxLimit = hierModel;
- bool buildExtraStructure = phraseModel || hierModel;
+ bool relaxLimit = m_options.isHierModel();
+ bool buildExtraStructure = m_options.isPhraseModel() || m_options.isHierModel();
// check alignments for target phrase startE...endE
// loop over extracted phrases which are compatible with the word-alignments
for(int startE=0; startE<countE; startE++) {
for(int endE=startE;
- (endE<countE && (relaxLimit || endE<startE+maxPhraseLength));
+ (endE<countE && (relaxLimit || endE<startE+m_options.maxPhraseLength));
endE++) {
int minF = 9999;
@@ -308,7 +348,7 @@ void extract(SentenceAlignment &sentence)
}
if (maxF >= 0 && // aligned to any source words at all
- (relaxLimit || maxF-minF < maxPhraseLength)) { // source phrase within limits
+ (relaxLimit || maxF-minF < m_options.maxPhraseLength)) { // source phrase within limits
// check if source words are aligned to out of bound target words
bool out_of_bounds = false;
@@ -323,17 +363,17 @@ void extract(SentenceAlignment &sentence)
// start point of source phrase may retreat over unaligned
for(int startF=minF;
(startF>=0 &&
- (relaxLimit || startF>maxF-maxPhraseLength) && // within length limit
+ (relaxLimit || startF>maxF-m_options.maxPhraseLength) && // within length limit
(startF==minF || sentence.alignedCountS[startF]==0)); // unaligned
startF--)
// end point of source phrase may advance over unaligned
for(int endF=maxF;
(endF<countF &&
- (relaxLimit || endF<startF+maxPhraseLength) && // within length limit
+ (relaxLimit || endF<startF+m_options.maxPhraseLength) && // within length limit
(endF==maxF || sentence.alignedCountS[endF]==0)); // unaligned
endF++) { // at this point we have extracted a phrase
if(buildExtraStructure) { // phrase || hier
- if(endE-startE < maxPhraseLength && endF-startF < maxPhraseLength) { // within limit
+ if(endE-startE < m_options.maxPhraseLength && endF-startF < m_options.maxPhraseLength) { // within limit
inboundPhrases.push_back(HPhrase(HPhraseVertex(startF,startE),
HPhraseVertex(endF,endE)));
insertPhraseVertices(inTopLeft, inTopRight, inBottomLeft, inBottomRight,
@@ -343,16 +383,16 @@ void extract(SentenceAlignment &sentence)
startF, startE, endF, endE);
} else {
string orientationInfo = "";
- if(wordModel) {
+ if(m_options.isWordModel()) {
REO_POS wordPrevOrient, wordNextOrient;
bool connectedLeftTopP = isAligned( sentence, startF-1, startE-1 );
bool connectedRightTopP = isAligned( sentence, endF+1, startE-1 );
bool connectedLeftTopN = isAligned( sentence, endF+1, endE+1 );
bool connectedRightTopN = isAligned( sentence, startF-1, endE+1 );
- wordPrevOrient = getOrientWordModel(sentence, wordType, connectedLeftTopP, connectedRightTopP, startF, endF, startE, endE, countF, 0, 1, &ge, &lt);
- wordNextOrient = getOrientWordModel(sentence, wordType, connectedLeftTopN, connectedRightTopN, endF, startF, endE, startE, 0, countF, -1, &lt, &ge);
- orientationInfo += getOrientString(wordPrevOrient, wordType) + " " + getOrientString(wordNextOrient, wordType);
- if(allModelsOutputFlag)
+ wordPrevOrient = getOrientWordModel(sentence, m_options.isWordType(), connectedLeftTopP, connectedRightTopP, startF, endF, startE, endE, countF, 0, 1, &ge, &lt);
+ wordNextOrient = getOrientWordModel(sentence, m_options.isWordType(), connectedLeftTopN, connectedRightTopN, endF, startF, endE, startE, 0, countF, -1, &lt, &ge);
+ orientationInfo += getOrientString(wordPrevOrient, m_options.isWordType()) + " " + getOrientString(wordNextOrient, m_options.isWordType());
+ if(m_options.isAllModelsOutputFlag())
" | | ";
}
addPhrase(sentence, startE, endE, startF, endF, orientationInfo);
@@ -378,38 +418,38 @@ void extract(SentenceAlignment &sentence)
bool connectedLeftTopN = isAligned( sentence, endF+1, endE+1 );
bool connectedRightTopN = isAligned( sentence, startF-1, endE+1 );
- if(wordModel) {
- wordPrevOrient = getOrientWordModel(sentence, wordType,
+ if(m_options.isWordModel()) {
+ wordPrevOrient = getOrientWordModel(sentence, m_options.isWordType(),
connectedLeftTopP, connectedRightTopP,
startF, endF, startE, endE, countF, 0, 1,
&ge, &lt);
- wordNextOrient = getOrientWordModel(sentence, wordType,
+ wordNextOrient = getOrientWordModel(sentence, m_options.isWordType(),
connectedLeftTopN, connectedRightTopN,
endF, startF, endE, startE, 0, countF, -1,
&lt, &ge);
}
- if (phraseModel) {
- phrasePrevOrient = getOrientPhraseModel(sentence, phraseType,
+ if (m_options.isPhraseModel()) {
+ phrasePrevOrient = getOrientPhraseModel(sentence, m_options.isPhraseType(),
connectedLeftTopP, connectedRightTopP,
startF, endF, startE, endE, countF-1, 0, 1, &ge, &lt, inBottomRight, inBottomLeft);
- phraseNextOrient = getOrientPhraseModel(sentence, phraseType,
+ phraseNextOrient = getOrientPhraseModel(sentence, m_options.isPhraseType(),
connectedLeftTopN, connectedRightTopN,
endF, startF, endE, startE, 0, countF-1, -1, &lt, &ge, inBottomLeft, inBottomRight);
} else {
phrasePrevOrient = phraseNextOrient = UNKNOWN;
}
- if(hierModel) {
- hierPrevOrient = getOrientHierModel(sentence, hierType,
+ if(m_options.isHierModel()) {
+ hierPrevOrient = getOrientHierModel(sentence, m_options.isHierType(),
connectedLeftTopP, connectedRightTopP,
startF, endF, startE, endE, countF-1, 0, 1, &ge, &lt, inBottomRight, inBottomLeft, outBottomRight, outBottomLeft, phrasePrevOrient);
- hierNextOrient = getOrientHierModel(sentence, hierType,
+ hierNextOrient = getOrientHierModel(sentence, m_options.isHierType(),
connectedLeftTopN, connectedRightTopN,
endF, startF, endE, startE, 0, countF-1, -1, &lt, &ge, inBottomLeft, inBottomRight, outBottomLeft, outBottomRight, phraseNextOrient);
}
- orientationInfo = ((wordModel)? getOrientString(wordPrevOrient, wordType) + " " + getOrientString(wordNextOrient, wordType) : "") + " | " +
- ((phraseModel)? getOrientString(phrasePrevOrient, phraseType) + " " + getOrientString(phraseNextOrient, phraseType) : "") + " | " +
- ((hierModel)? getOrientString(hierPrevOrient, hierType) + " " + getOrientString(hierNextOrient, hierType) : "");
+ orientationInfo = ((m_options.isWordModel())? getOrientString(wordPrevOrient, m_options.isWordType()) + " " + getOrientString(wordNextOrient, m_options.isWordType()) : "") + " | " +
+ ((m_options.isPhraseModel())? getOrientString(phrasePrevOrient, m_options.isPhraseType()) + " " + getOrientString(phraseNextOrient, m_options.isPhraseType()) : "") + " | " +
+ ((m_options.isHierModel())? getOrientString(hierPrevOrient, m_options.isHierType()) + " " + getOrientString(hierNextOrient, m_options.isHierType()) : "");
addPhrase(sentence, startE, endE, startF, endF, orientationInfo);
}
@@ -617,95 +657,141 @@ string getOrientString(REO_POS orient, REO_MODEL_TYPE modelType)
return "";
}
-void addPhrase( SentenceAlignment &sentence, int startE, int endE, int startF, int endF , string &orientationInfo)
+void ExtractTask::addPhrase( SentenceAlignment &sentence, int startE, int endE, int startF, int endF , string &orientationInfo)
{
// source
- // cout << "adding ( " << startF << "-" << endF << ", " << startE << "-" << endE << ")\n";
+ // // cout << "adding ( " << startF << "-" << endF << ", " << startE << "-" << endE << ")\n";
+ ostringstream outextractstr;
+ ostringstream outextractstrInv;
+ ostringstream outextractstrOrientation;
+ ostringstream outextractstrSentenceId;
- if (onlyOutputSpanInfo) {
+ if (m_options.isOnlyOutputSpanInfo()) {
cout << startF << " " << endF << " " << startE << " " << endE << endl;
return;
}
- for(int fi=startF; fi<=endF; fi++) {
- if (translationFlag) extractFile << sentence.source[fi] << " ";
- if (orientationFlag) extractFileOrientation << sentence.source[fi] << " ";
- if (sentenceIdFlag) extractFileSentenceId << sentence.source[fi] << " ";
+for(int fi=startF; fi<=endF; fi++) {
+ if (m_options.isTranslationFlag()) outextractstr << sentence.source[fi] << " ";
+ if (m_options.isOrientationFlag()) outextractstrOrientation << sentence.source[fi] << " ";
+ if (m_options.isSentenceIdFlag()) outextractstrSentenceId << sentence.source[fi] << " ";
}
- if (translationFlag) extractFile << "||| ";
- if (orientationFlag) extractFileOrientation << "||| ";
- if (sentenceIdFlag) extractFileSentenceId << "||| ";
+ if (m_options.isTranslationFlag()) outextractstr << "||| ";
+ if (m_options.isOrientationFlag()) outextractstrOrientation << "||| ";
+ if (m_options.isSentenceIdFlag()) outextractstrSentenceId << "||| ";
// target
for(int ei=startE; ei<=endE; ei++) {
- if (translationFlag) extractFile << sentence.target[ei] << " ";
- if (translationFlag) extractFileInv << sentence.target[ei] << " ";
- if (orientationFlag) extractFileOrientation << sentence.target[ei] << " ";
- if (sentenceIdFlag) extractFileSentenceId << sentence.target[ei] << " ";
+ if (m_options.isTranslationFlag()) outextractstr << sentence.target[ei] << " ";
+ if (m_options.isTranslationFlag()) outextractstrInv << sentence.target[ei] << " ";
+ if (m_options.isOrientationFlag()) outextractstrOrientation << sentence.target[ei] << " ";
+ if (m_options.isSentenceIdFlag()) outextractstrSentenceId << sentence.target[ei] << " ";
}
- if (translationFlag) extractFile << "|||";
- if (translationFlag) extractFileInv << "||| ";
- if (orientationFlag) extractFileOrientation << "||| ";
- if (sentenceIdFlag) extractFileSentenceId << "||| ";
+ if (m_options.isTranslationFlag()) outextractstr << "|||";
+ if (m_options.isTranslationFlag()) outextractstrInv << "||| ";
+ if (m_options.isOrientationFlag()) outextractstrOrientation << "||| ";
+ if (m_options.isSentenceIdFlag()) outextractstrSentenceId << "||| ";
// source (for inverse)
- if (translationFlag) {
+
+ if (m_options.isTranslationFlag()) {
for(int fi=startF; fi<=endF; fi++)
- extractFileInv << sentence.source[fi] << " ";
- extractFileInv << "|||";
+ outextractstrInv << sentence.source[fi] << " ";
+ outextractstrInv << "|||";
}
-
// alignment
- if (translationFlag) {
+ if (m_options.isTranslationFlag()) {
for(int ei=startE; ei<=endE; ei++) {
- for(size_t i=0; i<sentence.alignedToT[ei].size(); i++) {
+ for(unsigned int i=0; i<sentence.alignedToT[ei].size(); i++) {
int fi = sentence.alignedToT[ei][i];
- extractFile << " " << fi-startF << "-" << ei-startE;
- extractFileInv << " " << ei-startE << "-" << fi-startF;
+ outextractstr << " " << fi-startF << "-" << ei-startE;
+ outextractstrInv << " " << ei-startE << "-" << fi-startF;
}
}
}
- if (orientationFlag)
- extractFileOrientation << orientationInfo;
+ if (m_options.isOrientationFlag())
+ outextractstrOrientation << orientationInfo;
- if (sentenceIdFlag) {
- extractFileSentenceId << sentence.sentenceID;
+ if (m_options.isSentenceIdFlag()) {
+ outextractstrSentenceId << sentence.sentenceID;
+ }
+ if (m_options.isIncludeSentenceIdFlag()) {
+ outextractstr << " ||| " << sentence.sentenceID;
}
- if (translationFlag) extractFile << "\n";
- if (translationFlag) extractFileInv << "\n";
- if (orientationFlag) extractFileOrientation << "\n";
- if (sentenceIdFlag) extractFileSentenceId << "\n";
+ if (m_options.isTranslationFlag()) outextractstr << "\n";
+ if (m_options.isTranslationFlag()) outextractstrInv << "\n";
+ if (m_options.isOrientationFlag()) outextractstrOrientation << "\n";
+ if (m_options.isSentenceIdFlag()) outextractstrSentenceId << "\n";
+
+
+ m_extractedPhrases.push_back(outextractstr.str());
+ m_extractedPhrasesInv.push_back(outextractstrInv.str());
+ m_extractedPhrasesOri.push_back(outextractstrOrientation.str());
+ m_extractedPhrasesSid.push_back(outextractstrSentenceId.str());
+}
+
+
+void ExtractTask::writePhrasesToFile(){
+
+ ostringstream outextractFile;
+ ostringstream outextractFileInv;
+ ostringstream outextractFileOrientation;
+ ostringstream outextractFileSentenceId;
+
+ for(vector<string>::const_iterator phrase=m_extractedPhrases.begin();phrase!=m_extractedPhrases.end();phrase++){
+ outextractFile<<phrase->data();
+ }
+ for(vector<string>::const_iterator phrase=m_extractedPhrasesInv.begin();phrase!=m_extractedPhrasesInv.end();phrase++){
+ outextractFileInv<<phrase->data();
+ }
+ for(vector<string>::const_iterator phrase=m_extractedPhrasesOri.begin();phrase!=m_extractedPhrasesOri.end();phrase++){
+ outextractFileOrientation<<phrase->data();
+ }
+ for(vector<string>::const_iterator phrase=m_extractedPhrasesSid.begin();phrase!=m_extractedPhrasesSid.end();phrase++){
+ outextractFileSentenceId<<phrase->data();
+ }
+
+ m_extractFile << outextractFile.str();
+ m_extractFileInv << outextractFileInv.str();
+ m_extractFileOrientation << outextractFileOrientation.str();
+ m_extractFileSentenceId << outextractFileSentenceId.str();
}
// if proper conditioning, we need the number of times a source phrase occured
-void extractBase( SentenceAlignment &sentence )
+
+void ExtractTask::extractBase( SentenceAlignment &sentence )
{
+ ostringstream outextractFile;
+ ostringstream outextractFileInv;
+
int countF = sentence.source.size();
for(int startF=0; startF<countF; startF++) {
for(int endF=startF;
- (endF<countF && endF<startF+maxPhraseLength);
+ (endF<countF && endF<startF+m_options.maxPhraseLength);
endF++) {
for(int fi=startF; fi<=endF; fi++) {
- extractFile << sentence.source[fi] << " ";
- }
- extractFile << "|||" << endl;
+ outextractFile << sentence.source[fi] << " ";
+ }
+ outextractFile << "|||" << endl;
}
}
int countE = sentence.target.size();
for(int startE=0; startE<countE; startE++) {
for(int endE=startE;
- (endE<countE && endE<startE+maxPhraseLength);
+ (endE<countE && endE<startE+m_options.maxPhraseLength);
endE++) {
for(int ei=startE; ei<=endE; ei++) {
- extractFileInv << sentence.target[ei] << " ";
+ outextractFileInv << sentence.target[ei] << " ";
}
- extractFileInv << "|||" << endl;
+ outextractFileInv << "|||" << endl;
}
}
-}
+ m_extractFile << outextractFile.str();
+ m_extractFileInv << outextractFileInv.str();
}
+}
diff --git a/phrase-extract/score.cpp b/phrase-extract/score.cpp
index e3025ac08..f764beef7 100644
--- a/phrase-extract/score.cpp
+++ b/phrase-extract/score.cpp
@@ -30,6 +30,7 @@
#include "SafeGetline.h"
#include "tables-core.h"
+#include "domain.h"
#include "PhraseAlignment.h"
#include "score.h"
#include "InputFileStream.h"
@@ -54,13 +55,22 @@ bool kneserNeyFlag = false;
#define COC_MAX 10
bool logProbFlag = false;
int negLogProb = 1;
+inline float maybeLogProb( float a ) { return logProbFlag ? negLogProb*log(a) : a; }
bool lexFlag = true;
bool unalignedFlag = false;
bool unalignedFWFlag = false;
bool outputNTLengths = false;
+bool singletonFeature = false;
+bool crossedNonTerm = false;
int countOfCounts[COC_MAX+1];
int totalDistinct = 0;
float minCountHierarchical = 0;
+bool domainFlag = false;
+bool domainRatioFlag = false;
+bool domainSubsetFlag = false;
+bool domainSparseFlag = false;
+Domain *domain;
+bool includeSentenceIdFlag = false;
Vocabulary vcbT;
Vocabulary vcbS;
@@ -70,14 +80,14 @@ Vocabulary vcbS;
vector<string> tokenize( const char [] );
void writeCountOfCounts( const string &fileNameCountOfCounts );
-void processPhrasePairs( vector< PhraseAlignment > & , ostream &phraseTableFile);
-PhraseAlignment* findBestAlignment(const PhraseAlignmentCollection &phrasePair );
-void outputPhrasePair(const PhraseAlignmentCollection &phrasePair, float, int, ostream &phraseTableFile );
-double computeLexicalTranslation( const PHRASE &, const PHRASE &, PhraseAlignment * );
-double computeUnalignedPenalty( const PHRASE &, const PHRASE &, PhraseAlignment * );
+void processPhrasePairs( vector< PhraseAlignment > & , ostream &phraseTableFile, bool isSingleton);
+const PhraseAlignment &findBestAlignment(const PhraseAlignmentCollection &phrasePair );
+void outputPhrasePair(const PhraseAlignmentCollection &phrasePair, float, int, ostream &phraseTableFile, bool isSingleton );
+double computeLexicalTranslation( const PHRASE &, const PHRASE &, const PhraseAlignment & );
+double computeUnalignedPenalty( const PHRASE &, const PHRASE &, const PhraseAlignment & );
set<string> functionWordList;
-void loadFunctionWords( const char* fileNameFunctionWords );
-double computeUnalignedFWPenalty( const PHRASE &, const PHRASE &, PhraseAlignment * );
+void loadFunctionWords( const string &fileNameFunctionWords );
+double computeUnalignedFWPenalty( const PHRASE &, const PHRASE &, const PhraseAlignment & );
void calcNTLengthProb(const vector< PhraseAlignment* > &phrasePairs
, map<size_t, map<size_t, float> > &sourceProb
, map<size_t, map<size_t, float> > &targetProb);
@@ -90,14 +100,15 @@ int main(int argc, char* argv[])
<< "scoring methods for extracted rules\n";
if (argc < 4) {
- cerr << "syntax: score extract lex phrase-table [--Inverse] [--Hierarchical] [--LogProb] [--NegLogProb] [--NoLex] [--GoodTuring] [--KneserNey] [--WordAlignment] [--UnalignedPenalty] [--UnalignedFunctionWordPenalty function-word-file] [--MinCountHierarchical count] [--OutputNTLengths] [--PCFG] [--UnpairedExtractFormat] [--ConditionOnTargetLHS]\n";
+ cerr << "syntax: score extract lex phrase-table [--Inverse] [--Hierarchical] [--LogProb] [--NegLogProb] [--NoLex] [--GoodTuring] [--KneserNey] [--WordAlignment] [--UnalignedPenalty] [--UnalignedFunctionWordPenalty function-word-file] [--MinCountHierarchical count] [--OutputNTLengths] [--PCFG] [--UnpairedExtractFormat] [--ConditionOnTargetLHS] [--[Sparse]Domain[Indicator|Ratio|Subset|Bin] domain-file [bins]] [--Singleton] [--CrossedNonTerm] \n";
exit(1);
}
- char* fileNameExtract = argv[1];
- char* fileNameLex = argv[2];
- char* fileNamePhraseTable = argv[3];
+ string fileNameExtract = argv[1];
+ string fileNameLex = argv[2];
+ string fileNamePhraseTable = argv[3];
string fileNameCountOfCounts;
- char* fileNameFunctionWords;
+ char* fileNameFunctionWords = NULL;
+ char* fileNameDomain = NULL;
for(int i=4; i<argc; i++) {
if (strcmp(argv[i],"inverse") == 0 || strcmp(argv[i],"--Inverse") == 0) {
@@ -140,6 +151,22 @@ int main(int argc, char* argv[])
}
fileNameFunctionWords = argv[++i];
cerr << "using unaligned function word penalty with function words from " << fileNameFunctionWords << endl;
+ } else if (strcmp(argv[i],"--SparseDomainIndicator") == 0 ||
+ strcmp(argv[i],"--SparseDomainRatio") == 0 ||
+ strcmp(argv[i],"--SparseDomainSubset") == 0 ||
+ strcmp(argv[i],"--DomainIndicator") == 0 ||
+ strcmp(argv[i],"--DomainRatio") == 0 ||
+ strcmp(argv[i],"--DomainSubset") == 0) {
+ includeSentenceIdFlag = true;
+ domainFlag = true;
+ domainSparseFlag = strstr( argv[i], "Sparse" );
+ domainRatioFlag = strstr( argv[i], "Ratio" );
+ domainSubsetFlag = strstr( argv[i], "Subset" );
+ if (i+1==argc) {
+ cerr << "ERROR: specify domain info file with " << argv[i] << endl;
+ exit(1);
+ }
+ fileNameDomain = argv[++i];
} else if (strcmp(argv[i],"--LogProb") == 0) {
logProbFlag = true;
cerr << "using log-probabilities\n";
@@ -153,6 +180,12 @@ int main(int argc, char* argv[])
minCountHierarchical -= 0.00001; // account for rounding
} else if (strcmp(argv[i],"--OutputNTLengths") == 0) {
outputNTLengths = true;
+ } else if (strcmp(argv[i],"--Singleton") == 0) {
+ singletonFeature = true;
+ cerr << "binary singleton feature\n";
+ } else if (strcmp(argv[i],"--CrossedNonTerm") == 0) {
+ crossedNonTerm = true;
+ cerr << "crossed non-term reordering feature\n";
} else {
cerr << "ERROR: unknown option " << argv[i] << endl;
exit(1);
@@ -167,6 +200,18 @@ int main(int argc, char* argv[])
if (unalignedFWFlag)
loadFunctionWords( fileNameFunctionWords );
+ // load domain information
+ if (domainFlag) {
+ if (inverseFlag) {
+ domainFlag = false;
+ includeSentenceIdFlag = false;
+ }
+ else {
+ domain = new Domain;
+ domain->load( fileNameDomain );
+ }
+ }
+
// compute count of counts for Good Turing discounting
if (goodTuringFlag || kneserNeyFlag) {
for(int i=1; i<=COC_MAX; i++) countOfCounts[i] = 0;
@@ -184,7 +229,7 @@ int main(int argc, char* argv[])
// output file: phrase translation table
ostream *phraseTableFile;
- if (strcmp(fileNamePhraseTable, "-") == 0) {
+ if (fileNamePhraseTable == "-") {
phraseTableFile = &cout;
}
else {
@@ -202,6 +247,7 @@ int main(int argc, char* argv[])
float lastCount = 0.0f;
float lastPcfgSum = 0.0f;
vector< PhraseAlignment > phrasePairsWithSameF;
+ bool isSingleton = true;
int i=0;
char line[LINE_MAX_LENGTH],lastLine[LINE_MAX_LENGTH];
lastLine[0] = '\0';
@@ -222,30 +268,40 @@ int main(int argc, char* argv[])
// create new phrase pair
PhraseAlignment phrasePair;
- phrasePair.create( line, i );
+ phrasePair.create( line, i, includeSentenceIdFlag );
lastCount = phrasePair.count;
lastPcfgSum = phrasePair.pcfgSum;
// only differs in count? just add count
- if (lastPhrasePair != NULL && lastPhrasePair->equals( phrasePair )) {
+ if (lastPhrasePair != NULL
+ && lastPhrasePair->equals( phrasePair )
+ && (!domainFlag
+ || domain->getDomainOfSentence( lastPhrasePair->sentenceId )
+ == domain->getDomainOfSentence( phrasePair.sentenceId ) )) {
lastPhrasePair->count += phrasePair.count;
lastPhrasePair->pcfgSum += phrasePair.pcfgSum;
continue;
}
-
+
// if new source phrase, process last batch
if (lastPhrasePair != NULL &&
lastPhrasePair->GetSource() != phrasePair.GetSource()) {
- processPhrasePairs( phrasePairsWithSameF, *phraseTableFile );
+ processPhrasePairs( phrasePairsWithSameF, *phraseTableFile, isSingleton );
+
phrasePairsWithSameF.clear();
+ isSingleton = false;
lastPhrasePair = NULL;
}
+ else
+ {
+ isSingleton = true;
+ }
// add phrase pairs to list, it's now the last one
phrasePairsWithSameF.push_back( phrasePair );
lastPhrasePair = &phrasePairsWithSameF.back();
}
- processPhrasePairs( phrasePairsWithSameF, *phraseTableFile );
+ processPhrasePairs( phrasePairsWithSameF, *phraseTableFile, isSingleton );
phraseTableFile->flush();
if (phraseTableFile != &cout) {
@@ -279,7 +335,7 @@ void writeCountOfCounts( const string &fileNameCountOfCounts )
countOfCountsFile.Close();
}
-void processPhrasePairs( vector< PhraseAlignment > &phrasePair, ostream &phraseTableFile )
+void processPhrasePairs( vector< PhraseAlignment > &phrasePair, ostream &phraseTableFile, bool isSingleton )
{
if (phrasePair.size() == 0) return;
@@ -320,16 +376,15 @@ void processPhrasePairs( vector< PhraseAlignment > &phrasePair, ostream &phraseT
for(iter = sortedColl.begin(); iter != sortedColl.end(); ++iter)
{
const PhraseAlignmentCollection &group = **iter;
- outputPhrasePair( group, totalSource, phrasePairGroup.GetSize(), phraseTableFile );
-
+ outputPhrasePair( group, totalSource, phrasePairGroup.GetSize(), phraseTableFile, isSingleton );
}
}
-PhraseAlignment* findBestAlignment(const PhraseAlignmentCollection &phrasePair )
+const PhraseAlignment &findBestAlignment(const PhraseAlignmentCollection &phrasePair )
{
float bestAlignmentCount = -1;
- PhraseAlignment* bestAlignment;
+ PhraseAlignment* bestAlignment = NULL;
for(size_t i=0; i<phrasePair.size(); i++) {
size_t alignInd;
@@ -347,7 +402,7 @@ PhraseAlignment* findBestAlignment(const PhraseAlignmentCollection &phrasePair )
}
}
- return bestAlignment;
+ return *bestAlignment;
}
@@ -438,11 +493,65 @@ void outputNTLengthProbs(ostream &phraseTableFile, const map<size_t, map<size_t,
}
-void outputPhrasePair(const PhraseAlignmentCollection &phrasePair, float totalCount, int distinctCount, ostream &phraseTableFile )
+bool calcCrossedNonTerm(int sourcePos, int targetPos, const std::vector< std::set<size_t> > &alignedToS)
+{
+ for (int currSource = 0; currSource < alignedToS.size(); ++currSource)
+ {
+ if (currSource == sourcePos)
+ { // skip
+ }
+ else
+ {
+ const std::set<size_t> &targetSet = alignedToS[currSource];
+ std::set<size_t>::const_iterator iter;
+ for (iter = targetSet.begin(); iter != targetSet.end(); ++iter)
+ {
+ size_t currTarget = *iter;
+
+ if ((currSource < sourcePos && currTarget > targetPos)
+ || (currSource > sourcePos && currTarget < targetPos)
+ )
+ {
+ return true;
+ }
+ }
+
+ }
+ }
+
+ return false;
+}
+
+int calcCrossedNonTerm(const PHRASE &phraseS, const PhraseAlignment &bestAlignment)
+{
+ const std::vector< std::set<size_t> > &alignedToS = bestAlignment.alignedToS;
+
+ for (int sourcePos = 0; sourcePos < alignedToS.size(); ++sourcePos)
+ {
+ const std::set<size_t> &targetSet = alignedToS[sourcePos];
+
+ WORD_ID wordId = phraseS[sourcePos];
+ const WORD &word = vcbS.getWord(wordId);
+ bool isNonTerm = isNonTerminal(word);
+
+ if (isNonTerm)
+ {
+ assert(targetSet.size() == 1);
+ int targetPos = *targetSet.begin();
+ bool ret = calcCrossedNonTerm(sourcePos, targetPos, alignedToS);
+ if (ret)
+ return 1;
+ }
+ }
+
+ return 0;
+}
+
+void outputPhrasePair(const PhraseAlignmentCollection &phrasePair, float totalCount, int distinctCount, ostream &phraseTableFile, bool isSingleton )
{
if (phrasePair.size() == 0) return;
- PhraseAlignment *bestAlignment = findBestAlignment( phrasePair );
+ const PhraseAlignment &bestAlignment = findBestAlignment( phrasePair );
// compute count
float count = 0;
@@ -450,6 +559,18 @@ void outputPhrasePair(const PhraseAlignmentCollection &phrasePair, float totalCo
count += phrasePair[i]->count;
}
+ // compute domain counts
+ map< string, float > domainCount;
+ if (domainFlag) {
+ for(size_t i=0; i<phrasePair.size(); i++) {
+ string d = domain->getDomainOfSentence( phrasePair[i]->sentenceId );
+ if (domainCount.find( d ) == domainCount.end())
+ domainCount[ d ] = phrasePair[i]->count;
+ else
+ domainCount[ d ] += phrasePair[i]->count;
+ }
+ }
+
// collect count of count statistics
if (goodTuringFlag || kneserNeyFlag) {
totalDistinct++;
@@ -459,7 +580,7 @@ void outputPhrasePair(const PhraseAlignmentCollection &phrasePair, float totalCo
}
// compute PCFG score
- float pcfgScore;
+ float pcfgScore = 0;
if (pcfgFlag && !inverseFlag) {
float pcfgSum = 0;
for(size_t i=0; i<phrasePair.size(); ++i) {
@@ -482,41 +603,109 @@ void outputPhrasePair(const PhraseAlignmentCollection &phrasePair, float totalCo
// source phrase (unless inverse)
if (! inverseFlag) {
- printSourcePhrase(phraseS, phraseT, *bestAlignment, phraseTableFile);
+ printSourcePhrase(phraseS, phraseT, bestAlignment, phraseTableFile);
phraseTableFile << " ||| ";
}
// target phrase
- printTargetPhrase(phraseS, phraseT, *bestAlignment, phraseTableFile);
+ printTargetPhrase(phraseS, phraseT, bestAlignment, phraseTableFile);
phraseTableFile << " ||| ";
// source phrase (if inverse)
if (inverseFlag) {
- printSourcePhrase(phraseS, phraseT, *bestAlignment, phraseTableFile);
+ printSourcePhrase(phraseS, phraseT, bestAlignment, phraseTableFile);
phraseTableFile << " ||| ";
}
// lexical translation probability
if (lexFlag) {
double lexScore = computeLexicalTranslation( phraseS, phraseT, bestAlignment);
- phraseTableFile << ( logProbFlag ? negLogProb*log(lexScore) : lexScore );
+ phraseTableFile << maybeLogProb( lexScore );
}
// unaligned word penalty
if (unalignedFlag) {
double penalty = computeUnalignedPenalty( phraseS, phraseT, bestAlignment);
- phraseTableFile << " " << ( logProbFlag ? negLogProb*log(penalty) : penalty );
+ phraseTableFile << " " << maybeLogProb( penalty );
}
// unaligned function word penalty
if (unalignedFWFlag) {
double penalty = computeUnalignedFWPenalty( phraseS, phraseT, bestAlignment);
- phraseTableFile << " " << ( logProbFlag ? negLogProb*log(penalty) : penalty );
+ phraseTableFile << " " << maybeLogProb( penalty );
}
+ if (singletonFeature) {
+ phraseTableFile << " " << (isSingleton ? 1 : 0);
+ }
+
+ if (crossedNonTerm && !inverseFlag) {
+ phraseTableFile << " " << calcCrossedNonTerm(phraseS, bestAlignment);
+ }
+
// target-side PCFG score
if (pcfgFlag && !inverseFlag) {
- phraseTableFile << " " << pcfgScore;
+ phraseTableFile << " " << maybeLogProb( pcfgScore );
+ }
+
+ // domain count features
+ if (domainFlag) {
+ if (domainSparseFlag) {
+ // sparse, subset
+ if (domainSubsetFlag) {
+ typedef vector< string >::const_iterator I;
+ phraseTableFile << " doms";
+ for (I i = domain->list.begin(); i != domain->list.end(); i++ ) {
+ if (domainCount.find( *i ) != domainCount.end() ) {
+ phraseTableFile << "_" << *i;
+ }
+ }
+ phraseTableFile << " 1";
+ }
+ // sparse, indicator or ratio
+ else {
+ typedef map< string, float >::const_iterator I;
+ for (I i=domainCount.begin(); i != domainCount.end(); i++) {
+ if (domainRatioFlag) {
+ phraseTableFile << " domr_" << i->first << " " << (i->second / count);
+ }
+ else {
+ phraseTableFile << " dom_" << i->first << " 1";
+ }
+ }
+ }
+ }
+ // core, subset
+ else if (domainSubsetFlag) {
+ if (domain->list.size() > 6) {
+ cerr << "ERROR: too many domains for core domain subset features\n";
+ exit(1);
+ }
+ size_t bitmap = 0;
+ for(size_t bit = 0; bit < domain->list.size(); bit++) {
+ if (domainCount.find( domain->list[ bit ] ) != domainCount.end()) {
+ bitmap += 1 << bit;
+ }
+ }
+ for(size_t i = 1; i < (1 << domain->list.size()); i++) {
+ phraseTableFile << " " << maybeLogProb( (bitmap == i) ? 2.718 : 1 );
+ }
+ }
+ // core, indicator or ratio
+ else {
+ typedef vector< string >::const_iterator I;
+ for (I i = domain->list.begin(); i != domain->list.end(); i++ ) {
+ if (domainCount.find( *i ) == domainCount.end() ) {
+ phraseTableFile << " " << maybeLogProb( 1 );
+ }
+ else if (domainRatioFlag) {
+ phraseTableFile << " " << maybeLogProb( exp( domainCount[ *i ] / count ) );
+ }
+ else {
+ phraseTableFile << " " << maybeLogProb( 2.718 );
+ }
+ }
+ }
}
phraseTableFile << " ||| ";
@@ -526,41 +715,40 @@ void outputPhrasePair(const PhraseAlignmentCollection &phrasePair, float totalCo
if (hierarchicalFlag) {
// always output alignment if hiero style, but only for non-terms
// (eh: output all alignments, needed for some feature functions)
- assert(phraseT.size() == bestAlignment->alignedToT.size() + 1);
+ assert(phraseT.size() == bestAlignment.alignedToT.size() + 1);
std::vector<std::string> alignment;
for(size_t j = 0; j < phraseT.size() - 1; j++) {
if (isNonTerminal(vcbT.getWord( phraseT[j] ))) {
- if (bestAlignment->alignedToT[ j ].size() != 1) {
+ if (bestAlignment.alignedToT[ j ].size() != 1) {
cerr << "Error: unequal numbers of non-terminals. Make sure the text does not contain words in square brackets (like [xxx])." << endl;
phraseTableFile.flush();
- assert(bestAlignment->alignedToT[ j ].size() == 1);
+ assert(bestAlignment.alignedToT[ j ].size() == 1);
}
- int sourcePos = *(bestAlignment->alignedToT[ j ].begin());
+ int sourcePos = *(bestAlignment.alignedToT[ j ].begin());
//phraseTableFile << sourcePos << "-" << j << " ";
- std::stringstream point;
- point << sourcePos << "-" << j;
- alignment.push_back(point.str());
- }
- else {
+ std::stringstream point;
+ point << sourcePos << "-" << j;
+ alignment.push_back(point.str());
+ } else {
set<size_t>::iterator setIter;
- for(setIter = (bestAlignment->alignedToT[j]).begin(); setIter != (bestAlignment->alignedToT[j]).end(); setIter++) {
+ for(setIter = (bestAlignment.alignedToT[j]).begin(); setIter != (bestAlignment.alignedToT[j]).end(); setIter++) {
int sourcePos = *setIter;
//phraseTableFile << sourcePos << "-" << j << " ";
- std::stringstream point;
- point << sourcePos << "-" << j;
- alignment.push_back(point.str());
- }
- }
- }
- // now print all alignments, sorted by source index
- sort(alignment.begin(), alignment.end());
- for (size_t i = 0; i < alignment.size(); ++i) {
- phraseTableFile << alignment[i] << " ";
- }
- } else if (wordAlignmentFlag) {
+ std::stringstream point;
+ point << sourcePos << "-" << j;
+ alignment.push_back(point.str());
+ }
+ }
+ }
+ // now print all alignments, sorted by source index
+ sort(alignment.begin(), alignment.end());
+ for (size_t i = 0; i < alignment.size(); ++i) {
+ phraseTableFile << alignment[i] << " ";
+ }
+ } else if (wordAlignmentFlag) {
// alignment info in pb model
- for(size_t j=0; j<bestAlignment->alignedToT.size(); j++) {
- const set< size_t > &aligned = bestAlignment->alignedToT[j];
+ for(size_t j=0; j<bestAlignment.alignedToT.size(); j++) {
+ const set< size_t > &aligned = bestAlignment.alignedToT[j];
for (set< size_t >::const_iterator p(aligned.begin()); p != aligned.end(); ++p) {
phraseTableFile << *p << "-" << j << " ";
}
@@ -568,6 +756,7 @@ void outputPhrasePair(const PhraseAlignmentCollection &phrasePair, float totalCo
}
}
+
// counts
phraseTableFile << " ||| " << totalCount << " " << count;
@@ -594,13 +783,13 @@ void outputPhrasePair(const PhraseAlignmentCollection &phrasePair, float totalCo
phraseTableFile << endl;
}
-double computeUnalignedPenalty( const PHRASE &phraseS, const PHRASE &phraseT, PhraseAlignment *alignment )
+double computeUnalignedPenalty( const PHRASE &phraseS, const PHRASE &phraseT, const PhraseAlignment &alignment )
{
// unaligned word counter
double unaligned = 1.0;
// only checking target words - source words are caught when computing inverse
- for(size_t ti=0; ti<alignment->alignedToT.size(); ti++) {
- const set< size_t > & srcIndices = alignment->alignedToT[ ti ];
+ for(size_t ti=0; ti<alignment.alignedToT.size(); ti++) {
+ const set< size_t > & srcIndices = alignment.alignedToT[ ti ];
if (srcIndices.empty()) {
unaligned *= 2.718;
}
@@ -608,13 +797,13 @@ double computeUnalignedPenalty( const PHRASE &phraseS, const PHRASE &phraseT, Ph
return unaligned;
}
-double computeUnalignedFWPenalty( const PHRASE &phraseS, const PHRASE &phraseT, PhraseAlignment *alignment )
+double computeUnalignedFWPenalty( const PHRASE &phraseS, const PHRASE &phraseT, const PhraseAlignment &alignment )
{
// unaligned word counter
double unaligned = 1.0;
// only checking target words - source words are caught when computing inverse
- for(size_t ti=0; ti<alignment->alignedToT.size(); ti++) {
- const set< size_t > & srcIndices = alignment->alignedToT[ ti ];
+ for(size_t ti=0; ti<alignment.alignedToT.size(); ti++) {
+ const set< size_t > & srcIndices = alignment.alignedToT[ ti ];
if (srcIndices.empty() && functionWordList.find( vcbT.getWord( phraseT[ ti ] ) ) != functionWordList.end()) {
unaligned *= 2.718;
}
@@ -622,11 +811,11 @@ double computeUnalignedFWPenalty( const PHRASE &phraseS, const PHRASE &phraseT,
return unaligned;
}
-void loadFunctionWords( const char *fileName )
+void loadFunctionWords( const string &fileName )
{
cerr << "Loading function word list from " << fileName;
ifstream inFile;
- inFile.open(fileName);
+ inFile.open(fileName.c_str());
if (inFile.fail()) {
cerr << " - ERROR: could not open file\n";
exit(1);
@@ -647,14 +836,14 @@ void loadFunctionWords( const char *fileName )
inFile.close();
}
-double computeLexicalTranslation( const PHRASE &phraseS, const PHRASE &phraseT, PhraseAlignment *alignment )
+double computeLexicalTranslation( const PHRASE &phraseS, const PHRASE &phraseT, const PhraseAlignment &alignment )
{
// lexical translation probability
double lexScore = 1.0;
int null = vcbS.getWordID("NULL");
// all target words have to be explained
- for(size_t ti=0; ti<alignment->alignedToT.size(); ti++) {
- const set< size_t > & srcIndices = alignment->alignedToT[ ti ];
+ for(size_t ti=0; ti<alignment.alignedToT.size(); ti++) {
+ const set< size_t > & srcIndices = alignment.alignedToT[ ti ];
if (srcIndices.empty()) {
// explain unaligned word by NULL
lexScore *= lexTable.permissiveLookup( null, phraseT[ ti ] );
@@ -670,11 +859,11 @@ double computeLexicalTranslation( const PHRASE &phraseS, const PHRASE &phraseT,
return lexScore;
}
-void LexicalTable::load( char *fileName )
+void LexicalTable::load( const string &fileName )
{
cerr << "Loading lexical translation table from " << fileName;
ifstream inFile;
- inFile.open(fileName);
+ inFile.open(fileName.c_str());
if (inFile.fail()) {
cerr << " - ERROR: could not open file\n";
exit(1);
diff --git a/phrase-extract/score.h b/phrase-extract/score.h
index ed9adc18c..f720a32d2 100644
--- a/phrase-extract/score.h
+++ b/phrase-extract/score.h
@@ -65,7 +65,7 @@ class LexicalTable
{
public:
std::map< WORD_ID, std::map< WORD_ID, double > > ltable;
- void load( char[] );
+ void load( const std::string &filePath );
double permissiveLookup( WORD_ID wordS, WORD_ID wordT ) {
// cout << endl << vcbS.getWord( wordS ) << "-" << vcbT.getWord( wordT ) << ":";
if (ltable.find( wordS ) == ltable.end()) return 1.0;
diff --git a/phrase-extract/statistics.cpp b/phrase-extract/statistics.cpp
index d39a05d3b..67373ec93 100644
--- a/phrase-extract/statistics.cpp
+++ b/phrase-extract/statistics.cpp
@@ -40,7 +40,7 @@ class LexicalTable
{
public:
map< WORD_ID, map< WORD_ID, double > > ltable;
- void load( char[] );
+ void load( const string &);
};
}
@@ -310,11 +310,11 @@ bool PhraseAlignment::equals( const PhraseAlignment& other )
return true;
}
-void LexicalTable::load( char *fileName )
+void LexicalTable::load( const string &filePath )
{
- cerr << "Loading lexical translation table from " << fileName;
+ cerr << "Loading lexical translation table from " << filePath;
ifstream inFile;
- inFile.open(fileName);
+ inFile.open(filePath.c_str());
if (inFile.fail()) {
cerr << " - ERROR: could not open file\n";
exit(1);
@@ -332,7 +332,7 @@ void LexicalTable::load( char *fileName )
vector<string> token = tokenize( line );
if (token.size() != 3) {
- cerr << "line " << i << " in " << fileName << " has wrong number of tokens, skipping:\n" <<
+ cerr << "line " << i << " in " << filePath << " has wrong number of tokens, skipping:\n" <<
token.size() << " " << token[0] << " " << line << endl;
continue;
}