Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorHieu Hoang <fishandfrolick@gmail.com>2012-07-31 22:40:43 +0400
committerHieu Hoang <fishandfrolick@gmail.com>2012-07-31 22:40:43 +0400
commit3b65a8c62612652df6d4c36482b5de81db5ae8f5 (patch)
tree8c2aafb103a9fae46a1f954a85136bc96f0f103e /phrase-extract
parentfd0fefcf6ee910ed2d7b7545f3ffe6bfe6fcc663 (diff)
get rid of threading
Diffstat (limited to 'phrase-extract')
-rw-r--r--phrase-extract/extract-rules.cpp201
1 files changed, 81 insertions, 120 deletions
diff --git a/phrase-extract/extract-rules.cpp b/phrase-extract/extract-rules.cpp
index c333040f6..b8cadea26 100644
--- a/phrase-extract/extract-rules.cpp
+++ b/phrase-extract/extract-rules.cpp
@@ -46,8 +46,6 @@
#include "XmlTree.h"
#include "InputFileStream.h"
#include "OutputFileStream.h"
-#include "../moses/src/ThreadPool.h"
-#include "../moses/src/OutputCollector.h"
#define LINE_MAX_LENGTH 500000
@@ -57,55 +55,53 @@ using namespace MosesTraining;
typedef vector< int > LabelIndex;
typedef map< int, int > WordIndex;
-class ExtractTask : public Moses::Task {
+class ExtractTask
+{
private:
- size_t m_id;
- SentenceAlignmentWithSyntax *m_sentence;
- RuleExtractionOptions &m_options;
- Moses::OutputCollector* m_extractCollector;
- Moses::OutputCollector* m_extractCollectorInv;
+ SentenceAlignmentWithSyntax &m_sentence;
+ const RuleExtractionOptions &m_options;
+ Moses::OutputFileStream& m_extractCollector;
+ Moses::OutputFileStream& m_extractCollectorInv;
+
+ vector< ExtractedRule > m_extractedRules;
+
+ // main functions
+ void extractRules();
+ void addRuleToCollection(ExtractedRule &rule);
+ void consolidateRules();
+ void writeRulesToFile();
+
+ // subs
+ void addRule( int, int, int, int, RuleExist &ruleExist);
+ void addHieroRule( int startT, int endT, int startS, int endS
+ , RuleExist &ruleExist, const HoleCollection &holeColl, int numHoles, int initStartF, int wordCountT, int wordCountS);
+ void printHieroPhrase( int startT, int endT, int startS, int endS
+ , HoleCollection &holeColl, LabelIndex &labelIndex);
+ string printTargetHieroPhrase( int startT, int endT, int startS, int endS
+ , WordIndex &indexT, HoleCollection &holeColl, const LabelIndex &labelIndex, double &logPCFGScore);
+ string printSourceHieroPhrase( int startT, int endT, int startS, int endS
+ , HoleCollection &holeColl, const LabelIndex &labelIndex);
+ void preprocessSourceHieroPhrase( int startT, int endT, int startS, int endS
+ , WordIndex &indexS, HoleCollection &holeColl, const LabelIndex &labelIndex);
+ void printHieroAlignment( int startT, int endT, int startS, int endS
+ , const WordIndex &indexS, const WordIndex &indexT, HoleCollection &holeColl, ExtractedRule &rule);
+ void printAllHieroPhrases( int startT, int endT, int startS, int endS, HoleCollection &holeColl);
+
+ inline string IntToString( int i )
+ {
+ stringstream out;
+ out << i;
+ return out.str();
+ }
public:
- ExtractTask(size_t id, SentenceAlignmentWithSyntax *sentence, RuleExtractionOptions &options, Moses::OutputCollector* extractCollector, Moses::OutputCollector* extractCollectorInv):
- m_id(id),
+ ExtractTask(SentenceAlignmentWithSyntax &sentence, const RuleExtractionOptions &options, Moses::OutputFileStream &extractCollector, Moses::OutputFileStream &extractCollectorInv):
m_sentence(sentence),
m_options(options),
m_extractCollector(extractCollector),
m_extractCollectorInv(extractCollectorInv) {}
- ~ExtractTask() { delete m_sentence; }
void Run();
-private:
-vector< ExtractedRule > m_extractedRules;
-
-// main functions
-void extractRules();
-void addRuleToCollection(ExtractedRule &rule);
-void consolidateRules();
-void writeRulesToFile();
-
-// subs
-void addRule( int, int, int, int, RuleExist &ruleExist);
-void addHieroRule( int startT, int endT, int startS, int endS
- , RuleExist &ruleExist, const HoleCollection &holeColl, int numHoles, int initStartF, int wordCountT, int wordCountS);
-void printHieroPhrase( int startT, int endT, int startS, int endS
- , HoleCollection &holeColl, LabelIndex &labelIndex);
-string printTargetHieroPhrase( int startT, int endT, int startS, int endS
- , WordIndex &indexT, HoleCollection &holeColl, const LabelIndex &labelIndex, double &logPCFGScore);
-string printSourceHieroPhrase( int startT, int endT, int startS, int endS
- , HoleCollection &holeColl, const LabelIndex &labelIndex);
-void preprocessSourceHieroPhrase( int startT, int endT, int startS, int endS
- , WordIndex &indexS, HoleCollection &holeColl, const LabelIndex &labelIndex);
-void printHieroAlignment( int startT, int endT, int startS, int endS
- , const WordIndex &indexS, const WordIndex &indexT, HoleCollection &holeColl, ExtractedRule &rule);
-void printAllHieroPhrases( int startT, int endT, int startS, int endS, HoleCollection &holeColl);
-
-inline string IntToString( int i )
-{
- stringstream out;
- out << i;
- return out.str();
-}
};
// stats for glue grammar and unknown word label probabilities
@@ -120,15 +116,11 @@ int main(int argc, char* argv[])
<< "rule extraction from an aligned parallel corpus\n";
RuleExtractionOptions options;
-#ifdef WITH_THREADS
- int thread_count = 1;
-#endif
+
if (argc < 5) {
cerr << "syntax: extract-rules corpus.target corpus.source corpus.align extract ["
-#ifdef WITH_THREADS
- << " --threads NUM |"
-#endif
- << " --GlueGrammar FILE"
+
+ << " --GlueGrammar FILE"
<< " | --UnknownWordLabel FILE"
<< " | --OnlyDirect"
<< " | --OutputNTLengths"
@@ -268,12 +260,6 @@ int main(int argc, char* argv[])
options.unpairedExtractFormat = true;
} else if (strcmp(argv[i],"--ConditionOnTargetLHS") == 0) {
options.conditionOnTargetLhs = true;
-#ifdef WITH_THREADS
- } else if (strcmp(argv[i],"-threads") == 0 ||
- strcmp(argv[i],"--threads") == 0 ||
- strcmp(argv[i],"--Threads") == 0) {
- thread_count = atoi(argv[++i]);
-#endif
} else {
cerr << "extract: syntax error, unknown option '" << string(argv[i]) << "'\n";
exit(1);
@@ -299,27 +285,17 @@ int main(int argc, char* argv[])
if (!options.onlyDirectFlag)
extractFileInv.Open(fileNameExtractInv.c_str());
- // output into file
- Moses::OutputCollector* extractCollector = new Moses::OutputCollector(&extractFile);
- Moses::OutputCollector* extractCollectorInv = new Moses::OutputCollector(&extractFileInv);
// stats on labels for glue grammar and unknown word label probabilities
set< string > targetLabelCollection, sourceLabelCollection;
map< string, int > targetTopLabelCollection, sourceTopLabelCollection;
-#ifdef WITH_THREADS
- // set up thread pool
- Moses::ThreadPool pool(thread_count);
- pool.SetQueueLimit(1000);
-#endif
-
// loop through all sentence pairs
size_t i=0;
while(true) {
i++;
- if (i%1000 == 0) cerr << "." << flush;
- if (i%10000 == 0) cerr << ":" << flush;
- if (i%100000 == 0) cerr << "!" << flush;
+ if (i%1000 == 0) cerr << i << " " << flush;
+
char targetString[LINE_MAX_LENGTH];
char sourceString[LINE_MAX_LENGTH];
char alignmentString[LINE_MAX_LENGTH];
@@ -328,7 +304,7 @@ int main(int argc, char* argv[])
SAFE_GETLINE((*sFileP), sourceString, LINE_MAX_LENGTH, '\n', __FILE__);
SAFE_GETLINE((*aFileP), alignmentString, LINE_MAX_LENGTH, '\n', __FILE__);
- SentenceAlignmentWithSyntax *sentence = new SentenceAlignmentWithSyntax
+ SentenceAlignmentWithSyntax sentence
(targetLabelCollection, sourceLabelCollection,
targetTopLabelCollection, sourceTopLabelCollection, options);
//az: output src, tgt, and alingment line
@@ -339,32 +315,17 @@ int main(int argc, char* argv[])
cout << "LOG: PHRASES_BEGIN:" << endl;
}
- if (sentence->create(targetString, sourceString, alignmentString, i)) {
+ if (sentence.create(targetString, sourceString, alignmentString, i)) {
if (options.unknownWordLabelFlag) {
- collectWordLabelCounts(*sentence);
+ collectWordLabelCounts(sentence);
}
- ExtractTask *task = new ExtractTask(i-1, sentence, options, extractCollector, extractCollectorInv);
-#ifdef WITH_THREADS
- if (thread_count == 1) {
- task->Run();
- delete task;
- }
- else {
- pool.Submit(task);
- }
-#else
+ ExtractTask *task = new ExtractTask(sentence, options, extractFile, extractFileInv);
task->Run();
delete task;
-#endif
}
if (options.onlyOutputSpanInfo) cout << "LOG: PHRASES_END:" << endl; //az: mark end of phrases
}
-#ifdef WITH_THREADS
- // wait for all threads to finish
- pool.Stop(true);
-#endif
-
tFile.Close();
sFile.Close();
aFile.Close();
@@ -390,8 +351,8 @@ void ExtractTask::Run() {
void ExtractTask::extractRules()
{
- int countT = m_sentence->target.size();
- int countS = m_sentence->source.size();
+ int countT = m_sentence.target.size();
+ int countS = m_sentence.source.size();
// phrase repository for creating hiero phrases
RuleExist ruleExist(countT);
@@ -406,17 +367,17 @@ void ExtractTask::extractRules()
int endT = startT + lengthT - 1;
// if there is target side syntax, there has to be a node
- if (m_options.targetSyntax && !m_sentence->targetTree.HasNode(startT,endT))
+ if (m_options.targetSyntax && !m_sentence.targetTree.HasNode(startT,endT))
continue;
// find find aligned source words
// first: find minimum and maximum source word
int minS = 9999;
int maxS = -1;
- vector< int > usedS = m_sentence->alignedCountS;
+ vector< int > usedS = m_sentence.alignedCountS;
for(int ti=startT; ti<=endT; ti++) {
- for(unsigned int i=0; i<m_sentence->alignedToT[ti].size(); i++) {
- int si = m_sentence->alignedToT[ti][i];
+ for(unsigned int i=0; i<m_sentence.alignedToT[ti].size(); i++) {
+ int si = m_sentence.alignedToT[ti][i];
if (si<minS) {
minS = si;
}
@@ -451,15 +412,15 @@ void ExtractTask::extractRules()
for(int startS=minS;
(startS>=0 &&
startS>maxS - m_options.maxSpan && // within length limit
- (startS==minS || m_sentence->alignedCountS[startS]==0)); // unaligned
+ (startS==minS || m_sentence.alignedCountS[startS]==0)); // unaligned
startS--) {
// end point of source phrase may advance over unaligned
for(int endS=maxS;
(endS<countS && endS<startS + m_options.maxSpan && // within length limit
- (endS==maxS || m_sentence->alignedCountS[endS]==0)); // unaligned
+ (endS==maxS || m_sentence.alignedCountS[endS]==0)); // unaligned
endS++) {
// if there is source side syntax, there has to be a node
- if (m_options.sourceSyntax && !m_sentence->sourceTree.HasNode(startS,endS))
+ if (m_options.sourceSyntax && !m_sentence.sourceTree.HasNode(startS,endS))
continue;
// TODO: loop over all source and target syntax labels
@@ -508,7 +469,7 @@ void ExtractTask::preprocessSourceHieroPhrase( int startT, int endT, int startS,
int labelI = labelIndex[ 2+holeCount+holeTotal ];
string label = m_options.sourceSyntax ?
- m_sentence->sourceTree.GetNodes(currPos,hole.GetEnd(0))[ labelI ]->GetLabel() : "X";
+ m_sentence.sourceTree.GetNodes(currPos,hole.GetEnd(0))[ labelI ]->GetLabel() : "X";
hole.SetLabel(label, 0);
currPos = hole.GetEnd(0);
@@ -549,7 +510,7 @@ string ExtractTask::printTargetHieroPhrase( int startT, int endT, int startS, in
int labelI = labelIndex[ 2+holeCount ];
string targetLabel = m_options.targetSyntax ?
- m_sentence->targetTree.GetNodes(currPos,hole.GetEnd(1))[ labelI ]->GetLabel() : "X";
+ m_sentence.targetTree.GetNodes(currPos,hole.GetEnd(1))[ labelI ]->GetLabel() : "X";
hole.SetLabel(targetLabel, 1);
if (m_options.unpairedExtractFormat) {
@@ -559,7 +520,7 @@ string ExtractTask::printTargetHieroPhrase( int startT, int endT, int startS, in
}
if (m_options.pcfgScore) {
- double score = m_sentence->targetTree.GetNodes(currPos,hole.GetEnd(1))[labelI]->GetPcfgScore();
+ double score = m_sentence.targetTree.GetNodes(currPos,hole.GetEnd(1))[labelI]->GetPcfgScore();
logPCFGScore -= score;
}
@@ -569,7 +530,7 @@ string ExtractTask::printTargetHieroPhrase( int startT, int endT, int startS, in
holeCount++;
} else {
indexT[currPos] = outPos;
- out += m_sentence->target[currPos] + " ";
+ out += m_sentence.target[currPos] + " ";
}
outPos++;
@@ -613,7 +574,7 @@ string ExtractTask::printSourceHieroPhrase( int startT, int endT, int startS, in
++iterHoleList;
++holeCount;
} else {
- out += m_sentence->source[currPos] + " ";
+ out += m_sentence.source[currPos] + " ";
}
outPos++;
@@ -630,8 +591,8 @@ void ExtractTask::printHieroAlignment( int startT, int endT, int startS, int end
for(int ti=startT; ti<=endT; ti++) {
WordIndex::const_iterator p = indexT.find(ti);
if (p != indexT.end()) { // does word still exist?
- for(unsigned int i=0; i<m_sentence->alignedToT[ti].size(); i++) {
- int si = m_sentence->alignedToT[ti][i];
+ for(unsigned int i=0; i<m_sentence.alignedToT[ti].size(); i++) {
+ int si = m_sentence.alignedToT[ti][i];
std::string sourceSymbolIndex = IntToString(indexS.find(si)->second);
std::string targetSymbolIndex = IntToString(p->second);
rule.alignment += sourceSymbolIndex + "-" + targetSymbolIndex + " ";
@@ -671,16 +632,16 @@ void ExtractTask::printHieroPhrase( int startT, int endT, int startS, int endS
// phrase labels
string targetLabel = m_options.targetSyntax ?
- m_sentence->targetTree.GetNodes(startT,endT)[ labelIndex[0] ]->GetLabel() : "X";
+ m_sentence.targetTree.GetNodes(startT,endT)[ labelIndex[0] ]->GetLabel() : "X";
string sourceLabel = m_options.sourceSyntax ?
- m_sentence->sourceTree.GetNodes(startS,endS)[ labelIndex[1] ]->GetLabel() : "X";
+ m_sentence.sourceTree.GetNodes(startS,endS)[ labelIndex[1] ]->GetLabel() : "X";
// create non-terms on the source side
preprocessSourceHieroPhrase(startT, endT, startS, endS, indexS, holeColl, labelIndex);
// target
if (m_options.pcfgScore) {
- double logPCFGScore = m_sentence->targetTree.GetNodes(startT,endT)[labelIndex[0]]->GetPcfgScore();
+ double logPCFGScore = m_sentence.targetTree.GetNodes(startT,endT)[labelIndex[0]]->GetPcfgScore();
rule.target = printTargetHieroPhrase(startT, endT, startS, endS, indexT, holeColl, labelIndex, logPCFGScore)
+ " [" + targetLabel + "]";
rule.pcfgScore = std::exp(logPCFGScore);
@@ -709,19 +670,19 @@ void ExtractTask::printAllHieroPhrases( int startT, int endT, int startS, int en
LabelIndex labelIndex,labelCount;
// number of target head labels
- int numLabels = m_options.targetSyntax ? m_sentence->targetTree.GetNodes(startT,endT).size() : 1;
+ int numLabels = m_options.targetSyntax ? m_sentence.targetTree.GetNodes(startT,endT).size() : 1;
labelCount.push_back(numLabels);
labelIndex.push_back(0);
// number of source head labels
- numLabels = m_options.sourceSyntax ? m_sentence->sourceTree.GetNodes(startS,endS).size() : 1;
+ numLabels = m_options.sourceSyntax ? m_sentence.sourceTree.GetNodes(startS,endS).size() : 1;
labelCount.push_back(numLabels);
labelIndex.push_back(0);
// number of target hole labels
for( HoleList::const_iterator hole = holeColl.GetHoles().begin();
hole != holeColl.GetHoles().end(); hole++ ) {
- int numLabels = m_options.targetSyntax ? m_sentence->targetTree.GetNodes(hole->GetStart(1),hole->GetEnd(1)).size() : 1 ;
+ int numLabels = m_options.targetSyntax ? m_sentence.targetTree.GetNodes(hole->GetStart(1),hole->GetEnd(1)).size() : 1 ;
labelCount.push_back(numLabels);
labelIndex.push_back(0);
}
@@ -731,7 +692,7 @@ void ExtractTask::printAllHieroPhrases( int startT, int endT, int startS, int en
for( vector<Hole*>::iterator i = holeColl.GetSortedSourceHoles().begin();
i != holeColl.GetSortedSourceHoles().end(); i++ ) {
const Hole &hole = **i;
- int numLabels = m_options.sourceSyntax ? m_sentence->sourceTree.GetNodes(hole.GetStart(0),hole.GetEnd(0)).size() : 1 ;
+ int numLabels = m_options.sourceSyntax ? m_sentence.sourceTree.GetNodes(hole.GetStart(0),hole.GetEnd(0)).size() : 1 ;
labelCount.push_back(numLabels);
labelIndex.push_back(0);
}
@@ -843,7 +804,7 @@ void ExtractTask::addHieroRule( int startT, int endT, int startS, int endS
}
// covered by word? check if it is aligned
else {
- if (m_sentence->alignedToT[pos].size() > 0)
+ if (m_sentence.alignedToT[pos].size() > 0)
foundAlignedWord = true;
}
}
@@ -893,36 +854,36 @@ void ExtractTask::addRule( int startT, int endT, int startS, int endS, RuleExist
// phrase labels
string targetLabel,sourceLabel;
if (m_options.targetSyntax && m_options.conditionOnTargetLhs) {
- sourceLabel = targetLabel = m_sentence->targetTree.GetNodes(startT,endT)[0]->GetLabel();
+ sourceLabel = targetLabel = m_sentence.targetTree.GetNodes(startT,endT)[0]->GetLabel();
}
else {
sourceLabel = m_options.sourceSyntax ?
- m_sentence->sourceTree.GetNodes(startS,endS)[0]->GetLabel() : "X";
+ m_sentence.sourceTree.GetNodes(startS,endS)[0]->GetLabel() : "X";
targetLabel = m_options.targetSyntax ?
- m_sentence->targetTree.GetNodes(startT,endT)[0]->GetLabel() : "X";
+ m_sentence.targetTree.GetNodes(startT,endT)[0]->GetLabel() : "X";
}
// source
rule.source = "";
for(int si=startS; si<=endS; si++)
- rule.source += m_sentence->source[si] + " ";
+ rule.source += m_sentence.source[si] + " ";
rule.source += "[" + sourceLabel + "]";
// target
rule.target = "";
for(int ti=startT; ti<=endT; ti++)
- rule.target += m_sentence->target[ti] + " ";
+ rule.target += m_sentence.target[ti] + " ";
rule.target += "[" + targetLabel + "]";
if (m_options.pcfgScore) {
- double logPCFGScore = m_sentence->targetTree.GetNodes(startT,endT)[0]->GetPcfgScore();
+ double logPCFGScore = m_sentence.targetTree.GetNodes(startT,endT)[0]->GetPcfgScore();
rule.pcfgScore = std::exp(logPCFGScore);
}
// alignment
for(int ti=startT; ti<=endT; ti++) {
- for(unsigned int i=0; i<m_sentence->alignedToT[ti].size(); i++) {
- int si = m_sentence->alignedToT[ti][i];
+ for(unsigned int i=0; i<m_sentence.alignedToT[ti].size(); i++) {
+ int si = m_sentence.alignedToT[ti][i];
std::string sourceSymbolIndex = IntToString(si-startS);
std::string targetSymbolIndex = IntToString(ti-startT);
rule.alignment += sourceSymbolIndex + "-" + targetSymbolIndex + " ";
@@ -1015,8 +976,8 @@ void ExtractTask::writeRulesToFile()
<< rule->count << "\n";
}
}
- m_extractCollector->Write( m_id, out.str() );
- m_extractCollectorInv->Write( m_id, outInv.str() );;
+ m_extractCollector << out.str() << endl;
+ m_extractCollectorInv << outInv.str() << endl;
}
void writeGlueGrammar( const string & fileName, RuleExtractionOptions &options, set< string > &targetLabelCollection, map< string, int > &targetTopLabelCollection )