Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorphikoehn <pkoehn@inf.ed.ac.uk>2012-08-19 02:48:26 +0400
committerphikoehn <pkoehn@inf.ed.ac.uk>2012-08-19 02:48:26 +0400
commit4a1a995878ed069dd4d77e0ac6c1727dc223ebe6 (patch)
tree6f774d4ddc8c6a6fc3d5b5b619c356a833b03c90 /phrase-extract
parent366ab93f8aa53b7b065fe8366201bd59dafc51ba (diff)
parentb317522563feb4ca7ff978a0de661ec2189934ea (diff)
a lot of changes
Diffstat (limited to 'phrase-extract')
-rw-r--r--phrase-extract/Jamfile4
-rw-r--r--phrase-extract/PhraseExtractionOptions.h146
-rw-r--r--phrase-extract/extract-rules.cpp202
-rw-r--r--phrase-extract/extract.cpp370
-rw-r--r--phrase-extract/score.cpp41
-rw-r--r--phrase-extract/score.h2
-rw-r--r--phrase-extract/statistics.cpp10
7 files changed, 518 insertions, 257 deletions
diff --git a/phrase-extract/Jamfile b/phrase-extract/Jamfile
index 50f03a739..e4f801089 100644
--- a/phrase-extract/Jamfile
+++ b/phrase-extract/Jamfile
@@ -11,9 +11,9 @@ obj XmlTree.o : XmlTree.cpp : <include>. ;
alias filestreams : InputFileStream.cpp OutputFileStream.cpp : : : <include>. ;
alias trees : SyntaxTree.cpp tables-core.o XmlTree.o : : : <include>. ;
-exe extract : tables-core.o SentenceAlignment.o extract.cpp OutputFileStream.cpp InputFileStream ..//boost_iostreams ;
+exe extract : tables-core.o SentenceAlignment.o extract.cpp OutputFileStream.cpp InputFileStream ../moses/src//ThreadPool ..//boost_iostreams ;
-exe extract-rules : tables-core.o SentenceAlignment.o SyntaxTree.o XmlTree.o SentenceAlignmentWithSyntax.cpp HoleCollection.cpp extract-rules.cpp ExtractedRule.cpp OutputFileStream.cpp InputFileStream ../moses/src//ThreadPool ..//boost_iostreams ;
+exe extract-rules : tables-core.o SentenceAlignment.o SyntaxTree.o XmlTree.o SentenceAlignmentWithSyntax.cpp HoleCollection.cpp extract-rules.cpp ExtractedRule.cpp OutputFileStream.cpp InputFileStream ..//boost_iostreams ;
exe extract-lex : extract-lex.cpp InputFileStream ;
diff --git a/phrase-extract/PhraseExtractionOptions.h b/phrase-extract/PhraseExtractionOptions.h
new file mode 100644
index 000000000..d541144b7
--- /dev/null
+++ b/phrase-extract/PhraseExtractionOptions.h
@@ -0,0 +1,146 @@
+/***********************************************************************
+ Moses - factored phrase-based language decoder
+ Copyright (C) 2010 University of Edinburgh
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ ***********************************************************************/
+
+/* Created by Rohit Gupta, CDAC, Mumbai, India on 18 July, 2012*/
+
+#pragma once
+#ifndef PHRASEEXTRACTIONOPTIONS_H_INCLUDED_
+#define PHRASEEXTRACTIONOPTIONS_H_INCLUDED_
+
+namespace MosesTraining
+{
+enum REO_MODEL_TYPE {REO_MSD, REO_MSLR, REO_MONO};
+enum REO_POS {LEFT, RIGHT, DLEFT, DRIGHT, UNKNOWN};
+
+
+class PhraseExtractionOptions {
+
+ public:
+ const int maxPhraseLength;
+ private:
+ bool allModelsOutputFlag;
+ bool wordModel;
+ REO_MODEL_TYPE wordType;
+ bool phraseModel;
+ REO_MODEL_TYPE phraseType;
+ bool hierModel;
+ REO_MODEL_TYPE hierType;
+ bool orientationFlag;
+ bool translationFlag;
+ bool sentenceIdFlag; //create extract file with sentence id
+ bool onlyOutputSpanInfo;
+ bool gzOutput;
+
+public:
+ PhraseExtractionOptions(const int initmaxPhraseLength):
+ maxPhraseLength(initmaxPhraseLength),
+ allModelsOutputFlag(false),
+ wordModel(false),
+ wordType(REO_MSD),
+ phraseModel(false),
+ phraseType(REO_MSD),
+ hierModel(false),
+ hierType(REO_MSD),
+ orientationFlag(false),
+ translationFlag(true),
+ sentenceIdFlag(false),
+ onlyOutputSpanInfo(false),
+ gzOutput(false){}
+
+
+
+ //functions for initialization of options
+ void initAllModelsOutputFlag(const bool initallModelsOutputFlag){
+ allModelsOutputFlag=initallModelsOutputFlag;
+ }
+ void initWordModel(const bool initwordModel){
+ wordModel=initwordModel;
+ }
+ void initWordType(REO_MODEL_TYPE initwordType ){
+ wordType=initwordType;
+ }
+ void initPhraseModel(const bool initphraseModel ){
+ phraseModel=initphraseModel;
+ }
+ void initPhraseType(REO_MODEL_TYPE initphraseType){
+ phraseType=initphraseType;
+ }
+ void initHierModel(const bool inithierModel){
+ hierModel=inithierModel;
+ }
+ void initHierType(REO_MODEL_TYPE inithierType){
+ hierType=inithierType;
+ }
+ void initOrientationFlag(const bool initorientationFlag){
+ orientationFlag=initorientationFlag;
+ }
+ void initTranslationFlag(const bool inittranslationFlag){
+ translationFlag=inittranslationFlag;
+ }
+ void initSentenceIdFlag(const bool initsentenceIdFlag){
+ sentenceIdFlag=initsentenceIdFlag;
+ }
+ void initOnlyOutputSpanInfo(const bool initonlyOutputSpanInfo){
+ onlyOutputSpanInfo= initonlyOutputSpanInfo;
+ }
+ void initGzOutput (const bool initgzOutput){
+ gzOutput= initgzOutput;
+ }
+ // functions for getting values
+ bool isAllModelsOutputFlag() const {
+ return allModelsOutputFlag;
+ }
+ bool isWordModel() const {
+ return wordModel;
+ }
+ REO_MODEL_TYPE isWordType() const {
+ return wordType;
+ }
+ bool isPhraseModel() const {
+ return phraseModel;
+ }
+ REO_MODEL_TYPE isPhraseType() const {
+ return phraseType;
+ }
+ bool isHierModel() const {
+ return hierModel;
+ }
+ REO_MODEL_TYPE isHierType() const {
+ return hierType;
+ }
+ bool isOrientationFlag() const {
+ return orientationFlag;
+ }
+ bool isTranslationFlag() const {
+ return translationFlag;
+ }
+ bool isSentenceIdFlag() const {
+ return sentenceIdFlag;
+ }
+ bool isOnlyOutputSpanInfo() const {
+ return onlyOutputSpanInfo;
+ }
+ bool isGzOutput () const {
+ return gzOutput;
+ }
+};
+
+}
+
+#endif
diff --git a/phrase-extract/extract-rules.cpp b/phrase-extract/extract-rules.cpp
index 0abf548c3..252547557 100644
--- a/phrase-extract/extract-rules.cpp
+++ b/phrase-extract/extract-rules.cpp
@@ -46,8 +46,6 @@
#include "XmlTree.h"
#include "InputFileStream.h"
#include "OutputFileStream.h"
-#include "../moses/src/ThreadPool.h"
-#include "../moses/src/OutputCollector.h"
#define LINE_MAX_LENGTH 500000
@@ -57,55 +55,53 @@ using namespace MosesTraining;
typedef vector< int > LabelIndex;
typedef map< int, int > WordIndex;
-class ExtractTask : public Moses::Task {
+class ExtractTask
+{
private:
- size_t m_id;
- SentenceAlignmentWithSyntax *m_sentence;
- RuleExtractionOptions &m_options;
- Moses::OutputCollector* m_extractCollector;
- Moses::OutputCollector* m_extractCollectorInv;
+ SentenceAlignmentWithSyntax &m_sentence;
+ const RuleExtractionOptions &m_options;
+ Moses::OutputFileStream& m_extractFile;
+ Moses::OutputFileStream& m_extractFileInv;
+
+ vector< ExtractedRule > m_extractedRules;
+
+ // main functions
+ void extractRules();
+ void addRuleToCollection(ExtractedRule &rule);
+ void consolidateRules();
+ void writeRulesToFile();
+
+ // subs
+ void addRule( int, int, int, int, RuleExist &ruleExist);
+ void addHieroRule( int startT, int endT, int startS, int endS
+ , RuleExist &ruleExist, const HoleCollection &holeColl, int numHoles, int initStartF, int wordCountT, int wordCountS);
+ void printHieroPhrase( int startT, int endT, int startS, int endS
+ , HoleCollection &holeColl, LabelIndex &labelIndex);
+ string printTargetHieroPhrase( int startT, int endT, int startS, int endS
+ , WordIndex &indexT, HoleCollection &holeColl, const LabelIndex &labelIndex, double &logPCFGScore);
+ string printSourceHieroPhrase( int startT, int endT, int startS, int endS
+ , HoleCollection &holeColl, const LabelIndex &labelIndex);
+ void preprocessSourceHieroPhrase( int startT, int endT, int startS, int endS
+ , WordIndex &indexS, HoleCollection &holeColl, const LabelIndex &labelIndex);
+ void printHieroAlignment( int startT, int endT, int startS, int endS
+ , const WordIndex &indexS, const WordIndex &indexT, HoleCollection &holeColl, ExtractedRule &rule);
+ void printAllHieroPhrases( int startT, int endT, int startS, int endS, HoleCollection &holeColl);
+
+ inline string IntToString( int i )
+ {
+ stringstream out;
+ out << i;
+ return out.str();
+ }
public:
- ExtractTask(size_t id, SentenceAlignmentWithSyntax *sentence, RuleExtractionOptions &options, Moses::OutputCollector* extractCollector, Moses::OutputCollector* extractCollectorInv):
- m_id(id),
+ ExtractTask(SentenceAlignmentWithSyntax &sentence, const RuleExtractionOptions &options, Moses::OutputFileStream &extractFile, Moses::OutputFileStream &extractFileInv):
m_sentence(sentence),
m_options(options),
- m_extractCollector(extractCollector),
- m_extractCollectorInv(extractCollectorInv) {}
- ~ExtractTask() { delete m_sentence; }
+ m_extractFile(extractFile),
+ m_extractFileInv(extractFileInv) {}
void Run();
-private:
-vector< ExtractedRule > m_extractedRules;
-
-// main functions
-void extractRules();
-void addRuleToCollection(ExtractedRule &rule);
-void consolidateRules();
-void writeRulesToFile();
-
-// subs
-void addRule( int, int, int, int, RuleExist &ruleExist);
-void addHieroRule( int startT, int endT, int startS, int endS
- , RuleExist &ruleExist, const HoleCollection &holeColl, int numHoles, int initStartF, int wordCountT, int wordCountS);
-void printHieroPhrase( int startT, int endT, int startS, int endS
- , HoleCollection &holeColl, LabelIndex &labelIndex);
-string printTargetHieroPhrase( int startT, int endT, int startS, int endS
- , WordIndex &indexT, HoleCollection &holeColl, const LabelIndex &labelIndex, double &logPCFGScore);
-string printSourceHieroPhrase( int startT, int endT, int startS, int endS
- , HoleCollection &holeColl, const LabelIndex &labelIndex);
-void preprocessSourceHieroPhrase( int startT, int endT, int startS, int endS
- , WordIndex &indexS, HoleCollection &holeColl, const LabelIndex &labelIndex);
-void printHieroAlignment( int startT, int endT, int startS, int endS
- , const WordIndex &indexS, const WordIndex &indexT, HoleCollection &holeColl, ExtractedRule &rule);
-void printAllHieroPhrases( int startT, int endT, int startS, int endS, HoleCollection &holeColl);
-
-inline string IntToString( int i )
-{
- stringstream out;
- out << i;
- return out.str();
-}
};
// stats for glue grammar and unknown word label probabilities
@@ -120,16 +116,18 @@ int main(int argc, char* argv[])
<< "rule extraction from an aligned parallel corpus\n";
RuleExtractionOptions options;
+<<<<<<< HEAD
int sentenceOffset = 0;
#ifdef WITH_THREADS
int thread_count = 1;
#endif
+=======
+
+>>>>>>> b317522563feb4ca7ff978a0de661ec2189934ea
if (argc < 5) {
cerr << "syntax: extract-rules corpus.target corpus.source corpus.align extract ["
-#ifdef WITH_THREADS
- << " --threads NUM |"
-#endif
- << " --GlueGrammar FILE"
+
+ << " --GlueGrammar FILE"
<< " | --UnknownWordLabel FILE"
<< " | --OnlyDirect"
<< " | --OutputNTLengths"
@@ -269,6 +267,7 @@ int main(int argc, char* argv[])
options.unpairedExtractFormat = true;
} else if (strcmp(argv[i],"--ConditionOnTargetLHS") == 0) {
options.conditionOnTargetLhs = true;
+<<<<<<< HEAD
#ifdef WITH_THREADS
} else if (strcmp(argv[i],"-threads") == 0 ||
strcmp(argv[i],"--threads") == 0 ||
@@ -281,6 +280,8 @@ int main(int argc, char* argv[])
exit(1);
}
sentenceOffset = atoi(argv[++i]);
+=======
+>>>>>>> b317522563feb4ca7ff978a0de661ec2189934ea
} else {
cerr << "extract: syntax error, unknown option '" << string(argv[i]) << "'\n";
exit(1);
@@ -306,27 +307,17 @@ int main(int argc, char* argv[])
if (!options.onlyDirectFlag)
extractFileInv.Open(fileNameExtractInv.c_str());
- // output into file
- Moses::OutputCollector* extractCollector = new Moses::OutputCollector(&extractFile);
- Moses::OutputCollector* extractCollectorInv = new Moses::OutputCollector(&extractFileInv);
// stats on labels for glue grammar and unknown word label probabilities
set< string > targetLabelCollection, sourceLabelCollection;
map< string, int > targetTopLabelCollection, sourceTopLabelCollection;
-#ifdef WITH_THREADS
- // set up thread pool
- Moses::ThreadPool pool(thread_count);
- pool.SetQueueLimit(1000);
-#endif
-
// loop through all sentence pairs
size_t i=sentenceOffset;
while(true) {
i++;
- if (i%1000 == 0) cerr << "." << flush;
- if (i%10000 == 0) cerr << ":" << flush;
- if (i%100000 == 0) cerr << "!" << flush;
+ if (i%1000 == 0) cerr << i << " " << flush;
+
char targetString[LINE_MAX_LENGTH];
char sourceString[LINE_MAX_LENGTH];
char alignmentString[LINE_MAX_LENGTH];
@@ -335,7 +326,7 @@ int main(int argc, char* argv[])
SAFE_GETLINE((*sFileP), sourceString, LINE_MAX_LENGTH, '\n', __FILE__);
SAFE_GETLINE((*aFileP), alignmentString, LINE_MAX_LENGTH, '\n', __FILE__);
- SentenceAlignmentWithSyntax *sentence = new SentenceAlignmentWithSyntax
+ SentenceAlignmentWithSyntax sentence
(targetLabelCollection, sourceLabelCollection,
targetTopLabelCollection, sourceTopLabelCollection, options);
//az: output src, tgt, and alingment line
@@ -346,32 +337,17 @@ int main(int argc, char* argv[])
cout << "LOG: PHRASES_BEGIN:" << endl;
}
- if (sentence->create(targetString, sourceString, alignmentString, i)) {
+ if (sentence.create(targetString, sourceString, alignmentString, i)) {
if (options.unknownWordLabelFlag) {
- collectWordLabelCounts(*sentence);
- }
- ExtractTask *task = new ExtractTask(i-1, sentence, options, extractCollector, extractCollectorInv);
-#ifdef WITH_THREADS
- if (thread_count == 1) {
- task->Run();
- delete task;
- }
- else {
- pool.Submit(task);
+ collectWordLabelCounts(sentence);
}
-#else
+ ExtractTask *task = new ExtractTask(sentence, options, extractFile, extractFileInv);
task->Run();
delete task;
-#endif
}
if (options.onlyOutputSpanInfo) cout << "LOG: PHRASES_END:" << endl; //az: mark end of phrases
}
-#ifdef WITH_THREADS
- // wait for all threads to finish
- pool.Stop(true);
-#endif
-
tFile.Close();
sFile.Close();
aFile.Close();
@@ -397,8 +373,8 @@ void ExtractTask::Run() {
void ExtractTask::extractRules()
{
- int countT = m_sentence->target.size();
- int countS = m_sentence->source.size();
+ int countT = m_sentence.target.size();
+ int countS = m_sentence.source.size();
// phrase repository for creating hiero phrases
RuleExist ruleExist(countT);
@@ -413,17 +389,17 @@ void ExtractTask::extractRules()
int endT = startT + lengthT - 1;
// if there is target side syntax, there has to be a node
- if (m_options.targetSyntax && !m_sentence->targetTree.HasNode(startT,endT))
+ if (m_options.targetSyntax && !m_sentence.targetTree.HasNode(startT,endT))
continue;
// find find aligned source words
// first: find minimum and maximum source word
int minS = 9999;
int maxS = -1;
- vector< int > usedS = m_sentence->alignedCountS;
+ vector< int > usedS = m_sentence.alignedCountS;
for(int ti=startT; ti<=endT; ti++) {
- for(unsigned int i=0; i<m_sentence->alignedToT[ti].size(); i++) {
- int si = m_sentence->alignedToT[ti][i];
+ for(unsigned int i=0; i<m_sentence.alignedToT[ti].size(); i++) {
+ int si = m_sentence.alignedToT[ti][i];
if (si<minS) {
minS = si;
}
@@ -458,15 +434,15 @@ void ExtractTask::extractRules()
for(int startS=minS;
(startS>=0 &&
startS>maxS - m_options.maxSpan && // within length limit
- (startS==minS || m_sentence->alignedCountS[startS]==0)); // unaligned
+ (startS==minS || m_sentence.alignedCountS[startS]==0)); // unaligned
startS--) {
// end point of source phrase may advance over unaligned
for(int endS=maxS;
(endS<countS && endS<startS + m_options.maxSpan && // within length limit
- (endS==maxS || m_sentence->alignedCountS[endS]==0)); // unaligned
+ (endS==maxS || m_sentence.alignedCountS[endS]==0)); // unaligned
endS++) {
// if there is source side syntax, there has to be a node
- if (m_options.sourceSyntax && !m_sentence->sourceTree.HasNode(startS,endS))
+ if (m_options.sourceSyntax && !m_sentence.sourceTree.HasNode(startS,endS))
continue;
// TODO: loop over all source and target syntax labels
@@ -515,7 +491,7 @@ void ExtractTask::preprocessSourceHieroPhrase( int startT, int endT, int startS,
int labelI = labelIndex[ 2+holeCount+holeTotal ];
string label = m_options.sourceSyntax ?
- m_sentence->sourceTree.GetNodes(currPos,hole.GetEnd(0))[ labelI ]->GetLabel() : "X";
+ m_sentence.sourceTree.GetNodes(currPos,hole.GetEnd(0))[ labelI ]->GetLabel() : "X";
hole.SetLabel(label, 0);
currPos = hole.GetEnd(0);
@@ -556,7 +532,7 @@ string ExtractTask::printTargetHieroPhrase( int startT, int endT, int startS, in
int labelI = labelIndex[ 2+holeCount ];
string targetLabel = m_options.targetSyntax ?
- m_sentence->targetTree.GetNodes(currPos,hole.GetEnd(1))[ labelI ]->GetLabel() : "X";
+ m_sentence.targetTree.GetNodes(currPos,hole.GetEnd(1))[ labelI ]->GetLabel() : "X";
hole.SetLabel(targetLabel, 1);
if (m_options.unpairedExtractFormat) {
@@ -566,7 +542,7 @@ string ExtractTask::printTargetHieroPhrase( int startT, int endT, int startS, in
}
if (m_options.pcfgScore) {
- double score = m_sentence->targetTree.GetNodes(currPos,hole.GetEnd(1))[labelI]->GetPcfgScore();
+ double score = m_sentence.targetTree.GetNodes(currPos,hole.GetEnd(1))[labelI]->GetPcfgScore();
logPCFGScore -= score;
}
@@ -576,7 +552,7 @@ string ExtractTask::printTargetHieroPhrase( int startT, int endT, int startS, in
holeCount++;
} else {
indexT[currPos] = outPos;
- out += m_sentence->target[currPos] + " ";
+ out += m_sentence.target[currPos] + " ";
}
outPos++;
@@ -620,7 +596,7 @@ string ExtractTask::printSourceHieroPhrase( int startT, int endT, int startS, in
++iterHoleList;
++holeCount;
} else {
- out += m_sentence->source[currPos] + " ";
+ out += m_sentence.source[currPos] + " ";
}
outPos++;
@@ -637,8 +613,8 @@ void ExtractTask::printHieroAlignment( int startT, int endT, int startS, int end
for(int ti=startT; ti<=endT; ti++) {
WordIndex::const_iterator p = indexT.find(ti);
if (p != indexT.end()) { // does word still exist?
- for(unsigned int i=0; i<m_sentence->alignedToT[ti].size(); i++) {
- int si = m_sentence->alignedToT[ti][i];
+ for(unsigned int i=0; i<m_sentence.alignedToT[ti].size(); i++) {
+ int si = m_sentence.alignedToT[ti][i];
std::string sourceSymbolIndex = IntToString(indexS.find(si)->second);
std::string targetSymbolIndex = IntToString(p->second);
rule.alignment += sourceSymbolIndex + "-" + targetSymbolIndex + " ";
@@ -678,16 +654,16 @@ void ExtractTask::printHieroPhrase( int startT, int endT, int startS, int endS
// phrase labels
string targetLabel = m_options.targetSyntax ?
- m_sentence->targetTree.GetNodes(startT,endT)[ labelIndex[0] ]->GetLabel() : "X";
+ m_sentence.targetTree.GetNodes(startT,endT)[ labelIndex[0] ]->GetLabel() : "X";
string sourceLabel = m_options.sourceSyntax ?
- m_sentence->sourceTree.GetNodes(startS,endS)[ labelIndex[1] ]->GetLabel() : "X";
+ m_sentence.sourceTree.GetNodes(startS,endS)[ labelIndex[1] ]->GetLabel() : "X";
// create non-terms on the source side
preprocessSourceHieroPhrase(startT, endT, startS, endS, indexS, holeColl, labelIndex);
// target
if (m_options.pcfgScore) {
- double logPCFGScore = m_sentence->targetTree.GetNodes(startT,endT)[labelIndex[0]]->GetPcfgScore();
+ double logPCFGScore = m_sentence.targetTree.GetNodes(startT,endT)[labelIndex[0]]->GetPcfgScore();
rule.target = printTargetHieroPhrase(startT, endT, startS, endS, indexT, holeColl, labelIndex, logPCFGScore)
+ " [" + targetLabel + "]";
rule.pcfgScore = std::exp(logPCFGScore);
@@ -716,19 +692,19 @@ void ExtractTask::printAllHieroPhrases( int startT, int endT, int startS, int en
LabelIndex labelIndex,labelCount;
// number of target head labels
- int numLabels = m_options.targetSyntax ? m_sentence->targetTree.GetNodes(startT,endT).size() : 1;
+ int numLabels = m_options.targetSyntax ? m_sentence.targetTree.GetNodes(startT,endT).size() : 1;
labelCount.push_back(numLabels);
labelIndex.push_back(0);
// number of source head labels
- numLabels = m_options.sourceSyntax ? m_sentence->sourceTree.GetNodes(startS,endS).size() : 1;
+ numLabels = m_options.sourceSyntax ? m_sentence.sourceTree.GetNodes(startS,endS).size() : 1;
labelCount.push_back(numLabels);
labelIndex.push_back(0);
// number of target hole labels
for( HoleList::const_iterator hole = holeColl.GetHoles().begin();
hole != holeColl.GetHoles().end(); hole++ ) {
- int numLabels = m_options.targetSyntax ? m_sentence->targetTree.GetNodes(hole->GetStart(1),hole->GetEnd(1)).size() : 1 ;
+ int numLabels = m_options.targetSyntax ? m_sentence.targetTree.GetNodes(hole->GetStart(1),hole->GetEnd(1)).size() : 1 ;
labelCount.push_back(numLabels);
labelIndex.push_back(0);
}
@@ -738,7 +714,7 @@ void ExtractTask::printAllHieroPhrases( int startT, int endT, int startS, int en
for( vector<Hole*>::iterator i = holeColl.GetSortedSourceHoles().begin();
i != holeColl.GetSortedSourceHoles().end(); i++ ) {
const Hole &hole = **i;
- int numLabels = m_options.sourceSyntax ? m_sentence->sourceTree.GetNodes(hole.GetStart(0),hole.GetEnd(0)).size() : 1 ;
+ int numLabels = m_options.sourceSyntax ? m_sentence.sourceTree.GetNodes(hole.GetStart(0),hole.GetEnd(0)).size() : 1 ;
labelCount.push_back(numLabels);
labelIndex.push_back(0);
}
@@ -850,7 +826,7 @@ void ExtractTask::addHieroRule( int startT, int endT, int startS, int endS
}
// covered by word? check if it is aligned
else {
- if (m_sentence->alignedToT[pos].size() > 0)
+ if (m_sentence.alignedToT[pos].size() > 0)
foundAlignedWord = true;
}
}
@@ -900,36 +876,36 @@ void ExtractTask::addRule( int startT, int endT, int startS, int endS, RuleExist
// phrase labels
string targetLabel,sourceLabel;
if (m_options.targetSyntax && m_options.conditionOnTargetLhs) {
- sourceLabel = targetLabel = m_sentence->targetTree.GetNodes(startT,endT)[0]->GetLabel();
+ sourceLabel = targetLabel = m_sentence.targetTree.GetNodes(startT,endT)[0]->GetLabel();
}
else {
sourceLabel = m_options.sourceSyntax ?
- m_sentence->sourceTree.GetNodes(startS,endS)[0]->GetLabel() : "X";
+ m_sentence.sourceTree.GetNodes(startS,endS)[0]->GetLabel() : "X";
targetLabel = m_options.targetSyntax ?
- m_sentence->targetTree.GetNodes(startT,endT)[0]->GetLabel() : "X";
+ m_sentence.targetTree.GetNodes(startT,endT)[0]->GetLabel() : "X";
}
// source
rule.source = "";
for(int si=startS; si<=endS; si++)
- rule.source += m_sentence->source[si] + " ";
+ rule.source += m_sentence.source[si] + " ";
rule.source += "[" + sourceLabel + "]";
// target
rule.target = "";
for(int ti=startT; ti<=endT; ti++)
- rule.target += m_sentence->target[ti] + " ";
+ rule.target += m_sentence.target[ti] + " ";
rule.target += "[" + targetLabel + "]";
if (m_options.pcfgScore) {
- double logPCFGScore = m_sentence->targetTree.GetNodes(startT,endT)[0]->GetPcfgScore();
+ double logPCFGScore = m_sentence.targetTree.GetNodes(startT,endT)[0]->GetPcfgScore();
rule.pcfgScore = std::exp(logPCFGScore);
}
// alignment
for(int ti=startT; ti<=endT; ti++) {
- for(unsigned int i=0; i<m_sentence->alignedToT[ti].size(); i++) {
- int si = m_sentence->alignedToT[ti][i];
+ for(unsigned int i=0; i<m_sentence.alignedToT[ti].size(); i++) {
+ int si = m_sentence.alignedToT[ti][i];
std::string sourceSymbolIndex = IntToString(si-startS);
std::string targetSymbolIndex = IntToString(ti-startT);
rule.alignment += sourceSymbolIndex + "-" + targetSymbolIndex + " ";
@@ -1022,8 +998,8 @@ void ExtractTask::writeRulesToFile()
<< rule->count << "\n";
}
}
- m_extractCollector->Write( m_id, out.str() );
- m_extractCollectorInv->Write( m_id, outInv.str() );;
+ m_extractFile << out.str();
+ m_extractFileInv << outInv.str();
}
void writeGlueGrammar( const string & fileName, RuleExtractionOptions &options, set< string > &targetLabelCollection, map< string, int > &targetTopLabelCollection )
diff --git a/phrase-extract/extract.cpp b/phrase-extract/extract.cpp
index 89c45a2e6..b6ea97c6e 100644
--- a/phrase-extract/extract.cpp
+++ b/phrase-extract/extract.cpp
@@ -1,6 +1,7 @@
/*
* extract.cpp
- *
+ * Modified by: Rohit Gupta CDAC, Mumbai, India
+ * on July 15, 2012 to implement parallel processing
* Modified by: Nadi Tomeh - LIMSI/CNRS
* Machine Translation Marathon 2010, Dublin
*/
@@ -13,7 +14,7 @@
#include <stdlib.h>
#include <assert.h>
#include <cstring>
-
+#include <sstream>
#include <map>
#include <set>
#include <vector>
@@ -23,14 +24,16 @@
#include "tables-core.h"
#include "InputFileStream.h"
#include "OutputFileStream.h"
+#include "PhraseExtractionOptions.h"
using namespace std;
using namespace MosesTraining;
-#define LINE_MAX_LENGTH 500000
+namespace MosesTraining {
+
+
+const long int LINE_MAX_LENGTH = 500000 ;
-namespace MosesTraining
-{
// HPhraseVertex represents a point in the alignment matrix
typedef pair <int, int> HPhraseVertex;
@@ -46,26 +49,24 @@ typedef vector < HPhrase > HPhraseVector;
// The key of the map is the English index and the value is a set of the source ones
typedef map <int, set<int> > HSentenceVertices;
-enum REO_MODEL_TYPE {REO_MSD, REO_MSLR, REO_MONO};
-enum REO_POS {LEFT, RIGHT, DLEFT, DRIGHT, UNKNOWN};
-
-REO_POS getOrientWordModel(SentenceAlignment &, REO_MODEL_TYPE, bool, bool,
+ REO_POS getOrientWordModel(SentenceAlignment &, REO_MODEL_TYPE, bool, bool,
int, int, int, int, int, int, int,
bool (*)(int, int), bool (*)(int, int));
-REO_POS getOrientPhraseModel(SentenceAlignment &, REO_MODEL_TYPE, bool, bool,
+ REO_POS getOrientPhraseModel(SentenceAlignment &, REO_MODEL_TYPE, bool, bool,
int, int, int, int, int, int, int,
bool (*)(int, int), bool (*)(int, int),
const HSentenceVertices &, const HSentenceVertices &);
-REO_POS getOrientHierModel(SentenceAlignment &, REO_MODEL_TYPE, bool, bool,
+ REO_POS getOrientHierModel(SentenceAlignment &, REO_MODEL_TYPE, bool, bool,
int, int, int, int, int, int, int,
bool (*)(int, int), bool (*)(int, int),
const HSentenceVertices &, const HSentenceVertices &,
const HSentenceVertices &, const HSentenceVertices &,
REO_POS);
-void insertVertex(HSentenceVertices &, int, int);
-void insertPhraseVertices(HSentenceVertices &, HSentenceVertices &, HSentenceVertices &, HSentenceVertices &,
+ void insertVertex(HSentenceVertices &, int, int);
+ void insertPhraseVertices(HSentenceVertices &, HSentenceVertices &, HSentenceVertices &, HSentenceVertices &,
int, int, int, int);
+<<<<<<< HEAD
string getOrientString(REO_POS, REO_MODEL_TYPE);
bool ge(int, int);
@@ -99,7 +100,49 @@ int sentenceOffset = 0;
bool includeSentenceIdFlag = false; //include sentence id in extract file
bool onlyOutputSpanInfo = false;
bool gzOutput = false;
+=======
+ string getOrientString(REO_POS, REO_MODEL_TYPE);
+
+ bool ge(int, int);
+ bool le(int, int);
+ bool lt(int, int);
+
+ bool isAligned (SentenceAlignment &, int, int);
+
+>>>>>>> b317522563feb4ca7ff978a0de661ec2189934ea
+
+}
+
+namespace MosesTraining{
+class ExtractTask
+{
+public:
+ ExtractTask(size_t id, SentenceAlignment &sentence,PhraseExtractionOptions &initoptions, Moses::OutputFileStream &extractFile, Moses::OutputFileStream &extractFileInv,Moses::OutputFileStream &extractFileOrientation,Moses::OutputFileStream &extractFileSentenceId ):
+ m_sentence(sentence),
+ m_options(initoptions),
+ m_extractFile(extractFile),
+ m_extractFileInv(extractFileInv),
+ m_extractFileOrientation(extractFileOrientation),
+ m_extractFileSentenceId(extractFileSentenceId) {}
+void Run();
+private:
+ vector< string > m_extractedPhrases;
+ vector< string > m_extractedPhrasesInv;
+ vector< string > m_extractedPhrasesOri;
+ vector< string > m_extractedPhrasesSid;
+ void extractBase(SentenceAlignment &);
+ void extract(SentenceAlignment &);
+ void addPhrase(SentenceAlignment &, int, int, int, int, string &);
+ void writePhrasesToFile();
+
+ SentenceAlignment &m_sentence;
+ const PhraseExtractionOptions &m_options;
+ Moses::OutputFileStream &m_extractFile;
+ Moses::OutputFileStream &m_extractFileInv;
+ Moses::OutputFileStream &m_extractFileOrientation;
+ Moses::OutputFileStream &m_extractFileSentenceId;
+};
}
int main(int argc, char* argv[])
@@ -107,24 +150,36 @@ int main(int argc, char* argv[])
cerr << "PhraseExtract v1.4, written by Philipp Koehn\n"
<< "phrase extraction from an aligned parallel corpus\n";
+<<<<<<< HEAD
if (argc < 6) {
cerr << "syntax: extract en de align extract max-length [orientation [ --model [wbe|phrase|hier]-[msd|mslr|mono] ] | --OnlyOutputSpanInfo | --NoTTable | --SentenceId | --IncludeSentenceId | --SentenceOffset n ]\n";
+=======
+ if (argc < 6) {
+ cerr << "syntax: extract en de align extract max-length [orientation [ --model [wbe|phrase|hier]-[msd|mslr|mono] ] ";
+ cerr<<"| --OnlyOutputSpanInfo | --NoTTable | --SentenceId | --GZOutput ]\n";
+>>>>>>> b317522563feb4ca7ff978a0de661ec2189934ea
exit(1);
}
- char* &fileNameE = argv[1];
- char* &fileNameF = argv[2];
- char* &fileNameA = argv[3];
- string fileNameExtract = string(argv[4]);
- maxPhraseLength = atoi(argv[5]);
+
+ Moses::OutputFileStream extractFile;
+ Moses::OutputFileStream extractFileInv;
+ Moses::OutputFileStream extractFileOrientation;
+ Moses::OutputFileStream extractFileSentenceId;
+ const char* const &fileNameE = argv[1];
+ const char* const &fileNameF = argv[2];
+ const char* const &fileNameA = argv[3];
+ const string fileNameExtract = string(argv[4]);
+ PhraseExtractionOptions options(atoi(argv[5]));
for(int i=6; i<argc; i++) {
if (strcmp(argv[i],"--OnlyOutputSpanInfo") == 0) {
- onlyOutputSpanInfo = true;
+ options.initOnlyOutputSpanInfo(true);
} else if (strcmp(argv[i],"orientation") == 0 || strcmp(argv[i],"--Orientation") == 0) {
- orientationFlag = true;
+ options.initOrientationFlag(true);
} else if (strcmp(argv[i],"--NoTTable") == 0) {
- translationFlag = false;
+ options.initTranslationFlag(false);
} else if (strcmp(argv[i], "--SentenceId") == 0) {
+<<<<<<< HEAD
sentenceIdFlag = true;
} else if (strcmp(argv[i], "--IncludeSentenceId") == 0) {
includeSentenceIdFlag = true;
@@ -134,51 +189,54 @@ int main(int argc, char* argv[])
exit(1);
}
sentenceOffset = atoi(argv[++i]);
+=======
+ options.initSentenceIdFlag(true);
+>>>>>>> b317522563feb4ca7ff978a0de661ec2189934ea
} else if (strcmp(argv[i], "--GZOutput") == 0) {
- gzOutput = true;
+ options.initGzOutput(true);
} else if(strcmp(argv[i],"--model") == 0) {
if (i+1 >= argc) {
cerr << "extract: syntax error, no model's information provided to the option --model " << endl;
exit(1);
}
- char* modelParams = argv[++i];
- char* modelName = strtok(modelParams, "-");
- char* modelType = strtok(NULL, "-");
+ char* modelParams = argv[++i];
+ char* modelName = strtok(modelParams, "-");
+ char* modelType = strtok(NULL, "-");
// REO_MODEL_TYPE intModelType;
if(strcmp(modelName, "wbe") == 0) {
- wordModel = true;
+ options.initWordModel(true);
if(strcmp(modelType, "msd") == 0)
- wordType = REO_MSD;
+ options.initWordType(REO_MSD);
else if(strcmp(modelType, "mslr") == 0)
- wordType = REO_MSLR;
+ options.initWordType(REO_MSLR);
else if(strcmp(modelType, "mono") == 0 || strcmp(modelType, "monotonicity") == 0)
- wordType = REO_MONO;
+ options.initWordType(REO_MONO);
else {
cerr << "extract: syntax error, unknown reordering model type: " << modelType << endl;
exit(1);
}
} else if(strcmp(modelName, "phrase") == 0) {
- phraseModel = true;
+ options.initPhraseModel(true);
if(strcmp(modelType, "msd") == 0)
- phraseType = REO_MSD;
+ options.initPhraseType(REO_MSD);
else if(strcmp(modelType, "mslr") == 0)
- phraseType = REO_MSLR;
+ options.initPhraseType(REO_MSLR);
else if(strcmp(modelType, "mono") == 0 || strcmp(modelType, "monotonicity") == 0)
- phraseType = REO_MONO;
+ options.initPhraseType(REO_MONO);
else {
cerr << "extract: syntax error, unknown reordering model type: " << modelType << endl;
exit(1);
}
} else if(strcmp(modelName, "hier") == 0) {
- hierModel = true;
+ options.initHierModel(true);
if(strcmp(modelType, "msd") == 0)
- hierType = REO_MSD;
+ options.initHierType(REO_MSD);
else if(strcmp(modelType, "mslr") == 0)
- hierType = REO_MSLR;
+ options.initHierType(REO_MSLR);
else if(strcmp(modelType, "mono") == 0 || strcmp(modelType, "monotonicity") == 0)
- hierType = REO_MONO;
+ options.initHierType(REO_MONO);
else {
cerr << "extract: syntax error, unknown reordering model type: " << modelType << endl;
exit(1);
@@ -188,7 +246,8 @@ int main(int argc, char* argv[])
exit(1);
}
- allModelsOutputFlag = true;
+ options.initAllModelsOutputFlag(true);
+
} else {
cerr << "extract: syntax error, unknown option '" << string(argv[i]) << "'\n";
exit(1);
@@ -197,9 +256,9 @@ int main(int argc, char* argv[])
// default reordering model if no model selected
// allows for the old syntax to be used
- if(orientationFlag && !allModelsOutputFlag) {
- wordModel = true;
- wordType = REO_MSD;
+ if(options.isOrientationFlag() && !options.isAllModelsOutputFlag()) {
+ options.initWordModel(true);
+ options.initWordType(REO_MSD);
}
// open input files
@@ -212,18 +271,18 @@ int main(int argc, char* argv[])
istream *aFileP = &aFile;
// open output files
- if (translationFlag) {
- string fileNameExtractInv = fileNameExtract + ".inv" + (gzOutput?".gz":"");
- extractFile.Open( (fileNameExtract + (gzOutput?".gz":"")).c_str());
+ if (options.isTranslationFlag()) {
+ string fileNameExtractInv = fileNameExtract + ".inv" + (options.isGzOutput()?".gz":"");
+ extractFile.Open( (fileNameExtract + (options.isGzOutput()?".gz":"")).c_str());
extractFileInv.Open(fileNameExtractInv.c_str());
}
- if (orientationFlag) {
- string fileNameExtractOrientation = fileNameExtract + ".o" + (gzOutput?".gz":"");
+ if (options.isOrientationFlag()) {
+ string fileNameExtractOrientation = fileNameExtract + ".o" + (options.isGzOutput()?".gz":"");
extractFileOrientation.Open(fileNameExtractOrientation.c_str());
}
- if (sentenceIdFlag) {
- string fileNameExtractSentenceId = fileNameExtract + ".sid" + (gzOutput?".gz":"");
+ if (options.isSentenceIdFlag()) {
+ string fileNameExtractSentenceId = fileNameExtract + ".sid" + (options.isGzOutput()?".gz":"");
extractFileSentenceId.Open(fileNameExtractSentenceId.c_str());
}
@@ -239,31 +298,38 @@ int main(int argc, char* argv[])
SAFE_GETLINE((*fFileP), foreignString, LINE_MAX_LENGTH, '\n', __FILE__);
SAFE_GETLINE((*aFileP), alignmentString, LINE_MAX_LENGTH, '\n', __FILE__);
SentenceAlignment sentence;
- // cout << "read in: " << englishString << " & " << foreignString << " & " << alignmentString << endl;
+ // cout << "read in: " << englishString << " & " << foreignString << " & " << alignmentString << endl;
//az: output src, tgt, and alingment line
- if (onlyOutputSpanInfo) {
+ if (options.isOnlyOutputSpanInfo()) {
cout << "LOG: SRC: " << foreignString << endl;
cout << "LOG: TGT: " << englishString << endl;
cout << "LOG: ALT: " << alignmentString << endl;
cout << "LOG: PHRASES_BEGIN:" << endl;
}
+ if (sentence.create( englishString, foreignString, alignmentString, i)) {
+ ExtractTask *task = new ExtractTask(i-1, sentence, options, extractFile , extractFileInv, extractFileOrientation, extractFileSentenceId);
+ task->Run();
+ delete task;
- if (sentence.create( englishString, foreignString, alignmentString, i)) {
- extract(sentence);
}
- if (onlyOutputSpanInfo) cout << "LOG: PHRASES_END:" << endl; //az: mark end of phrases
+ if (options.isOnlyOutputSpanInfo()) cout << "LOG: PHRASES_END:" << endl; //az: mark end of phrases
}
+
eFile.Close();
fFile.Close();
aFile.Close();
+
//az: only close if we actually opened it
- if (!onlyOutputSpanInfo) {
- if (translationFlag) {
+ if (!options.isOnlyOutputSpanInfo()) {
+ if (options.isTranslationFlag()) {
extractFile.Close();
extractFileInv.Close();
+
}
- if (orientationFlag) extractFileOrientation.Close();
- if (sentenceIdFlag) {
+ if (options.isOrientationFlag()){
+ extractFileOrientation.Close();
+ }
+ if (options.isSentenceIdFlag()) {
extractFileSentenceId.Close();
}
}
@@ -271,8 +337,17 @@ int main(int argc, char* argv[])
namespace MosesTraining
{
+void ExtractTask::Run() {
+ extract(m_sentence);
+ writePhrasesToFile();
+ m_extractedPhrases.clear();
+ m_extractedPhrasesInv.clear();
+ m_extractedPhrasesOri.clear();
+ m_extractedPhrasesSid.clear();
-void extract(SentenceAlignment &sentence)
+}
+
+void ExtractTask::extract(SentenceAlignment &sentence)
{
int countE = sentence.target.size();
int countF = sentence.source.size();
@@ -291,14 +366,14 @@ void extract(SentenceAlignment &sentence)
HSentenceVertices::const_iterator it;
- bool relaxLimit = hierModel;
- bool buildExtraStructure = phraseModel || hierModel;
+ bool relaxLimit = m_options.isHierModel();
+ bool buildExtraStructure = m_options.isPhraseModel() || m_options.isHierModel();
// check alignments for target phrase startE...endE
// loop over extracted phrases which are compatible with the word-alignments
for(int startE=0; startE<countE; startE++) {
for(int endE=startE;
- (endE<countE && (relaxLimit || endE<startE+maxPhraseLength));
+ (endE<countE && (relaxLimit || endE<startE+m_options.maxPhraseLength));
endE++) {
int minF = 9999;
@@ -318,7 +393,7 @@ void extract(SentenceAlignment &sentence)
}
if (maxF >= 0 && // aligned to any source words at all
- (relaxLimit || maxF-minF < maxPhraseLength)) { // source phrase within limits
+ (relaxLimit || maxF-minF < m_options.maxPhraseLength)) { // source phrase within limits
// check if source words are aligned to out of bound target words
bool out_of_bounds = false;
@@ -333,17 +408,17 @@ void extract(SentenceAlignment &sentence)
// start point of source phrase may retreat over unaligned
for(int startF=minF;
(startF>=0 &&
- (relaxLimit || startF>maxF-maxPhraseLength) && // within length limit
+ (relaxLimit || startF>maxF-m_options.maxPhraseLength) && // within length limit
(startF==minF || sentence.alignedCountS[startF]==0)); // unaligned
startF--)
// end point of source phrase may advance over unaligned
for(int endF=maxF;
(endF<countF &&
- (relaxLimit || endF<startF+maxPhraseLength) && // within length limit
+ (relaxLimit || endF<startF+m_options.maxPhraseLength) && // within length limit
(endF==maxF || sentence.alignedCountS[endF]==0)); // unaligned
endF++) { // at this point we have extracted a phrase
if(buildExtraStructure) { // phrase || hier
- if(endE-startE < maxPhraseLength && endF-startF < maxPhraseLength) { // within limit
+ if(endE-startE < m_options.maxPhraseLength && endF-startF < m_options.maxPhraseLength) { // within limit
inboundPhrases.push_back(HPhrase(HPhraseVertex(startF,startE),
HPhraseVertex(endF,endE)));
insertPhraseVertices(inTopLeft, inTopRight, inBottomLeft, inBottomRight,
@@ -353,16 +428,16 @@ void extract(SentenceAlignment &sentence)
startF, startE, endF, endE);
} else {
string orientationInfo = "";
- if(wordModel) {
+ if(m_options.isWordModel()) {
REO_POS wordPrevOrient, wordNextOrient;
bool connectedLeftTopP = isAligned( sentence, startF-1, startE-1 );
bool connectedRightTopP = isAligned( sentence, endF+1, startE-1 );
bool connectedLeftTopN = isAligned( sentence, endF+1, endE+1 );
bool connectedRightTopN = isAligned( sentence, startF-1, endE+1 );
- wordPrevOrient = getOrientWordModel(sentence, wordType, connectedLeftTopP, connectedRightTopP, startF, endF, startE, endE, countF, 0, 1, &ge, &lt);
- wordNextOrient = getOrientWordModel(sentence, wordType, connectedLeftTopN, connectedRightTopN, endF, startF, endE, startE, 0, countF, -1, &lt, &ge);
- orientationInfo += getOrientString(wordPrevOrient, wordType) + " " + getOrientString(wordNextOrient, wordType);
- if(allModelsOutputFlag)
+ wordPrevOrient = getOrientWordModel(sentence, m_options.isWordType(), connectedLeftTopP, connectedRightTopP, startF, endF, startE, endE, countF, 0, 1, &ge, &lt);
+ wordNextOrient = getOrientWordModel(sentence, m_options.isWordType(), connectedLeftTopN, connectedRightTopN, endF, startF, endE, startE, 0, countF, -1, &lt, &ge);
+ orientationInfo += getOrientString(wordPrevOrient, m_options.isWordType()) + " " + getOrientString(wordNextOrient, m_options.isWordType());
+ if(m_options.isAllModelsOutputFlag())
" | | ";
}
addPhrase(sentence, startE, endE, startF, endF, orientationInfo);
@@ -388,38 +463,38 @@ void extract(SentenceAlignment &sentence)
bool connectedLeftTopN = isAligned( sentence, endF+1, endE+1 );
bool connectedRightTopN = isAligned( sentence, startF-1, endE+1 );
- if(wordModel) {
- wordPrevOrient = getOrientWordModel(sentence, wordType,
+ if(m_options.isWordModel()) {
+ wordPrevOrient = getOrientWordModel(sentence, m_options.isWordType(),
connectedLeftTopP, connectedRightTopP,
startF, endF, startE, endE, countF, 0, 1,
&ge, &lt);
- wordNextOrient = getOrientWordModel(sentence, wordType,
+ wordNextOrient = getOrientWordModel(sentence, m_options.isWordType(),
connectedLeftTopN, connectedRightTopN,
endF, startF, endE, startE, 0, countF, -1,
&lt, &ge);
}
- if (phraseModel) {
- phrasePrevOrient = getOrientPhraseModel(sentence, phraseType,
+ if (m_options.isPhraseModel()) {
+ phrasePrevOrient = getOrientPhraseModel(sentence, m_options.isPhraseType(),
connectedLeftTopP, connectedRightTopP,
startF, endF, startE, endE, countF-1, 0, 1, &ge, &lt, inBottomRight, inBottomLeft);
- phraseNextOrient = getOrientPhraseModel(sentence, phraseType,
+ phraseNextOrient = getOrientPhraseModel(sentence, m_options.isPhraseType(),
connectedLeftTopN, connectedRightTopN,
endF, startF, endE, startE, 0, countF-1, -1, &lt, &ge, inBottomLeft, inBottomRight);
} else {
phrasePrevOrient = phraseNextOrient = UNKNOWN;
}
- if(hierModel) {
- hierPrevOrient = getOrientHierModel(sentence, hierType,
+ if(m_options.isHierModel()) {
+ hierPrevOrient = getOrientHierModel(sentence, m_options.isHierType(),
connectedLeftTopP, connectedRightTopP,
startF, endF, startE, endE, countF-1, 0, 1, &ge, &lt, inBottomRight, inBottomLeft, outBottomRight, outBottomLeft, phrasePrevOrient);
- hierNextOrient = getOrientHierModel(sentence, hierType,
+ hierNextOrient = getOrientHierModel(sentence, m_options.isHierType(),
connectedLeftTopN, connectedRightTopN,
endF, startF, endE, startE, 0, countF-1, -1, &lt, &ge, inBottomLeft, inBottomRight, outBottomLeft, outBottomRight, phraseNextOrient);
}
- orientationInfo = ((wordModel)? getOrientString(wordPrevOrient, wordType) + " " + getOrientString(wordNextOrient, wordType) : "") + " | " +
- ((phraseModel)? getOrientString(phrasePrevOrient, phraseType) + " " + getOrientString(phraseNextOrient, phraseType) : "") + " | " +
- ((hierModel)? getOrientString(hierPrevOrient, hierType) + " " + getOrientString(hierNextOrient, hierType) : "");
+ orientationInfo = ((m_options.isWordModel())? getOrientString(wordPrevOrient, m_options.isWordType()) + " " + getOrientString(wordNextOrient, m_options.isWordType()) : "") + " | " +
+ ((m_options.isPhraseModel())? getOrientString(phrasePrevOrient, m_options.isPhraseType()) + " " + getOrientString(phraseNextOrient, m_options.isPhraseType()) : "") + " | " +
+ ((m_options.isHierModel())? getOrientString(hierPrevOrient, m_options.isHierType()) + " " + getOrientString(hierNextOrient, m_options.isHierType()) : "");
addPhrase(sentence, startE, endE, startF, endF, orientationInfo);
}
@@ -627,96 +702,147 @@ string getOrientString(REO_POS orient, REO_MODEL_TYPE modelType)
return "";
}
-void addPhrase( SentenceAlignment &sentence, int startE, int endE, int startF, int endF , string &orientationInfo)
+void ExtractTask::addPhrase( SentenceAlignment &sentence, int startE, int endE, int startF, int endF , string &orientationInfo)
{
// source
- // cout << "adding ( " << startF << "-" << endF << ", " << startE << "-" << endE << ")\n";
+ // // cout << "adding ( " << startF << "-" << endF << ", " << startE << "-" << endE << ")\n";
+ ostringstream outextractstr;
+ ostringstream outextractstrInv;
+ ostringstream outextractstrOrientation;
+ ostringstream outextractstrSentenceId;
- if (onlyOutputSpanInfo) {
+ if (m_options.isOnlyOutputSpanInfo()) {
cout << startF << " " << endF << " " << startE << " " << endE << endl;
return;
}
- for(int fi=startF; fi<=endF; fi++) {
- if (translationFlag) extractFile << sentence.source[fi] << " ";
- if (orientationFlag) extractFileOrientation << sentence.source[fi] << " ";
- if (sentenceIdFlag) extractFileSentenceId << sentence.source[fi] << " ";
+for(int fi=startF; fi<=endF; fi++) {
+ if (m_options.isTranslationFlag()) outextractstr << sentence.source[fi] << " ";
+ if (m_options.isOrientationFlag()) outextractstrOrientation << sentence.source[fi] << " ";
+ if (m_options.isSentenceIdFlag()) outextractstrSentenceId << sentence.source[fi] << " ";
}
- if (translationFlag) extractFile << "||| ";
- if (orientationFlag) extractFileOrientation << "||| ";
- if (sentenceIdFlag) extractFileSentenceId << "||| ";
+ if (m_options.isTranslationFlag()) outextractstr << "||| ";
+ if (m_options.isOrientationFlag()) outextractstrOrientation << "||| ";
+ if (m_options.isSentenceIdFlag()) outextractstrSentenceId << "||| ";
// target
for(int ei=startE; ei<=endE; ei++) {
- if (translationFlag) extractFile << sentence.target[ei] << " ";
- if (translationFlag) extractFileInv << sentence.target[ei] << " ";
- if (orientationFlag) extractFileOrientation << sentence.target[ei] << " ";
- if (sentenceIdFlag) extractFileSentenceId << sentence.target[ei] << " ";
+ if (m_options.isTranslationFlag()) outextractstr << sentence.target[ei] << " ";
+ if (m_options.isTranslationFlag()) outextractstrInv << sentence.target[ei] << " ";
+ if (m_options.isOrientationFlag()) outextractstrOrientation << sentence.target[ei] << " ";
+ if (m_options.isSentenceIdFlag()) outextractstrSentenceId << sentence.target[ei] << " ";
}
- if (translationFlag) extractFile << "|||";
- if (translationFlag) extractFileInv << "||| ";
- if (orientationFlag) extractFileOrientation << "||| ";
- if (sentenceIdFlag) extractFileSentenceId << "||| ";
+ if (m_options.isTranslationFlag()) outextractstr << "|||";
+ if (m_options.isTranslationFlag()) outextractstrInv << "||| ";
+ if (m_options.isOrientationFlag()) outextractstrOrientation << "||| ";
+ if (m_options.isSentenceIdFlag()) outextractstrSentenceId << "||| ";
// source (for inverse)
- if (translationFlag) {
+
+ if (m_options.isTranslationFlag()) {
for(int fi=startF; fi<=endF; fi++)
- extractFileInv << sentence.source[fi] << " ";
- extractFileInv << "|||";
+ outextractstrInv << sentence.source[fi] << " ";
+ outextractstrInv << "|||";
}
-
// alignment
- if (translationFlag) {
+ if (m_options.isTranslationFlag()) {
for(int ei=startE; ei<=endE; ei++) {
- for(size_t i=0; i<sentence.alignedToT[ei].size(); i++) {
+ for(unsigned int i=0; i<sentence.alignedToT[ei].size(); i++) {
int fi = sentence.alignedToT[ei][i];
- extractFile << " " << fi-startF << "-" << ei-startE;
- extractFileInv << " " << ei-startE << "-" << fi-startF;
+ outextractstr << " " << fi-startF << "-" << ei-startE;
+ outextractstrInv << " " << ei-startE << "-" << fi-startF;
}
}
}
- if (orientationFlag)
- extractFileOrientation << orientationInfo;
+ if (m_options.isOrientationFlag())
+ outextractstrOrientation << orientationInfo;
+<<<<<<< HEAD
if (sentenceIdFlag)
extractFileSentenceId << sentence.sentenceID;
if (includeSentenceIdFlag)
extractFile << " ||| " << sentence.sentenceID;
+=======
+ if (m_options.isSentenceIdFlag()) {
+ outextractstrSentenceId << sentence.sentenceID;
+ }
+>>>>>>> b317522563feb4ca7ff978a0de661ec2189934ea
+
+
+ if (m_options.isTranslationFlag()) outextractstr << "\n";
+ if (m_options.isTranslationFlag()) outextractstrInv << "\n";
+ if (m_options.isOrientationFlag()) outextractstrOrientation << "\n";
+ if (m_options.isSentenceIdFlag()) outextractstrSentenceId << "\n";
+
- if (translationFlag) extractFile << "\n";
- if (translationFlag) extractFileInv << "\n";
- if (orientationFlag) extractFileOrientation << "\n";
- if (sentenceIdFlag) extractFileSentenceId << "\n";
+ m_extractedPhrases.push_back(outextractstr.str());
+ m_extractedPhrasesInv.push_back(outextractstrInv.str());
+ m_extractedPhrasesOri.push_back(outextractstrOrientation.str());
+ m_extractedPhrasesSid.push_back(outextractstrSentenceId.str());
+}
+
+
+void ExtractTask::writePhrasesToFile(){
+
+ ostringstream outextractFile;
+ ostringstream outextractFileInv;
+ ostringstream outextractFileOrientation;
+ ostringstream outextractFileSentenceId;
+
+ for(vector<string>::const_iterator phrase=m_extractedPhrases.begin();phrase!=m_extractedPhrases.end();phrase++){
+ outextractFile<<phrase->data();
+ }
+ for(vector<string>::const_iterator phrase=m_extractedPhrasesInv.begin();phrase!=m_extractedPhrasesInv.end();phrase++){
+ outextractFileInv<<phrase->data();
+ }
+ for(vector<string>::const_iterator phrase=m_extractedPhrasesOri.begin();phrase!=m_extractedPhrasesOri.end();phrase++){
+ outextractFileOrientation<<phrase->data();
+ }
+ for(vector<string>::const_iterator phrase=m_extractedPhrasesSid.begin();phrase!=m_extractedPhrasesSid.end();phrase++){
+ outextractFileSentenceId<<phrase->data();
+ }
+
+ m_extractFile << outextractFile.str();
+ m_extractFileInv << outextractFileInv.str();
+ m_extractFileOrientation << outextractFileOrientation.str();
+ m_extractFileSentenceId << outextractFileSentenceId.str();
}
// if proper conditioning, we need the number of times a source phrase occured
-void extractBase( SentenceAlignment &sentence )
+
+void ExtractTask::extractBase( SentenceAlignment &sentence )
{
+ ostringstream outextractFile;
+ ostringstream outextractFileInv;
+
int countF = sentence.source.size();
for(int startF=0; startF<countF; startF++) {
for(int endF=startF;
- (endF<countF && endF<startF+maxPhraseLength);
+ (endF<countF && endF<startF+m_options.maxPhraseLength);
endF++) {
for(int fi=startF; fi<=endF; fi++) {
- extractFile << sentence.source[fi] << " ";
- }
- extractFile << "|||" << endl;
+ outextractFile << sentence.source[fi] << " ";
+ }
+ outextractFile << "|||" << endl;
}
}
int countE = sentence.target.size();
for(int startE=0; startE<countE; startE++) {
for(int endE=startE;
- (endE<countE && endE<startE+maxPhraseLength);
+ (endE<countE && endE<startE+m_options.maxPhraseLength);
endE++) {
for(int ei=startE; ei<=endE; ei++) {
- extractFileInv << sentence.target[ei] << " ";
+ outextractFileInv << sentence.target[ei] << " ";
}
- extractFileInv << "|||" << endl;
+ outextractFileInv << "|||" << endl;
}
}
+ m_extractFile << outextractFile.str();
+ m_extractFileInv << outextractFileInv.str();
+
}
}
diff --git a/phrase-extract/score.cpp b/phrase-extract/score.cpp
index f02b6b3b0..9ec976f46 100644
--- a/phrase-extract/score.cpp
+++ b/phrase-extract/score.cpp
@@ -83,7 +83,7 @@ void outputPhrasePair(const PhraseAlignmentCollection &phrasePair, float, int, o
double computeLexicalTranslation( const PHRASE &, const PHRASE &, PhraseAlignment * );
double computeUnalignedPenalty( const PHRASE &, const PHRASE &, PhraseAlignment * );
set<string> functionWordList;
-void loadFunctionWords( const char* fileNameFunctionWords );
+void loadFunctionWords( const string &fileNameFunctionWords );
double computeUnalignedFWPenalty( const PHRASE &, const PHRASE &, PhraseAlignment * );
void calcNTLengthProb(const vector< PhraseAlignment* > &phrasePairs
, map<size_t, map<size_t, float> > &sourceProb
@@ -100,12 +100,16 @@ int main(int argc, char* argv[])
cerr << "syntax: score extract lex phrase-table [--Inverse] [--Hierarchical] [--LogProb] [--NegLogProb] [--NoLex] [--GoodTuring] [--KneserNey] [--WordAlignment] [--UnalignedPenalty] [--UnalignedFunctionWordPenalty function-word-file] [--MinCountHierarchical count] [--OutputNTLengths] [--PCFG] [--UnpairedExtractFormat] [--ConditionOnTargetLHS] [--[Sparse]Domain[Indicator|Ratio|Subset|Bin] domain-file [bins]]\n";
exit(1);
}
- char* fileNameExtract = argv[1];
- char* fileNameLex = argv[2];
- char* fileNamePhraseTable = argv[3];
+ string fileNameExtract = argv[1];
+ string fileNameLex = argv[2];
+ string fileNamePhraseTable = argv[3];
string fileNameCountOfCounts;
+<<<<<<< HEAD
char* fileNameFunctionWords = NULL;
char* fileNameDomain = NULL;
+=======
+ string fileNameFunctionWords;
+>>>>>>> b317522563feb4ca7ff978a0de661ec2189934ea
for(int i=4; i<argc; i++) {
if (strcmp(argv[i],"inverse") == 0 || strcmp(argv[i],"--Inverse") == 0) {
@@ -220,7 +224,7 @@ int main(int argc, char* argv[])
// output file: phrase translation table
ostream *phraseTableFile;
- if (strcmp(fileNamePhraseTable, "-") == 0) {
+ if (fileNamePhraseTable == "-") {
phraseTableFile = &cout;
}
else {
@@ -367,12 +371,21 @@ PhraseAlignment* findBestAlignment(const PhraseAlignmentCollection &phrasePair )
PhraseAlignment* bestAlignment = NULL;
for(size_t i=0; i<phrasePair.size(); i++) {
- if (phrasePair[i]->count > bestAlignmentCount) {
- bestAlignmentCount = phrasePair[i]->count;
- bestAlignment = phrasePair[i];
+ size_t alignInd;
+ if (inverseFlag)
+ { // count backwards, so that alignments for ties will be the same for both normal & inverse scores
+ alignInd = phrasePair.size() - i - 1;
}
- }
-
+ else {
+ alignInd = i;
+ }
+
+ if (phrasePair[alignInd]->count > bestAlignmentCount) {
+ bestAlignmentCount = phrasePair[alignInd]->count;
+ bestAlignment = phrasePair[alignInd];
+ }
+ }
+
return bestAlignment;
}
@@ -700,11 +713,11 @@ double computeUnalignedFWPenalty( const PHRASE &phraseS, const PHRASE &phraseT,
return unaligned;
}
-void loadFunctionWords( const char *fileName )
+void loadFunctionWords( const string &fileName )
{
cerr << "Loading function word list from " << fileName;
ifstream inFile;
- inFile.open(fileName);
+ inFile.open(fileName.c_str());
if (inFile.fail()) {
cerr << " - ERROR: could not open file\n";
exit(1);
@@ -748,11 +761,11 @@ double computeLexicalTranslation( const PHRASE &phraseS, const PHRASE &phraseT,
return lexScore;
}
-void LexicalTable::load( char *fileName )
+void LexicalTable::load( const string &fileName )
{
cerr << "Loading lexical translation table from " << fileName;
ifstream inFile;
- inFile.open(fileName);
+ inFile.open(fileName.c_str());
if (inFile.fail()) {
cerr << " - ERROR: could not open file\n";
exit(1);
diff --git a/phrase-extract/score.h b/phrase-extract/score.h
index ed9adc18c..f720a32d2 100644
--- a/phrase-extract/score.h
+++ b/phrase-extract/score.h
@@ -65,7 +65,7 @@ class LexicalTable
{
public:
std::map< WORD_ID, std::map< WORD_ID, double > > ltable;
- void load( char[] );
+ void load( const std::string &filePath );
double permissiveLookup( WORD_ID wordS, WORD_ID wordT ) {
// cout << endl << vcbS.getWord( wordS ) << "-" << vcbT.getWord( wordT ) << ":";
if (ltable.find( wordS ) == ltable.end()) return 1.0;
diff --git a/phrase-extract/statistics.cpp b/phrase-extract/statistics.cpp
index d39a05d3b..67373ec93 100644
--- a/phrase-extract/statistics.cpp
+++ b/phrase-extract/statistics.cpp
@@ -40,7 +40,7 @@ class LexicalTable
{
public:
map< WORD_ID, map< WORD_ID, double > > ltable;
- void load( char[] );
+ void load( const string &);
};
}
@@ -310,11 +310,11 @@ bool PhraseAlignment::equals( const PhraseAlignment& other )
return true;
}
-void LexicalTable::load( char *fileName )
+void LexicalTable::load( const string &filePath )
{
- cerr << "Loading lexical translation table from " << fileName;
+ cerr << "Loading lexical translation table from " << filePath;
ifstream inFile;
- inFile.open(fileName);
+ inFile.open(filePath.c_str());
if (inFile.fail()) {
cerr << " - ERROR: could not open file\n";
exit(1);
@@ -332,7 +332,7 @@ void LexicalTable::load( char *fileName )
vector<string> token = tokenize( line );
if (token.size() != 3) {
- cerr << "line " << i << " in " << fileName << " has wrong number of tokens, skipping:\n" <<
+ cerr << "line " << i << " in " << filePath << " has wrong number of tokens, skipping:\n" <<
token.size() << " " << token[0] << " " << line << endl;
continue;
}