Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMatthias Huck <mhuck@inf.ed.ac.uk>2016-02-04 00:35:26 +0300
committerMatthias Huck <mhuck@inf.ed.ac.uk>2016-02-04 00:35:26 +0300
commit5de88ec1a4e2ded2d73e4e7c0f386f66cd3671c1 (patch)
treed6a1bf35e99d49fa170978416d9232b07cd2d74b /phrase-extract
parent16a49d0d8de40b491f39d2375ff022fa613f882e (diff)
single word heuristic for phrase extraction,
and minor modification of SentenceAlignmentWithSyntax constructor
Diffstat (limited to 'phrase-extract')
-rw-r--r--phrase-extract/PhraseExtractionOptions.h8
-rw-r--r--phrase-extract/RuleExtractionOptions.h3
-rw-r--r--phrase-extract/SentenceAlignmentWithSyntax.cpp4
-rw-r--r--phrase-extract/SentenceAlignmentWithSyntax.h11
-rw-r--r--phrase-extract/extract-main.cpp20
-rw-r--r--phrase-extract/extract-rules-main.cpp3
6 files changed, 30 insertions, 19 deletions
diff --git a/phrase-extract/PhraseExtractionOptions.h b/phrase-extract/PhraseExtractionOptions.h
index cf2e4b365..38e570b79 100644
--- a/phrase-extract/PhraseExtractionOptions.h
+++ b/phrase-extract/PhraseExtractionOptions.h
@@ -51,6 +51,7 @@ private:
bool gzOutput;
std::string instanceWeightsFile; //weights for each sentence
bool flexScoreFlag;
+ bool singleWordHeuristicFlag;
public:
std::vector<std::string> placeholders;
@@ -73,6 +74,7 @@ public:
onlyOutputSpanInfo(false),
gzOutput(false),
flexScoreFlag(false),
+ singleWordHeuristicFlag(false),
debug(false) {
}
@@ -119,6 +121,9 @@ public:
void initFlexScoreFlag(const bool initflexScoreFlag) {
flexScoreFlag=initflexScoreFlag;
}
+ void initSingleWordHeuristicFlag(const bool initSingleWordHeuristicFlag) {
+ singleWordHeuristicFlag = initSingleWordHeuristicFlag;
+ }
// functions for getting values
bool isAllModelsOutputFlag() const {
@@ -163,6 +168,9 @@ public:
bool isFlexScoreFlag() const {
return flexScoreFlag;
}
+ bool isSingleWordHeuristicFlag() const {
+ return singleWordHeuristicFlag;
+ }
};
}
diff --git a/phrase-extract/RuleExtractionOptions.h b/phrase-extract/RuleExtractionOptions.h
index 43f7b205b..7dfb90177 100644
--- a/phrase-extract/RuleExtractionOptions.h
+++ b/phrase-extract/RuleExtractionOptions.h
@@ -18,8 +18,6 @@
***********************************************************************/
#pragma once
-#ifndef RULEEXTRACTIONOPTIONS_H_INCLUDED_
-#define RULEEXTRACTIONOPTIONS_H_INCLUDED_
namespace MosesTraining
{
@@ -95,4 +93,3 @@ public:
}
-#endif
diff --git a/phrase-extract/SentenceAlignmentWithSyntax.cpp b/phrase-extract/SentenceAlignmentWithSyntax.cpp
index 4fd2355ae..89361a45d 100644
--- a/phrase-extract/SentenceAlignmentWithSyntax.cpp
+++ b/phrase-extract/SentenceAlignmentWithSyntax.cpp
@@ -35,7 +35,7 @@ namespace MosesTraining
bool SentenceAlignmentWithSyntax::processTargetSentence(const char * targetString, int sentenceID, bool boundaryRules)
{
- if (!m_options.targetSyntax) {
+ if (!m_targetSyntax) {
return SentenceAlignment::processTargetSentence(targetString, sentenceID, boundaryRules);
}
@@ -56,7 +56,7 @@ bool SentenceAlignmentWithSyntax::processTargetSentence(const char * targetStrin
bool SentenceAlignmentWithSyntax::processSourceSentence(const char * sourceString, int sentenceID, bool boundaryRules)
{
- if (!m_options.sourceSyntax) {
+ if (!m_sourceSyntax) {
return SentenceAlignment::processSourceSentence(sourceString, sentenceID, boundaryRules);
}
diff --git a/phrase-extract/SentenceAlignmentWithSyntax.h b/phrase-extract/SentenceAlignmentWithSyntax.h
index 604b6d0e2..be7ee52d3 100644
--- a/phrase-extract/SentenceAlignmentWithSyntax.h
+++ b/phrase-extract/SentenceAlignmentWithSyntax.h
@@ -18,8 +18,6 @@
***********************************************************************/
#pragma once
-#ifndef SENTENCEALIGNMENTWITHSYNTAX_H_INCLUDED_
-#define SENTENCEALIGNMENTWITHSYNTAX_H_INCLUDED_
#include <map>
#include <set>
@@ -42,18 +40,20 @@ public:
std::set<std::string> & m_sourceLabelCollection;
std::map<std::string, int> & m_targetTopLabelCollection;
std::map<std::string, int> & m_sourceTopLabelCollection;
- const RuleExtractionOptions & m_options;
+ const bool m_targetSyntax, m_sourceSyntax;
SentenceAlignmentWithSyntax(std::set<std::string> & tgtLabelColl,
std::set<std::string> & srcLabelColl,
std::map<std::string,int> & tgtTopLabelColl,
std::map<std::string,int> & srcTopLabelColl,
- const RuleExtractionOptions & options)
+ bool targetSyntax,
+ bool sourceSyntax)
: m_targetLabelCollection(tgtLabelColl)
, m_sourceLabelCollection(srcLabelColl)
, m_targetTopLabelCollection(tgtTopLabelColl)
, m_sourceTopLabelCollection(srcTopLabelColl)
- , m_options(options) {
+ , m_targetSyntax(targetSyntax)
+ , m_sourceSyntax(sourceSyntax) {
}
virtual ~SentenceAlignmentWithSyntax() {}
@@ -67,4 +67,3 @@ public:
}
-#endif
diff --git a/phrase-extract/extract-main.cpp b/phrase-extract/extract-main.cpp
index c23fda489..ef989fac6 100644
--- a/phrase-extract/extract-main.cpp
+++ b/phrase-extract/extract-main.cpp
@@ -155,6 +155,8 @@ int main(int argc, char* argv[])
options.initOrientationFlag(true);
} else if (strcmp(argv[i],"--FlexibilityScore") == 0) {
options.initFlexScoreFlag(true);
+ } else if (strcmp(argv[i],"--SingleWordHeuristic") == 0) {
+ options.initSingleWordHeuristicFlag(true);
} else if (strcmp(argv[i],"--NoTTable") == 0) {
options.initTranslationFlag(false);
} else if (strcmp(argv[i], "--IncludeSentenceId") == 0) {
@@ -413,18 +415,22 @@ void ExtractTask::extract(SentenceAlignment &sentence)
}
// cout << "doing if for ( " << minF << "-" << maxF << ", " << startE << "," << endE << ")\n";
- if (!out_of_bounds) {
+ if (!out_of_bounds ||
+ ( m_options.isSingleWordHeuristicFlag() && (endE==startE) && (minF==maxF) )) // extraction of single word phrases even if inconsistent wrt. word alignment
+ {
// start point of source phrase may retreat over unaligned
for(int startF=minF;
- (startF>=0 &&
- (relaxLimit || startF>maxF-m_options.maxPhraseLength) && // within length limit
- (startF==minF || sentence.alignedCountS[startF]==0)); // unaligned
+ ((startF>=0 &&
+ (relaxLimit || startF>maxF-m_options.maxPhraseLength) && // within length limit
+ (startF==minF || sentence.alignedCountS[startF]==0)) && // unaligned
+ (!out_of_bounds || (startF==minF))); // if out of bounds, but single word heuristic: don't retreat over unaligned
startF--)
// end point of source phrase may advance over unaligned
for(int endF=maxF;
- (endF<countF &&
- (relaxLimit || endF<startF+m_options.maxPhraseLength) && // within length limit
- (endF==maxF || sentence.alignedCountS[endF]==0)); // unaligned
+ ((endF<countF &&
+ (relaxLimit || endF<startF+m_options.maxPhraseLength) && // within length limit
+ (endF==maxF || sentence.alignedCountS[endF]==0)) && // unaligned
+ (!out_of_bounds || (endF==maxF))); // if out of bounds, but single word heuristic: don't advance over unaligned
endF++) { // at this point we have extracted a phrase
if(buildExtraStructure) { // phrase || hier
if(endE-startE < m_options.maxPhraseLength && endF-startF < m_options.maxPhraseLength) { // within limit
diff --git a/phrase-extract/extract-rules-main.cpp b/phrase-extract/extract-rules-main.cpp
index 125383a7e..711918656 100644
--- a/phrase-extract/extract-rules-main.cpp
+++ b/phrase-extract/extract-rules-main.cpp
@@ -347,7 +347,8 @@ int main(int argc, char* argv[])
SentenceAlignmentWithSyntax sentence
(targetLabelCollection, sourceLabelCollection,
- targetTopLabelCollection, sourceTopLabelCollection, options);
+ targetTopLabelCollection, sourceTopLabelCollection,
+ options.targetSyntax, options.sourceSyntax);
//az: output src, tgt, and alingment line
if (options.onlyOutputSpanInfo) {
cout << "LOG: SRC: " << sourceString << endl;