diff options
author | Matthias Huck <mhuck@inf.ed.ac.uk> | 2016-02-04 00:35:26 +0300 |
---|---|---|
committer | Matthias Huck <mhuck@inf.ed.ac.uk> | 2016-02-04 00:35:26 +0300 |
commit | 5de88ec1a4e2ded2d73e4e7c0f386f66cd3671c1 (patch) | |
tree | d6a1bf35e99d49fa170978416d9232b07cd2d74b /phrase-extract | |
parent | 16a49d0d8de40b491f39d2375ff022fa613f882e (diff) |
single word heuristic for phrase extraction,
and minor modification of SentenceAlignmentWithSyntax constructor
Diffstat (limited to 'phrase-extract')
-rw-r--r-- | phrase-extract/PhraseExtractionOptions.h | 8 | ||||
-rw-r--r-- | phrase-extract/RuleExtractionOptions.h | 3 | ||||
-rw-r--r-- | phrase-extract/SentenceAlignmentWithSyntax.cpp | 4 | ||||
-rw-r--r-- | phrase-extract/SentenceAlignmentWithSyntax.h | 11 | ||||
-rw-r--r-- | phrase-extract/extract-main.cpp | 20 | ||||
-rw-r--r-- | phrase-extract/extract-rules-main.cpp | 3 |
6 files changed, 30 insertions, 19 deletions
diff --git a/phrase-extract/PhraseExtractionOptions.h b/phrase-extract/PhraseExtractionOptions.h index cf2e4b365..38e570b79 100644 --- a/phrase-extract/PhraseExtractionOptions.h +++ b/phrase-extract/PhraseExtractionOptions.h @@ -51,6 +51,7 @@ private: bool gzOutput; std::string instanceWeightsFile; //weights for each sentence bool flexScoreFlag; + bool singleWordHeuristicFlag; public: std::vector<std::string> placeholders; @@ -73,6 +74,7 @@ public: onlyOutputSpanInfo(false), gzOutput(false), flexScoreFlag(false), + singleWordHeuristicFlag(false), debug(false) { } @@ -119,6 +121,9 @@ public: void initFlexScoreFlag(const bool initflexScoreFlag) { flexScoreFlag=initflexScoreFlag; } + void initSingleWordHeuristicFlag(const bool initSingleWordHeuristicFlag) { + singleWordHeuristicFlag = initSingleWordHeuristicFlag; + } // functions for getting values bool isAllModelsOutputFlag() const { @@ -163,6 +168,9 @@ public: bool isFlexScoreFlag() const { return flexScoreFlag; } + bool isSingleWordHeuristicFlag() const { + return singleWordHeuristicFlag; + } }; } diff --git a/phrase-extract/RuleExtractionOptions.h b/phrase-extract/RuleExtractionOptions.h index 43f7b205b..7dfb90177 100644 --- a/phrase-extract/RuleExtractionOptions.h +++ b/phrase-extract/RuleExtractionOptions.h @@ -18,8 +18,6 @@ ***********************************************************************/ #pragma once -#ifndef RULEEXTRACTIONOPTIONS_H_INCLUDED_ -#define RULEEXTRACTIONOPTIONS_H_INCLUDED_ namespace MosesTraining { @@ -95,4 +93,3 @@ public: } -#endif diff --git a/phrase-extract/SentenceAlignmentWithSyntax.cpp b/phrase-extract/SentenceAlignmentWithSyntax.cpp index 4fd2355ae..89361a45d 100644 --- a/phrase-extract/SentenceAlignmentWithSyntax.cpp +++ b/phrase-extract/SentenceAlignmentWithSyntax.cpp @@ -35,7 +35,7 @@ namespace MosesTraining bool SentenceAlignmentWithSyntax::processTargetSentence(const char * targetString, int sentenceID, bool boundaryRules) { - if (!m_options.targetSyntax) { + if (!m_targetSyntax) { return SentenceAlignment::processTargetSentence(targetString, sentenceID, boundaryRules); } @@ -56,7 +56,7 @@ bool SentenceAlignmentWithSyntax::processTargetSentence(const char * targetStrin bool SentenceAlignmentWithSyntax::processSourceSentence(const char * sourceString, int sentenceID, bool boundaryRules) { - if (!m_options.sourceSyntax) { + if (!m_sourceSyntax) { return SentenceAlignment::processSourceSentence(sourceString, sentenceID, boundaryRules); } diff --git a/phrase-extract/SentenceAlignmentWithSyntax.h b/phrase-extract/SentenceAlignmentWithSyntax.h index 604b6d0e2..be7ee52d3 100644 --- a/phrase-extract/SentenceAlignmentWithSyntax.h +++ b/phrase-extract/SentenceAlignmentWithSyntax.h @@ -18,8 +18,6 @@ ***********************************************************************/ #pragma once -#ifndef SENTENCEALIGNMENTWITHSYNTAX_H_INCLUDED_ -#define SENTENCEALIGNMENTWITHSYNTAX_H_INCLUDED_ #include <map> #include <set> @@ -42,18 +40,20 @@ public: std::set<std::string> & m_sourceLabelCollection; std::map<std::string, int> & m_targetTopLabelCollection; std::map<std::string, int> & m_sourceTopLabelCollection; - const RuleExtractionOptions & m_options; + const bool m_targetSyntax, m_sourceSyntax; SentenceAlignmentWithSyntax(std::set<std::string> & tgtLabelColl, std::set<std::string> & srcLabelColl, std::map<std::string,int> & tgtTopLabelColl, std::map<std::string,int> & srcTopLabelColl, - const RuleExtractionOptions & options) + bool targetSyntax, + bool sourceSyntax) : m_targetLabelCollection(tgtLabelColl) , m_sourceLabelCollection(srcLabelColl) , m_targetTopLabelCollection(tgtTopLabelColl) , m_sourceTopLabelCollection(srcTopLabelColl) - , m_options(options) { + , m_targetSyntax(targetSyntax) + , m_sourceSyntax(sourceSyntax) { } virtual ~SentenceAlignmentWithSyntax() {} @@ -67,4 +67,3 @@ public: } -#endif diff --git a/phrase-extract/extract-main.cpp b/phrase-extract/extract-main.cpp index c23fda489..ef989fac6 100644 --- a/phrase-extract/extract-main.cpp +++ b/phrase-extract/extract-main.cpp @@ -155,6 +155,8 @@ int main(int argc, char* argv[]) options.initOrientationFlag(true); } else if (strcmp(argv[i],"--FlexibilityScore") == 0) { options.initFlexScoreFlag(true); + } else if (strcmp(argv[i],"--SingleWordHeuristic") == 0) { + options.initSingleWordHeuristicFlag(true); } else if (strcmp(argv[i],"--NoTTable") == 0) { options.initTranslationFlag(false); } else if (strcmp(argv[i], "--IncludeSentenceId") == 0) { @@ -413,18 +415,22 @@ void ExtractTask::extract(SentenceAlignment &sentence) } // cout << "doing if for ( " << minF << "-" << maxF << ", " << startE << "," << endE << ")\n"; - if (!out_of_bounds) { + if (!out_of_bounds || + ( m_options.isSingleWordHeuristicFlag() && (endE==startE) && (minF==maxF) )) // extraction of single word phrases even if inconsistent wrt. word alignment + { // start point of source phrase may retreat over unaligned for(int startF=minF; - (startF>=0 && - (relaxLimit || startF>maxF-m_options.maxPhraseLength) && // within length limit - (startF==minF || sentence.alignedCountS[startF]==0)); // unaligned + ((startF>=0 && + (relaxLimit || startF>maxF-m_options.maxPhraseLength) && // within length limit + (startF==minF || sentence.alignedCountS[startF]==0)) && // unaligned + (!out_of_bounds || (startF==minF))); // if out of bounds, but single word heuristic: don't retreat over unaligned startF--) // end point of source phrase may advance over unaligned for(int endF=maxF; - (endF<countF && - (relaxLimit || endF<startF+m_options.maxPhraseLength) && // within length limit - (endF==maxF || sentence.alignedCountS[endF]==0)); // unaligned + ((endF<countF && + (relaxLimit || endF<startF+m_options.maxPhraseLength) && // within length limit + (endF==maxF || sentence.alignedCountS[endF]==0)) && // unaligned + (!out_of_bounds || (endF==maxF))); // if out of bounds, but single word heuristic: don't advance over unaligned endF++) { // at this point we have extracted a phrase if(buildExtraStructure) { // phrase || hier if(endE-startE < m_options.maxPhraseLength && endF-startF < m_options.maxPhraseLength) { // within limit diff --git a/phrase-extract/extract-rules-main.cpp b/phrase-extract/extract-rules-main.cpp index 125383a7e..711918656 100644 --- a/phrase-extract/extract-rules-main.cpp +++ b/phrase-extract/extract-rules-main.cpp @@ -347,7 +347,8 @@ int main(int argc, char* argv[]) SentenceAlignmentWithSyntax sentence (targetLabelCollection, sourceLabelCollection, - targetTopLabelCollection, sourceTopLabelCollection, options); + targetTopLabelCollection, sourceTopLabelCollection, + options.targetSyntax, options.sourceSyntax); //az: output src, tgt, and alingment line if (options.onlyOutputSpanInfo) { cout << "LOG: SRC: " << sourceString << endl; |