From 5de88ec1a4e2ded2d73e4e7c0f386f66cd3671c1 Mon Sep 17 00:00:00 2001 From: Matthias Huck Date: Wed, 3 Feb 2016 21:35:26 +0000 Subject: single word heuristic for phrase extraction, and minor modification of SentenceAlignmentWithSyntax constructor --- phrase-extract/PhraseExtractionOptions.h | 8 ++++++++ phrase-extract/RuleExtractionOptions.h | 3 --- phrase-extract/SentenceAlignmentWithSyntax.cpp | 4 ++-- phrase-extract/SentenceAlignmentWithSyntax.h | 11 +++++------ phrase-extract/extract-main.cpp | 20 +++++++++++++------- phrase-extract/extract-rules-main.cpp | 3 ++- 6 files changed, 30 insertions(+), 19 deletions(-) (limited to 'phrase-extract') diff --git a/phrase-extract/PhraseExtractionOptions.h b/phrase-extract/PhraseExtractionOptions.h index cf2e4b365..38e570b79 100644 --- a/phrase-extract/PhraseExtractionOptions.h +++ b/phrase-extract/PhraseExtractionOptions.h @@ -51,6 +51,7 @@ private: bool gzOutput; std::string instanceWeightsFile; //weights for each sentence bool flexScoreFlag; + bool singleWordHeuristicFlag; public: std::vector placeholders; @@ -73,6 +74,7 @@ public: onlyOutputSpanInfo(false), gzOutput(false), flexScoreFlag(false), + singleWordHeuristicFlag(false), debug(false) { } @@ -119,6 +121,9 @@ public: void initFlexScoreFlag(const bool initflexScoreFlag) { flexScoreFlag=initflexScoreFlag; } + void initSingleWordHeuristicFlag(const bool initSingleWordHeuristicFlag) { + singleWordHeuristicFlag = initSingleWordHeuristicFlag; + } // functions for getting values bool isAllModelsOutputFlag() const { @@ -163,6 +168,9 @@ public: bool isFlexScoreFlag() const { return flexScoreFlag; } + bool isSingleWordHeuristicFlag() const { + return singleWordHeuristicFlag; + } }; } diff --git a/phrase-extract/RuleExtractionOptions.h b/phrase-extract/RuleExtractionOptions.h index 43f7b205b..7dfb90177 100644 --- a/phrase-extract/RuleExtractionOptions.h +++ b/phrase-extract/RuleExtractionOptions.h @@ -18,8 +18,6 @@ ***********************************************************************/ #pragma once -#ifndef RULEEXTRACTIONOPTIONS_H_INCLUDED_ -#define RULEEXTRACTIONOPTIONS_H_INCLUDED_ namespace MosesTraining { @@ -95,4 +93,3 @@ public: } -#endif diff --git a/phrase-extract/SentenceAlignmentWithSyntax.cpp b/phrase-extract/SentenceAlignmentWithSyntax.cpp index 4fd2355ae..89361a45d 100644 --- a/phrase-extract/SentenceAlignmentWithSyntax.cpp +++ b/phrase-extract/SentenceAlignmentWithSyntax.cpp @@ -35,7 +35,7 @@ namespace MosesTraining bool SentenceAlignmentWithSyntax::processTargetSentence(const char * targetString, int sentenceID, bool boundaryRules) { - if (!m_options.targetSyntax) { + if (!m_targetSyntax) { return SentenceAlignment::processTargetSentence(targetString, sentenceID, boundaryRules); } @@ -56,7 +56,7 @@ bool SentenceAlignmentWithSyntax::processTargetSentence(const char * targetStrin bool SentenceAlignmentWithSyntax::processSourceSentence(const char * sourceString, int sentenceID, bool boundaryRules) { - if (!m_options.sourceSyntax) { + if (!m_sourceSyntax) { return SentenceAlignment::processSourceSentence(sourceString, sentenceID, boundaryRules); } diff --git a/phrase-extract/SentenceAlignmentWithSyntax.h b/phrase-extract/SentenceAlignmentWithSyntax.h index 604b6d0e2..be7ee52d3 100644 --- a/phrase-extract/SentenceAlignmentWithSyntax.h +++ b/phrase-extract/SentenceAlignmentWithSyntax.h @@ -18,8 +18,6 @@ ***********************************************************************/ #pragma once -#ifndef SENTENCEALIGNMENTWITHSYNTAX_H_INCLUDED_ -#define SENTENCEALIGNMENTWITHSYNTAX_H_INCLUDED_ #include #include @@ -42,18 +40,20 @@ public: std::set & m_sourceLabelCollection; std::map & m_targetTopLabelCollection; std::map & m_sourceTopLabelCollection; - const RuleExtractionOptions & m_options; + const bool m_targetSyntax, m_sourceSyntax; SentenceAlignmentWithSyntax(std::set & tgtLabelColl, std::set & srcLabelColl, std::map & tgtTopLabelColl, std::map & srcTopLabelColl, - const RuleExtractionOptions & options) + bool targetSyntax, + bool sourceSyntax) : m_targetLabelCollection(tgtLabelColl) , m_sourceLabelCollection(srcLabelColl) , m_targetTopLabelCollection(tgtTopLabelColl) , m_sourceTopLabelCollection(srcTopLabelColl) - , m_options(options) { + , m_targetSyntax(targetSyntax) + , m_sourceSyntax(sourceSyntax) { } virtual ~SentenceAlignmentWithSyntax() {} @@ -67,4 +67,3 @@ public: } -#endif diff --git a/phrase-extract/extract-main.cpp b/phrase-extract/extract-main.cpp index c23fda489..ef989fac6 100644 --- a/phrase-extract/extract-main.cpp +++ b/phrase-extract/extract-main.cpp @@ -155,6 +155,8 @@ int main(int argc, char* argv[]) options.initOrientationFlag(true); } else if (strcmp(argv[i],"--FlexibilityScore") == 0) { options.initFlexScoreFlag(true); + } else if (strcmp(argv[i],"--SingleWordHeuristic") == 0) { + options.initSingleWordHeuristicFlag(true); } else if (strcmp(argv[i],"--NoTTable") == 0) { options.initTranslationFlag(false); } else if (strcmp(argv[i], "--IncludeSentenceId") == 0) { @@ -413,18 +415,22 @@ void ExtractTask::extract(SentenceAlignment &sentence) } // cout << "doing if for ( " << minF << "-" << maxF << ", " << startE << "," << endE << ")\n"; - if (!out_of_bounds) { + if (!out_of_bounds || + ( m_options.isSingleWordHeuristicFlag() && (endE==startE) && (minF==maxF) )) // extraction of single word phrases even if inconsistent wrt. word alignment + { // start point of source phrase may retreat over unaligned for(int startF=minF; - (startF>=0 && - (relaxLimit || startF>maxF-m_options.maxPhraseLength) && // within length limit - (startF==minF || sentence.alignedCountS[startF]==0)); // unaligned + ((startF>=0 && + (relaxLimit || startF>maxF-m_options.maxPhraseLength) && // within length limit + (startF==minF || sentence.alignedCountS[startF]==0)) && // unaligned + (!out_of_bounds || (startF==minF))); // if out of bounds, but single word heuristic: don't retreat over unaligned startF--) // end point of source phrase may advance over unaligned for(int endF=maxF; - (endF