Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMatthias Huck <mhuck@inf.ed.ac.uk>2015-07-24 21:42:15 +0300
committerMatthias Huck <mhuck@inf.ed.ac.uk>2015-07-24 21:42:15 +0300
commit9e31bced9afa395bc470de2c9958044e357574ad (patch)
treee8f53579ca99e17b83ed61d6fb8a67f18f8fc2a7 /phrase-extract
parent5fca4c67a1f5485ada79723782bbf4df930bfb2e (diff)
MinCount parameter in score-main
Diffstat (limited to 'phrase-extract')
-rw-r--r--phrase-extract/score-main.cpp71
1 files changed, 48 insertions, 23 deletions
diff --git a/phrase-extract/score-main.cpp b/phrase-extract/score-main.cpp
index 444087b2d..a09bd0543 100644
--- a/phrase-extract/score-main.cpp
+++ b/phrase-extract/score-main.cpp
@@ -20,6 +20,7 @@
#include <sstream>
#include <assert.h>
#include <cstring>
+#include <list>
#include <map>
#include <set>
#include <vector>
@@ -70,6 +71,7 @@ bool nonTermContextTarget = false;
int countOfCounts[COC_MAX+1];
int totalDistinct = 0;
+float minCount = 0;
float minCountHierarchical = 0;
bool phraseOrientationPriorsFlag = false;
@@ -107,7 +109,7 @@ void writeLeftHandSideLabelCounts( const boost::unordered_map<std::string,float>
const std::string &fileNameLeftHandSideSourceLabelCounts,
const std::string &fileNameLeftHandSideTargetSourceLabelCounts );
void writeLabelSet( const std::set<std::string> &labelSet, const std::string &fileName );
-void processPhrasePairs( std::vector< ExtractionPhrasePair* > &phrasePairsWithSameSource, std::ostream &phraseTableFile,
+void processPhrasePairs( std::list< ExtractionPhrasePair* > &phrasePairsWithSameSource, std::ostream &phraseTableFile,
const ScoreFeatureManager& featureManager, const MaybeLog& maybeLogProb );
void outputPhrasePair(const ExtractionPhrasePair &phrasePair, float, int, std::ostream &phraseTableFile, const ScoreFeatureManager &featureManager, const MaybeLog &maybeLog );
double computeLexicalTranslation( const PHRASE *phraseSource, const PHRASE *phraseTarget, const ALIGNMENT *alignmentTargetToSource );
@@ -131,14 +133,28 @@ int main(int argc, char* argv[])
ScoreFeatureManager featureManager;
if (argc < 4) {
std::cerr <<
- "syntax: score extract lex phrase-table [--Inverse] [--Hierarchical] "
- "[--LogProb] [--NegLogProb] [--NoLex] [--GoodTuring] [--KneserNey] "
- "[--NoWordAlignment] [--UnalignedPenalty] "
+ "syntax: score extract lex phrase-table "
+ "[--Inverse] "
+ "[--Hierarchical] "
+ "[--LogProb] "
+ "[--NegLogProb] "
+ "[--NoLex] "
+ "[--GoodTuring] "
+ "[--KneserNey] "
+ "[--NoWordAlignment] "
+ "[--UnalignedPenalty] "
"[--UnalignedFunctionWordPenalty function-word-file] "
- "[--MinCountHierarchical count] [--PartsOfSpeech] [--PCFG] "
- "[--TreeFragments] [--SourceLabels] [--SourceLabelCountsLHS] "
- "[--TargetPreferenceLabels] [--UnpairedExtractFormat] "
- "[--ConditionOnTargetLHS] [--CrossedNonTerm]" << std::endl;
+ "[--MinCountHierarchical count] "
+ "[--PartsOfSpeech] "
+ "[--PCFG] "
+ "[--TreeFragments] "
+ "[--SourceLabels] "
+ "[--SourceLabelCountsLHS] "
+ "[--TargetPreferenceLabels] "
+ "[--UnpairedExtractFormat] "
+ "[--ConditionOnTargetLHS] "
+ "[--CrossedNonTerm]"
+ << std::endl;
std::cerr << featureManager.usage() << std::endl;
exit(1);
}
@@ -235,9 +251,13 @@ int main(int argc, char* argv[])
logProbFlag = true;
negLogProb = -1;
std::cerr << "using negative log-probabilities" << std::endl;
+ } else if (strcmp(argv[i],"--MinCount") == 0) {
+ minCount = Moses::Scan<float>( argv[++i] );
+ std::cerr << "dropping all phrase pairs occurring less than " << minCount << " times" << std::endl;
+ minCount -= 0.00001; // account for rounding
} else if (strcmp(argv[i],"--MinCountHierarchical") == 0) {
minCountHierarchical = Moses::Scan<float>( argv[++i] );
- std::cerr << "dropping all phrase pairs occurring less than " << minCountHierarchical << " times" << std::endl;
+ std::cerr << "dropping all hierarchical phrase pairs occurring less than " << minCountHierarchical << " times" << std::endl;
minCountHierarchical -= 0.00001; // account for rounding
} else if (strcmp(argv[i],"--CrossedNonTerm") == 0) {
crossedNonTerm = true;
@@ -325,8 +345,8 @@ int main(int argc, char* argv[])
// loop through all extracted phrase translations
std::string line, lastLine;
ExtractionPhrasePair *phrasePair = NULL;
- std::vector< ExtractionPhrasePair* > phrasePairsWithSameSource;
- std::vector< ExtractionPhrasePair* > phrasePairsWithSameSourceAndTarget; // required for hierarchical rules only, as non-terminal alignments might make the phrases incompatible
+ std::list< ExtractionPhrasePair* > phrasePairsWithSameSource;
+ std::list< ExtractionPhrasePair* > phrasePairsWithSameSourceAndTarget; // required for hierarchical rules only, as non-terminal alignments might make the phrases incompatible
int tmpSentenceId;
PHRASE *tmpPhraseSource, *tmpPhraseTarget;
@@ -390,7 +410,7 @@ int main(int argc, char* argv[])
// once the first of them has been found to have to be set to false
if ( hierarchicalFlag ) {
- for ( std::vector< ExtractionPhrasePair* >::const_iterator iter = phrasePairsWithSameSourceAndTarget.begin();
+ for ( std::list< ExtractionPhrasePair* >::const_iterator iter = phrasePairsWithSameSourceAndTarget.begin();
iter != phrasePairsWithSameSourceAndTarget.end(); ++iter ) {
if ( (*iter)->Matches( tmpPhraseSource, tmpPhraseTarget, tmpTargetToSourceAlignment,
sourceMatch, targetMatch, alignmentMatch ) ) {
@@ -420,7 +440,7 @@ int main(int argc, char* argv[])
if ( !phrasePairsWithSameSource.empty() &&
!sourceMatch ) {
processPhrasePairs( phrasePairsWithSameSource, *phraseTableFile, featureManager, maybeLogProb );
- for ( std::vector< ExtractionPhrasePair* >::const_iterator iter=phrasePairsWithSameSource.begin();
+ for ( std::list< ExtractionPhrasePair* >::const_iterator iter=phrasePairsWithSameSource.begin();
iter!=phrasePairsWithSameSource.end(); ++iter) {
delete *iter;
}
@@ -455,7 +475,7 @@ int main(int argc, char* argv[])
std::cerr << std::endl;
processPhrasePairs( phrasePairsWithSameSource, *phraseTableFile, featureManager, maybeLogProb );
- for ( std::vector< ExtractionPhrasePair* >::const_iterator iter=phrasePairsWithSameSource.begin();
+ for ( std::list< ExtractionPhrasePair* >::const_iterator iter=phrasePairsWithSameSource.begin();
iter!=phrasePairsWithSameSource.end(); ++iter) {
delete *iter;
}
@@ -656,7 +676,7 @@ void writeLabelSet( const std::set<std::string> &labelSet, const std::string &fi
}
-void processPhrasePairs( std::vector< ExtractionPhrasePair* > &phrasePairsWithSameSource, std::ostream &phraseTableFile,
+void processPhrasePairs( std::list< ExtractionPhrasePair* > &phrasePairsWithSameSource, std::ostream &phraseTableFile,
const ScoreFeatureManager& featureManager, const MaybeLog& maybeLogProb )
{
if (phrasePairsWithSameSource.size() == 0) {
@@ -668,14 +688,14 @@ void processPhrasePairs( std::vector< ExtractionPhrasePair* > &phrasePairsWithSa
//std::cerr << "phrasePairs.size() = " << phrasePairs.size() << std::endl;
// loop through phrase pairs
- for ( std::vector< ExtractionPhrasePair* >::const_iterator iter=phrasePairsWithSameSource.begin();
+ for ( std::list< ExtractionPhrasePair* >::const_iterator iter=phrasePairsWithSameSource.begin();
iter!=phrasePairsWithSameSource.end(); ++iter) {
// add to total count
totalSource += (*iter)->GetCount();
}
// output the distinct phrase pairs, one at a time
- for ( std::vector< ExtractionPhrasePair* >::const_iterator iter=phrasePairsWithSameSource.begin();
+ for ( std::list< ExtractionPhrasePair* >::const_iterator iter=phrasePairsWithSameSource.begin();
iter!=phrasePairsWithSameSource.end(); ++iter) {
// add to total count
outputPhrasePair( **iter, totalSource, phrasePairsWithSameSource.size(), phraseTableFile, featureManager, maybeLogProb );
@@ -704,16 +724,15 @@ void outputPhrasePair(const ExtractionPhrasePair &phrasePair,
countOfCounts[ countInt ]++;
}
- // compute PCFG score
- float pcfgScore = 0;
- if (pcfgFlag && !inverseFlag) {
- pcfgScore = phrasePair.GetPcfgScore() / count;
- }
-
// output phrases
const PHRASE *phraseSource = phrasePair.GetSource();
const PHRASE *phraseTarget = phrasePair.GetTarget();
+ // do not output if count below threshold
+ if (count < minCount) {
+ return;
+ }
+
// do not output if hierarchical and count below threshold
if (hierarchicalFlag && count < minCountHierarchical) {
for(size_t j=0; j<phraseSource->size()-1; ++j) {
@@ -722,6 +741,12 @@ void outputPhrasePair(const ExtractionPhrasePair &phrasePair,
}
}
+ // compute PCFG score
+ float pcfgScore = 0;
+ if (pcfgFlag && !inverseFlag) {
+ pcfgScore = phrasePair.GetPcfgScore() / count;
+ }
+
// source phrase (unless inverse)
if (!inverseFlag) {
printSourcePhrase(phraseSource, phraseTarget, bestAlignmentT2S, phraseTableFile);