Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorhieuhoang1972 <hieuhoang1972@1f5c12ca-751b-0410-a591-d2e778427230>2011-09-14 22:16:05 +0400
committerhieuhoang1972 <hieuhoang1972@1f5c12ca-751b-0410-a591-d2e778427230>2011-09-14 22:16:05 +0400
commit1e1eb4d29e391e38da387b119234935312f21036 (patch)
tree413b919db9715e21a8bdc6497cf945e6f0cb4019 /scripts
parent149208ecbaed011a4555c55fcd5b828b4a9a8cc0 (diff)
print out span widths of non-terms. Extra argument --OutputNTLengths
git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@4225 1f5c12ca-751b-0410-a591-d2e778427230
Diffstat (limited to 'scripts')
-rw-r--r--scripts/training/phrase-extract/PhraseAlignment.h4
-rw-r--r--scripts/training/phrase-extract/extract.xcodeproj/project.pbxproj9
-rw-r--r--scripts/training/phrase-extract/score.cpp104
3 files changed, 106 insertions, 11 deletions
diff --git a/scripts/training/phrase-extract/PhraseAlignment.h b/scripts/training/phrase-extract/PhraseAlignment.h
index 68b82563a..8b8f5115c 100644
--- a/scripts/training/phrase-extract/PhraseAlignment.h
+++ b/scripts/training/phrase-extract/PhraseAlignment.h
@@ -46,4 +46,8 @@ public:
const PHRASE &GetTarget() const {
return phraseT;
}
+
+ const std::map<size_t, std::pair<size_t, size_t> > &GetNTLengths() const
+ { return m_ntLengths; }
+
};
diff --git a/scripts/training/phrase-extract/extract.xcodeproj/project.pbxproj b/scripts/training/phrase-extract/extract.xcodeproj/project.pbxproj
index 75262181f..8a3f58b92 100644
--- a/scripts/training/phrase-extract/extract.xcodeproj/project.pbxproj
+++ b/scripts/training/phrase-extract/extract.xcodeproj/project.pbxproj
@@ -654,26 +654,27 @@
1DEB923608733DC60010E9CD /* Debug */ = {
isa = XCBuildConfiguration;
buildSettings = {
- ARCHS = "$(ARCHS_STANDARD_32_BIT)";
+ ARCHS = "$(ARCHS_STANDARD_32_64_BIT)";
GCC_C_LANGUAGE_STANDARD = c99;
GCC_OPTIMIZATION_LEVEL = 0;
GCC_WARN_ABOUT_RETURN_TYPE = YES;
GCC_WARN_UNUSED_VARIABLE = YES;
ONLY_ACTIVE_ARCH = YES;
PREBINDING = NO;
- SDKROOT = macosx10.5;
+ SDKROOT = macosx10.6;
};
name = Debug;
};
1DEB923708733DC60010E9CD /* Release */ = {
isa = XCBuildConfiguration;
buildSettings = {
- ARCHS = "$(ARCHS_STANDARD_32_BIT)";
+ ARCHS = "$(ARCHS_STANDARD_32_64_BIT)";
GCC_C_LANGUAGE_STANDARD = c99;
GCC_WARN_ABOUT_RETURN_TYPE = YES;
GCC_WARN_UNUSED_VARIABLE = YES;
+ ONLY_ACTIVE_ARCH = YES;
PREBINDING = NO;
- SDKROOT = macosx10.5;
+ SDKROOT = macosx10.6;
};
name = Release;
};
diff --git a/scripts/training/phrase-extract/score.cpp b/scripts/training/phrase-extract/score.cpp
index 5ae714aa4..00f6dfcf6 100644
--- a/scripts/training/phrase-extract/score.cpp
+++ b/scripts/training/phrase-extract/score.cpp
@@ -65,8 +65,9 @@ double computeUnalignedPenalty( const PHRASE &, const PHRASE &, PhraseAlignment
set<string> functionWordList;
void loadFunctionWords( const char* fileNameFunctionWords );
double computeUnalignedFWPenalty( const PHRASE &, const PHRASE &, PhraseAlignment * );
-void calcNTLengthProb(const vector< PhraseAlignment* > &phrasePair, map<size_t, pair<float, float> > &ntLengthsProb);
-
+void calcNTLengthProb(const vector< PhraseAlignment* > &phrasePairs
+ , map<size_t, map<size_t, float> > &sourceProb
+ , map<size_t, map<size_t, float> > &targetProb);
LexicalTable lexTable;
bool inverseFlag = false;
bool hierarchicalFlag = false;
@@ -90,7 +91,7 @@ int main(int argc, char* argv[])
<< "scoring methods for extracted rules\n";
if (argc < 4) {
- cerr << "syntax: score extract lex phrase-table [--Inverse] [--Hierarchical] [--LogProb] [--NegLogProb] [--NoLex] [--GoodTuring coc-file] [--KneserNey coc-file] [--WordAlignment] [--UnalignedPenalty] [--UnalignedFunctionWordPenalty function-word-file] [--MinCountHierarchical count]\n";
+ cerr << "syntax: score extract lex phrase-table [--Inverse] [--Hierarchical] [--LogProb] [--NegLogProb] [--NoLex] [--GoodTuring coc-file] [--KneserNey coc-file] [--WordAlignment] [--UnalignedPenalty] [--UnalignedFunctionWordPenalty function-word-file] [--MinCountHierarchical count] [--OutputNTLengths] \n";
exit(1);
}
char* fileNameExtract = argv[1];
@@ -329,11 +330,93 @@ PhraseAlignment* findBestAlignment( vector< PhraseAlignment* > &phrasePair )
return bestAlignment;
}
-void calcNTLengthProb(const vector< PhraseAlignment* > &phrasePair, map<size_t, pair<float, float> > &ntLengthsProb)
+void calcNTLengthProb(const map<size_t, map<size_t, size_t> > &lengths
+ , size_t total
+ , map<size_t, map<size_t, float> > &probs)
+{
+ map<size_t, map<size_t, size_t> >::const_iterator iterOuter;
+ for (iterOuter = lengths.begin(); iterOuter != lengths.end(); ++iterOuter)
+ {
+ size_t sourcePos = iterOuter->first;
+ const map<size_t, size_t> &inner = iterOuter->second;
+
+ map<size_t, size_t>::const_iterator iterInner;
+ for (iterInner = inner.begin(); iterInner != inner.end(); ++iterInner)
+ {
+ size_t length = iterInner->first;
+ size_t count = iterInner->second;
+ float prob = (float) count / (float) total;
+ probs[sourcePos][length] = prob;
+ }
+ }
+}
+
+void calcNTLengthProb(const vector< PhraseAlignment* > &phrasePairs
+ , map<size_t, map<size_t, float> > &sourceProb
+ , map<size_t, map<size_t, float> > &targetProb)
{
+ map<size_t, map<size_t, size_t> > sourceLengths, targetLengths;
+ // 1st = position in source phrase, 2nd = length, 3rd = count
+ map<size_t, size_t> totals;
+ // 1st = position in source phrase, 2nd = total counts
+ // each source pos should have same count?
+
+ vector< PhraseAlignment* >::const_iterator iterOuter;
+ for (iterOuter = phrasePairs.begin(); iterOuter != phrasePairs.end(); ++iterOuter)
+ {
+ const PhraseAlignment &phrasePair = **iterOuter;
+ const std::map<size_t, std::pair<size_t, size_t> > &ntLengths = phrasePair.GetNTLengths();
+
+ std::map<size_t, std::pair<size_t, size_t> >::const_iterator iterInner;
+ for (iterInner = ntLengths.begin(); iterInner != ntLengths.end(); ++iterInner)
+ {
+ size_t sourcePos = iterInner->first;
+ size_t sourceLength = iterInner->second.first;
+ size_t targetLength = iterInner->second.second;
+
+ sourceLengths[sourcePos][sourceLength]++;
+ targetLengths[sourcePos][targetLength]++;
+
+ totals[sourcePos]++;
+ }
+ }
+
+ if (totals.size() == 0)
+ { // no non-term. Don't bother
+ return;
+ }
+
+ size_t total = totals.begin()->second;
+ if (totals.size() > 1)
+ {
+ assert(total == (++totals.begin())->second );
+ }
+
+ calcNTLengthProb(sourceLengths, total, sourceProb);
+ calcNTLengthProb(targetLengths, total, targetProb);
}
+void outputNTLengthProbs(ostream &phraseTableFile, const map<size_t, map<size_t, float> > &probs, const string &prefix)
+{
+ map<size_t, map<size_t, float> >::const_iterator iterOuter;
+ for (iterOuter = probs.begin(); iterOuter != probs.end(); ++iterOuter)
+ {
+ size_t sourcePos = iterOuter->first;
+ const map<size_t, float> &inner = iterOuter->second;
+
+ map<size_t, float>::const_iterator iterInner;
+ for (iterInner = inner.begin(); iterInner != inner.end(); ++iterInner)
+ {
+ size_t length = iterInner->first;
+ float prob = iterInner->second;
+
+ phraseTableFile << sourcePos << "|" << prefix << "|" << length << "=" << prob << " ";
+ }
+ }
+
+}
+
void outputPhrasePair( vector< PhraseAlignment* > &phrasePair, float totalCount, int distinctCount, ostream &phraseTableFile )
{
if (phrasePair.size() == 0) return;
@@ -443,11 +526,18 @@ void outputPhrasePair( vector< PhraseAlignment* > &phrasePair, float totalCount,
if (kneserNeyFlag)
phraseTableFile << " " << distinctCount;
- // nt lengths
- map<size_t, pair<float, float> > ntLengthsProb;
+ // nt lengths
if (outputNTLengths)
{
- calcNTLengthProb(phrasePair, ntLengthsProb);
+ map<size_t, map<size_t, float> > sourceProb, targetProb;
+ // 1st sourcePos, 2nd = length, 3rd = prob
+
+ calcNTLengthProb(phrasePair, sourceProb, targetProb);
+
+ phraseTableFile << " ||| ";
+ outputNTLengthProbs(phraseTableFile, sourceProb, "S");
+ outputNTLengthProbs(phraseTableFile, targetProb, "T");
+
}
phraseTableFile << endl;