Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorhieuhoang1972 <hieuhoang1972@1f5c12ca-751b-0410-a591-d2e778427230>2011-09-14 11:15:36 +0400
committerhieuhoang1972 <hieuhoang1972@1f5c12ca-751b-0410-a591-d2e778427230>2011-09-14 11:15:36 +0400
commitd68274d2178196a4d1f9ffa3347336eec0e50f93 (patch)
treecaa461db9cc4b15e30341b110574eff33209a78f /scripts
parentb1ca5e1fc8903fcad33c9836b5b7ee741ad00834 (diff)
print out span widths of non-terms. Extra argument --OutputNTLengths
git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@4223 1f5c12ca-751b-0410-a591-d2e778427230
Diffstat (limited to 'scripts')
-rw-r--r--scripts/training/phrase-extract/ExtractedRule.cpp35
-rw-r--r--scripts/training/phrase-extract/ExtractedRule.h13
-rw-r--r--scripts/training/phrase-extract/Hole.h4
-rw-r--r--scripts/training/phrase-extract/Makefile2
-rw-r--r--scripts/training/phrase-extract/RuleExtractionOptions.h4
-rw-r--r--scripts/training/phrase-extract/extract-rules.cpp27
-rw-r--r--scripts/training/phrase-extract/extract.xcodeproj/project.pbxproj4
7 files changed, 78 insertions, 11 deletions
diff --git a/scripts/training/phrase-extract/ExtractedRule.cpp b/scripts/training/phrase-extract/ExtractedRule.cpp
new file mode 100644
index 000000000..3b90b6287
--- /dev/null
+++ b/scripts/training/phrase-extract/ExtractedRule.cpp
@@ -0,0 +1,35 @@
+//
+// ExtractedRule.cpp
+// extract
+//
+// Created by Hieu Hoang on 13/09/2011.
+// Copyright 2011 __MyCompanyName__. All rights reserved.
+//
+
+#include "ExtractedRule.h"
+
+using namespace std;
+
+void ExtractedRule::OutputNTLengths(std::ostream &out) const
+{
+ std::map<size_t, std::pair<size_t, size_t> >::const_iterator iter;
+ for (iter = m_ntLengths.begin(); iter != m_ntLengths.end(); ++iter)
+ {
+ size_t sourcePos = iter->first;
+ const std::pair<size_t, size_t> &spanLengths = iter->second;
+ out << sourcePos << "=" << spanLengths.first << "," <<spanLengths.second << " ";
+ }
+
+}
+
+std::ostream& operator<<(std::ostream &out, const ExtractedRule &obj)
+{
+ out << obj.source << " ||| " << obj.target << " ||| "
+ << obj.alignment << " ||| "
+ << obj.alignmentInv << " ||| ";
+
+ obj.OutputNTLengths(out);
+
+ return out;
+}
+
diff --git a/scripts/training/phrase-extract/ExtractedRule.h b/scripts/training/phrase-extract/ExtractedRule.h
index 71ce32d9a..e577cf813 100644
--- a/scripts/training/phrase-extract/ExtractedRule.h
+++ b/scripts/training/phrase-extract/ExtractedRule.h
@@ -22,10 +22,14 @@
#define EXTRACTEDRULE_H_INCLUDED_
#include <string>
+#include <iostream>
+#include <map>
// sentence-level collection of rules
class ExtractedRule
{
+ friend std::ostream& operator<<(std::ostream &, const ExtractedRule &);
+
public:
std::string source;
std::string target;
@@ -39,6 +43,8 @@ public:
int endS;
float count;
+ std::map<size_t, std::pair<size_t, size_t> > m_ntLengths;
+
ExtractedRule(int sT, int eT, int sS, int eS)
: source()
, target()
@@ -52,6 +58,13 @@ public:
, endS(eS)
, count(0)
{}
+
+ void SetSpanLength(size_t sourcePos, size_t sourceLength, size_t targetLength)
+ {
+ m_ntLengths[sourcePos] = std::pair<size_t, size_t>(sourceLength, targetLength);
+ }
+
+ void OutputNTLengths(std::ostream &out) const;
};
#endif
diff --git a/scripts/training/phrase-extract/Hole.h b/scripts/training/phrase-extract/Hole.h
index 484c15073..31e928f6e 100644
--- a/scripts/training/phrase-extract/Hole.h
+++ b/scripts/training/phrase-extract/Hole.h
@@ -66,6 +66,10 @@ public:
return m_end[direction];
}
+ int GetSize(size_t direction) const {
+ return m_end[direction] - m_start[direction] + 1;
+ }
+
void SetPos(int pos, size_t direction) {
m_pos[direction] = pos;
}
diff --git a/scripts/training/phrase-extract/Makefile b/scripts/training/phrase-extract/Makefile
index ee0b71d60..6c6b6a5f9 100644
--- a/scripts/training/phrase-extract/Makefile
+++ b/scripts/training/phrase-extract/Makefile
@@ -10,7 +10,7 @@ clean:
extract: tables-core.o SentenceAlignment.o extract.o
$(CXX) $^ -o extract
-extract-rules: tables-core.o SentenceAlignment.o SentenceAlignmentWithSyntax.o SyntaxTree.o XmlTree.o HoleCollection.o extract-rules.o
+extract-rules: tables-core.o SentenceAlignment.o SentenceAlignmentWithSyntax.o SyntaxTree.o XmlTree.o HoleCollection.o extract-rules.o ExtractedRule.o
$(CXX) $^ -o extract-rules
extract-lex: extract-lex.o
diff --git a/scripts/training/phrase-extract/RuleExtractionOptions.h b/scripts/training/phrase-extract/RuleExtractionOptions.h
index 8e17de30d..f132be84b 100644
--- a/scripts/training/phrase-extract/RuleExtractionOptions.h
+++ b/scripts/training/phrase-extract/RuleExtractionOptions.h
@@ -45,7 +45,7 @@ public:
bool targetSyntax;
bool duplicateRules;
bool fractionalCounting;
- bool mixed;
+ bool outputNTLengths;
RuleExtractionOptions()
: maxSpan(10)
@@ -73,7 +73,7 @@ public:
, targetSyntax(false)
, duplicateRules(true)
, fractionalCounting(true)
- , mixed(false)
+ , outputNTLengths(false)
{}
};
diff --git a/scripts/training/phrase-extract/extract-rules.cpp b/scripts/training/phrase-extract/extract-rules.cpp
index 0b56f0c29..31dec4cbb 100644
--- a/scripts/training/phrase-extract/extract-rules.cpp
+++ b/scripts/training/phrase-extract/extract-rules.cpp
@@ -91,6 +91,7 @@ int main(int argc, char* argv[])
<< " [ --GlueGrammar FILE"
<< " | --UnknownWordLabel FILE"
<< " | --OnlyDirect"
+ << " | --OutputNTLengths"
<< " | --MaxSpan[" << options.maxSpan << "]"
<< " | --MinHoleTarget[" << options.minHoleTarget << "]"
<< " | --MinHoleSource[" << options.minHoleSource << "]"
@@ -214,8 +215,8 @@ int main(int argc, char* argv[])
// if an source phrase is paired with two target phrases, then count(t|s) = 0.5
else if (strcmp(argv[i],"--NoFractionalCounting") == 0) {
options.fractionalCounting = false;
- } else if (strcmp(argv[i],"--Mixed") == 0) {
- options.mixed = true;
+ } else if (strcmp(argv[i],"--OutputNTLengths") == 0) {
+ options.outputNTLengths = true;
} else {
cerr << "extract: syntax error, unknown option '" << string(argv[i]) << "'\n";
exit(1);
@@ -543,11 +544,16 @@ void printHieroAlignment(SentenceAlignmentWithSyntax &sentence
// print alignment of non terminals
HoleList::const_iterator iterHole;
for (iterHole = holeColl.GetHoles().begin(); iterHole != holeColl.GetHoles().end(); ++iterHole) {
- std::string sourceSymbolIndex = IntToString(iterHole->GetPos(0));
- std::string targetSymbolIndex = IntToString(iterHole->GetPos(1));
+ const Hole &hole = *iterHole;
+
+ std::string sourceSymbolIndex = IntToString(hole.GetPos(0));
+ std::string targetSymbolIndex = IntToString(hole.GetPos(1));
rule.alignment += sourceSymbolIndex + "-" + targetSymbolIndex + " ";
if (!options.onlyDirectFlag)
rule.alignmentInv += targetSymbolIndex + "-" + sourceSymbolIndex + " ";
+
+ rule.SetSpanLength(hole.GetPos(0), hole.GetSize(0), hole.GetSize(1) ) ;
+
}
rule.alignment.erase(rule.alignment.size()-1);
@@ -875,14 +881,19 @@ void writeRulesToFile()
extractFile << rule->source << " ||| "
<< rule->target << " ||| "
<< rule->alignment << " ||| "
- << rule->count << "\n";
+ << rule->count;
+ if (options.outputNTLengths) {
+ extractFile << " ||| ";
+ rule->OutputNTLengths(extractFile);
+ }
+ extractFile << "\n";
- if (!options.onlyDirectFlag)
+ if (!options.onlyDirectFlag) {
extractFileInv << rule->target << " ||| "
<< rule->source << " ||| "
<< rule->alignmentInv << " ||| "
- << rule->count << "\n";
-
+ << rule->count << "\n";
+ }
}
}
diff --git a/scripts/training/phrase-extract/extract.xcodeproj/project.pbxproj b/scripts/training/phrase-extract/extract.xcodeproj/project.pbxproj
index 56d23c0fd..75262181f 100644
--- a/scripts/training/phrase-extract/extract.xcodeproj/project.pbxproj
+++ b/scripts/training/phrase-extract/extract.xcodeproj/project.pbxproj
@@ -44,6 +44,7 @@
1EB8A29A129C06B300041956 /* gzfilebuf.h in Sources */ = {isa = PBXBuildFile; fileRef = 1EB8A20C129C022000041956 /* gzfilebuf.h */; };
1EB8A29B129C06B300041956 /* InputFileStream.h in Sources */ = {isa = PBXBuildFile; fileRef = 1EB8A210129C024C00041956 /* InputFileStream.h */; };
1EB8A29C129C06B300041956 /* InputFileStream.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 1EB8A211129C024C00041956 /* InputFileStream.cpp */; };
+ 1ED896C3141FFDF500F43F46 /* ExtractedRule.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 1ED896C2141FFDF500F43F46 /* ExtractedRule.cpp */; };
/* End PBXBuildFile section */
/* Begin PBXCopyFilesBuildPhase section */
@@ -98,6 +99,7 @@
1EB8A20C129C022000041956 /* gzfilebuf.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = gzfilebuf.h; sourceTree = "<group>"; };
1EB8A210129C024C00041956 /* InputFileStream.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = InputFileStream.h; sourceTree = "<group>"; };
1EB8A211129C024C00041956 /* InputFileStream.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = InputFileStream.cpp; sourceTree = "<group>"; };
+ 1ED896C2141FFDF500F43F46 /* ExtractedRule.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = ExtractedRule.cpp; sourceTree = "<group>"; };
/* End PBXFileReference section */
/* Begin PBXFrameworksBuildPhase section */
@@ -168,6 +170,7 @@
1C05B9DB1174CBF9003585B2 /* extract-rules.cpp */,
1CE8CE2C0FC6EA0200924FEA /* extract.cpp */,
1C05B9DD1174CC06003585B2 /* ExtractedRule.h */,
+ 1ED896C2141FFDF500F43F46 /* ExtractedRule.cpp */,
1C05B9DE1174CC06003585B2 /* hierarchical.h */,
1C05B9DF1174CC06003585B2 /* Hole.h */,
1C05B9E01174CC06003585B2 /* HoleCollection.cpp */,
@@ -365,6 +368,7 @@
1C05BA1C1174CEC7003585B2 /* SentenceAlignmentWithSyntax.h in Sources */,
1C05BA161174CEA4003585B2 /* tables-core.cpp in Sources */,
1C05BA171174CEA4003585B2 /* tables-core.h in Sources */,
+ 1ED896C3141FFDF500F43F46 /* ExtractedRule.cpp in Sources */,
);
runOnlyForDeploymentPostprocessing = 0;
};