diff options
author | hieuhoang1972 <hieuhoang1972@1f5c12ca-751b-0410-a591-d2e778427230> | 2011-09-14 11:15:36 +0400 |
---|---|---|
committer | hieuhoang1972 <hieuhoang1972@1f5c12ca-751b-0410-a591-d2e778427230> | 2011-09-14 11:15:36 +0400 |
commit | d68274d2178196a4d1f9ffa3347336eec0e50f93 (patch) | |
tree | caa461db9cc4b15e30341b110574eff33209a78f /scripts | |
parent | b1ca5e1fc8903fcad33c9836b5b7ee741ad00834 (diff) |
print out span widths of non-terms. Extra argument --OutputNTLengths
git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@4223 1f5c12ca-751b-0410-a591-d2e778427230
Diffstat (limited to 'scripts')
-rw-r--r-- | scripts/training/phrase-extract/ExtractedRule.cpp | 35 | ||||
-rw-r--r-- | scripts/training/phrase-extract/ExtractedRule.h | 13 | ||||
-rw-r--r-- | scripts/training/phrase-extract/Hole.h | 4 | ||||
-rw-r--r-- | scripts/training/phrase-extract/Makefile | 2 | ||||
-rw-r--r-- | scripts/training/phrase-extract/RuleExtractionOptions.h | 4 | ||||
-rw-r--r-- | scripts/training/phrase-extract/extract-rules.cpp | 27 | ||||
-rw-r--r-- | scripts/training/phrase-extract/extract.xcodeproj/project.pbxproj | 4 |
7 files changed, 78 insertions, 11 deletions
diff --git a/scripts/training/phrase-extract/ExtractedRule.cpp b/scripts/training/phrase-extract/ExtractedRule.cpp new file mode 100644 index 000000000..3b90b6287 --- /dev/null +++ b/scripts/training/phrase-extract/ExtractedRule.cpp @@ -0,0 +1,35 @@ +// +// ExtractedRule.cpp +// extract +// +// Created by Hieu Hoang on 13/09/2011. +// Copyright 2011 __MyCompanyName__. All rights reserved. +// + +#include "ExtractedRule.h" + +using namespace std; + +void ExtractedRule::OutputNTLengths(std::ostream &out) const +{ + std::map<size_t, std::pair<size_t, size_t> >::const_iterator iter; + for (iter = m_ntLengths.begin(); iter != m_ntLengths.end(); ++iter) + { + size_t sourcePos = iter->first; + const std::pair<size_t, size_t> &spanLengths = iter->second; + out << sourcePos << "=" << spanLengths.first << "," <<spanLengths.second << " "; + } + +} + +std::ostream& operator<<(std::ostream &out, const ExtractedRule &obj) +{ + out << obj.source << " ||| " << obj.target << " ||| " + << obj.alignment << " ||| " + << obj.alignmentInv << " ||| "; + + obj.OutputNTLengths(out); + + return out; +} + diff --git a/scripts/training/phrase-extract/ExtractedRule.h b/scripts/training/phrase-extract/ExtractedRule.h index 71ce32d9a..e577cf813 100644 --- a/scripts/training/phrase-extract/ExtractedRule.h +++ b/scripts/training/phrase-extract/ExtractedRule.h @@ -22,10 +22,14 @@ #define EXTRACTEDRULE_H_INCLUDED_ #include <string> +#include <iostream> +#include <map> // sentence-level collection of rules class ExtractedRule { + friend std::ostream& operator<<(std::ostream &, const ExtractedRule &); + public: std::string source; std::string target; @@ -39,6 +43,8 @@ public: int endS; float count; + std::map<size_t, std::pair<size_t, size_t> > m_ntLengths; + ExtractedRule(int sT, int eT, int sS, int eS) : source() , target() @@ -52,6 +58,13 @@ public: , endS(eS) , count(0) {} + + void SetSpanLength(size_t sourcePos, size_t sourceLength, size_t targetLength) + { + m_ntLengths[sourcePos] = std::pair<size_t, size_t>(sourceLength, targetLength); + } + + void OutputNTLengths(std::ostream &out) const; }; #endif diff --git a/scripts/training/phrase-extract/Hole.h b/scripts/training/phrase-extract/Hole.h index 484c15073..31e928f6e 100644 --- a/scripts/training/phrase-extract/Hole.h +++ b/scripts/training/phrase-extract/Hole.h @@ -66,6 +66,10 @@ public: return m_end[direction]; } + int GetSize(size_t direction) const { + return m_end[direction] - m_start[direction] + 1; + } + void SetPos(int pos, size_t direction) { m_pos[direction] = pos; } diff --git a/scripts/training/phrase-extract/Makefile b/scripts/training/phrase-extract/Makefile index ee0b71d60..6c6b6a5f9 100644 --- a/scripts/training/phrase-extract/Makefile +++ b/scripts/training/phrase-extract/Makefile @@ -10,7 +10,7 @@ clean: extract: tables-core.o SentenceAlignment.o extract.o $(CXX) $^ -o extract -extract-rules: tables-core.o SentenceAlignment.o SentenceAlignmentWithSyntax.o SyntaxTree.o XmlTree.o HoleCollection.o extract-rules.o +extract-rules: tables-core.o SentenceAlignment.o SentenceAlignmentWithSyntax.o SyntaxTree.o XmlTree.o HoleCollection.o extract-rules.o ExtractedRule.o $(CXX) $^ -o extract-rules extract-lex: extract-lex.o diff --git a/scripts/training/phrase-extract/RuleExtractionOptions.h b/scripts/training/phrase-extract/RuleExtractionOptions.h index 8e17de30d..f132be84b 100644 --- a/scripts/training/phrase-extract/RuleExtractionOptions.h +++ b/scripts/training/phrase-extract/RuleExtractionOptions.h @@ -45,7 +45,7 @@ public: bool targetSyntax; bool duplicateRules; bool fractionalCounting; - bool mixed; + bool outputNTLengths; RuleExtractionOptions() : maxSpan(10) @@ -73,7 +73,7 @@ public: , targetSyntax(false) , duplicateRules(true) , fractionalCounting(true) - , mixed(false) + , outputNTLengths(false) {} }; diff --git a/scripts/training/phrase-extract/extract-rules.cpp b/scripts/training/phrase-extract/extract-rules.cpp index 0b56f0c29..31dec4cbb 100644 --- a/scripts/training/phrase-extract/extract-rules.cpp +++ b/scripts/training/phrase-extract/extract-rules.cpp @@ -91,6 +91,7 @@ int main(int argc, char* argv[]) << " [ --GlueGrammar FILE" << " | --UnknownWordLabel FILE" << " | --OnlyDirect" + << " | --OutputNTLengths" << " | --MaxSpan[" << options.maxSpan << "]" << " | --MinHoleTarget[" << options.minHoleTarget << "]" << " | --MinHoleSource[" << options.minHoleSource << "]" @@ -214,8 +215,8 @@ int main(int argc, char* argv[]) // if an source phrase is paired with two target phrases, then count(t|s) = 0.5 else if (strcmp(argv[i],"--NoFractionalCounting") == 0) { options.fractionalCounting = false; - } else if (strcmp(argv[i],"--Mixed") == 0) { - options.mixed = true; + } else if (strcmp(argv[i],"--OutputNTLengths") == 0) { + options.outputNTLengths = true; } else { cerr << "extract: syntax error, unknown option '" << string(argv[i]) << "'\n"; exit(1); @@ -543,11 +544,16 @@ void printHieroAlignment(SentenceAlignmentWithSyntax &sentence // print alignment of non terminals HoleList::const_iterator iterHole; for (iterHole = holeColl.GetHoles().begin(); iterHole != holeColl.GetHoles().end(); ++iterHole) { - std::string sourceSymbolIndex = IntToString(iterHole->GetPos(0)); - std::string targetSymbolIndex = IntToString(iterHole->GetPos(1)); + const Hole &hole = *iterHole; + + std::string sourceSymbolIndex = IntToString(hole.GetPos(0)); + std::string targetSymbolIndex = IntToString(hole.GetPos(1)); rule.alignment += sourceSymbolIndex + "-" + targetSymbolIndex + " "; if (!options.onlyDirectFlag) rule.alignmentInv += targetSymbolIndex + "-" + sourceSymbolIndex + " "; + + rule.SetSpanLength(hole.GetPos(0), hole.GetSize(0), hole.GetSize(1) ) ; + } rule.alignment.erase(rule.alignment.size()-1); @@ -875,14 +881,19 @@ void writeRulesToFile() extractFile << rule->source << " ||| " << rule->target << " ||| " << rule->alignment << " ||| " - << rule->count << "\n"; + << rule->count; + if (options.outputNTLengths) { + extractFile << " ||| "; + rule->OutputNTLengths(extractFile); + } + extractFile << "\n"; - if (!options.onlyDirectFlag) + if (!options.onlyDirectFlag) { extractFileInv << rule->target << " ||| " << rule->source << " ||| " << rule->alignmentInv << " ||| " - << rule->count << "\n"; - + << rule->count << "\n"; + } } } diff --git a/scripts/training/phrase-extract/extract.xcodeproj/project.pbxproj b/scripts/training/phrase-extract/extract.xcodeproj/project.pbxproj index 56d23c0fd..75262181f 100644 --- a/scripts/training/phrase-extract/extract.xcodeproj/project.pbxproj +++ b/scripts/training/phrase-extract/extract.xcodeproj/project.pbxproj @@ -44,6 +44,7 @@ 1EB8A29A129C06B300041956 /* gzfilebuf.h in Sources */ = {isa = PBXBuildFile; fileRef = 1EB8A20C129C022000041956 /* gzfilebuf.h */; }; 1EB8A29B129C06B300041956 /* InputFileStream.h in Sources */ = {isa = PBXBuildFile; fileRef = 1EB8A210129C024C00041956 /* InputFileStream.h */; }; 1EB8A29C129C06B300041956 /* InputFileStream.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 1EB8A211129C024C00041956 /* InputFileStream.cpp */; }; + 1ED896C3141FFDF500F43F46 /* ExtractedRule.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 1ED896C2141FFDF500F43F46 /* ExtractedRule.cpp */; }; /* End PBXBuildFile section */ /* Begin PBXCopyFilesBuildPhase section */ @@ -98,6 +99,7 @@ 1EB8A20C129C022000041956 /* gzfilebuf.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = gzfilebuf.h; sourceTree = "<group>"; }; 1EB8A210129C024C00041956 /* InputFileStream.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = InputFileStream.h; sourceTree = "<group>"; }; 1EB8A211129C024C00041956 /* InputFileStream.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = InputFileStream.cpp; sourceTree = "<group>"; }; + 1ED896C2141FFDF500F43F46 /* ExtractedRule.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = ExtractedRule.cpp; sourceTree = "<group>"; }; /* End PBXFileReference section */ /* Begin PBXFrameworksBuildPhase section */ @@ -168,6 +170,7 @@ 1C05B9DB1174CBF9003585B2 /* extract-rules.cpp */, 1CE8CE2C0FC6EA0200924FEA /* extract.cpp */, 1C05B9DD1174CC06003585B2 /* ExtractedRule.h */, + 1ED896C2141FFDF500F43F46 /* ExtractedRule.cpp */, 1C05B9DE1174CC06003585B2 /* hierarchical.h */, 1C05B9DF1174CC06003585B2 /* Hole.h */, 1C05B9E01174CC06003585B2 /* HoleCollection.cpp */, @@ -365,6 +368,7 @@ 1C05BA1C1174CEC7003585B2 /* SentenceAlignmentWithSyntax.h in Sources */, 1C05BA161174CEA4003585B2 /* tables-core.cpp in Sources */, 1C05BA171174CEA4003585B2 /* tables-core.h in Sources */, + 1ED896C3141FFDF500F43F46 /* ExtractedRule.cpp in Sources */, ); runOnlyForDeploymentPostprocessing = 0; }; |