diff options
author | maria <maria@buri.inf.ed.ac.uk> | 2012-09-24 21:27:49 +0400 |
---|---|---|
committer | maria <maria@buri.inf.ed.ac.uk> | 2012-09-24 21:27:49 +0400 |
commit | 026a7cc7577861598954331f7f4dbcdbe69a1693 (patch) | |
tree | 79ff810ba6a17b4e6150f6a5c269671b169d9540 | |
parent | dcfc3d446482ddeb1ed9424222db583bb7fe69ef (diff) |
added ISI format for gaussian span_length feature
-rw-r--r-- | moses/src/Parameter.cpp | 1 | ||||
-rw-r--r-- | moses/src/RuleTable/LoaderStandard.cpp | 134 | ||||
-rw-r--r-- | moses/src/SpanLengthEstimator.cpp | 7 | ||||
-rw-r--r-- | moses/src/SpanLengthEstimator.h | 3 | ||||
-rw-r--r-- | moses/src/SpanLengthFeature.cpp | 2 |
5 files changed, 93 insertions, 54 deletions
diff --git a/moses/src/Parameter.cpp b/moses/src/Parameter.cpp index 58ef70ee2..a16fe3ae5 100644 --- a/moses/src/Parameter.cpp +++ b/moses/src/Parameter.cpp @@ -155,6 +155,7 @@ Parameter::Parameter() AddParam("weight-span-length", "SL", "Weight for span length feature. Set this to activate it, if it is empty, feature will not be used"); AddParam("gaussian-span-length-score", "Use Gaussian pdf to calculate span length probability instead of unsmoothed counts"); + AddParam("isi-format-for-span-length","Read parameters for Gaussian pdf in ISI format (count,sum(len),sum(len^2))"); AddParam("weight-crossing", "CR", "weight for non-term crossing feature. Set this to activate it, if it is empty, feature will not be used"); AddParam("crossing-file", "Data file for crossing feature. Line format: [span-length] [non-term] [is-crossing] [probability]"); diff --git a/moses/src/RuleTable/LoaderStandard.cpp b/moses/src/RuleTable/LoaderStandard.cpp index 6ce14e048..35462b502 100644 --- a/moses/src/RuleTable/LoaderStandard.cpp +++ b/moses/src/RuleTable/LoaderStandard.cpp @@ -224,7 +224,7 @@ bool RuleTableLoaderStandard::Load(FormatType format //get rule counts tokens[4] -> count(t) assume that extract was run with --NoFractionalCounting flag const std::string &ruleCount = tokens[4]; vector<string> countStrings; - unsigned ruleTotalCount; + unsigned ruleTotalCount = 1; TokenizeMultiCharSeparator(countStrings,ruleCount," "); if(countStrings.size()>=1) sscanf(countStrings[0].c_str(), "%u", &ruleTotalCount); @@ -234,61 +234,91 @@ bool RuleTableLoaderStandard::Load(FormatType format std::vector<SpanLengthEstimator*> spanSourceEstimators, spanTargetEstimators; if (tokens.size() >= 6) { bool useGaussian = (StaticData::Instance().GetParam("gaussian-span-length-score").size() > 0); - + bool useISIFormat = (StaticData::Instance().GetParam("isi-format-for-span-length").size() > 0); + const std::string &spanLength = tokens[5]; - //source and target side are separated by || - TokenizeMultiCharSeparator(spanStringsST,spanLength,"||"); + + //use ISI format for span_length information in rule table + // rule_count | sum_NT1(len) | sum_NT1(len^2) || rule_count | sum_NT2(len) | sum_NT2(len^2) ... + if (useISIFormat == true ){ + vector<string> spanStatisticsSource; + vector<string>::iterator itr_statistics; + unsigned count = 1; + float sum_len=0.0f, sum_square_len=0.0f; + + TokenizeMultiCharSeparator(spanStatisticsSource,spanLength,"||"); + for(itr_statistics = spanStatisticsSource.begin(); itr_statistics != spanStatisticsSource.end(); itr_statistics++){ + vector<string> gaussParam; + TokenizeMultiCharSeparator(gaussParam,*itr_statistics,"|"); + if(gaussParam.size()<3) continue; + sscanf(gaussParam[0].c_str(),"%u", &count); + sscanf(gaussParam[1].c_str(),"%f", &sum_len); + sscanf(gaussParam[2].c_str(),"%f", &sum_square_len); + + std::auto_ptr<SpanLengthEstimator> estimatorSource; + estimatorSource.reset(CreateGaussianSpanLengthEstimator()); + estimatorSource->AddSpanScore_ISI(count,sum_len,sum_square_len); + estimatorSource->FinishedAdds(count); + spanSourceEstimators.push_back(estimatorSource.release()); + } + + } + // use (len=score) format for span_length information in rule table + else{ + + //source and target side are separated by || + TokenizeMultiCharSeparator(spanStringsST,spanLength,"||"); - //we consider only source and target information - //CHECK(spanStringsST.size() =< 3); - - if(spanStringsST.size()>=2){ - - //Take scores from source - string spanLengthSource = spanStringsST[0]; - //Take scores form target - string spanLengthTarget = spanStringsST[1]; - - TokenizeMultiCharSeparator(spanStringSource,spanLengthSource,"|"); - TokenizeMultiCharSeparator(spanStringTarget,spanLengthTarget,"|"); - - //Check that number of non terminals is the same on both sides - CHECK(spanStringSource.size() == spanStringTarget.size()); - vector<string>::iterator itr_source; - vector<string>::iterator itr_target; - for(itr_source = spanStringSource.begin(), itr_target = spanStringTarget.begin(); - itr_source != spanStringSource.end(), itr_target != spanStringTarget.end(); - itr_source++, itr_target++) - { - vector<string> spanTermSource; - vector<string> spanTermTarget; - vector<string> :: iterator itr_source_term; - vector<string> :: iterator itr_target_term; - Tokenize(spanTermSource,*itr_source); - Tokenize(spanTermTarget,*itr_target); - std::auto_ptr<SpanLengthEstimator> estimatorSource, estimatorTarget; - estimatorSource.reset(useGaussian ? CreateGaussianSpanLengthEstimator() : CreateAsIsSpanLengthEstimator()); - estimatorTarget.reset(useGaussian ? CreateGaussianSpanLengthEstimator() : CreateAsIsSpanLengthEstimator()); - //get source scores - iterate(spanTermSource,itr_source_term) - { - unsigned size; - float proba; - sscanf(itr_source_term->c_str(), "%u=%f", &size, &proba); - estimatorSource->AddSpanScore(size, logf(proba)); - } - //get target scores - iterate(spanStringTarget,itr_target_term) + //we consider only source and target information + + if(spanStringsST.size()>=2){ + + //Take scores from source + string spanLengthSource = spanStringsST[0]; + //Take scores form target + string spanLengthTarget = spanStringsST[1]; + + TokenizeMultiCharSeparator(spanStringSource,spanLengthSource,"|"); + TokenizeMultiCharSeparator(spanStringTarget,spanLengthTarget,"|"); + + //Check that number of non terminals is the same on both sides + CHECK(spanStringSource.size() == spanStringTarget.size()); + vector<string>::iterator itr_source; + vector<string>::iterator itr_target; + for(itr_source = spanStringSource.begin(), itr_target = spanStringTarget.begin(); + itr_source != spanStringSource.end(), itr_target != spanStringTarget.end(); + itr_source++, itr_target++) { - unsigned size; - float proba; - sscanf(itr_target_term->c_str(), "%u=%f", &size, &proba); - estimatorTarget->AddSpanScore(size, logf(proba)); + vector<string> spanTermSource; + vector<string> spanTermTarget; + vector<string> :: iterator itr_source_term; + vector<string> :: iterator itr_target_term; + Tokenize(spanTermSource,*itr_source); + Tokenize(spanTermTarget,*itr_target); + std::auto_ptr<SpanLengthEstimator> estimatorSource, estimatorTarget; + estimatorSource.reset(useGaussian ? CreateGaussianSpanLengthEstimator() : CreateAsIsSpanLengthEstimator()); + estimatorTarget.reset(useGaussian ? CreateGaussianSpanLengthEstimator() : CreateAsIsSpanLengthEstimator()); + //get source scores + iterate(spanTermSource,itr_source_term) + { + unsigned size; + float proba; + sscanf(itr_source_term->c_str(), "%u=%f", &size, &proba); + estimatorSource->AddSpanScore(size, logf(proba)); + } + //get target scores + iterate(spanStringTarget,itr_target_term) + { + unsigned size; + float proba; + sscanf(itr_target_term->c_str(), "%u=%f", &size, &proba); + estimatorTarget->AddSpanScore(size, logf(proba)); + } + estimatorSource->FinishedAdds(ruleTotalCount); + estimatorTarget->FinishedAdds(ruleTotalCount); + spanSourceEstimators.push_back(estimatorSource.release()); + spanTargetEstimators.push_back(estimatorTarget.release()); } - estimatorSource->FinishedAdds(ruleTotalCount); - estimatorTarget->FinishedAdds(ruleTotalCount); - spanSourceEstimators.push_back(estimatorSource.release()); - spanTargetEstimators.push_back(estimatorTarget.release()); } } } diff --git a/moses/src/SpanLengthEstimator.cpp b/moses/src/SpanLengthEstimator.cpp index d568f77e5..be201b2c1 100644 --- a/moses/src/SpanLengthEstimator.cpp +++ b/moses/src/SpanLengthEstimator.cpp @@ -17,6 +17,7 @@ public: virtual void AddSpanScore(unsigned spanLength, float score) { m_scores.insert(make_pair(spanLength, score)); } + virtual float GetScoreBySpanLength(unsigned spanLength) const { // bool useGaussian = StaticData::Instance().GetParam("gaussian-span-length-score").size() > 0; if (m_scores.empty()) @@ -52,6 +53,12 @@ public: m_average += exp(score) * spanLength; m_averageSquare += exp(score) * spanLength * spanLength; } + + virtual void AddSpanScore_ISI(unsigned count, float sum_len, float sum_square_len){ + m_average = sum_len / count; + m_averageSquare = sum_square_len /count; + } + virtual float GetScoreBySpanLength(unsigned spanLength) const { float t = ((spanLength - m_average) / m_sigma); float ret = -m_logSqrt2Pi - m_logSigma - 0.5 * t * t; diff --git a/moses/src/SpanLengthEstimator.h b/moses/src/SpanLengthEstimator.h index 6461ff8c8..ca75a2ba4 100644 --- a/moses/src/SpanLengthEstimator.h +++ b/moses/src/SpanLengthEstimator.h @@ -10,7 +10,8 @@ namespace Moses class SpanLengthEstimator { public: - virtual void AddSpanScore(unsigned spanLength, float score) = 0; + virtual void AddSpanScore(unsigned spanLength, float score)=0; + virtual void AddSpanScore_ISI(unsigned count, float sum_len, float sum_square_len){}; virtual float GetScoreBySpanLength(unsigned spanLength) const = 0; virtual void FinishedAdds(unsigned ruleCount) {} virtual ~SpanLengthEstimator() {} diff --git a/moses/src/SpanLengthFeature.cpp b/moses/src/SpanLengthFeature.cpp index 0c36c6ba0..016e11a1a 100644 --- a/moses/src/SpanLengthFeature.cpp +++ b/moses/src/SpanLengthFeature.cpp @@ -130,4 +130,4 @@ FFState* SpanLengthFeature::EvaluateChart( } } -} // namespace moses
\ No newline at end of file +} // namespace moses |