Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authormaria <maria@buri.inf.ed.ac.uk>2012-09-24 21:27:49 +0400
committermaria <maria@buri.inf.ed.ac.uk>2012-09-24 21:27:49 +0400
commit026a7cc7577861598954331f7f4dbcdbe69a1693 (patch)
tree79ff810ba6a17b4e6150f6a5c269671b169d9540
parentdcfc3d446482ddeb1ed9424222db583bb7fe69ef (diff)
added ISI format for gaussian span_length feature
-rw-r--r--moses/src/Parameter.cpp1
-rw-r--r--moses/src/RuleTable/LoaderStandard.cpp134
-rw-r--r--moses/src/SpanLengthEstimator.cpp7
-rw-r--r--moses/src/SpanLengthEstimator.h3
-rw-r--r--moses/src/SpanLengthFeature.cpp2
5 files changed, 93 insertions, 54 deletions
diff --git a/moses/src/Parameter.cpp b/moses/src/Parameter.cpp
index 58ef70ee2..a16fe3ae5 100644
--- a/moses/src/Parameter.cpp
+++ b/moses/src/Parameter.cpp
@@ -155,6 +155,7 @@ Parameter::Parameter()
AddParam("weight-span-length", "SL", "Weight for span length feature. Set this to activate it, if it is empty, feature will not be used");
AddParam("gaussian-span-length-score", "Use Gaussian pdf to calculate span length probability instead of unsmoothed counts");
+ AddParam("isi-format-for-span-length","Read parameters for Gaussian pdf in ISI format (count,sum(len),sum(len^2))");
AddParam("weight-crossing", "CR", "weight for non-term crossing feature. Set this to activate it, if it is empty, feature will not be used");
AddParam("crossing-file", "Data file for crossing feature. Line format: [span-length] [non-term] [is-crossing] [probability]");
diff --git a/moses/src/RuleTable/LoaderStandard.cpp b/moses/src/RuleTable/LoaderStandard.cpp
index 6ce14e048..35462b502 100644
--- a/moses/src/RuleTable/LoaderStandard.cpp
+++ b/moses/src/RuleTable/LoaderStandard.cpp
@@ -224,7 +224,7 @@ bool RuleTableLoaderStandard::Load(FormatType format
//get rule counts tokens[4] -> count(t) assume that extract was run with --NoFractionalCounting flag
const std::string &ruleCount = tokens[4];
vector<string> countStrings;
- unsigned ruleTotalCount;
+ unsigned ruleTotalCount = 1;
TokenizeMultiCharSeparator(countStrings,ruleCount," ");
if(countStrings.size()>=1)
sscanf(countStrings[0].c_str(), "%u", &ruleTotalCount);
@@ -234,61 +234,91 @@ bool RuleTableLoaderStandard::Load(FormatType format
std::vector<SpanLengthEstimator*> spanSourceEstimators, spanTargetEstimators;
if (tokens.size() >= 6) {
bool useGaussian = (StaticData::Instance().GetParam("gaussian-span-length-score").size() > 0);
-
+ bool useISIFormat = (StaticData::Instance().GetParam("isi-format-for-span-length").size() > 0);
+
const std::string &spanLength = tokens[5];
- //source and target side are separated by ||
- TokenizeMultiCharSeparator(spanStringsST,spanLength,"||");
+
+ //use ISI format for span_length information in rule table
+ // rule_count | sum_NT1(len) | sum_NT1(len^2) || rule_count | sum_NT2(len) | sum_NT2(len^2) ...
+ if (useISIFormat == true ){
+ vector<string> spanStatisticsSource;
+ vector<string>::iterator itr_statistics;
+ unsigned count = 1;
+ float sum_len=0.0f, sum_square_len=0.0f;
+
+ TokenizeMultiCharSeparator(spanStatisticsSource,spanLength,"||");
+ for(itr_statistics = spanStatisticsSource.begin(); itr_statistics != spanStatisticsSource.end(); itr_statistics++){
+ vector<string> gaussParam;
+ TokenizeMultiCharSeparator(gaussParam,*itr_statistics,"|");
+ if(gaussParam.size()<3) continue;
+ sscanf(gaussParam[0].c_str(),"%u", &count);
+ sscanf(gaussParam[1].c_str(),"%f", &sum_len);
+ sscanf(gaussParam[2].c_str(),"%f", &sum_square_len);
+
+ std::auto_ptr<SpanLengthEstimator> estimatorSource;
+ estimatorSource.reset(CreateGaussianSpanLengthEstimator());
+ estimatorSource->AddSpanScore_ISI(count,sum_len,sum_square_len);
+ estimatorSource->FinishedAdds(count);
+ spanSourceEstimators.push_back(estimatorSource.release());
+ }
+
+ }
+ // use (len=score) format for span_length information in rule table
+ else{
+
+ //source and target side are separated by ||
+ TokenizeMultiCharSeparator(spanStringsST,spanLength,"||");
- //we consider only source and target information
- //CHECK(spanStringsST.size() =< 3);
-
- if(spanStringsST.size()>=2){
-
- //Take scores from source
- string spanLengthSource = spanStringsST[0];
- //Take scores form target
- string spanLengthTarget = spanStringsST[1];
-
- TokenizeMultiCharSeparator(spanStringSource,spanLengthSource,"|");
- TokenizeMultiCharSeparator(spanStringTarget,spanLengthTarget,"|");
-
- //Check that number of non terminals is the same on both sides
- CHECK(spanStringSource.size() == spanStringTarget.size());
- vector<string>::iterator itr_source;
- vector<string>::iterator itr_target;
- for(itr_source = spanStringSource.begin(), itr_target = spanStringTarget.begin();
- itr_source != spanStringSource.end(), itr_target != spanStringTarget.end();
- itr_source++, itr_target++)
- {
- vector<string> spanTermSource;
- vector<string> spanTermTarget;
- vector<string> :: iterator itr_source_term;
- vector<string> :: iterator itr_target_term;
- Tokenize(spanTermSource,*itr_source);
- Tokenize(spanTermTarget,*itr_target);
- std::auto_ptr<SpanLengthEstimator> estimatorSource, estimatorTarget;
- estimatorSource.reset(useGaussian ? CreateGaussianSpanLengthEstimator() : CreateAsIsSpanLengthEstimator());
- estimatorTarget.reset(useGaussian ? CreateGaussianSpanLengthEstimator() : CreateAsIsSpanLengthEstimator());
- //get source scores
- iterate(spanTermSource,itr_source_term)
- {
- unsigned size;
- float proba;
- sscanf(itr_source_term->c_str(), "%u=%f", &size, &proba);
- estimatorSource->AddSpanScore(size, logf(proba));
- }
- //get target scores
- iterate(spanStringTarget,itr_target_term)
+ //we consider only source and target information
+
+ if(spanStringsST.size()>=2){
+
+ //Take scores from source
+ string spanLengthSource = spanStringsST[0];
+ //Take scores form target
+ string spanLengthTarget = spanStringsST[1];
+
+ TokenizeMultiCharSeparator(spanStringSource,spanLengthSource,"|");
+ TokenizeMultiCharSeparator(spanStringTarget,spanLengthTarget,"|");
+
+ //Check that number of non terminals is the same on both sides
+ CHECK(spanStringSource.size() == spanStringTarget.size());
+ vector<string>::iterator itr_source;
+ vector<string>::iterator itr_target;
+ for(itr_source = spanStringSource.begin(), itr_target = spanStringTarget.begin();
+ itr_source != spanStringSource.end(), itr_target != spanStringTarget.end();
+ itr_source++, itr_target++)
{
- unsigned size;
- float proba;
- sscanf(itr_target_term->c_str(), "%u=%f", &size, &proba);
- estimatorTarget->AddSpanScore(size, logf(proba));
+ vector<string> spanTermSource;
+ vector<string> spanTermTarget;
+ vector<string> :: iterator itr_source_term;
+ vector<string> :: iterator itr_target_term;
+ Tokenize(spanTermSource,*itr_source);
+ Tokenize(spanTermTarget,*itr_target);
+ std::auto_ptr<SpanLengthEstimator> estimatorSource, estimatorTarget;
+ estimatorSource.reset(useGaussian ? CreateGaussianSpanLengthEstimator() : CreateAsIsSpanLengthEstimator());
+ estimatorTarget.reset(useGaussian ? CreateGaussianSpanLengthEstimator() : CreateAsIsSpanLengthEstimator());
+ //get source scores
+ iterate(spanTermSource,itr_source_term)
+ {
+ unsigned size;
+ float proba;
+ sscanf(itr_source_term->c_str(), "%u=%f", &size, &proba);
+ estimatorSource->AddSpanScore(size, logf(proba));
+ }
+ //get target scores
+ iterate(spanStringTarget,itr_target_term)
+ {
+ unsigned size;
+ float proba;
+ sscanf(itr_target_term->c_str(), "%u=%f", &size, &proba);
+ estimatorTarget->AddSpanScore(size, logf(proba));
+ }
+ estimatorSource->FinishedAdds(ruleTotalCount);
+ estimatorTarget->FinishedAdds(ruleTotalCount);
+ spanSourceEstimators.push_back(estimatorSource.release());
+ spanTargetEstimators.push_back(estimatorTarget.release());
}
- estimatorSource->FinishedAdds(ruleTotalCount);
- estimatorTarget->FinishedAdds(ruleTotalCount);
- spanSourceEstimators.push_back(estimatorSource.release());
- spanTargetEstimators.push_back(estimatorTarget.release());
}
}
}
diff --git a/moses/src/SpanLengthEstimator.cpp b/moses/src/SpanLengthEstimator.cpp
index d568f77e5..be201b2c1 100644
--- a/moses/src/SpanLengthEstimator.cpp
+++ b/moses/src/SpanLengthEstimator.cpp
@@ -17,6 +17,7 @@ public:
virtual void AddSpanScore(unsigned spanLength, float score) {
m_scores.insert(make_pair(spanLength, score));
}
+
virtual float GetScoreBySpanLength(unsigned spanLength) const {
// bool useGaussian = StaticData::Instance().GetParam("gaussian-span-length-score").size() > 0;
if (m_scores.empty())
@@ -52,6 +53,12 @@ public:
m_average += exp(score) * spanLength;
m_averageSquare += exp(score) * spanLength * spanLength;
}
+
+ virtual void AddSpanScore_ISI(unsigned count, float sum_len, float sum_square_len){
+ m_average = sum_len / count;
+ m_averageSquare = sum_square_len /count;
+ }
+
virtual float GetScoreBySpanLength(unsigned spanLength) const {
float t = ((spanLength - m_average) / m_sigma);
float ret = -m_logSqrt2Pi - m_logSigma - 0.5 * t * t;
diff --git a/moses/src/SpanLengthEstimator.h b/moses/src/SpanLengthEstimator.h
index 6461ff8c8..ca75a2ba4 100644
--- a/moses/src/SpanLengthEstimator.h
+++ b/moses/src/SpanLengthEstimator.h
@@ -10,7 +10,8 @@ namespace Moses
class SpanLengthEstimator
{
public:
- virtual void AddSpanScore(unsigned spanLength, float score) = 0;
+ virtual void AddSpanScore(unsigned spanLength, float score)=0;
+ virtual void AddSpanScore_ISI(unsigned count, float sum_len, float sum_square_len){};
virtual float GetScoreBySpanLength(unsigned spanLength) const = 0;
virtual void FinishedAdds(unsigned ruleCount) {}
virtual ~SpanLengthEstimator() {}
diff --git a/moses/src/SpanLengthFeature.cpp b/moses/src/SpanLengthFeature.cpp
index 0c36c6ba0..016e11a1a 100644
--- a/moses/src/SpanLengthFeature.cpp
+++ b/moses/src/SpanLengthFeature.cpp
@@ -130,4 +130,4 @@ FFState* SpanLengthFeature::EvaluateChart(
}
}
-} // namespace moses \ No newline at end of file
+} // namespace moses