Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
path: root/moses/PP
diff options
context:
space:
mode:
authorHieu Hoang <hieu@hoang.co.uk>2014-06-16 13:57:02 +0400
committerHieu Hoang <hieu@hoang.co.uk>2014-06-16 13:57:02 +0400
commitf2d3f57d07186c2bcbb6adda6510bb2c23f733e0 (patch)
treeed919df6b1d7bd5cb52f9694a3c7032bb5f23cd8 /moses/PP
parent72e88c6d328fa5ebf43f6aab58c4793816d58f66 (diff)
parente2dc8891509943fd6ef864ecc7b45649a0161cf1 (diff)
Merge branch 'hieu' of ../mosesdecoder.hieu
Diffstat (limited to 'moses/PP')
-rw-r--r--moses/PP/Factory.cpp5
-rw-r--r--moses/PP/SpanLengthPhraseProperty.cpp127
-rw-r--r--moses/PP/SpanLengthPhraseProperty.h35
3 files changed, 165 insertions, 2 deletions
diff --git a/moses/PP/Factory.cpp b/moses/PP/Factory.cpp
index 45bdafe36..497eabaff 100644
--- a/moses/PP/Factory.cpp
+++ b/moses/PP/Factory.cpp
@@ -7,6 +7,7 @@
#include "moses/PP/CountsPhraseProperty.h"
#include "moses/PP/SourceLabelsPhraseProperty.h"
#include "moses/PP/TreeStructurePhraseProperty.h"
+#include "moses/PP/SpanLengthPhraseProperty.h"
namespace Moses
{
@@ -54,8 +55,8 @@ PhrasePropertyFactory::PhrasePropertyFactory()
MOSES_PNAME2("Counts", CountsPhraseProperty);
MOSES_PNAME2("SourceLabels", SourceLabelsPhraseProperty);
- MOSES_PNAME2("Tree", TreeStructurePhraseProperty);
-
+ MOSES_PNAME2("Tree",TreeStructurePhraseProperty);
+ MOSES_PNAME2("SpanLength", SpanLengthPhraseProperty);
}
PhrasePropertyFactory::~PhrasePropertyFactory()
diff --git a/moses/PP/SpanLengthPhraseProperty.cpp b/moses/PP/SpanLengthPhraseProperty.cpp
new file mode 100644
index 000000000..d45c7b919
--- /dev/null
+++ b/moses/PP/SpanLengthPhraseProperty.cpp
@@ -0,0 +1,127 @@
+#include "SpanLengthPhraseProperty.h"
+#include "moses/Util.h"
+#include "util/exception.hh"
+
+using namespace std;
+
+namespace Moses
+{
+SpanLengthPhraseProperty::SpanLengthPhraseProperty()
+{
+}
+
+void SpanLengthPhraseProperty::ProcessValue(const std::string &value)
+{
+ vector<string> toks;
+ Tokenize(toks, value);
+
+ set< vector<string> > indices;
+
+ for (size_t i = 0; i < toks.size(); ++i) {
+ const string &span = toks[i];
+
+ // is it a ntIndex,sourceSpan,targetSpan or count ?
+ vector<string> toks;
+ Tokenize<string>(toks, span, ",");
+ UTIL_THROW_IF2(toks.size() != 1 && toks.size() != 3, "Incorrect format for SpanLength: " << span);
+
+ if (toks.size() == 1) {
+ float count = Scan<float>(toks[0]);
+ Populate(indices, count);
+
+ indices.clear();
+ }
+ else {
+ indices.insert(toks);
+ }
+ }
+
+ // totals
+ CalcTotals(m_source);
+ CalcTotals(m_target);
+}
+
+void SpanLengthPhraseProperty::Populate(const set< vector<string> > &indices, float count)
+{
+ set< vector<string> >::const_iterator iter;
+ for (iter = indices.begin(); iter != indices.end(); ++iter) {
+ const vector<string> &toksStr = *iter;
+ vector<size_t> toks = Scan<size_t>(toksStr);
+ UTIL_THROW_IF2(toks.size() != 3, "Incorrect format for SpanLength. Size is " << toks.size());
+
+ Populate(toks, count);
+ }
+}
+
+void SpanLengthPhraseProperty::Populate(const std::vector<size_t> &toks, float count)
+{
+ size_t ntInd = toks[0];
+ size_t sourceLength = toks[1];
+ size_t targetLength = toks[2];
+ if (ntInd >= m_source.size() ) {
+ m_source.resize(ntInd + 1);
+ m_target.resize(ntInd + 1);
+ }
+
+ Map &sourceMap = m_source[ntInd].first;
+ Map &targetMap = m_target[ntInd].first;
+ Populate(sourceMap, sourceLength, count);
+ Populate(targetMap, targetLength, count);
+}
+
+void SpanLengthPhraseProperty::Populate(Map &map, size_t span, float count)
+{
+ Map::iterator iter;
+ iter = map.find(span);
+ if (iter != map.end()) {
+ float &value = iter->second;
+ value += count;
+ }
+ else {
+ map[span] = count;
+ }
+}
+
+void SpanLengthPhraseProperty::CalcTotals(Vec &vec)
+{
+ for (size_t i = 0; i < vec.size(); ++i) {
+ float total = 0;
+
+ const Map &map = vec[i].first;
+ Map::const_iterator iter;
+ for (iter = map.begin(); iter != map.end(); ++iter) {
+ float count = iter->second;
+ total += count;
+ }
+
+ vec[i].second = total;
+ }
+}
+
+float SpanLengthPhraseProperty::GetProb(size_t ntInd, size_t sourceWidth, float smoothing) const
+{
+ float count;
+
+ const std::pair<Map, float> &data = m_source[ntInd];
+ const Map &map = data.first;
+
+ if (map.size() == 0) {
+ // should this ever be reached? there shouldn't be any span length proprty so FF shouldn't call this
+ return 1.0f;
+ }
+
+ Map::const_iterator iter = map.find(sourceWidth);
+ if (iter == map.end()) {
+ count = 0;
+ }
+ else {
+ count = iter->second;
+ }
+ count += smoothing;
+
+ float total = data.second + smoothing * (float) map.size();
+ float ret = count / total;
+ return ret;
+}
+
+}
diff --git a/moses/PP/SpanLengthPhraseProperty.h b/moses/PP/SpanLengthPhraseProperty.h
new file mode 100644
index 000000000..982c3ca0d
--- /dev/null
+++ b/moses/PP/SpanLengthPhraseProperty.h
@@ -0,0 +1,35 @@
+
+#pragma once
+
+#include <string>
+#include <set>
+#include <map>
+#include <vector>
+#include "moses/PP/PhraseProperty.h"
+
+namespace Moses
+{
+
+class SpanLengthPhraseProperty : public PhraseProperty
+{
+public:
+ SpanLengthPhraseProperty();
+
+ void ProcessValue(const std::string &value);
+
+ float GetProb(size_t ntInd, size_t sourceWidth, float smoothing) const;
+protected:
+ // fractional counts
+ typedef std::map<size_t, float> Map;
+ typedef std::vector<std::pair<Map, float> > Vec;
+ Vec m_source, m_target;
+
+ void Populate(const std::set< std::vector<std::string> > &indices, float count);
+ void Populate(const std::vector<size_t> &toks, float count);
+ void Populate(Map &map, size_t span, float count);
+
+ void CalcTotals(Vec &vec);
+};
+
+} // namespace Moses
+