add StoreVocab

author: Hieu Hoang <hieuhoang@gmail.com> 2016-10-05 18:43:04 +0300
committer: Hieu Hoang <hieuhoang@gmail.com> 2016-10-05 18:43:04 +0300
commit: 7d7ae1b72ca6487cd50dba6d20d0ba4a4b08b782 (patch)
tree: 49473051b7181a047920836c6c8abfbf59453956
parent: 0e4e64b26dd3b82a0dfbfe2445f89e1dcbbdf61a (diff)
4 files changed, 394 insertions, 0 deletions
diff --git a/moses/TranslationModel/ProbingPT/StoreTarget.cpp b/moses/TranslationModel/ProbingPT/StoreTarget.cpp
new file mode 100644
index 000000000..8072f408b
--- /dev/null
+++ b/moses/TranslationModel/ProbingPT/StoreTarget.cpp
@@ -0,0 +1,266 @@
+/*
+ * StoreTarget.cpp
+ *
+ *  Created on: 19 Jan 2016
+ *      Author: hieu
+ */
+#include <boost/foreach.hpp>
+#include "StoreTarget.h"
+#include "line_splitter.hh"
+#include "probing_hash_utils.hh"
+#include "moses/OutputFileStream.h"
+#include "moses/Util.h"
+
+using namespace std;
+
+namespace Moses
+{
+
+StoreTarget::StoreTarget(const std::string &basepath)
+:m_basePath(basepath)
+,m_vocab(basepath + "/TargetVocab.dat")
+{
+  std::string path = basepath + "/TargetColl.dat";
+  m_fileTargetColl.open(path.c_str(),
+      std::ios::out | std::ios::binary | std::ios::ate | std::ios::trunc);
+  if (!m_fileTargetColl.is_open()) {
+    throw "can't create file ";
+  }
+
+}
+
+StoreTarget::~StoreTarget()
+{
+  assert(m_coll.empty());
+  m_fileTargetColl.close();
+
+  // vocab
+  m_vocab.Save();
+}
+
+uint64_t StoreTarget::Save()
+{
+  uint64_t ret = m_fileTargetColl.tellp();
+
+  // save to disk
+  uint64_t numTP = m_coll.size();
+  m_fileTargetColl.write((char*) &numTP, sizeof(uint64_t));
+
+  for (size_t i = 0; i < m_coll.size(); ++i) {
+    Save(*m_coll[i]);
+  }
+
+  // clear coll
+  RemoveAllInColl(m_coll);
+  m_coll.clear();
+
+  // starting position of coll
+  return ret;
+}
+
+void StoreTarget::Save(const target_text &rule)
+{
+  // metadata for each tp
+  TargetPhraseInfo tpInfo;
+  tpInfo.alignTerm = GetAlignId(rule.word_align_term);
+  tpInfo.alignNonTerm = GetAlignId(rule.word_align_non_term);
+  tpInfo.numWords = rule.target_phrase.size();
+  tpInfo.propLength = rule.property.size();
+
+  //cerr << "TPInfo=" << sizeof(TPInfo);
+  m_fileTargetColl.write((char*) &tpInfo, sizeof(TargetPhraseInfo));
+
+  // scores
+  for (size_t i = 0; i < rule.prob.size(); ++i) {
+    float prob = rule.prob[i];
+    m_fileTargetColl.write((char*) &prob, sizeof(prob));
+  }
+
+  // tp
+  for (size_t i = 0; i < rule.target_phrase.size(); ++i) {
+    uint32_t vocabId = rule.target_phrase[i];
+    m_fileTargetColl.write((char*) &vocabId, sizeof(vocabId));
+  }
+
+  // prop TODO
+
+}
+
+void StoreTarget::SaveAlignment()
+{
+  std::string path = m_basePath + "/Alignments.dat";
+  OutputFileStream file(path);
+
+  BOOST_FOREACH(Alignments::value_type &valPair, m_aligns) {
+    file << valPair.second << "\t";
+
+    const std::vector<size_t> &aligns = valPair.first;
+    BOOST_FOREACH(size_t align, aligns) {
+      file << align << " ";
+    }
+    file << endl;
+  }
+
+}
+
+void StoreTarget::Append(const line_text &line, bool log_prob, bool scfg)
+{
+  target_text *rule = new target_text;
+  //cerr << "line.target_phrase=" << line.target_phrase << endl;
+
+  // target_phrase
+  vector<bool> nonTerms;
+  util::TokenIter<util::SingleCharacter> it;
+  it = util::TokenIter<util::SingleCharacter>(line.target_phrase,
+      util::SingleCharacter(' '));
+  while (it) {
+	StringPiece word = *it;
+	//cerr << "word=" << word << endl;
+
+    bool nonTerm = false;
+    if (scfg) {
+      // not really sure how to handle factored SCFG and NT
+      if (scfg && word[0] == '[' && word[word.size() - 1] == ']') {
+        //cerr << "NON-TERM=" << tok << " " << nonTerms.size() << endl;
+        nonTerm = true;
+      }
+      nonTerms.push_back(nonTerm);
+    }
+
+    util::TokenIter<util::SingleCharacter> itFactor;
+    itFactor = util::TokenIter<util::SingleCharacter>(word,
+        util::SingleCharacter('|'));
+    while (itFactor) {
+    	StringPiece factor = *itFactor;
+
+    	string factorStr = factor.as_string();
+    	uint32_t vocabId = m_vocab.GetVocabId(factorStr);
+
+    	rule->target_phrase.push_back(vocabId);
+
+    	itFactor++;
+    }
+
+    it++;
+  }
+
+  // probs
+  it = util::TokenIter<util::SingleCharacter>(line.prob,
+      util::SingleCharacter(' '));
+  while (it) {
+    string tok = it->as_string();
+    float prob = Scan<float>(tok);
+
+    if (log_prob) {
+      prob = FloorScore(log(prob));
+      if (prob == 0.0f) prob = 0.0000000001;
+    }
+
+    rule->prob.push_back(prob);
+    it++;
+  }
+
+  /*
+  cerr << "nonTerms=";
+  for (size_t i = 0; i < nonTerms.size(); ++i) {
+    cerr << nonTerms[i] << " ";
+  }
+  cerr << endl;
+  */
+
+  // alignment
+  it = util::TokenIter<util::SingleCharacter>(line.word_align,
+      util::SingleCharacter(' '));
+  while (it) {
+    string tokPair = Trim(it->as_string());
+    if (tokPair.empty()) {
+      break;
+    }
+
+    vector<size_t> alignPair = Tokenize<size_t>(tokPair, "-");
+    assert(alignPair.size() == 2);
+
+    bool nonTerm = false;
+    size_t sourcePos = alignPair[0];
+    size_t targetPos = alignPair[1];
+    if (scfg) {
+      nonTerm = nonTerms[targetPos];
+    }
+
+    //cerr << targetPos << "=" << nonTerm << endl;
+
+    if (nonTerm) {
+      rule->word_align_non_term.push_back(sourcePos);
+      rule->word_align_non_term.push_back(targetPos);
+      //cerr << (int) rule->word_all1.back() << " ";
+    }
+    else {
+      rule->word_align_term.push_back(sourcePos);
+      rule->word_align_term.push_back(targetPos);
+    }
+
+    it++;
+  }
+
+  // extra scores
+  string prop = line.property.as_string();
+  AppendLexRO(prop, rule->prob, log_prob);
+
+  //cerr << "line.property=" << line.property << endl;
+  //cerr << "prop=" << prop << endl;
+
+  // properties
+  /*
+   for (size_t i = 0; i < prop.size(); ++i) {
+   rule->property.push_back(prop[i]);
+   }
+   */
+  m_coll.push_back(rule);
+}
+
+uint32_t StoreTarget::GetAlignId(const std::vector<size_t> &align)
+{
+  boost::unordered_map<std::vector<size_t>, uint32_t>::iterator iter =
+      m_aligns.find(align);
+  if (iter == m_aligns.end()) {
+    uint32_t ind = m_aligns.size();
+    m_aligns[align] = ind;
+    return ind;
+  }
+  else {
+    return iter->second;
+  }
+}
+
+void StoreTarget::AppendLexRO(std::string &prop, std::vector<float> &retvector,
+    bool log_prob) const
+{
+  size_t startPos = prop.find("{{LexRO ");
+
+  if (startPos != string::npos) {
+    size_t endPos = prop.find("}}", startPos + 8);
+    string lexProb = prop.substr(startPos + 8, endPos - startPos - 8);
+    //cerr << "lexProb=" << lexProb << endl;
+
+    // append lex probs to pt probs
+    vector<float> scores = Tokenize<float>(lexProb);
+
+    if (log_prob) {
+      for (size_t i = 0; i < scores.size(); ++i) {
+        scores[i] = FloorScore(log(scores[i]));
+        if (scores[i] == 0.0f) scores[i] = 0.0000000001;
+      }
+    }
+
+    for (size_t i = 0; i < scores.size(); ++i) {
+      retvector.push_back(scores[i]);
+    }
+
+    // exclude LexRO property from property column
+    prop = prop.substr(0, startPos)
+        + prop.substr(endPos + 2, prop.size() - endPos - 2);
+    //cerr << "line.property_to_be_binarized=" << line.property_to_be_binarized << "AAAA" << endl;
+  }
+}
+
+} /* namespace Moses2 */
diff --git a/moses/TranslationModel/ProbingPT/StoreTarget.h b/moses/TranslationModel/ProbingPT/StoreTarget.h
new file mode 100644
index 000000000..5c7d9e1b7
--- /dev/null
+++ b/moses/TranslationModel/ProbingPT/StoreTarget.h
@@ -0,0 +1,51 @@
+/*
+ * StoreTarget.h
+ *
+ *  Created on: 19 Jan 2016
+ *      Author: hieu
+ */
+#pragma once
+#include <string>
+#include <fstream>
+#include <vector>
+#include <inttypes.h>
+#include <boost/unordered_map.hpp>
+#include <boost/unordered_set.hpp>
+#include "StoreVocab.h"
+
+namespace Moses
+{
+
+class line_text;
+class target_text;
+
+class StoreTarget
+{
+public:
+  StoreTarget(const std::string &basepath);
+  virtual ~StoreTarget();
+
+  uint64_t Save();
+  void SaveAlignment();
+
+  void Append(const line_text &line, bool log_prob, bool scfg);
+protected:
+  std::string m_basePath;
+  std::fstream m_fileTargetColl;
+  StoreVocab<uint32_t> m_vocab;
+
+  typedef boost::unordered_map<std::vector<size_t>, uint32_t> Alignments;
+  Alignments m_aligns;
+
+  std::vector<target_text*> m_coll;
+
+  uint32_t GetAlignId(const std::vector<size_t> &align);
+  void Save(const target_text &rule);
+
+  void AppendLexRO(std::string &prop, std::vector<float> &retvector,
+      bool log_prob) const;
+
+};
+
+} /* namespace Moses2 */
+
diff --git a/moses/TranslationModel/ProbingPT/StoreVocab.cpp b/moses/TranslationModel/ProbingPT/StoreVocab.cpp
new file mode 100644
index 000000000..6515bac63
--- /dev/null
+++ b/moses/TranslationModel/ProbingPT/StoreVocab.cpp
@@ -0,0 +1,13 @@
+/*
+ * StoreVocab.cpp
+ *
+ *  Created on: 15 Jun 2016
+ *      Author: hieu
+ */
+#include <fstream>
+#include "StoreVocab.h"
+
+namespace Moses
+{
+
+} /* namespace Moses2 */
diff --git a/moses/TranslationModel/ProbingPT/StoreVocab.h b/moses/TranslationModel/ProbingPT/StoreVocab.h
new file mode 100644
index 000000000..05d279f4c
--- /dev/null
+++ b/moses/TranslationModel/ProbingPT/StoreVocab.h
@@ -0,0 +1,64 @@
+/*
+ * StoreVocab.h
+ *
+ *  Created on: 15 Jun 2016
+ *      Author: hieu
+ */
+#pragma once
+#include <string>
+#include <boost/unordered_map.hpp>
+#include "moses/OutputFileStream.h"
+#include "moses/Util.h"
+
+namespace Moses
+{
+
+template<typename VOCABID>
+class StoreVocab
+{
+protected:
+  std::string m_path;
+
+  typedef boost::unordered_map<std::string, VOCABID> Coll;
+  Coll m_vocab;
+
+public:
+  StoreVocab(const std::string &path)
+  :m_path(path)
+  {}
+
+  virtual ~StoreVocab() {}
+
+  VOCABID GetVocabId(const std::string &word)
+  {
+    typename Coll::iterator iter = m_vocab.find(word);
+    if (iter == m_vocab.end()) {
+      VOCABID ind = m_vocab.size() + 1;
+      m_vocab[word] = ind;
+      return ind;
+    }
+    else {
+      return iter->second;
+    }
+  }
+
+  void Insert(VOCABID id, const std::string &word)
+  {
+    m_vocab[word] = id;
+  }
+
+  void Save()
+  {
+    OutputFileStream strme(m_path);
+
+    typename Coll::const_iterator iter;
+    for (iter = m_vocab.begin(); iter != m_vocab.end(); ++iter) {
+      strme << iter->first << "\t" << iter->second << std::endl;
+    }
+
+    strme.Close();
+  }
+};
+
+} /* namespace Moses2 */
+
author	Hieu Hoang <hieuhoang@gmail.com>	2016-10-05 18:43:04 +0300
committer	Hieu Hoang <hieuhoang@gmail.com>	2016-10-05 18:43:04 +0300
commit	7d7ae1b72ca6487cd50dba6d20d0ba4a4b08b782 (patch)
tree	49473051b7181a047920836c6c8abfbf59453956
parent	0e4e64b26dd3b82a0dfbfe2445f89e1dcbbdf61a (diff)