Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorEva <eva@deimos.(none)>2012-05-18 22:09:04 +0400
committerEva <eva@deimos.(none)>2012-05-18 22:09:04 +0400
commit4e9babf040c35831a643981bb15e76f1579afeb5 (patch)
treecde9ba14fc83c67599d56e9892e332a1a4973e5d /OnDiskPt
parent9e69a12b0295d4b83035e23c5151cc62c59016fd (diff)
enable binarization of rule tables with terminal alignments and new rule count
Diffstat (limited to 'OnDiskPt')
-rwxr-xr-xOnDiskPt/Main.cpp23
-rwxr-xr-xOnDiskPt/Main.h4
-rwxr-xr-xOnDiskPt/PhraseNode.cpp16
-rwxr-xr-xOnDiskPt/PhraseNode.h5
-rwxr-xr-xOnDiskPt/TargetPhrase.cpp61
-rwxr-xr-xOnDiskPt/TargetPhrase.h12
-rwxr-xr-xOnDiskPt/TargetPhraseCollection.cpp17
-rwxr-xr-xOnDiskPt/Vocab.cpp3
8 files changed, 101 insertions, 40 deletions
diff --git a/OnDiskPt/Main.cpp b/OnDiskPt/Main.cpp
index 5b3ac6cb8..a078b96ff 100755
--- a/OnDiskPt/Main.cpp
+++ b/OnDiskPt/Main.cpp
@@ -76,10 +76,10 @@ int main (int argc, char * const argv[])
std::vector<float> misc(1);
SourcePhrase sourcePhrase;
TargetPhrase *targetPhrase = new TargetPhrase(numScores);
- Tokenize(sourcePhrase, *targetPhrase, line, onDiskWrapper, numScores, misc);
+ OnDiskPt::Phrase *spShort = Tokenize(sourcePhrase, *targetPhrase, line, onDiskWrapper, numScores, misc);
assert(misc.size() == onDiskWrapper.GetNumCounts());
- rootNode.AddTargetPhrase(sourcePhrase, targetPhrase, onDiskWrapper, tableLimit, misc);
+ rootNode.AddTargetPhrase(sourcePhrase, targetPhrase, onDiskWrapper, tableLimit, misc, spShort);
}
rootNode.Save(onDiskWrapper, 0, tableLimit);
@@ -104,7 +104,7 @@ bool Flush(const OnDiskPt::SourcePhrase *prevSourcePhrase, const OnDiskPt::Sourc
return ret;
}
-void Tokenize(SourcePhrase &sourcePhrase, TargetPhrase &targetPhrase, char *line, OnDiskWrapper &onDiskWrapper, int numScores, vector<float> &misc)
+OnDiskPt::Phrase *Tokenize(SourcePhrase &sourcePhrase, TargetPhrase &targetPhrase, char *line, OnDiskWrapper &onDiskWrapper, int numScores, vector<float> &misc)
{
size_t scoreInd = 0;
@@ -117,13 +117,17 @@ void Tokenize(SourcePhrase &sourcePhrase, TargetPhrase &targetPhrase, char *line
4 = count
*/
char *tok = strtok (line," ");
+ OnDiskPt::Phrase *out = new Phrase();
while (tok != NULL) {
if (0 == strcmp(tok, "|||")) {
++stage;
} else {
switch (stage) {
case 0: {
- Tokenize(sourcePhrase, tok, true, true, onDiskWrapper);
+ Word *w = Tokenize(sourcePhrase, tok, true, true, onDiskWrapper);
+ if (w != NULL)
+ out->AddWord(w);
+
break;
}
case 1: {
@@ -176,10 +180,10 @@ void Tokenize(SourcePhrase &sourcePhrase, TargetPhrase &targetPhrase, char *line
assert(scoreInd == numScores);
targetPhrase.SortAlign();
-
+ return out;
} // Tokenize()
-void Tokenize(OnDiskPt::Phrase &phrase
+OnDiskPt::Word *Tokenize(OnDiskPt::Phrase &phrase
, const std::string &token, bool addSourceNonTerm, bool addTargetNonTerm
, OnDiskPt::OnDiskWrapper &onDiskWrapper)
{
@@ -193,6 +197,7 @@ void Tokenize(OnDiskPt::Phrase &phrase
nonTerm = comStr == 0;
}
+ OnDiskPt::Word *out = NULL;
if (nonTerm) {
// non-term
size_t splitPos = token.find_first_of("[", 2);
@@ -208,7 +213,7 @@ void Tokenize(OnDiskPt::Phrase &phrase
if (addSourceNonTerm) {
Word *word = new Word();
word->CreateFromString(wordStr, onDiskWrapper.GetVocab());
- phrase.AddWord(word);
+ phrase.AddWord(word);
}
wordStr = token.substr(splitPos, tokSize - splitPos);
@@ -216,6 +221,7 @@ void Tokenize(OnDiskPt::Phrase &phrase
Word *word = new Word();
word->CreateFromString(wordStr, onDiskWrapper.GetVocab());
phrase.AddWord(word);
+ out = word;
}
}
@@ -224,7 +230,10 @@ void Tokenize(OnDiskPt::Phrase &phrase
Word *word = new Word();
word->CreateFromString(token, onDiskWrapper.GetVocab());
phrase.AddWord(word);
+ out = word;
}
+
+ return out;
}
void InsertTargetNonTerminals(std::vector<std::string> &sourceToks, const std::vector<std::string> &targetToks, const ::AlignType &alignments)
diff --git a/OnDiskPt/Main.h b/OnDiskPt/Main.h
index 41a24a239..2db4eb864 100755
--- a/OnDiskPt/Main.h
+++ b/OnDiskPt/Main.h
@@ -25,10 +25,10 @@
typedef std::pair<size_t, size_t> AlignPair;
typedef std::vector<AlignPair> AlignType;
-void Tokenize(OnDiskPt::Phrase &phrase
+OnDiskPt::Word *Tokenize(OnDiskPt::Phrase &phrase
, const std::string &token, bool addSourceNonTerm, bool addTargetNonTerm
, OnDiskPt::OnDiskWrapper &onDiskWrapper);
-void Tokenize(OnDiskPt::SourcePhrase &sourcePhrase, OnDiskPt::TargetPhrase &targetPhrase
+OnDiskPt::Phrase *Tokenize(OnDiskPt::SourcePhrase &sourcePhrase, OnDiskPt::TargetPhrase &targetPhrase
, char *line, OnDiskPt::OnDiskWrapper &onDiskWrapper
, int numScores
, std::vector<float> &misc);
diff --git a/OnDiskPt/PhraseNode.cpp b/OnDiskPt/PhraseNode.cpp
index 98a55dbc1..0b5491963 100755
--- a/OnDiskPt/PhraseNode.cpp
+++ b/OnDiskPt/PhraseNode.cpp
@@ -56,9 +56,9 @@ PhraseNode::PhraseNode(UINT64 filePos, OnDiskWrapper &onDiskWrapper)
std::fstream &file = onDiskWrapper.GetFileSource();
file.seekg(filePos);
CHECK(filePos == file.tellg());
-
+
file.read((char*) &m_numChildrenLoad, sizeof(UINT64));
-
+
size_t memAlloc = GetNodeSize(m_numChildrenLoad, onDiskWrapper.GetSourceWordSize(), countSize);
m_memLoad = (char*) malloc(memAlloc);
@@ -160,15 +160,15 @@ void PhraseNode::Save(OnDiskWrapper &onDiskWrapper, size_t pos, size_t tableLimi
void PhraseNode::AddTargetPhrase(const SourcePhrase &sourcePhrase, TargetPhrase *targetPhrase
, OnDiskWrapper &onDiskWrapper, size_t tableLimit
- , const std::vector<float> &counts)
+ , const std::vector<float> &counts, OnDiskPt::Phrase *spShort)
{
- AddTargetPhrase(0, sourcePhrase, targetPhrase, onDiskWrapper, tableLimit, counts);
+ AddTargetPhrase(0, sourcePhrase, targetPhrase, onDiskWrapper, tableLimit, counts, spShort);
}
void PhraseNode::AddTargetPhrase(size_t pos, const SourcePhrase &sourcePhrase
, TargetPhrase *targetPhrase, OnDiskWrapper &onDiskWrapper
- , size_t tableLimit, const std::vector<float> &counts)
-{
+ , size_t tableLimit, const std::vector<float> &counts, OnDiskPt::Phrase *spShort)
+{
size_t phraseSize = sourcePhrase.GetSize();
if (pos < phraseSize) {
const Word &word = sourcePhrase.GetWord(pos);
@@ -185,10 +185,12 @@ void PhraseNode::AddTargetPhrase(size_t pos, const SourcePhrase &sourcePhrase
m_currChild = &node;
}
- node.AddTargetPhrase(pos + 1, sourcePhrase, targetPhrase, onDiskWrapper, tableLimit, counts);
+ // keep searching for target phrase node..
+ node.AddTargetPhrase(pos + 1, sourcePhrase, targetPhrase, onDiskWrapper, tableLimit, counts, spShort);
} else {
// drilled down to the right node
m_counts = counts;
+ targetPhrase->SetSourcePhrase(spShort);
m_targetPhraseColl.AddTargetPhrase(targetPhrase);
}
}
diff --git a/OnDiskPt/PhraseNode.h b/OnDiskPt/PhraseNode.h
index 279ca278a..e4704d142 100755
--- a/OnDiskPt/PhraseNode.h
+++ b/OnDiskPt/PhraseNode.h
@@ -23,6 +23,7 @@
#include <map>
#include "Word.h"
#include "TargetPhraseCollection.h"
+#include "Phrase.h"
namespace OnDiskPt
{
@@ -50,7 +51,7 @@ protected:
void AddTargetPhrase(size_t pos, const SourcePhrase &sourcePhrase
, TargetPhrase *targetPhrase, OnDiskWrapper &onDiskWrapper
- , size_t tableLimit, const std::vector<float> &counts);
+ , size_t tableLimit, const std::vector<float> &counts, OnDiskPt::Phrase *spShort);
size_t ReadChild(Word &wordFound, UINT64 &childFilePos, const char *mem, size_t numFactors) const;
void GetChild(Word &wordFound, UINT64 &childFilePos, size_t ind, OnDiskWrapper &onDiskWrapper) const;
@@ -66,7 +67,7 @@ public:
void AddTargetPhrase(const SourcePhrase &sourcePhrase, TargetPhrase *targetPhrase
, OnDiskWrapper &onDiskWrapper, size_t tableLimit
- , const std::vector<float> &counts);
+ , const std::vector<float> &counts, OnDiskPt::Phrase *spShort);
UINT64 GetFilePos() const {
return m_filePos;
diff --git a/OnDiskPt/TargetPhrase.cpp b/OnDiskPt/TargetPhrase.cpp
index aedb2a37e..182181bbf 100755
--- a/OnDiskPt/TargetPhrase.cpp
+++ b/OnDiskPt/TargetPhrase.cpp
@@ -98,9 +98,16 @@ char *TargetPhrase::WriteToMemory(OnDiskWrapper &onDiskWrapper, size_t &memUsed)
{
size_t phraseSize = GetSize();
size_t targetWordSize = onDiskWrapper.GetTargetWordSize();
-
+
+ const Phrase* sp = GetSourcePhrase();
+ size_t spSize = sp->GetSize();
+ size_t sourceWordSize = onDiskWrapper.GetSourceWordSize();
+
size_t memNeeded = sizeof(UINT64) // num of words
- + targetWordSize * phraseSize; // actual words. lhs as last words
+ + targetWordSize * phraseSize // actual words. lhs as last words
+ + sizeof(UINT64) // num source words
+ + sourceWordSize * spSize; // actual source words
+
memUsed = 0;
UINT64 *mem = (UINT64*) malloc(memNeeded);
@@ -115,6 +122,17 @@ char *TargetPhrase::WriteToMemory(OnDiskWrapper &onDiskWrapper, size_t &memUsed)
memUsed += word.WriteToMemory((char*) currPtr);
}
+ // write size of source phrase and all source words
+ char *currPtr = (char*)mem + memUsed;
+ UINT64 *memTmp = (UINT64*) currPtr;
+ memTmp[0] = spSize;
+ memUsed += sizeof(UINT64);
+ for (size_t pos = 0; pos < spSize; ++pos) {
+ const Word &word = sp->GetWord(pos);
+ char *currPtr = (char*)mem + memUsed;
+ memUsed += word.WriteToMemory((char*) currPtr);
+ }
+
CHECK(memUsed == memNeeded);
return (char *) mem;
}
@@ -191,7 +209,6 @@ size_t TargetPhrase::WriteAlignToMemory(char *mem) const
memUsed += sizeof(alignPair.second);
}
- std::cerr << "align memory used: " << memUsed << std::endl;
return memUsed;
}
@@ -207,7 +224,7 @@ size_t TargetPhrase::WriteScoresToMemory(char *mem) const
}
-Moses::TargetPhrase *TargetPhrase::ConvertToMoses(const std::vector<Moses::FactorType> & /*inputFactors */
+Moses::TargetPhrase *TargetPhrase::ConvertToMoses(const std::vector<Moses::FactorType> & inputFactors
, const std::vector<Moses::FactorType> &outputFactors
, const Vocab &vocab
, const Moses::PhraseDictionary &phraseDict
@@ -232,17 +249,31 @@ Moses::TargetPhrase *TargetPhrase::ConvertToMoses(const std::vector<Moses::Facto
ret->SetScoreChart(phraseDict.GetFeature(), m_scores, weightT, lmList, wpProducer);
// alignments
+ int indicator[m_align.size()];
+ int index = 0;
std::set<std::pair<size_t, size_t> > alignmentInfo;
+ const Phrase* sp = GetSourcePhrase();
for (size_t ind = 0; ind < m_align.size(); ++ind) {
const std::pair<size_t, size_t> &entry = m_align[ind];
alignmentInfo.insert(entry);
+ size_t sourcePos = entry.first;
+ indicator[index++] = sp->GetWord(sourcePos).IsNonTerminal() ? 1: 0;
}
- ret->SetAlignmentInfo(alignmentInfo);
+ ret->SetAlignmentInfo(alignmentInfo, indicator);
Moses::Word *lhs = GetWord(GetSize() - 1).ConvertToMoses(Moses::Output, outputFactors, vocab);
ret->SetTargetLHS(*lhs);
delete lhs;
-
+
+ // set source phrase
+ Moses::Phrase *mosesSP = new Moses::Phrase(Moses::Input);
+ for (size_t pos = 0; pos < sp->GetSize(); ++pos) {
+ Moses::Word *mosesWord = sp->GetWord(pos).ConvertToMoses(Moses::Input, inputFactors, vocab);
+ mosesSP->AddWord(*mosesWord);
+ delete mosesWord;
+ }
+ ret->SetSourcePhrase(*mosesSP);
+
return ret;
}
@@ -264,7 +295,7 @@ UINT64 TargetPhrase::ReadOtherInfoFromFile(UINT64 filePos, std::fstream &fileTPC
return memUsed;
}
-UINT64 TargetPhrase::ReadFromFile(std::fstream &fileTP, size_t numFactors)
+UINT64 TargetPhrase::ReadFromFile(std::fstream &fileTP, size_t numFactors, size_t numSourceFactors)
{
UINT64 bytesRead = 0;
@@ -279,20 +310,31 @@ UINT64 TargetPhrase::ReadFromFile(std::fstream &fileTP, size_t numFactors)
bytesRead += word->ReadFromFile(fileTP, numFactors);
AddWord(word);
}
+
+ // read source words
+ UINT64 numSourceWords;
+ fileTP.read((char*) &numSourceWords, sizeof(UINT64));
+ bytesRead += sizeof(UINT64);
+
+ SourcePhrase *sp = new SourcePhrase();
+ for (size_t ind = 0; ind < numSourceWords; ++ind) {
+ Word *word = new Word();
+ bytesRead += word->ReadFromFile(fileTP, numSourceFactors);
+ sp->AddWord(word);
+ }
+ SetSourcePhrase(sp);
return bytesRead;
}
UINT64 TargetPhrase::ReadAlignFromFile(std::fstream &fileTPColl)
{
- std::cerr << "read alignment.." << std::endl;
UINT64 bytesRead = 0;
UINT64 numAlign;
fileTPColl.read((char*) &numAlign, sizeof(UINT64));
bytesRead += sizeof(UINT64);
- std::cerr << "numAlign: " << numAlign << std::endl;
for (size_t ind = 0; ind < numAlign; ++ind) {
AlignPair alignPair;
fileTPColl.read((char*) &alignPair.first, sizeof(UINT64));
@@ -302,7 +344,6 @@ UINT64 TargetPhrase::ReadAlignFromFile(std::fstream &fileTPColl)
bytesRead += sizeof(UINT64) * 2;
}
- std::cerr << "Align bytes read: " << bytesRead << std::endl;
return bytesRead;
}
diff --git a/OnDiskPt/TargetPhrase.h b/OnDiskPt/TargetPhrase.h
index 1ff6f46a2..85b8ceeef 100755
--- a/OnDiskPt/TargetPhrase.h
+++ b/OnDiskPt/TargetPhrase.h
@@ -24,6 +24,7 @@
#include <vector>
#include "Word.h"
#include "Phrase.h"
+#include "SourcePhrase.h"
namespace Moses
{
@@ -45,6 +46,7 @@ class TargetPhrase: public Phrase
friend std::ostream& operator<<(std::ostream&, const TargetPhrase&);
protected:
AlignType m_align;
+ Phrase* m_sourcePhrase;
std::vector<float> m_scores;
UINT64 m_filePos;
@@ -60,6 +62,14 @@ public:
TargetPhrase(const TargetPhrase &copy);
virtual ~TargetPhrase();
+ void SetSourcePhrase(Phrase *p) {
+ Phrase *copy = new Phrase(*p);
+ m_sourcePhrase = copy;
+ }
+ const Phrase* GetSourcePhrase() const {
+ return m_sourcePhrase;
+ }
+
void SetLHS(Word *lhs);
void Create1AlignFromString(const std::string &align1Str);
@@ -90,7 +100,7 @@ public:
, const Moses::WordPenaltyProducer* wpProducer
, const Moses::LMList &lmList) const;
UINT64 ReadOtherInfoFromFile(UINT64 filePos, std::fstream &fileTPColl);
- UINT64 ReadFromFile(std::fstream &fileTP, size_t numFactors);
+ UINT64 ReadFromFile(std::fstream &fileTP, size_t numFactors, size_t numSourceFactors);
};
diff --git a/OnDiskPt/TargetPhraseCollection.cpp b/OnDiskPt/TargetPhraseCollection.cpp
index 295726ce1..c9fe92745 100755
--- a/OnDiskPt/TargetPhraseCollection.cpp
+++ b/OnDiskPt/TargetPhraseCollection.cpp
@@ -82,7 +82,7 @@ void TargetPhraseCollection::Save(OnDiskWrapper &onDiskWrapper)
CollType::iterator iter;
for (iter = m_coll.begin(); iter != m_coll.end(); ++iter) {
// save phrase
- TargetPhrase &targetPhrase = **iter;
+ TargetPhrase &targetPhrase = **iter;
targetPhrase.Save(onDiskWrapper);
// save coll
@@ -154,10 +154,11 @@ void TargetPhraseCollection::ReadFromFile(size_t tableLimit, UINT64 filePos, OnD
{
fstream &fileTPColl = onDiskWrapper.GetFileTargetColl();
fstream &fileTP = onDiskWrapper.GetFileTargetInd();
-
+
size_t numScores = onDiskWrapper.GetNumScores();
size_t numTargetFactors = onDiskWrapper.GetNumTargetFactors();
-
+ size_t numSourceFactors = onDiskWrapper.GetNumSourceFactors();
+
UINT64 numPhrases;
UINT64 currFilePos = filePos;
@@ -168,19 +169,15 @@ void TargetPhraseCollection::ReadFromFile(size_t tableLimit, UINT64 filePos, OnD
numPhrases = std::min(numPhrases, (UINT64) tableLimit);
currFilePos += sizeof(UINT64);
-
+
for (size_t ind = 0; ind < numPhrases; ++ind) {
- TargetPhrase *tp = new TargetPhrase(numScores);
-
+ TargetPhrase *tp = new TargetPhrase(numScores);
UINT64 sizeOtherInfo = tp->ReadOtherInfoFromFile(currFilePos, fileTPColl);
- std::cerr << "other info done." << std::endl;
- tp->ReadFromFile(fileTP, numTargetFactors);
- std::cerr << "done reading from file." << std::endl;
+ tp->ReadFromFile(fileTP, numTargetFactors, numSourceFactors);
currFilePos += sizeOtherInfo;
m_coll.push_back(tp);
- std::cerr << "tp done." << std::endl;
}
}
diff --git a/OnDiskPt/Vocab.cpp b/OnDiskPt/Vocab.cpp
index 86072edc6..1e9e4186a 100755
--- a/OnDiskPt/Vocab.cpp
+++ b/OnDiskPt/Vocab.cpp
@@ -44,7 +44,8 @@ bool Vocab::Load(OnDiskWrapper &onDiskWrapper)
// create lookup
// assume contiguous vocab id
m_lookup.resize(m_vocabColl.size() + 1);
-
+ m_nextId = m_lookup.size();
+
CollType::const_iterator iter;
for (iter = m_vocabColl.begin(); iter != m_vocabColl.end(); ++iter) {
UINT32 vocabId = iter->second;