diff options
author | Eva <eva@deimos.(none)> | 2012-05-18 22:09:04 +0400 |
---|---|---|
committer | Eva <eva@deimos.(none)> | 2012-05-18 22:09:04 +0400 |
commit | 4e9babf040c35831a643981bb15e76f1579afeb5 (patch) | |
tree | cde9ba14fc83c67599d56e9892e332a1a4973e5d /OnDiskPt | |
parent | 9e69a12b0295d4b83035e23c5151cc62c59016fd (diff) |
enable binarization of rule tables with terminal alignments and new rule count
Diffstat (limited to 'OnDiskPt')
-rwxr-xr-x | OnDiskPt/Main.cpp | 23 | ||||
-rwxr-xr-x | OnDiskPt/Main.h | 4 | ||||
-rwxr-xr-x | OnDiskPt/PhraseNode.cpp | 16 | ||||
-rwxr-xr-x | OnDiskPt/PhraseNode.h | 5 | ||||
-rwxr-xr-x | OnDiskPt/TargetPhrase.cpp | 61 | ||||
-rwxr-xr-x | OnDiskPt/TargetPhrase.h | 12 | ||||
-rwxr-xr-x | OnDiskPt/TargetPhraseCollection.cpp | 17 | ||||
-rwxr-xr-x | OnDiskPt/Vocab.cpp | 3 |
8 files changed, 101 insertions, 40 deletions
diff --git a/OnDiskPt/Main.cpp b/OnDiskPt/Main.cpp index 5b3ac6cb8..a078b96ff 100755 --- a/OnDiskPt/Main.cpp +++ b/OnDiskPt/Main.cpp @@ -76,10 +76,10 @@ int main (int argc, char * const argv[]) std::vector<float> misc(1); SourcePhrase sourcePhrase; TargetPhrase *targetPhrase = new TargetPhrase(numScores); - Tokenize(sourcePhrase, *targetPhrase, line, onDiskWrapper, numScores, misc); + OnDiskPt::Phrase *spShort = Tokenize(sourcePhrase, *targetPhrase, line, onDiskWrapper, numScores, misc); assert(misc.size() == onDiskWrapper.GetNumCounts()); - rootNode.AddTargetPhrase(sourcePhrase, targetPhrase, onDiskWrapper, tableLimit, misc); + rootNode.AddTargetPhrase(sourcePhrase, targetPhrase, onDiskWrapper, tableLimit, misc, spShort); } rootNode.Save(onDiskWrapper, 0, tableLimit); @@ -104,7 +104,7 @@ bool Flush(const OnDiskPt::SourcePhrase *prevSourcePhrase, const OnDiskPt::Sourc return ret; } -void Tokenize(SourcePhrase &sourcePhrase, TargetPhrase &targetPhrase, char *line, OnDiskWrapper &onDiskWrapper, int numScores, vector<float> &misc) +OnDiskPt::Phrase *Tokenize(SourcePhrase &sourcePhrase, TargetPhrase &targetPhrase, char *line, OnDiskWrapper &onDiskWrapper, int numScores, vector<float> &misc) { size_t scoreInd = 0; @@ -117,13 +117,17 @@ void Tokenize(SourcePhrase &sourcePhrase, TargetPhrase &targetPhrase, char *line 4 = count */ char *tok = strtok (line," "); + OnDiskPt::Phrase *out = new Phrase(); while (tok != NULL) { if (0 == strcmp(tok, "|||")) { ++stage; } else { switch (stage) { case 0: { - Tokenize(sourcePhrase, tok, true, true, onDiskWrapper); + Word *w = Tokenize(sourcePhrase, tok, true, true, onDiskWrapper); + if (w != NULL) + out->AddWord(w); + break; } case 1: { @@ -176,10 +180,10 @@ void Tokenize(SourcePhrase &sourcePhrase, TargetPhrase &targetPhrase, char *line assert(scoreInd == numScores); targetPhrase.SortAlign(); - + return out; } // Tokenize() -void Tokenize(OnDiskPt::Phrase &phrase +OnDiskPt::Word *Tokenize(OnDiskPt::Phrase &phrase , const std::string &token, bool addSourceNonTerm, bool addTargetNonTerm , OnDiskPt::OnDiskWrapper &onDiskWrapper) { @@ -193,6 +197,7 @@ void Tokenize(OnDiskPt::Phrase &phrase nonTerm = comStr == 0; } + OnDiskPt::Word *out = NULL; if (nonTerm) { // non-term size_t splitPos = token.find_first_of("[", 2); @@ -208,7 +213,7 @@ void Tokenize(OnDiskPt::Phrase &phrase if (addSourceNonTerm) { Word *word = new Word(); word->CreateFromString(wordStr, onDiskWrapper.GetVocab()); - phrase.AddWord(word); + phrase.AddWord(word); } wordStr = token.substr(splitPos, tokSize - splitPos); @@ -216,6 +221,7 @@ void Tokenize(OnDiskPt::Phrase &phrase Word *word = new Word(); word->CreateFromString(wordStr, onDiskWrapper.GetVocab()); phrase.AddWord(word); + out = word; } } @@ -224,7 +230,10 @@ void Tokenize(OnDiskPt::Phrase &phrase Word *word = new Word(); word->CreateFromString(token, onDiskWrapper.GetVocab()); phrase.AddWord(word); + out = word; } + + return out; } void InsertTargetNonTerminals(std::vector<std::string> &sourceToks, const std::vector<std::string> &targetToks, const ::AlignType &alignments) diff --git a/OnDiskPt/Main.h b/OnDiskPt/Main.h index 41a24a239..2db4eb864 100755 --- a/OnDiskPt/Main.h +++ b/OnDiskPt/Main.h @@ -25,10 +25,10 @@ typedef std::pair<size_t, size_t> AlignPair; typedef std::vector<AlignPair> AlignType; -void Tokenize(OnDiskPt::Phrase &phrase +OnDiskPt::Word *Tokenize(OnDiskPt::Phrase &phrase , const std::string &token, bool addSourceNonTerm, bool addTargetNonTerm , OnDiskPt::OnDiskWrapper &onDiskWrapper); -void Tokenize(OnDiskPt::SourcePhrase &sourcePhrase, OnDiskPt::TargetPhrase &targetPhrase +OnDiskPt::Phrase *Tokenize(OnDiskPt::SourcePhrase &sourcePhrase, OnDiskPt::TargetPhrase &targetPhrase , char *line, OnDiskPt::OnDiskWrapper &onDiskWrapper , int numScores , std::vector<float> &misc); diff --git a/OnDiskPt/PhraseNode.cpp b/OnDiskPt/PhraseNode.cpp index 98a55dbc1..0b5491963 100755 --- a/OnDiskPt/PhraseNode.cpp +++ b/OnDiskPt/PhraseNode.cpp @@ -56,9 +56,9 @@ PhraseNode::PhraseNode(UINT64 filePos, OnDiskWrapper &onDiskWrapper) std::fstream &file = onDiskWrapper.GetFileSource(); file.seekg(filePos); CHECK(filePos == file.tellg()); - + file.read((char*) &m_numChildrenLoad, sizeof(UINT64)); - + size_t memAlloc = GetNodeSize(m_numChildrenLoad, onDiskWrapper.GetSourceWordSize(), countSize); m_memLoad = (char*) malloc(memAlloc); @@ -160,15 +160,15 @@ void PhraseNode::Save(OnDiskWrapper &onDiskWrapper, size_t pos, size_t tableLimi void PhraseNode::AddTargetPhrase(const SourcePhrase &sourcePhrase, TargetPhrase *targetPhrase , OnDiskWrapper &onDiskWrapper, size_t tableLimit - , const std::vector<float> &counts) + , const std::vector<float> &counts, OnDiskPt::Phrase *spShort) { - AddTargetPhrase(0, sourcePhrase, targetPhrase, onDiskWrapper, tableLimit, counts); + AddTargetPhrase(0, sourcePhrase, targetPhrase, onDiskWrapper, tableLimit, counts, spShort); } void PhraseNode::AddTargetPhrase(size_t pos, const SourcePhrase &sourcePhrase , TargetPhrase *targetPhrase, OnDiskWrapper &onDiskWrapper - , size_t tableLimit, const std::vector<float> &counts) -{ + , size_t tableLimit, const std::vector<float> &counts, OnDiskPt::Phrase *spShort) +{ size_t phraseSize = sourcePhrase.GetSize(); if (pos < phraseSize) { const Word &word = sourcePhrase.GetWord(pos); @@ -185,10 +185,12 @@ void PhraseNode::AddTargetPhrase(size_t pos, const SourcePhrase &sourcePhrase m_currChild = &node; } - node.AddTargetPhrase(pos + 1, sourcePhrase, targetPhrase, onDiskWrapper, tableLimit, counts); + // keep searching for target phrase node.. + node.AddTargetPhrase(pos + 1, sourcePhrase, targetPhrase, onDiskWrapper, tableLimit, counts, spShort); } else { // drilled down to the right node m_counts = counts; + targetPhrase->SetSourcePhrase(spShort); m_targetPhraseColl.AddTargetPhrase(targetPhrase); } } diff --git a/OnDiskPt/PhraseNode.h b/OnDiskPt/PhraseNode.h index 279ca278a..e4704d142 100755 --- a/OnDiskPt/PhraseNode.h +++ b/OnDiskPt/PhraseNode.h @@ -23,6 +23,7 @@ #include <map> #include "Word.h" #include "TargetPhraseCollection.h" +#include "Phrase.h" namespace OnDiskPt { @@ -50,7 +51,7 @@ protected: void AddTargetPhrase(size_t pos, const SourcePhrase &sourcePhrase , TargetPhrase *targetPhrase, OnDiskWrapper &onDiskWrapper - , size_t tableLimit, const std::vector<float> &counts); + , size_t tableLimit, const std::vector<float> &counts, OnDiskPt::Phrase *spShort); size_t ReadChild(Word &wordFound, UINT64 &childFilePos, const char *mem, size_t numFactors) const; void GetChild(Word &wordFound, UINT64 &childFilePos, size_t ind, OnDiskWrapper &onDiskWrapper) const; @@ -66,7 +67,7 @@ public: void AddTargetPhrase(const SourcePhrase &sourcePhrase, TargetPhrase *targetPhrase , OnDiskWrapper &onDiskWrapper, size_t tableLimit - , const std::vector<float> &counts); + , const std::vector<float> &counts, OnDiskPt::Phrase *spShort); UINT64 GetFilePos() const { return m_filePos; diff --git a/OnDiskPt/TargetPhrase.cpp b/OnDiskPt/TargetPhrase.cpp index aedb2a37e..182181bbf 100755 --- a/OnDiskPt/TargetPhrase.cpp +++ b/OnDiskPt/TargetPhrase.cpp @@ -98,9 +98,16 @@ char *TargetPhrase::WriteToMemory(OnDiskWrapper &onDiskWrapper, size_t &memUsed) { size_t phraseSize = GetSize(); size_t targetWordSize = onDiskWrapper.GetTargetWordSize(); - + + const Phrase* sp = GetSourcePhrase(); + size_t spSize = sp->GetSize(); + size_t sourceWordSize = onDiskWrapper.GetSourceWordSize(); + size_t memNeeded = sizeof(UINT64) // num of words - + targetWordSize * phraseSize; // actual words. lhs as last words + + targetWordSize * phraseSize // actual words. lhs as last words + + sizeof(UINT64) // num source words + + sourceWordSize * spSize; // actual source words + memUsed = 0; UINT64 *mem = (UINT64*) malloc(memNeeded); @@ -115,6 +122,17 @@ char *TargetPhrase::WriteToMemory(OnDiskWrapper &onDiskWrapper, size_t &memUsed) memUsed += word.WriteToMemory((char*) currPtr); } + // write size of source phrase and all source words + char *currPtr = (char*)mem + memUsed; + UINT64 *memTmp = (UINT64*) currPtr; + memTmp[0] = spSize; + memUsed += sizeof(UINT64); + for (size_t pos = 0; pos < spSize; ++pos) { + const Word &word = sp->GetWord(pos); + char *currPtr = (char*)mem + memUsed; + memUsed += word.WriteToMemory((char*) currPtr); + } + CHECK(memUsed == memNeeded); return (char *) mem; } @@ -191,7 +209,6 @@ size_t TargetPhrase::WriteAlignToMemory(char *mem) const memUsed += sizeof(alignPair.second); } - std::cerr << "align memory used: " << memUsed << std::endl; return memUsed; } @@ -207,7 +224,7 @@ size_t TargetPhrase::WriteScoresToMemory(char *mem) const } -Moses::TargetPhrase *TargetPhrase::ConvertToMoses(const std::vector<Moses::FactorType> & /*inputFactors */ +Moses::TargetPhrase *TargetPhrase::ConvertToMoses(const std::vector<Moses::FactorType> & inputFactors , const std::vector<Moses::FactorType> &outputFactors , const Vocab &vocab , const Moses::PhraseDictionary &phraseDict @@ -232,17 +249,31 @@ Moses::TargetPhrase *TargetPhrase::ConvertToMoses(const std::vector<Moses::Facto ret->SetScoreChart(phraseDict.GetFeature(), m_scores, weightT, lmList, wpProducer); // alignments + int indicator[m_align.size()]; + int index = 0; std::set<std::pair<size_t, size_t> > alignmentInfo; + const Phrase* sp = GetSourcePhrase(); for (size_t ind = 0; ind < m_align.size(); ++ind) { const std::pair<size_t, size_t> &entry = m_align[ind]; alignmentInfo.insert(entry); + size_t sourcePos = entry.first; + indicator[index++] = sp->GetWord(sourcePos).IsNonTerminal() ? 1: 0; } - ret->SetAlignmentInfo(alignmentInfo); + ret->SetAlignmentInfo(alignmentInfo, indicator); Moses::Word *lhs = GetWord(GetSize() - 1).ConvertToMoses(Moses::Output, outputFactors, vocab); ret->SetTargetLHS(*lhs); delete lhs; - + + // set source phrase + Moses::Phrase *mosesSP = new Moses::Phrase(Moses::Input); + for (size_t pos = 0; pos < sp->GetSize(); ++pos) { + Moses::Word *mosesWord = sp->GetWord(pos).ConvertToMoses(Moses::Input, inputFactors, vocab); + mosesSP->AddWord(*mosesWord); + delete mosesWord; + } + ret->SetSourcePhrase(*mosesSP); + return ret; } @@ -264,7 +295,7 @@ UINT64 TargetPhrase::ReadOtherInfoFromFile(UINT64 filePos, std::fstream &fileTPC return memUsed; } -UINT64 TargetPhrase::ReadFromFile(std::fstream &fileTP, size_t numFactors) +UINT64 TargetPhrase::ReadFromFile(std::fstream &fileTP, size_t numFactors, size_t numSourceFactors) { UINT64 bytesRead = 0; @@ -279,20 +310,31 @@ UINT64 TargetPhrase::ReadFromFile(std::fstream &fileTP, size_t numFactors) bytesRead += word->ReadFromFile(fileTP, numFactors); AddWord(word); } + + // read source words + UINT64 numSourceWords; + fileTP.read((char*) &numSourceWords, sizeof(UINT64)); + bytesRead += sizeof(UINT64); + + SourcePhrase *sp = new SourcePhrase(); + for (size_t ind = 0; ind < numSourceWords; ++ind) { + Word *word = new Word(); + bytesRead += word->ReadFromFile(fileTP, numSourceFactors); + sp->AddWord(word); + } + SetSourcePhrase(sp); return bytesRead; } UINT64 TargetPhrase::ReadAlignFromFile(std::fstream &fileTPColl) { - std::cerr << "read alignment.." << std::endl; UINT64 bytesRead = 0; UINT64 numAlign; fileTPColl.read((char*) &numAlign, sizeof(UINT64)); bytesRead += sizeof(UINT64); - std::cerr << "numAlign: " << numAlign << std::endl; for (size_t ind = 0; ind < numAlign; ++ind) { AlignPair alignPair; fileTPColl.read((char*) &alignPair.first, sizeof(UINT64)); @@ -302,7 +344,6 @@ UINT64 TargetPhrase::ReadAlignFromFile(std::fstream &fileTPColl) bytesRead += sizeof(UINT64) * 2; } - std::cerr << "Align bytes read: " << bytesRead << std::endl; return bytesRead; } diff --git a/OnDiskPt/TargetPhrase.h b/OnDiskPt/TargetPhrase.h index 1ff6f46a2..85b8ceeef 100755 --- a/OnDiskPt/TargetPhrase.h +++ b/OnDiskPt/TargetPhrase.h @@ -24,6 +24,7 @@ #include <vector> #include "Word.h" #include "Phrase.h" +#include "SourcePhrase.h" namespace Moses { @@ -45,6 +46,7 @@ class TargetPhrase: public Phrase friend std::ostream& operator<<(std::ostream&, const TargetPhrase&); protected: AlignType m_align; + Phrase* m_sourcePhrase; std::vector<float> m_scores; UINT64 m_filePos; @@ -60,6 +62,14 @@ public: TargetPhrase(const TargetPhrase ©); virtual ~TargetPhrase(); + void SetSourcePhrase(Phrase *p) { + Phrase *copy = new Phrase(*p); + m_sourcePhrase = copy; + } + const Phrase* GetSourcePhrase() const { + return m_sourcePhrase; + } + void SetLHS(Word *lhs); void Create1AlignFromString(const std::string &align1Str); @@ -90,7 +100,7 @@ public: , const Moses::WordPenaltyProducer* wpProducer , const Moses::LMList &lmList) const; UINT64 ReadOtherInfoFromFile(UINT64 filePos, std::fstream &fileTPColl); - UINT64 ReadFromFile(std::fstream &fileTP, size_t numFactors); + UINT64 ReadFromFile(std::fstream &fileTP, size_t numFactors, size_t numSourceFactors); }; diff --git a/OnDiskPt/TargetPhraseCollection.cpp b/OnDiskPt/TargetPhraseCollection.cpp index 295726ce1..c9fe92745 100755 --- a/OnDiskPt/TargetPhraseCollection.cpp +++ b/OnDiskPt/TargetPhraseCollection.cpp @@ -82,7 +82,7 @@ void TargetPhraseCollection::Save(OnDiskWrapper &onDiskWrapper) CollType::iterator iter; for (iter = m_coll.begin(); iter != m_coll.end(); ++iter) { // save phrase - TargetPhrase &targetPhrase = **iter; + TargetPhrase &targetPhrase = **iter; targetPhrase.Save(onDiskWrapper); // save coll @@ -154,10 +154,11 @@ void TargetPhraseCollection::ReadFromFile(size_t tableLimit, UINT64 filePos, OnD { fstream &fileTPColl = onDiskWrapper.GetFileTargetColl(); fstream &fileTP = onDiskWrapper.GetFileTargetInd(); - + size_t numScores = onDiskWrapper.GetNumScores(); size_t numTargetFactors = onDiskWrapper.GetNumTargetFactors(); - + size_t numSourceFactors = onDiskWrapper.GetNumSourceFactors(); + UINT64 numPhrases; UINT64 currFilePos = filePos; @@ -168,19 +169,15 @@ void TargetPhraseCollection::ReadFromFile(size_t tableLimit, UINT64 filePos, OnD numPhrases = std::min(numPhrases, (UINT64) tableLimit); currFilePos += sizeof(UINT64); - + for (size_t ind = 0; ind < numPhrases; ++ind) { - TargetPhrase *tp = new TargetPhrase(numScores); - + TargetPhrase *tp = new TargetPhrase(numScores); UINT64 sizeOtherInfo = tp->ReadOtherInfoFromFile(currFilePos, fileTPColl); - std::cerr << "other info done." << std::endl; - tp->ReadFromFile(fileTP, numTargetFactors); - std::cerr << "done reading from file." << std::endl; + tp->ReadFromFile(fileTP, numTargetFactors, numSourceFactors); currFilePos += sizeOtherInfo; m_coll.push_back(tp); - std::cerr << "tp done." << std::endl; } } diff --git a/OnDiskPt/Vocab.cpp b/OnDiskPt/Vocab.cpp index 86072edc6..1e9e4186a 100755 --- a/OnDiskPt/Vocab.cpp +++ b/OnDiskPt/Vocab.cpp @@ -44,7 +44,8 @@ bool Vocab::Load(OnDiskWrapper &onDiskWrapper) // create lookup // assume contiguous vocab id m_lookup.resize(m_vocabColl.size() + 1); - + m_nextId = m_lookup.size(); + CollType::const_iterator iter; for (iter = m_vocabColl.begin(); iter != m_vocabColl.end(); ++iter) { UINT32 vocabId = iter->second; |