diff options
author | Kenneth Heafield <github@kheafield.com> | 2011-12-01 22:26:05 +0400 |
---|---|---|
committer | Kenneth Heafield <github@kheafield.com> | 2011-12-01 22:26:05 +0400 |
commit | a0d79781e555067a31823b593ff3f36d6ed5e54e (patch) | |
tree | 99a7ca15a77378c080300fcccef7e364d05998e9 /OnDiskPt | |
parent | ececab6e2723e0122c49401ee9afca85e5fc39fc (diff) |
Merge CreateOnDisk into OnDiskPt, pop out of src
Diffstat (limited to 'OnDiskPt')
-rw-r--r-- | OnDiskPt/Jamfile (renamed from OnDiskPt/src/Jamfile) | 3 | ||||
-rw-r--r-- | OnDiskPt/Main.cpp | 240 | ||||
-rw-r--r-- | OnDiskPt/Main.h | 39 | ||||
-rw-r--r-- | OnDiskPt/OnDiskWrapper.cpp (renamed from OnDiskPt/src/OnDiskWrapper.cpp) | 0 | ||||
-rw-r--r-- | OnDiskPt/OnDiskWrapper.h (renamed from OnDiskPt/src/OnDiskWrapper.h) | 2 | ||||
-rw-r--r-- | OnDiskPt/Phrase.cpp (renamed from OnDiskPt/src/Phrase.cpp) | 2 | ||||
-rw-r--r-- | OnDiskPt/Phrase.h (renamed from OnDiskPt/src/Phrase.h) | 0 | ||||
-rw-r--r-- | OnDiskPt/PhraseNode.cpp (renamed from OnDiskPt/src/PhraseNode.cpp) | 2 | ||||
-rw-r--r-- | OnDiskPt/PhraseNode.h (renamed from OnDiskPt/src/PhraseNode.h) | 0 | ||||
-rw-r--r-- | OnDiskPt/SourcePhrase.cpp (renamed from OnDiskPt/src/SourcePhrase.cpp) | 0 | ||||
-rw-r--r-- | OnDiskPt/SourcePhrase.h (renamed from OnDiskPt/src/SourcePhrase.h) | 0 | ||||
-rw-r--r-- | OnDiskPt/TargetPhrase.cpp (renamed from OnDiskPt/src/TargetPhrase.cpp) | 8 | ||||
-rw-r--r-- | OnDiskPt/TargetPhrase.h (renamed from OnDiskPt/src/TargetPhrase.h) | 0 | ||||
-rw-r--r-- | OnDiskPt/TargetPhraseCollection.cpp (renamed from OnDiskPt/src/TargetPhraseCollection.cpp) | 6 | ||||
-rw-r--r-- | OnDiskPt/TargetPhraseCollection.h (renamed from OnDiskPt/src/TargetPhraseCollection.h) | 0 | ||||
-rw-r--r-- | OnDiskPt/Vocab.cpp (renamed from OnDiskPt/src/Vocab.cpp) | 2 | ||||
-rw-r--r-- | OnDiskPt/Vocab.h (renamed from OnDiskPt/src/Vocab.h) | 2 | ||||
-rw-r--r-- | OnDiskPt/Word.cpp (renamed from OnDiskPt/src/Word.cpp) | 4 | ||||
-rw-r--r-- | OnDiskPt/Word.h (renamed from OnDiskPt/src/Word.h) | 0 |
19 files changed, 295 insertions, 15 deletions
diff --git a/OnDiskPt/src/Jamfile b/OnDiskPt/Jamfile index 80ca813d2..f9811c05b 100644 --- a/OnDiskPt/src/Jamfile +++ b/OnDiskPt/Jamfile @@ -1 +1,2 @@ -lib OnDiskPt : OnDiskWrapper.cpp SourcePhrase.cpp TargetPhrase.cpp Word.cpp Phrase.cpp PhraseNode.cpp TargetPhraseCollection.cpp Vocab.cpp ../../moses/src//headers ; +lib OnDiskPt : OnDiskWrapper.cpp SourcePhrase.cpp TargetPhrase.cpp Word.cpp Phrase.cpp PhraseNode.cpp TargetPhraseCollection.cpp Vocab.cpp ../moses/src//headers ; +exe CreateOnDisk : Main.cpp ../moses/src//moses OnDiskPt ; diff --git a/OnDiskPt/Main.cpp b/OnDiskPt/Main.cpp new file mode 100644 index 000000000..d8899def5 --- /dev/null +++ b/OnDiskPt/Main.cpp @@ -0,0 +1,240 @@ +// $Id$ +/*********************************************************************** + Moses - factored phrase-based, hierarchical and syntactic language decoder + Copyright (C) 2009 Hieu Hoang + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + ***********************************************************************/ + +#include <algorithm> +#include <iostream> +#include <string> +#include <vector> +#include <iterator> +#include "../moses/src/InputFileStream.h" +#include "../moses/src/Util.h" +#include "../moses/src/UserMessage.h" +#include "../OnDiskPt/OnDiskWrapper.h" +#include "../OnDiskPt/SourcePhrase.h" +#include "../OnDiskPt/TargetPhrase.h" +#include "../OnDiskPt/TargetPhraseCollection.h" +#include "../OnDiskPt/Word.h" +#include "../OnDiskPt/Vocab.h" +#include "Main.h" + +using namespace std; +using namespace OnDiskPt; + +int main (int argc, char * const argv[]) +{ + // insert code here... + Moses::ResetUserTime(); + Moses::PrintUserTime("Starting"); + + assert(argc == 8); + + int numSourceFactors = Moses::Scan<int>(argv[1]) + , numTargetFactors = Moses::Scan<int>(argv[2]) + , numScores = Moses::Scan<int>(argv[3]) + , tableLimit = Moses::Scan<int>(argv[4]); + TargetPhraseCollection::s_sortScoreInd = Moses::Scan<int>(argv[5]); + assert(TargetPhraseCollection::s_sortScoreInd < numScores); + + const string filePath = argv[6] + ,destPath = argv[7]; + + + Moses::InputFileStream inStream(filePath); + + OnDiskWrapper onDiskWrapper; + bool retDb = onDiskWrapper.BeginSave(destPath, numSourceFactors, numTargetFactors, numScores); + assert(retDb); + + PhraseNode &rootNode = onDiskWrapper.GetRootSourceNode(); + size_t lineNum = 0; + char line[100000]; + + //while(getline(inStream, line)) + while(inStream.getline(line, 100000)) { + lineNum++; + if (lineNum%1000 == 0) cerr << "." << flush; + if (lineNum%10000 == 0) cerr << ":" << flush; + if (lineNum%100000 == 0) cerr << lineNum << flush; + //cerr << lineNum << " " << line << endl; + + std::vector<float> misc(1); + SourcePhrase sourcePhrase; + TargetPhrase *targetPhrase = new TargetPhrase(numScores); + Tokenize(sourcePhrase, *targetPhrase, line, onDiskWrapper, numScores, misc); + assert(misc.size() == onDiskWrapper.GetNumCounts()); + + rootNode.AddTargetPhrase(sourcePhrase, targetPhrase, onDiskWrapper, tableLimit, misc); + } + + rootNode.Save(onDiskWrapper, 0, tableLimit); + onDiskWrapper.EndSave(); + + Moses::PrintUserTime("Finished"); + + //pause(); + return 0; + +} // main() + +bool Flush(const OnDiskPt::SourcePhrase *prevSourcePhrase, const OnDiskPt::SourcePhrase *currSourcePhrase) +{ + if (prevSourcePhrase == NULL) + return false; + + assert(currSourcePhrase); + bool ret = (*currSourcePhrase > *prevSourcePhrase); + //cerr << *prevSourcePhrase << endl << *currSourcePhrase << " " << ret << endl << endl; + + return ret; +} + +void Tokenize(SourcePhrase &sourcePhrase, TargetPhrase &targetPhrase, char *line, OnDiskWrapper &onDiskWrapper, int numScores, vector<float> &misc) +{ + size_t scoreInd = 0; + + // MAIN LOOP + size_t stage = 0; + /* 0 = source phrase + 1 = target phrase + 2 = scores + 3 = align + 4 = count + */ + char *tok = strtok (line," "); + while (tok != NULL) { + if (0 == strcmp(tok, "|||")) { + ++stage; + } else { + switch (stage) { + case 0: { + Tokenize(sourcePhrase, tok, true, true, onDiskWrapper); + break; + } + case 1: { + Tokenize(targetPhrase, tok, false, true, onDiskWrapper); + break; + } + case 2: { + float score = Moses::Scan<float>(tok); + targetPhrase.SetScore(score, scoreInd); + ++scoreInd; + break; + } + case 3: { + targetPhrase.Create1AlignFromString(tok); + break; + } + case 4: + ++stage; + break; + case 5: { + // count info. Only store the 2nd one + float val = Moses::Scan<float>(tok); + misc[0] = val; + ++stage; + break; + } + default: + assert(false); + break; + } + } + + tok = strtok (NULL, " "); + } // while (tok != NULL) + + assert(scoreInd == numScores); + targetPhrase.SortAlign(); + +} // Tokenize() + +void Tokenize(OnDiskPt::Phrase &phrase + , const std::string &token, bool addSourceNonTerm, bool addTargetNonTerm + , OnDiskPt::OnDiskWrapper &onDiskWrapper) +{ + + bool nonTerm = false; + size_t tokSize = token.size(); + int comStr =token.compare(0, 1, "["); + + if (comStr == 0) { + comStr = token.compare(tokSize - 1, 1, "]"); + nonTerm = comStr == 0; + } + + if (nonTerm) { + // non-term + size_t splitPos = token.find_first_of("[", 2); + string wordStr = token.substr(0, splitPos); + + if (splitPos == string::npos) { + // lhs - only 1 word + Word *word = new Word(); + word->CreateFromString(wordStr, onDiskWrapper.GetVocab()); + phrase.AddWord(word); + } else { + // source & target non-terms + if (addSourceNonTerm) { + Word *word = new Word(); + word->CreateFromString(wordStr, onDiskWrapper.GetVocab()); + phrase.AddWord(word); + } + + wordStr = token.substr(splitPos, tokSize - splitPos); + if (addTargetNonTerm) { + Word *word = new Word(); + word->CreateFromString(wordStr, onDiskWrapper.GetVocab()); + phrase.AddWord(word); + } + + } + } else { + // term + Word *word = new Word(); + word->CreateFromString(token, onDiskWrapper.GetVocab()); + phrase.AddWord(word); + } +} + +void InsertTargetNonTerminals(std::vector<std::string> &sourceToks, const std::vector<std::string> &targetToks, const ::AlignType &alignments) +{ + for (int ind = alignments.size() - 1; ind >= 0; --ind) { + const ::AlignPair &alignPair = alignments[ind]; + size_t sourcePos = alignPair.first + ,targetPos = alignPair.second; + + const string &target = targetToks[targetPos]; + sourceToks.insert(sourceToks.begin() + sourcePos + 1, target); + + } +} + +class AlignOrderer +{ +public: + bool operator()(const ::AlignPair &a, const ::AlignPair &b) const { + return a.first < b.first; + } +}; + +void SortAlign(::AlignType &alignments) +{ + std::sort(alignments.begin(), alignments.end(), AlignOrderer()); +} diff --git a/OnDiskPt/Main.h b/OnDiskPt/Main.h new file mode 100644 index 000000000..41a24a239 --- /dev/null +++ b/OnDiskPt/Main.h @@ -0,0 +1,39 @@ +#pragma once +// $Id$ +/*********************************************************************** + Moses - factored phrase-based, hierarchical and syntactic language decoder + Copyright (C) 2009 Hieu Hoang + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + ***********************************************************************/ +#include <string> +#include "../OnDiskPt/SourcePhrase.h" +#include "../OnDiskPt/TargetPhrase.h" + +typedef std::pair<size_t, size_t> AlignPair; +typedef std::vector<AlignPair> AlignType; + +void Tokenize(OnDiskPt::Phrase &phrase + , const std::string &token, bool addSourceNonTerm, bool addTargetNonTerm + , OnDiskPt::OnDiskWrapper &onDiskWrapper); +void Tokenize(OnDiskPt::SourcePhrase &sourcePhrase, OnDiskPt::TargetPhrase &targetPhrase + , char *line, OnDiskPt::OnDiskWrapper &onDiskWrapper + , int numScores + , std::vector<float> &misc); + +void InsertTargetNonTerminals(std::vector<std::string> &sourceToks, const std::vector<std::string> &targetToks, const AlignType &alignments); +void SortAlign(AlignType &alignments); +bool Flush(const OnDiskPt::SourcePhrase *prevSource, const OnDiskPt::SourcePhrase *currSource); + diff --git a/OnDiskPt/src/OnDiskWrapper.cpp b/OnDiskPt/OnDiskWrapper.cpp index 79b0563a8..79b0563a8 100644 --- a/OnDiskPt/src/OnDiskWrapper.cpp +++ b/OnDiskPt/OnDiskWrapper.cpp diff --git a/OnDiskPt/src/OnDiskWrapper.h b/OnDiskPt/OnDiskWrapper.h index 2d3d6ed64..c49afdda1 100644 --- a/OnDiskPt/src/OnDiskWrapper.h +++ b/OnDiskPt/OnDiskWrapper.h @@ -22,7 +22,7 @@ #include <fstream> #include "Vocab.h" #include "PhraseNode.h" -#include "../../moses/src/Word.h" +#include "../moses/src/Word.h" namespace OnDiskPt { diff --git a/OnDiskPt/src/Phrase.cpp b/OnDiskPt/Phrase.cpp index ce6dc208c..b6ccd0721 100644 --- a/OnDiskPt/src/Phrase.cpp +++ b/OnDiskPt/Phrase.cpp @@ -19,7 +19,7 @@ ***********************************************************************/ #include <iostream> #include "util/check.hh" -#include "../../moses/src/Util.h" +#include "../moses/src/Util.h" #include "Phrase.h" using namespace std; diff --git a/OnDiskPt/src/Phrase.h b/OnDiskPt/Phrase.h index 093510e64..093510e64 100644 --- a/OnDiskPt/src/Phrase.h +++ b/OnDiskPt/Phrase.h diff --git a/OnDiskPt/src/PhraseNode.cpp b/OnDiskPt/PhraseNode.cpp index a479294e5..98a55dbc1 100644 --- a/OnDiskPt/src/PhraseNode.cpp +++ b/OnDiskPt/PhraseNode.cpp @@ -22,7 +22,7 @@ #include "OnDiskWrapper.h" #include "TargetPhraseCollection.h" #include "SourcePhrase.h" -#include "../../moses/src/Util.h" +#include "../moses/src/Util.h" using namespace std; diff --git a/OnDiskPt/src/PhraseNode.h b/OnDiskPt/PhraseNode.h index 279ca278a..279ca278a 100644 --- a/OnDiskPt/src/PhraseNode.h +++ b/OnDiskPt/PhraseNode.h diff --git a/OnDiskPt/src/SourcePhrase.cpp b/OnDiskPt/SourcePhrase.cpp index 595748c70..595748c70 100644 --- a/OnDiskPt/src/SourcePhrase.cpp +++ b/OnDiskPt/SourcePhrase.cpp diff --git a/OnDiskPt/src/SourcePhrase.h b/OnDiskPt/SourcePhrase.h index b4ae46705..b4ae46705 100644 --- a/OnDiskPt/src/SourcePhrase.h +++ b/OnDiskPt/SourcePhrase.h diff --git a/OnDiskPt/src/TargetPhrase.cpp b/OnDiskPt/TargetPhrase.cpp index 7480368e0..b740811d8 100644 --- a/OnDiskPt/src/TargetPhrase.cpp +++ b/OnDiskPt/TargetPhrase.cpp @@ -20,10 +20,10 @@ #include <algorithm> #include <iostream> -#include "../../moses/src/Util.h" -#include "../../moses/src/TargetPhrase.h" -#include "../../moses/src/PhraseDictionary.h" -#include "../../moses/src/DummyScoreProducers.h" +#include "../moses/src/Util.h" +#include "../moses/src/TargetPhrase.h" +#include "../moses/src/PhraseDictionary.h" +#include "../moses/src/DummyScoreProducers.h" #include "TargetPhrase.h" #include "OnDiskWrapper.h" diff --git a/OnDiskPt/src/TargetPhrase.h b/OnDiskPt/TargetPhrase.h index 56c7b6d3f..56c7b6d3f 100644 --- a/OnDiskPt/src/TargetPhrase.h +++ b/OnDiskPt/TargetPhrase.h diff --git a/OnDiskPt/src/TargetPhraseCollection.cpp b/OnDiskPt/TargetPhraseCollection.cpp index bb9c74364..b57ce4ee3 100644 --- a/OnDiskPt/src/TargetPhraseCollection.cpp +++ b/OnDiskPt/TargetPhraseCollection.cpp @@ -20,9 +20,9 @@ #include <algorithm> #include <iostream> -#include "../../moses/src/Util.h" -#include "../../moses/src/TargetPhraseCollection.h" -#include "../../moses/src/PhraseDictionary.h" +#include "../moses/src/Util.h" +#include "../moses/src/TargetPhraseCollection.h" +#include "../moses/src/PhraseDictionary.h" #include "TargetPhraseCollection.h" #include "Vocab.h" #include "OnDiskWrapper.h" diff --git a/OnDiskPt/src/TargetPhraseCollection.h b/OnDiskPt/TargetPhraseCollection.h index 6d95fb356..6d95fb356 100644 --- a/OnDiskPt/src/TargetPhraseCollection.h +++ b/OnDiskPt/TargetPhraseCollection.h diff --git a/OnDiskPt/src/Vocab.cpp b/OnDiskPt/Vocab.cpp index 9c0470b32..86072edc6 100644 --- a/OnDiskPt/src/Vocab.cpp +++ b/OnDiskPt/Vocab.cpp @@ -21,7 +21,7 @@ #include <fstream> #include "OnDiskWrapper.h" #include "Vocab.h" -#include "../../moses/src/FactorCollection.h" +#include "../moses/src/FactorCollection.h" using namespace std; diff --git a/OnDiskPt/src/Vocab.h b/OnDiskPt/Vocab.h index 9bb361251..360aedf4a 100644 --- a/OnDiskPt/src/Vocab.h +++ b/OnDiskPt/Vocab.h @@ -20,7 +20,7 @@ ***********************************************************************/ #include <string> #include <map> -#include "../../moses/src/TypeDef.h" +#include "../moses/src/TypeDef.h" namespace Moses { diff --git a/OnDiskPt/src/Word.cpp b/OnDiskPt/Word.cpp index 972f74584..a8d4c683a 100644 --- a/OnDiskPt/src/Word.cpp +++ b/OnDiskPt/Word.cpp @@ -18,8 +18,8 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ***********************************************************************/ -#include "../../moses/src/Util.h" -#include "../../moses/src/Word.h" +#include "../moses/src/Util.h" +#include "../moses/src/Word.h" #include "Word.h" using namespace std; diff --git a/OnDiskPt/src/Word.h b/OnDiskPt/Word.h index 0c0b55a09..0c0b55a09 100644 --- a/OnDiskPt/src/Word.h +++ b/OnDiskPt/Word.h |