Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKenneth Heafield <github@kheafield.com>2011-12-01 22:26:05 +0400
committerKenneth Heafield <github@kheafield.com>2011-12-01 22:26:05 +0400
commita0d79781e555067a31823b593ff3f36d6ed5e54e (patch)
tree99a7ca15a77378c080300fcccef7e364d05998e9 /OnDiskPt
parentececab6e2723e0122c49401ee9afca85e5fc39fc (diff)
Merge CreateOnDisk into OnDiskPt, pop out of src
Diffstat (limited to 'OnDiskPt')
-rw-r--r--OnDiskPt/Jamfile (renamed from OnDiskPt/src/Jamfile)3
-rw-r--r--OnDiskPt/Main.cpp240
-rw-r--r--OnDiskPt/Main.h39
-rw-r--r--OnDiskPt/OnDiskWrapper.cpp (renamed from OnDiskPt/src/OnDiskWrapper.cpp)0
-rw-r--r--OnDiskPt/OnDiskWrapper.h (renamed from OnDiskPt/src/OnDiskWrapper.h)2
-rw-r--r--OnDiskPt/Phrase.cpp (renamed from OnDiskPt/src/Phrase.cpp)2
-rw-r--r--OnDiskPt/Phrase.h (renamed from OnDiskPt/src/Phrase.h)0
-rw-r--r--OnDiskPt/PhraseNode.cpp (renamed from OnDiskPt/src/PhraseNode.cpp)2
-rw-r--r--OnDiskPt/PhraseNode.h (renamed from OnDiskPt/src/PhraseNode.h)0
-rw-r--r--OnDiskPt/SourcePhrase.cpp (renamed from OnDiskPt/src/SourcePhrase.cpp)0
-rw-r--r--OnDiskPt/SourcePhrase.h (renamed from OnDiskPt/src/SourcePhrase.h)0
-rw-r--r--OnDiskPt/TargetPhrase.cpp (renamed from OnDiskPt/src/TargetPhrase.cpp)8
-rw-r--r--OnDiskPt/TargetPhrase.h (renamed from OnDiskPt/src/TargetPhrase.h)0
-rw-r--r--OnDiskPt/TargetPhraseCollection.cpp (renamed from OnDiskPt/src/TargetPhraseCollection.cpp)6
-rw-r--r--OnDiskPt/TargetPhraseCollection.h (renamed from OnDiskPt/src/TargetPhraseCollection.h)0
-rw-r--r--OnDiskPt/Vocab.cpp (renamed from OnDiskPt/src/Vocab.cpp)2
-rw-r--r--OnDiskPt/Vocab.h (renamed from OnDiskPt/src/Vocab.h)2
-rw-r--r--OnDiskPt/Word.cpp (renamed from OnDiskPt/src/Word.cpp)4
-rw-r--r--OnDiskPt/Word.h (renamed from OnDiskPt/src/Word.h)0
19 files changed, 295 insertions, 15 deletions
diff --git a/OnDiskPt/src/Jamfile b/OnDiskPt/Jamfile
index 80ca813d2..f9811c05b 100644
--- a/OnDiskPt/src/Jamfile
+++ b/OnDiskPt/Jamfile
@@ -1 +1,2 @@
-lib OnDiskPt : OnDiskWrapper.cpp SourcePhrase.cpp TargetPhrase.cpp Word.cpp Phrase.cpp PhraseNode.cpp TargetPhraseCollection.cpp Vocab.cpp ../../moses/src//headers ;
+lib OnDiskPt : OnDiskWrapper.cpp SourcePhrase.cpp TargetPhrase.cpp Word.cpp Phrase.cpp PhraseNode.cpp TargetPhraseCollection.cpp Vocab.cpp ../moses/src//headers ;
+exe CreateOnDisk : Main.cpp ../moses/src//moses OnDiskPt ;
diff --git a/OnDiskPt/Main.cpp b/OnDiskPt/Main.cpp
new file mode 100644
index 000000000..d8899def5
--- /dev/null
+++ b/OnDiskPt/Main.cpp
@@ -0,0 +1,240 @@
+// $Id$
+/***********************************************************************
+ Moses - factored phrase-based, hierarchical and syntactic language decoder
+ Copyright (C) 2009 Hieu Hoang
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ ***********************************************************************/
+
+#include <algorithm>
+#include <iostream>
+#include <string>
+#include <vector>
+#include <iterator>
+#include "../moses/src/InputFileStream.h"
+#include "../moses/src/Util.h"
+#include "../moses/src/UserMessage.h"
+#include "../OnDiskPt/OnDiskWrapper.h"
+#include "../OnDiskPt/SourcePhrase.h"
+#include "../OnDiskPt/TargetPhrase.h"
+#include "../OnDiskPt/TargetPhraseCollection.h"
+#include "../OnDiskPt/Word.h"
+#include "../OnDiskPt/Vocab.h"
+#include "Main.h"
+
+using namespace std;
+using namespace OnDiskPt;
+
+int main (int argc, char * const argv[])
+{
+ // insert code here...
+ Moses::ResetUserTime();
+ Moses::PrintUserTime("Starting");
+
+ assert(argc == 8);
+
+ int numSourceFactors = Moses::Scan<int>(argv[1])
+ , numTargetFactors = Moses::Scan<int>(argv[2])
+ , numScores = Moses::Scan<int>(argv[3])
+ , tableLimit = Moses::Scan<int>(argv[4]);
+ TargetPhraseCollection::s_sortScoreInd = Moses::Scan<int>(argv[5]);
+ assert(TargetPhraseCollection::s_sortScoreInd < numScores);
+
+ const string filePath = argv[6]
+ ,destPath = argv[7];
+
+
+ Moses::InputFileStream inStream(filePath);
+
+ OnDiskWrapper onDiskWrapper;
+ bool retDb = onDiskWrapper.BeginSave(destPath, numSourceFactors, numTargetFactors, numScores);
+ assert(retDb);
+
+ PhraseNode &rootNode = onDiskWrapper.GetRootSourceNode();
+ size_t lineNum = 0;
+ char line[100000];
+
+ //while(getline(inStream, line))
+ while(inStream.getline(line, 100000)) {
+ lineNum++;
+ if (lineNum%1000 == 0) cerr << "." << flush;
+ if (lineNum%10000 == 0) cerr << ":" << flush;
+ if (lineNum%100000 == 0) cerr << lineNum << flush;
+ //cerr << lineNum << " " << line << endl;
+
+ std::vector<float> misc(1);
+ SourcePhrase sourcePhrase;
+ TargetPhrase *targetPhrase = new TargetPhrase(numScores);
+ Tokenize(sourcePhrase, *targetPhrase, line, onDiskWrapper, numScores, misc);
+ assert(misc.size() == onDiskWrapper.GetNumCounts());
+
+ rootNode.AddTargetPhrase(sourcePhrase, targetPhrase, onDiskWrapper, tableLimit, misc);
+ }
+
+ rootNode.Save(onDiskWrapper, 0, tableLimit);
+ onDiskWrapper.EndSave();
+
+ Moses::PrintUserTime("Finished");
+
+ //pause();
+ return 0;
+
+} // main()
+
+bool Flush(const OnDiskPt::SourcePhrase *prevSourcePhrase, const OnDiskPt::SourcePhrase *currSourcePhrase)
+{
+ if (prevSourcePhrase == NULL)
+ return false;
+
+ assert(currSourcePhrase);
+ bool ret = (*currSourcePhrase > *prevSourcePhrase);
+ //cerr << *prevSourcePhrase << endl << *currSourcePhrase << " " << ret << endl << endl;
+
+ return ret;
+}
+
+void Tokenize(SourcePhrase &sourcePhrase, TargetPhrase &targetPhrase, char *line, OnDiskWrapper &onDiskWrapper, int numScores, vector<float> &misc)
+{
+ size_t scoreInd = 0;
+
+ // MAIN LOOP
+ size_t stage = 0;
+ /* 0 = source phrase
+ 1 = target phrase
+ 2 = scores
+ 3 = align
+ 4 = count
+ */
+ char *tok = strtok (line," ");
+ while (tok != NULL) {
+ if (0 == strcmp(tok, "|||")) {
+ ++stage;
+ } else {
+ switch (stage) {
+ case 0: {
+ Tokenize(sourcePhrase, tok, true, true, onDiskWrapper);
+ break;
+ }
+ case 1: {
+ Tokenize(targetPhrase, tok, false, true, onDiskWrapper);
+ break;
+ }
+ case 2: {
+ float score = Moses::Scan<float>(tok);
+ targetPhrase.SetScore(score, scoreInd);
+ ++scoreInd;
+ break;
+ }
+ case 3: {
+ targetPhrase.Create1AlignFromString(tok);
+ break;
+ }
+ case 4:
+ ++stage;
+ break;
+ case 5: {
+ // count info. Only store the 2nd one
+ float val = Moses::Scan<float>(tok);
+ misc[0] = val;
+ ++stage;
+ break;
+ }
+ default:
+ assert(false);
+ break;
+ }
+ }
+
+ tok = strtok (NULL, " ");
+ } // while (tok != NULL)
+
+ assert(scoreInd == numScores);
+ targetPhrase.SortAlign();
+
+} // Tokenize()
+
+void Tokenize(OnDiskPt::Phrase &phrase
+ , const std::string &token, bool addSourceNonTerm, bool addTargetNonTerm
+ , OnDiskPt::OnDiskWrapper &onDiskWrapper)
+{
+
+ bool nonTerm = false;
+ size_t tokSize = token.size();
+ int comStr =token.compare(0, 1, "[");
+
+ if (comStr == 0) {
+ comStr = token.compare(tokSize - 1, 1, "]");
+ nonTerm = comStr == 0;
+ }
+
+ if (nonTerm) {
+ // non-term
+ size_t splitPos = token.find_first_of("[", 2);
+ string wordStr = token.substr(0, splitPos);
+
+ if (splitPos == string::npos) {
+ // lhs - only 1 word
+ Word *word = new Word();
+ word->CreateFromString(wordStr, onDiskWrapper.GetVocab());
+ phrase.AddWord(word);
+ } else {
+ // source & target non-terms
+ if (addSourceNonTerm) {
+ Word *word = new Word();
+ word->CreateFromString(wordStr, onDiskWrapper.GetVocab());
+ phrase.AddWord(word);
+ }
+
+ wordStr = token.substr(splitPos, tokSize - splitPos);
+ if (addTargetNonTerm) {
+ Word *word = new Word();
+ word->CreateFromString(wordStr, onDiskWrapper.GetVocab());
+ phrase.AddWord(word);
+ }
+
+ }
+ } else {
+ // term
+ Word *word = new Word();
+ word->CreateFromString(token, onDiskWrapper.GetVocab());
+ phrase.AddWord(word);
+ }
+}
+
+void InsertTargetNonTerminals(std::vector<std::string> &sourceToks, const std::vector<std::string> &targetToks, const ::AlignType &alignments)
+{
+ for (int ind = alignments.size() - 1; ind >= 0; --ind) {
+ const ::AlignPair &alignPair = alignments[ind];
+ size_t sourcePos = alignPair.first
+ ,targetPos = alignPair.second;
+
+ const string &target = targetToks[targetPos];
+ sourceToks.insert(sourceToks.begin() + sourcePos + 1, target);
+
+ }
+}
+
+class AlignOrderer
+{
+public:
+ bool operator()(const ::AlignPair &a, const ::AlignPair &b) const {
+ return a.first < b.first;
+ }
+};
+
+void SortAlign(::AlignType &alignments)
+{
+ std::sort(alignments.begin(), alignments.end(), AlignOrderer());
+}
diff --git a/OnDiskPt/Main.h b/OnDiskPt/Main.h
new file mode 100644
index 000000000..41a24a239
--- /dev/null
+++ b/OnDiskPt/Main.h
@@ -0,0 +1,39 @@
+#pragma once
+// $Id$
+/***********************************************************************
+ Moses - factored phrase-based, hierarchical and syntactic language decoder
+ Copyright (C) 2009 Hieu Hoang
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ ***********************************************************************/
+#include <string>
+#include "../OnDiskPt/SourcePhrase.h"
+#include "../OnDiskPt/TargetPhrase.h"
+
+typedef std::pair<size_t, size_t> AlignPair;
+typedef std::vector<AlignPair> AlignType;
+
+void Tokenize(OnDiskPt::Phrase &phrase
+ , const std::string &token, bool addSourceNonTerm, bool addTargetNonTerm
+ , OnDiskPt::OnDiskWrapper &onDiskWrapper);
+void Tokenize(OnDiskPt::SourcePhrase &sourcePhrase, OnDiskPt::TargetPhrase &targetPhrase
+ , char *line, OnDiskPt::OnDiskWrapper &onDiskWrapper
+ , int numScores
+ , std::vector<float> &misc);
+
+void InsertTargetNonTerminals(std::vector<std::string> &sourceToks, const std::vector<std::string> &targetToks, const AlignType &alignments);
+void SortAlign(AlignType &alignments);
+bool Flush(const OnDiskPt::SourcePhrase *prevSource, const OnDiskPt::SourcePhrase *currSource);
+
diff --git a/OnDiskPt/src/OnDiskWrapper.cpp b/OnDiskPt/OnDiskWrapper.cpp
index 79b0563a8..79b0563a8 100644
--- a/OnDiskPt/src/OnDiskWrapper.cpp
+++ b/OnDiskPt/OnDiskWrapper.cpp
diff --git a/OnDiskPt/src/OnDiskWrapper.h b/OnDiskPt/OnDiskWrapper.h
index 2d3d6ed64..c49afdda1 100644
--- a/OnDiskPt/src/OnDiskWrapper.h
+++ b/OnDiskPt/OnDiskWrapper.h
@@ -22,7 +22,7 @@
#include <fstream>
#include "Vocab.h"
#include "PhraseNode.h"
-#include "../../moses/src/Word.h"
+#include "../moses/src/Word.h"
namespace OnDiskPt
{
diff --git a/OnDiskPt/src/Phrase.cpp b/OnDiskPt/Phrase.cpp
index ce6dc208c..b6ccd0721 100644
--- a/OnDiskPt/src/Phrase.cpp
+++ b/OnDiskPt/Phrase.cpp
@@ -19,7 +19,7 @@
***********************************************************************/
#include <iostream>
#include "util/check.hh"
-#include "../../moses/src/Util.h"
+#include "../moses/src/Util.h"
#include "Phrase.h"
using namespace std;
diff --git a/OnDiskPt/src/Phrase.h b/OnDiskPt/Phrase.h
index 093510e64..093510e64 100644
--- a/OnDiskPt/src/Phrase.h
+++ b/OnDiskPt/Phrase.h
diff --git a/OnDiskPt/src/PhraseNode.cpp b/OnDiskPt/PhraseNode.cpp
index a479294e5..98a55dbc1 100644
--- a/OnDiskPt/src/PhraseNode.cpp
+++ b/OnDiskPt/PhraseNode.cpp
@@ -22,7 +22,7 @@
#include "OnDiskWrapper.h"
#include "TargetPhraseCollection.h"
#include "SourcePhrase.h"
-#include "../../moses/src/Util.h"
+#include "../moses/src/Util.h"
using namespace std;
diff --git a/OnDiskPt/src/PhraseNode.h b/OnDiskPt/PhraseNode.h
index 279ca278a..279ca278a 100644
--- a/OnDiskPt/src/PhraseNode.h
+++ b/OnDiskPt/PhraseNode.h
diff --git a/OnDiskPt/src/SourcePhrase.cpp b/OnDiskPt/SourcePhrase.cpp
index 595748c70..595748c70 100644
--- a/OnDiskPt/src/SourcePhrase.cpp
+++ b/OnDiskPt/SourcePhrase.cpp
diff --git a/OnDiskPt/src/SourcePhrase.h b/OnDiskPt/SourcePhrase.h
index b4ae46705..b4ae46705 100644
--- a/OnDiskPt/src/SourcePhrase.h
+++ b/OnDiskPt/SourcePhrase.h
diff --git a/OnDiskPt/src/TargetPhrase.cpp b/OnDiskPt/TargetPhrase.cpp
index 7480368e0..b740811d8 100644
--- a/OnDiskPt/src/TargetPhrase.cpp
+++ b/OnDiskPt/TargetPhrase.cpp
@@ -20,10 +20,10 @@
#include <algorithm>
#include <iostream>
-#include "../../moses/src/Util.h"
-#include "../../moses/src/TargetPhrase.h"
-#include "../../moses/src/PhraseDictionary.h"
-#include "../../moses/src/DummyScoreProducers.h"
+#include "../moses/src/Util.h"
+#include "../moses/src/TargetPhrase.h"
+#include "../moses/src/PhraseDictionary.h"
+#include "../moses/src/DummyScoreProducers.h"
#include "TargetPhrase.h"
#include "OnDiskWrapper.h"
diff --git a/OnDiskPt/src/TargetPhrase.h b/OnDiskPt/TargetPhrase.h
index 56c7b6d3f..56c7b6d3f 100644
--- a/OnDiskPt/src/TargetPhrase.h
+++ b/OnDiskPt/TargetPhrase.h
diff --git a/OnDiskPt/src/TargetPhraseCollection.cpp b/OnDiskPt/TargetPhraseCollection.cpp
index bb9c74364..b57ce4ee3 100644
--- a/OnDiskPt/src/TargetPhraseCollection.cpp
+++ b/OnDiskPt/TargetPhraseCollection.cpp
@@ -20,9 +20,9 @@
#include <algorithm>
#include <iostream>
-#include "../../moses/src/Util.h"
-#include "../../moses/src/TargetPhraseCollection.h"
-#include "../../moses/src/PhraseDictionary.h"
+#include "../moses/src/Util.h"
+#include "../moses/src/TargetPhraseCollection.h"
+#include "../moses/src/PhraseDictionary.h"
#include "TargetPhraseCollection.h"
#include "Vocab.h"
#include "OnDiskWrapper.h"
diff --git a/OnDiskPt/src/TargetPhraseCollection.h b/OnDiskPt/TargetPhraseCollection.h
index 6d95fb356..6d95fb356 100644
--- a/OnDiskPt/src/TargetPhraseCollection.h
+++ b/OnDiskPt/TargetPhraseCollection.h
diff --git a/OnDiskPt/src/Vocab.cpp b/OnDiskPt/Vocab.cpp
index 9c0470b32..86072edc6 100644
--- a/OnDiskPt/src/Vocab.cpp
+++ b/OnDiskPt/Vocab.cpp
@@ -21,7 +21,7 @@
#include <fstream>
#include "OnDiskWrapper.h"
#include "Vocab.h"
-#include "../../moses/src/FactorCollection.h"
+#include "../moses/src/FactorCollection.h"
using namespace std;
diff --git a/OnDiskPt/src/Vocab.h b/OnDiskPt/Vocab.h
index 9bb361251..360aedf4a 100644
--- a/OnDiskPt/src/Vocab.h
+++ b/OnDiskPt/Vocab.h
@@ -20,7 +20,7 @@
***********************************************************************/
#include <string>
#include <map>
-#include "../../moses/src/TypeDef.h"
+#include "../moses/src/TypeDef.h"
namespace Moses
{
diff --git a/OnDiskPt/src/Word.cpp b/OnDiskPt/Word.cpp
index 972f74584..a8d4c683a 100644
--- a/OnDiskPt/src/Word.cpp
+++ b/OnDiskPt/Word.cpp
@@ -18,8 +18,8 @@
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
***********************************************************************/
-#include "../../moses/src/Util.h"
-#include "../../moses/src/Word.h"
+#include "../moses/src/Util.h"
+#include "../moses/src/Word.h"
#include "Word.h"
using namespace std;
diff --git a/OnDiskPt/src/Word.h b/OnDiskPt/Word.h
index 0c0b55a09..0c0b55a09 100644
--- a/OnDiskPt/src/Word.h
+++ b/OnDiskPt/Word.h