Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMarcin Junczys-Dowmunt <junczys@amu.edu.pl>2013-01-23 00:11:02 +0400
committerMarcin Junczys-Dowmunt <junczys@amu.edu.pl>2013-01-23 00:11:02 +0400
commitfcf75fae18c618877a6c0e7d8ebcfcaf833f5e27 (patch)
tree81df4b210b6d2b1ae0672e16ffeae3bfe8702356
parentcfe7d00ea279717e730881b98795d510a9c30d26 (diff)
Added option to specify directory or prefix for temporary files created during phrase table compacting
-rw-r--r--misc/processLexicalTableMin.cpp8
-rw-r--r--misc/processPhraseTableMin.cpp9
-rw-r--r--moses/TranslationModel/CompactPT/LexicalReorderingTableCreator.cpp46
-rw-r--r--moses/TranslationModel/CompactPT/LexicalReorderingTableCreator.h6
-rw-r--r--moses/TranslationModel/CompactPT/MmapAllocator.h2
-rw-r--r--moses/TranslationModel/CompactPT/PackedArray.h2
-rw-r--r--moses/TranslationModel/CompactPT/PhraseTableCreator.cpp38
-rw-r--r--moses/TranslationModel/CompactPT/PhraseTableCreator.h6
-rw-r--r--moses/TranslationModel/CompactPT/StringVector.h33
9 files changed, 112 insertions, 38 deletions
diff --git a/misc/processLexicalTableMin.cpp b/misc/processLexicalTableMin.cpp
index 23fda1f93..32373154b 100644
--- a/misc/processLexicalTableMin.cpp
+++ b/misc/processLexicalTableMin.cpp
@@ -15,6 +15,7 @@ void printHelp(char **argv)
" options: \n"
"\t-in string -- input table file name\n"
"\t-out string -- prefix of binary table file\n"
+ "\t-T string -- path to temporary directory (uses /tmp by default)\n"
#ifdef WITH_THREADS
"\t-threads int|all -- number of threads used for conversion\n"
#endif
@@ -44,6 +45,7 @@ int main(int argc, char** argv)
std::string inFilePath;
std::string outFilePath("out");
+ std::string tempfilePath;
size_t orderBits = 10;
size_t fingerPrintBits = 16;
@@ -72,6 +74,10 @@ int main(int argc, char** argv)
++i;
outFilePath = argv[i];
}
+ else if("-T" == arg && i+1 < argc) {
+ ++i;
+ tempfilePath = argv[i];
+ }
else if("-landmark" == arg && i+1 < argc)
{
++i;
@@ -121,7 +127,7 @@ int main(int argc, char** argv)
outFilePath += ".minlexr";
LexicalReorderingTableCreator(
- inFilePath, outFilePath,
+ inFilePath, outFilePath, tempfilePath,
orderBits, fingerPrintBits,
multipleScoreTrees, quantize
#ifdef WITH_THREADS
diff --git a/misc/processPhraseTableMin.cpp b/misc/processPhraseTableMin.cpp
index 3b313a484..ecbe81186 100644
--- a/misc/processPhraseTableMin.cpp
+++ b/misc/processPhraseTableMin.cpp
@@ -14,6 +14,7 @@ void printHelp(char **argv) {
" options: \n"
"\t-in string -- input table file name\n"
"\t-out string -- prefix of binary table file\n"
+ "\t-T string -- path to temporary directory (uses /tmp by default)\n"
"\t-nscores int -- number of score components in phrase table\n"
"\t-no-alignment-info -- do not include alignment info in the binary phrase table\n"
#ifdef WITH_THREADS
@@ -49,6 +50,7 @@ int main(int argc, char **argv) {
std::string inFilePath;
std::string outFilePath("out");
+ std::string tempfilePath;
PhraseTableCreator::Coding coding = PhraseTableCreator::PREnc;
size_t numScoreComponent = 5;
@@ -77,6 +79,10 @@ int main(int argc, char **argv) {
++i;
outFilePath = argv[i];
}
+ else if("-T" == arg && i+1 < argc) {
+ ++i;
+ tempfilePath = argv[i];
+ }
else if("-encoding" == arg && i+1 < argc) {
++i;
std::string val(argv[i]);
@@ -166,7 +172,8 @@ int main(int argc, char **argv) {
if(outFilePath.rfind(".minphr") != outFilePath.size() - 7)
outFilePath += ".minphr";
- PhraseTableCreator(inFilePath, outFilePath, numScoreComponent, sortScoreIndex,
+ PhraseTableCreator(inFilePath, outFilePath, tempfilePath,
+ numScoreComponent, sortScoreIndex,
coding, orderBits, fingerprintBits,
useAlignmentInfo, multipleScoreTrees,
quantize, maxRank, warnMe
diff --git a/moses/TranslationModel/CompactPT/LexicalReorderingTableCreator.cpp b/moses/TranslationModel/CompactPT/LexicalReorderingTableCreator.cpp
index f5c93ace8..a3eee1694 100644
--- a/moses/TranslationModel/CompactPT/LexicalReorderingTableCreator.cpp
+++ b/moses/TranslationModel/CompactPT/LexicalReorderingTableCreator.cpp
@@ -23,21 +23,23 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
#include "ThrowingFwrite.h"
#include "moses/Util.h"
+#include "util/file.hh"
+
namespace Moses {
LexicalReorderingTableCreator::LexicalReorderingTableCreator(
- std::string inPath, std::string outPath,
+ std::string inPath, std::string outPath, std::string tempfilePath,
size_t orderBits, size_t fingerPrintBits, bool multipleScoreTrees,
size_t quantize
#ifdef WITH_THREADS
, size_t threads
#endif
)
- : m_inPath(inPath), m_outPath(outPath), m_orderBits(orderBits),
- m_fingerPrintBits(fingerPrintBits), m_numScoreComponent(0),
- m_multipleScoreTrees(multipleScoreTrees), m_quantize(quantize),
- m_separator(" ||| "), m_hash(m_orderBits, m_fingerPrintBits),
- m_lastFlushedLine(-1)
+ : m_inPath(inPath), m_outPath(outPath), m_tempfilePath(tempfilePath),
+ m_orderBits(orderBits), m_fingerPrintBits(fingerPrintBits),
+ m_numScoreComponent(0), m_multipleScoreTrees(multipleScoreTrees),
+ m_quantize(quantize), m_separator(" ||| "),
+ m_hash(m_orderBits, m_fingerPrintBits), m_lastFlushedLine(-1)
#ifdef WITH_THREADS
, m_threads(threads)
#endif
@@ -48,12 +50,31 @@ LexicalReorderingTableCreator::LexicalReorderingTableCreator(
std::cerr << "Pass 1/2: Creating phrase index + Counting scores" << std::endl;
m_hash.BeginSave(m_outFile);
+
+
+ if(tempfilePath.size()) {
+ MmapAllocator<unsigned char> allocEncoded(util::FMakeTemp(tempfilePath));
+ m_encodedScores = new StringVector<unsigned char, unsigned long, MmapAllocator>(allocEncoded);
+ }
+ else {
+ m_encodedScores = new StringVector<unsigned char, unsigned long, MmapAllocator>();
+ }
+
EncodeScores();
std::cerr << "Intermezzo: Calculating Huffman code sets" << std::endl;
CalcHuffmanCodes();
std::cerr << "Pass 2/2: Compressing scores" << std::endl;
+
+
+ if(tempfilePath.size()) {
+ MmapAllocator<unsigned char> allocCompressed(util::FMakeTemp(tempfilePath));
+ m_compressedScores = new StringVector<unsigned char, unsigned long, MmapAllocator>(allocCompressed);
+ }
+ else {
+ m_compressedScores = new StringVector<unsigned char, unsigned long, MmapAllocator>();
+ }
CompressScores();
std::cerr << "Saving to " << m_outPath << std::endl;
@@ -88,6 +109,9 @@ LexicalReorderingTableCreator::~LexicalReorderingTableCreator()
delete m_scoreTrees[i];
delete m_scoreCounters[i];
}
+
+ delete m_encodedScores;
+ delete m_compressedScores;
}
@@ -134,12 +158,12 @@ void LexicalReorderingTableCreator::CompressScores()
#ifdef WITH_THREADS
boost::thread_group threads;
for (size_t i = 0; i < m_threads; ++i) {
- CompressionTaskReordering* ct = new CompressionTaskReordering(m_encodedScores, *this);
+ CompressionTaskReordering* ct = new CompressionTaskReordering(*m_encodedScores, *this);
threads.create_thread(*ct);
}
threads.join_all();
#else
- CompressionTaskReordering* ct = new CompressionTaskReordering(m_encodedScores, *this);
+ CompressionTaskReordering* ct = new CompressionTaskReordering(*m_encodedScores, *this);
(*ct)();
delete ct;
#endif
@@ -153,7 +177,7 @@ void LexicalReorderingTableCreator::Save()
for(size_t i = 0; i < m_scoreTrees.size(); i++)
m_scoreTrees[i]->Save(m_outFile);
- m_compressedScores.save(m_outFile);
+ m_compressedScores->save(m_outFile);
}
std::string LexicalReorderingTableCreator::MakeSourceTargetKey(std::string &source, std::string &target)
@@ -218,7 +242,7 @@ void LexicalReorderingTableCreator::FlushEncodedQueue(bool force) {
m_lastFlushedLine++;
m_lastRange.push_back(pi.GetSrc());
- m_encodedScores.push_back(pi.GetTrg());
+ m_encodedScores->push_back(pi.GetTrg());
if((pi.GetLine()+1) % 100000 == 0)
std::cerr << ".";
@@ -293,7 +317,7 @@ void LexicalReorderingTableCreator::FlushCompressedQueue(bool force)
m_queue.pop();
m_lastFlushedLine++;
- m_compressedScores.push_back(pi.GetTrg());
+ m_compressedScores->push_back(pi.GetTrg());
if((pi.GetLine()+1) % 100000 == 0)
std::cerr << ".";
diff --git a/moses/TranslationModel/CompactPT/LexicalReorderingTableCreator.h b/moses/TranslationModel/CompactPT/LexicalReorderingTableCreator.h
index 68b37dcb8..2e202ce9b 100644
--- a/moses/TranslationModel/CompactPT/LexicalReorderingTableCreator.h
+++ b/moses/TranslationModel/CompactPT/LexicalReorderingTableCreator.h
@@ -30,6 +30,7 @@ class LexicalReorderingTableCreator {
private:
std::string m_inPath;
std::string m_outPath;
+ std::string m_tempfilePath;
std::FILE* m_outFile;
@@ -51,8 +52,8 @@ class LexicalReorderingTableCreator {
std::vector<ScoreCounter*> m_scoreCounters;
std::vector<ScoreTree*> m_scoreTrees;
- StringVector<unsigned char, unsigned long, MmapAllocator> m_encodedScores;
- StringVector<unsigned char, unsigned long, MmapAllocator> m_compressedScores;
+ StringVector<unsigned char, unsigned long, MmapAllocator>* m_encodedScores;
+ StringVector<unsigned char, unsigned long, MmapAllocator>* m_compressedScores;
std::priority_queue<PackedItem> m_queue;
long m_lastFlushedLine;
@@ -84,6 +85,7 @@ class LexicalReorderingTableCreator {
public:
LexicalReorderingTableCreator(std::string inPath,
std::string outPath,
+ std::string tempfilePath,
size_t orderBits = 10,
size_t fingerPrintBits = 16,
bool multipleScoreTrees = true,
diff --git a/moses/TranslationModel/CompactPT/MmapAllocator.h b/moses/TranslationModel/CompactPT/MmapAllocator.h
index c4655692b..049c0149d 100644
--- a/moses/TranslationModel/CompactPT/MmapAllocator.h
+++ b/moses/TranslationModel/CompactPT/MmapAllocator.h
@@ -66,7 +66,7 @@ namespace Moses
m_data_offset(0), m_fixed(false), m_count(new size_t(0))
{ }
- MmapAllocator(std::FILE* f_ptr, size_t data_offset = 0) throw()
+ MmapAllocator(std::FILE* f_ptr, size_t data_offset) throw()
: m_file_ptr(f_ptr), m_file_desc(fileno(m_file_ptr)),
m_page_size(sysconf(_SC_PAGE_SIZE)), m_map_size(0), m_data_ptr(0),
m_data_offset(data_offset), m_fixed(true), m_count(new size_t(0))
diff --git a/moses/TranslationModel/CompactPT/PackedArray.h b/moses/TranslationModel/CompactPT/PackedArray.h
index b74a98850..ad4596546 100644
--- a/moses/TranslationModel/CompactPT/PackedArray.h
+++ b/moses/TranslationModel/CompactPT/PackedArray.h
@@ -66,7 +66,7 @@ class PackedArray
std::memcpy(m_storage, c.m_storage, m_storageSize * sizeof(D));
}
- ~PackedArray()
+ virtual ~PackedArray()
{
delete [] m_storage;
m_size = 0;
diff --git a/moses/TranslationModel/CompactPT/PhraseTableCreator.cpp b/moses/TranslationModel/CompactPT/PhraseTableCreator.cpp
index c62305b99..7ff0292fe 100644
--- a/moses/TranslationModel/CompactPT/PhraseTableCreator.cpp
+++ b/moses/TranslationModel/CompactPT/PhraseTableCreator.cpp
@@ -25,6 +25,8 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
#include "ConsistentPhrases.h"
#include "ThrowingFwrite.h"
+#include "util/file.hh"
+
namespace Moses
{
@@ -40,6 +42,7 @@ std::string PhraseTableCreator::m_separator = " ||| ";
PhraseTableCreator::PhraseTableCreator(std::string inPath,
std::string outPath,
+ std::string tempfilePath,
size_t numScoreComponent,
size_t sortScoreIndex,
Coding coding,
@@ -54,7 +57,7 @@ PhraseTableCreator::PhraseTableCreator(std::string inPath,
, size_t threads
#endif
)
- : m_inPath(inPath), m_outPath(outPath),
+ : m_inPath(inPath), m_outPath(outPath), m_tempfilePath(tempfilePath),
m_outFile(std::fopen(m_outPath.c_str(), "w")), m_numScoreComponent(numScoreComponent),
m_sortScoreIndex(sortScoreIndex), m_warnMe(warnMe),
m_coding(coding), m_orderBits(orderBits), m_fingerPrintBits(fingerPrintBits),
@@ -108,7 +111,15 @@ PhraseTableCreator::PhraseTableCreator(std::string inPath,
// 1st pass
std::cerr << "Pass " << cur_pass << "/" << all_passes << ": Creating source phrase index + Encoding target phrases" << std::endl;
- m_srcHash.BeginSave(m_outFile);
+ m_srcHash.BeginSave(m_outFile);
+
+ if(tempfilePath.size()) {
+ MmapAllocator<unsigned char> allocEncoded(util::FMakeTemp(tempfilePath));
+ m_encodedTargetPhrases = new StringVector<unsigned char, unsigned long, MmapAllocator>(allocEncoded);
+ }
+ else {
+ m_encodedTargetPhrases = new StringVector<unsigned char, unsigned long, MmapAllocator>();
+ }
EncodeTargetPhrases();
cur_pass++;
@@ -118,6 +129,14 @@ PhraseTableCreator::PhraseTableCreator(std::string inPath,
// 2nd pass
std::cerr << "Pass " << cur_pass << "/" << all_passes << ": Compressing target phrases" << std::endl;
+
+ if(tempfilePath.size()) {
+ MmapAllocator<unsigned char> allocCompressed(util::FMakeTemp(tempfilePath));
+ m_compressedTargetPhrases = new StringVector<unsigned char, unsigned long, MmapAllocator>(allocCompressed);
+ }
+ else {
+ m_compressedTargetPhrases = new StringVector<unsigned char, unsigned long, MmapAllocator>();
+ }
CompressTargetPhrases();
std::cerr << "Saving to " << m_outPath << std::endl;
@@ -135,6 +154,9 @@ PhraseTableCreator::~PhraseTableCreator()
delete m_scoreTrees[i];
delete m_scoreCounters[i];
}
+
+ delete m_encodedTargetPhrases;
+ delete m_compressedTargetPhrases;
}
void PhraseTableCreator::PrintInfo()
@@ -230,7 +252,7 @@ void PhraseTableCreator::Save()
m_alignTree->Save(m_outFile);
// Save compressed target phrase collections
- m_compressedTargetPhrases.save(m_outFile);
+ m_compressedTargetPhrases->save(m_outFile);
}
void PhraseTableCreator::LoadLexicalTable(std::string filePath)
@@ -355,12 +377,12 @@ void PhraseTableCreator::CompressTargetPhrases()
#ifdef WITH_THREADS
boost::thread_group threads;
for (size_t i = 0; i < m_threads; ++i) {
- CompressionTask* ct = new CompressionTask(m_encodedTargetPhrases, *this);
+ CompressionTask* ct = new CompressionTask(*m_encodedTargetPhrases, *this);
threads.create_thread(*ct);
}
threads.join_all();
#else
- CompressionTask* ct = new CompressionTask(m_encodedTargetPhrases, *this);
+ CompressionTask* ct = new CompressionTask(*m_encodedTargetPhrases, *this);
(*ct)();
delete ct;
#endif
@@ -940,7 +962,7 @@ void PhraseTableCreator::FlushEncodedQueue(bool force)
targetPhraseCollection << *it;
m_lastSourceRange.push_back(MakeSourceKey(m_lastFlushedSourcePhrase));
- m_encodedTargetPhrases.push_back(targetPhraseCollection.str());
+ m_encodedTargetPhrases->push_back(targetPhraseCollection.str());
m_lastFlushedSourceNum++;
if(m_lastFlushedSourceNum % 100000 == 0)
@@ -982,7 +1004,7 @@ void PhraseTableCreator::FlushEncodedQueue(bool force)
m_lastCollection.begin(); it != m_lastCollection.end(); it++)
targetPhraseCollection << *it;
- m_encodedTargetPhrases.push_back(targetPhraseCollection.str());
+ m_encodedTargetPhrases->push_back(targetPhraseCollection.str());
m_lastCollection.clear();
}
@@ -1019,7 +1041,7 @@ void PhraseTableCreator::FlushCompressedQueue(bool force)
m_queue.pop();
m_lastFlushedLine++;
- m_compressedTargetPhrases.push_back(pi.GetTrg());
+ m_compressedTargetPhrases->push_back(pi.GetTrg());
if((pi.GetLine()+1) % 100000 == 0)
std::cerr << ".";
diff --git a/moses/TranslationModel/CompactPT/PhraseTableCreator.h b/moses/TranslationModel/CompactPT/PhraseTableCreator.h
index cadd6a2bf..ded3a84eb 100644
--- a/moses/TranslationModel/CompactPT/PhraseTableCreator.h
+++ b/moses/TranslationModel/CompactPT/PhraseTableCreator.h
@@ -196,6 +196,7 @@ class PhraseTableCreator
private:
std::string m_inPath;
std::string m_outPath;
+ std::string m_tempfilePath;
std::FILE* m_outFile;
@@ -252,10 +253,10 @@ class PhraseTableCreator
std::vector<size_t> m_lexicalTableIndex;
std::vector<SrcTrg> m_lexicalTable;
- StringVector<unsigned char, unsigned long, MmapAllocator>
+ StringVector<unsigned char, unsigned long, MmapAllocator>*
m_encodedTargetPhrases;
- StringVector<unsigned char, unsigned long, MmapAllocator>
+ StringVector<unsigned char, unsigned long, MmapAllocator>*
m_compressedTargetPhrases;
boost::unordered_map<std::string, unsigned> m_targetSymbolsMap;
@@ -346,6 +347,7 @@ class PhraseTableCreator
PhraseTableCreator(std::string inPath,
std::string outPath,
+ std::string tempfilePath,
size_t numScoreComponent = 5,
size_t sortScoreIndex = 2,
Coding coding = PREnc,
diff --git a/moses/TranslationModel/CompactPT/StringVector.h b/moses/TranslationModel/CompactPT/StringVector.h
index 76146176d..fcc545a19 100644
--- a/moses/TranslationModel/CompactPT/StringVector.h
+++ b/moses/TranslationModel/CompactPT/StringVector.h
@@ -79,11 +79,12 @@ template <typename ValueT = unsigned char, typename PosT = unsigned int,
class StringVector
{
protected:
- std::vector<ValueT, Allocator<ValueT> > m_charArray;
- MonotonicVector<PosT, unsigned int, 32, Allocator> m_positions;
bool m_sorted;
bool m_memoryMapped;
-
+
+ std::vector<ValueT, Allocator<ValueT> >* m_charArray;
+ MonotonicVector<PosT, unsigned int, 32> m_positions;
+
virtual const ValueT* value_ptr(PosT i) const;
public:
@@ -148,12 +149,18 @@ class StringVector
typedef StringIterator string_iterator;
StringVector();
+ StringVector(Allocator<ValueT> alloc);
+
+ virtual ~StringVector()
+ {
+ delete m_charArray;
+ }
void swap(StringVector<ValueT, PosT, Allocator> &c)
{
m_positions.commit();
m_positions.swap(c.m_positions);
- m_charArray.swap(c.m_charArray);
+ m_charArray->swap(*c.m_charArray);
bool temp = m_sorted;
m_sorted = c.m_sorted;
@@ -176,7 +183,7 @@ class StringVector
void clear()
{
- m_charArray.clear();
+ m_charArray->clear();
m_sorted = true;
m_positions = MonotonicVector<PosT, unsigned int, 32>();
}
@@ -201,7 +208,7 @@ class StringVector
size += std::fread(&m_sorted, sizeof(bool), 1, in) * sizeof(bool);
size += m_positions.load(in, m_memoryMapped);
- size += loadCharArray(m_charArray, in, m_memoryMapped);
+ size += loadCharArray(*m_charArray, in, m_memoryMapped);
return size;
}
@@ -272,7 +279,7 @@ class StringVector
size_t valSize = size2();
byteSize += ThrowingFwrite(&valSize, sizeof(size_t), 1, out) * sizeof(size_t);
- byteSize += ThrowingFwrite(&m_charArray[0], sizeof(ValueT), valSize, out) * sizeof(ValueT);
+ byteSize += ThrowingFwrite(&(*m_charArray)[0], sizeof(ValueT), valSize, out) * sizeof(ValueT);
return byteSize;
}
@@ -374,7 +381,11 @@ OStream& operator<<(OStream &os, ValueIteratorRange<ValueIteratorT> cr)
template<typename ValueT, typename PosT, template <typename> class Allocator>
StringVector<ValueT, PosT, Allocator>::StringVector()
- : m_sorted(true), m_memoryMapped(false) { }
+ : m_sorted(true), m_memoryMapped(false), m_charArray(new std::vector<ValueT, Allocator<ValueT> >()) { }
+
+template<typename ValueT, typename PosT, template <typename> class Allocator>
+StringVector<ValueT, PosT, Allocator>::StringVector(Allocator<ValueT> alloc)
+ : m_sorted(true), m_memoryMapped(false), m_charArray(new std::vector<ValueT, Allocator<ValueT> >(alloc)) { }
template<typename ValueT, typename PosT, template <typename> class Allocator>
template <typename StringT>
@@ -384,7 +395,7 @@ void StringVector<ValueT, PosT, Allocator>::push_back(StringT s)
m_sorted = false;
m_positions.push_back(size2());
- std::copy(s.begin(), s.end(), std::back_inserter(m_charArray));
+ std::copy(s.begin(), s.end(), std::back_inserter(*m_charArray));
}
template<typename ValueT, typename PosT, template <typename> class Allocator>
@@ -435,7 +446,7 @@ PosT StringVector<ValueT, PosT, Allocator>::size() const
template<typename ValueT, typename PosT, template <typename> class Allocator>
PosT StringVector<ValueT, PosT, Allocator>::size2() const
{
- return m_charArray.size();
+ return m_charArray->size();
}
template<typename ValueT, typename PosT, template <typename> class Allocator>
@@ -468,7 +479,7 @@ PosT StringVector<ValueT, PosT, Allocator>::length(PosT i) const
template<typename ValueT, typename PosT, template <typename> class Allocator>
const ValueT* StringVector<ValueT, PosT, Allocator>::value_ptr(PosT i) const
{
- return &m_charArray[m_positions[i]];
+ return &(*m_charArray)[m_positions[i]];
}
template<typename ValueT, typename PosT, template <typename> class Allocator>