diff options
author | Marcin Junczys-Dowmunt <junczys@amu.edu.pl> | 2013-01-23 00:11:02 +0400 |
---|---|---|
committer | Marcin Junczys-Dowmunt <junczys@amu.edu.pl> | 2013-01-23 00:11:02 +0400 |
commit | fcf75fae18c618877a6c0e7d8ebcfcaf833f5e27 (patch) | |
tree | 81df4b210b6d2b1ae0672e16ffeae3bfe8702356 | |
parent | cfe7d00ea279717e730881b98795d510a9c30d26 (diff) |
Added option to specify directory or prefix for temporary files created during phrase table compacting
-rw-r--r-- | misc/processLexicalTableMin.cpp | 8 | ||||
-rw-r--r-- | misc/processPhraseTableMin.cpp | 9 | ||||
-rw-r--r-- | moses/TranslationModel/CompactPT/LexicalReorderingTableCreator.cpp | 46 | ||||
-rw-r--r-- | moses/TranslationModel/CompactPT/LexicalReorderingTableCreator.h | 6 | ||||
-rw-r--r-- | moses/TranslationModel/CompactPT/MmapAllocator.h | 2 | ||||
-rw-r--r-- | moses/TranslationModel/CompactPT/PackedArray.h | 2 | ||||
-rw-r--r-- | moses/TranslationModel/CompactPT/PhraseTableCreator.cpp | 38 | ||||
-rw-r--r-- | moses/TranslationModel/CompactPT/PhraseTableCreator.h | 6 | ||||
-rw-r--r-- | moses/TranslationModel/CompactPT/StringVector.h | 33 |
9 files changed, 112 insertions, 38 deletions
diff --git a/misc/processLexicalTableMin.cpp b/misc/processLexicalTableMin.cpp index 23fda1f93..32373154b 100644 --- a/misc/processLexicalTableMin.cpp +++ b/misc/processLexicalTableMin.cpp @@ -15,6 +15,7 @@ void printHelp(char **argv) " options: \n" "\t-in string -- input table file name\n" "\t-out string -- prefix of binary table file\n" + "\t-T string -- path to temporary directory (uses /tmp by default)\n" #ifdef WITH_THREADS "\t-threads int|all -- number of threads used for conversion\n" #endif @@ -44,6 +45,7 @@ int main(int argc, char** argv) std::string inFilePath; std::string outFilePath("out"); + std::string tempfilePath; size_t orderBits = 10; size_t fingerPrintBits = 16; @@ -72,6 +74,10 @@ int main(int argc, char** argv) ++i; outFilePath = argv[i]; } + else if("-T" == arg && i+1 < argc) { + ++i; + tempfilePath = argv[i]; + } else if("-landmark" == arg && i+1 < argc) { ++i; @@ -121,7 +127,7 @@ int main(int argc, char** argv) outFilePath += ".minlexr"; LexicalReorderingTableCreator( - inFilePath, outFilePath, + inFilePath, outFilePath, tempfilePath, orderBits, fingerPrintBits, multipleScoreTrees, quantize #ifdef WITH_THREADS diff --git a/misc/processPhraseTableMin.cpp b/misc/processPhraseTableMin.cpp index 3b313a484..ecbe81186 100644 --- a/misc/processPhraseTableMin.cpp +++ b/misc/processPhraseTableMin.cpp @@ -14,6 +14,7 @@ void printHelp(char **argv) { " options: \n" "\t-in string -- input table file name\n" "\t-out string -- prefix of binary table file\n" + "\t-T string -- path to temporary directory (uses /tmp by default)\n" "\t-nscores int -- number of score components in phrase table\n" "\t-no-alignment-info -- do not include alignment info in the binary phrase table\n" #ifdef WITH_THREADS @@ -49,6 +50,7 @@ int main(int argc, char **argv) { std::string inFilePath; std::string outFilePath("out"); + std::string tempfilePath; PhraseTableCreator::Coding coding = PhraseTableCreator::PREnc; size_t numScoreComponent = 5; @@ -77,6 +79,10 @@ int main(int argc, char **argv) { ++i; outFilePath = argv[i]; } + else if("-T" == arg && i+1 < argc) { + ++i; + tempfilePath = argv[i]; + } else if("-encoding" == arg && i+1 < argc) { ++i; std::string val(argv[i]); @@ -166,7 +172,8 @@ int main(int argc, char **argv) { if(outFilePath.rfind(".minphr") != outFilePath.size() - 7) outFilePath += ".minphr"; - PhraseTableCreator(inFilePath, outFilePath, numScoreComponent, sortScoreIndex, + PhraseTableCreator(inFilePath, outFilePath, tempfilePath, + numScoreComponent, sortScoreIndex, coding, orderBits, fingerprintBits, useAlignmentInfo, multipleScoreTrees, quantize, maxRank, warnMe diff --git a/moses/TranslationModel/CompactPT/LexicalReorderingTableCreator.cpp b/moses/TranslationModel/CompactPT/LexicalReorderingTableCreator.cpp index f5c93ace8..a3eee1694 100644 --- a/moses/TranslationModel/CompactPT/LexicalReorderingTableCreator.cpp +++ b/moses/TranslationModel/CompactPT/LexicalReorderingTableCreator.cpp @@ -23,21 +23,23 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA #include "ThrowingFwrite.h" #include "moses/Util.h" +#include "util/file.hh" + namespace Moses { LexicalReorderingTableCreator::LexicalReorderingTableCreator( - std::string inPath, std::string outPath, + std::string inPath, std::string outPath, std::string tempfilePath, size_t orderBits, size_t fingerPrintBits, bool multipleScoreTrees, size_t quantize #ifdef WITH_THREADS , size_t threads #endif ) - : m_inPath(inPath), m_outPath(outPath), m_orderBits(orderBits), - m_fingerPrintBits(fingerPrintBits), m_numScoreComponent(0), - m_multipleScoreTrees(multipleScoreTrees), m_quantize(quantize), - m_separator(" ||| "), m_hash(m_orderBits, m_fingerPrintBits), - m_lastFlushedLine(-1) + : m_inPath(inPath), m_outPath(outPath), m_tempfilePath(tempfilePath), + m_orderBits(orderBits), m_fingerPrintBits(fingerPrintBits), + m_numScoreComponent(0), m_multipleScoreTrees(multipleScoreTrees), + m_quantize(quantize), m_separator(" ||| "), + m_hash(m_orderBits, m_fingerPrintBits), m_lastFlushedLine(-1) #ifdef WITH_THREADS , m_threads(threads) #endif @@ -48,12 +50,31 @@ LexicalReorderingTableCreator::LexicalReorderingTableCreator( std::cerr << "Pass 1/2: Creating phrase index + Counting scores" << std::endl; m_hash.BeginSave(m_outFile); + + + if(tempfilePath.size()) { + MmapAllocator<unsigned char> allocEncoded(util::FMakeTemp(tempfilePath)); + m_encodedScores = new StringVector<unsigned char, unsigned long, MmapAllocator>(allocEncoded); + } + else { + m_encodedScores = new StringVector<unsigned char, unsigned long, MmapAllocator>(); + } + EncodeScores(); std::cerr << "Intermezzo: Calculating Huffman code sets" << std::endl; CalcHuffmanCodes(); std::cerr << "Pass 2/2: Compressing scores" << std::endl; + + + if(tempfilePath.size()) { + MmapAllocator<unsigned char> allocCompressed(util::FMakeTemp(tempfilePath)); + m_compressedScores = new StringVector<unsigned char, unsigned long, MmapAllocator>(allocCompressed); + } + else { + m_compressedScores = new StringVector<unsigned char, unsigned long, MmapAllocator>(); + } CompressScores(); std::cerr << "Saving to " << m_outPath << std::endl; @@ -88,6 +109,9 @@ LexicalReorderingTableCreator::~LexicalReorderingTableCreator() delete m_scoreTrees[i]; delete m_scoreCounters[i]; } + + delete m_encodedScores; + delete m_compressedScores; } @@ -134,12 +158,12 @@ void LexicalReorderingTableCreator::CompressScores() #ifdef WITH_THREADS boost::thread_group threads; for (size_t i = 0; i < m_threads; ++i) { - CompressionTaskReordering* ct = new CompressionTaskReordering(m_encodedScores, *this); + CompressionTaskReordering* ct = new CompressionTaskReordering(*m_encodedScores, *this); threads.create_thread(*ct); } threads.join_all(); #else - CompressionTaskReordering* ct = new CompressionTaskReordering(m_encodedScores, *this); + CompressionTaskReordering* ct = new CompressionTaskReordering(*m_encodedScores, *this); (*ct)(); delete ct; #endif @@ -153,7 +177,7 @@ void LexicalReorderingTableCreator::Save() for(size_t i = 0; i < m_scoreTrees.size(); i++) m_scoreTrees[i]->Save(m_outFile); - m_compressedScores.save(m_outFile); + m_compressedScores->save(m_outFile); } std::string LexicalReorderingTableCreator::MakeSourceTargetKey(std::string &source, std::string &target) @@ -218,7 +242,7 @@ void LexicalReorderingTableCreator::FlushEncodedQueue(bool force) { m_lastFlushedLine++; m_lastRange.push_back(pi.GetSrc()); - m_encodedScores.push_back(pi.GetTrg()); + m_encodedScores->push_back(pi.GetTrg()); if((pi.GetLine()+1) % 100000 == 0) std::cerr << "."; @@ -293,7 +317,7 @@ void LexicalReorderingTableCreator::FlushCompressedQueue(bool force) m_queue.pop(); m_lastFlushedLine++; - m_compressedScores.push_back(pi.GetTrg()); + m_compressedScores->push_back(pi.GetTrg()); if((pi.GetLine()+1) % 100000 == 0) std::cerr << "."; diff --git a/moses/TranslationModel/CompactPT/LexicalReorderingTableCreator.h b/moses/TranslationModel/CompactPT/LexicalReorderingTableCreator.h index 68b37dcb8..2e202ce9b 100644 --- a/moses/TranslationModel/CompactPT/LexicalReorderingTableCreator.h +++ b/moses/TranslationModel/CompactPT/LexicalReorderingTableCreator.h @@ -30,6 +30,7 @@ class LexicalReorderingTableCreator { private: std::string m_inPath; std::string m_outPath; + std::string m_tempfilePath; std::FILE* m_outFile; @@ -51,8 +52,8 @@ class LexicalReorderingTableCreator { std::vector<ScoreCounter*> m_scoreCounters; std::vector<ScoreTree*> m_scoreTrees; - StringVector<unsigned char, unsigned long, MmapAllocator> m_encodedScores; - StringVector<unsigned char, unsigned long, MmapAllocator> m_compressedScores; + StringVector<unsigned char, unsigned long, MmapAllocator>* m_encodedScores; + StringVector<unsigned char, unsigned long, MmapAllocator>* m_compressedScores; std::priority_queue<PackedItem> m_queue; long m_lastFlushedLine; @@ -84,6 +85,7 @@ class LexicalReorderingTableCreator { public: LexicalReorderingTableCreator(std::string inPath, std::string outPath, + std::string tempfilePath, size_t orderBits = 10, size_t fingerPrintBits = 16, bool multipleScoreTrees = true, diff --git a/moses/TranslationModel/CompactPT/MmapAllocator.h b/moses/TranslationModel/CompactPT/MmapAllocator.h index c4655692b..049c0149d 100644 --- a/moses/TranslationModel/CompactPT/MmapAllocator.h +++ b/moses/TranslationModel/CompactPT/MmapAllocator.h @@ -66,7 +66,7 @@ namespace Moses m_data_offset(0), m_fixed(false), m_count(new size_t(0)) { } - MmapAllocator(std::FILE* f_ptr, size_t data_offset = 0) throw() + MmapAllocator(std::FILE* f_ptr, size_t data_offset) throw() : m_file_ptr(f_ptr), m_file_desc(fileno(m_file_ptr)), m_page_size(sysconf(_SC_PAGE_SIZE)), m_map_size(0), m_data_ptr(0), m_data_offset(data_offset), m_fixed(true), m_count(new size_t(0)) diff --git a/moses/TranslationModel/CompactPT/PackedArray.h b/moses/TranslationModel/CompactPT/PackedArray.h index b74a98850..ad4596546 100644 --- a/moses/TranslationModel/CompactPT/PackedArray.h +++ b/moses/TranslationModel/CompactPT/PackedArray.h @@ -66,7 +66,7 @@ class PackedArray std::memcpy(m_storage, c.m_storage, m_storageSize * sizeof(D)); } - ~PackedArray() + virtual ~PackedArray() { delete [] m_storage; m_size = 0; diff --git a/moses/TranslationModel/CompactPT/PhraseTableCreator.cpp b/moses/TranslationModel/CompactPT/PhraseTableCreator.cpp index c62305b99..7ff0292fe 100644 --- a/moses/TranslationModel/CompactPT/PhraseTableCreator.cpp +++ b/moses/TranslationModel/CompactPT/PhraseTableCreator.cpp @@ -25,6 +25,8 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA #include "ConsistentPhrases.h" #include "ThrowingFwrite.h" +#include "util/file.hh" + namespace Moses { @@ -40,6 +42,7 @@ std::string PhraseTableCreator::m_separator = " ||| "; PhraseTableCreator::PhraseTableCreator(std::string inPath, std::string outPath, + std::string tempfilePath, size_t numScoreComponent, size_t sortScoreIndex, Coding coding, @@ -54,7 +57,7 @@ PhraseTableCreator::PhraseTableCreator(std::string inPath, , size_t threads #endif ) - : m_inPath(inPath), m_outPath(outPath), + : m_inPath(inPath), m_outPath(outPath), m_tempfilePath(tempfilePath), m_outFile(std::fopen(m_outPath.c_str(), "w")), m_numScoreComponent(numScoreComponent), m_sortScoreIndex(sortScoreIndex), m_warnMe(warnMe), m_coding(coding), m_orderBits(orderBits), m_fingerPrintBits(fingerPrintBits), @@ -108,7 +111,15 @@ PhraseTableCreator::PhraseTableCreator(std::string inPath, // 1st pass std::cerr << "Pass " << cur_pass << "/" << all_passes << ": Creating source phrase index + Encoding target phrases" << std::endl; - m_srcHash.BeginSave(m_outFile); + m_srcHash.BeginSave(m_outFile); + + if(tempfilePath.size()) { + MmapAllocator<unsigned char> allocEncoded(util::FMakeTemp(tempfilePath)); + m_encodedTargetPhrases = new StringVector<unsigned char, unsigned long, MmapAllocator>(allocEncoded); + } + else { + m_encodedTargetPhrases = new StringVector<unsigned char, unsigned long, MmapAllocator>(); + } EncodeTargetPhrases(); cur_pass++; @@ -118,6 +129,14 @@ PhraseTableCreator::PhraseTableCreator(std::string inPath, // 2nd pass std::cerr << "Pass " << cur_pass << "/" << all_passes << ": Compressing target phrases" << std::endl; + + if(tempfilePath.size()) { + MmapAllocator<unsigned char> allocCompressed(util::FMakeTemp(tempfilePath)); + m_compressedTargetPhrases = new StringVector<unsigned char, unsigned long, MmapAllocator>(allocCompressed); + } + else { + m_compressedTargetPhrases = new StringVector<unsigned char, unsigned long, MmapAllocator>(); + } CompressTargetPhrases(); std::cerr << "Saving to " << m_outPath << std::endl; @@ -135,6 +154,9 @@ PhraseTableCreator::~PhraseTableCreator() delete m_scoreTrees[i]; delete m_scoreCounters[i]; } + + delete m_encodedTargetPhrases; + delete m_compressedTargetPhrases; } void PhraseTableCreator::PrintInfo() @@ -230,7 +252,7 @@ void PhraseTableCreator::Save() m_alignTree->Save(m_outFile); // Save compressed target phrase collections - m_compressedTargetPhrases.save(m_outFile); + m_compressedTargetPhrases->save(m_outFile); } void PhraseTableCreator::LoadLexicalTable(std::string filePath) @@ -355,12 +377,12 @@ void PhraseTableCreator::CompressTargetPhrases() #ifdef WITH_THREADS boost::thread_group threads; for (size_t i = 0; i < m_threads; ++i) { - CompressionTask* ct = new CompressionTask(m_encodedTargetPhrases, *this); + CompressionTask* ct = new CompressionTask(*m_encodedTargetPhrases, *this); threads.create_thread(*ct); } threads.join_all(); #else - CompressionTask* ct = new CompressionTask(m_encodedTargetPhrases, *this); + CompressionTask* ct = new CompressionTask(*m_encodedTargetPhrases, *this); (*ct)(); delete ct; #endif @@ -940,7 +962,7 @@ void PhraseTableCreator::FlushEncodedQueue(bool force) targetPhraseCollection << *it; m_lastSourceRange.push_back(MakeSourceKey(m_lastFlushedSourcePhrase)); - m_encodedTargetPhrases.push_back(targetPhraseCollection.str()); + m_encodedTargetPhrases->push_back(targetPhraseCollection.str()); m_lastFlushedSourceNum++; if(m_lastFlushedSourceNum % 100000 == 0) @@ -982,7 +1004,7 @@ void PhraseTableCreator::FlushEncodedQueue(bool force) m_lastCollection.begin(); it != m_lastCollection.end(); it++) targetPhraseCollection << *it; - m_encodedTargetPhrases.push_back(targetPhraseCollection.str()); + m_encodedTargetPhrases->push_back(targetPhraseCollection.str()); m_lastCollection.clear(); } @@ -1019,7 +1041,7 @@ void PhraseTableCreator::FlushCompressedQueue(bool force) m_queue.pop(); m_lastFlushedLine++; - m_compressedTargetPhrases.push_back(pi.GetTrg()); + m_compressedTargetPhrases->push_back(pi.GetTrg()); if((pi.GetLine()+1) % 100000 == 0) std::cerr << "."; diff --git a/moses/TranslationModel/CompactPT/PhraseTableCreator.h b/moses/TranslationModel/CompactPT/PhraseTableCreator.h index cadd6a2bf..ded3a84eb 100644 --- a/moses/TranslationModel/CompactPT/PhraseTableCreator.h +++ b/moses/TranslationModel/CompactPT/PhraseTableCreator.h @@ -196,6 +196,7 @@ class PhraseTableCreator private: std::string m_inPath; std::string m_outPath; + std::string m_tempfilePath; std::FILE* m_outFile; @@ -252,10 +253,10 @@ class PhraseTableCreator std::vector<size_t> m_lexicalTableIndex; std::vector<SrcTrg> m_lexicalTable; - StringVector<unsigned char, unsigned long, MmapAllocator> + StringVector<unsigned char, unsigned long, MmapAllocator>* m_encodedTargetPhrases; - StringVector<unsigned char, unsigned long, MmapAllocator> + StringVector<unsigned char, unsigned long, MmapAllocator>* m_compressedTargetPhrases; boost::unordered_map<std::string, unsigned> m_targetSymbolsMap; @@ -346,6 +347,7 @@ class PhraseTableCreator PhraseTableCreator(std::string inPath, std::string outPath, + std::string tempfilePath, size_t numScoreComponent = 5, size_t sortScoreIndex = 2, Coding coding = PREnc, diff --git a/moses/TranslationModel/CompactPT/StringVector.h b/moses/TranslationModel/CompactPT/StringVector.h index 76146176d..fcc545a19 100644 --- a/moses/TranslationModel/CompactPT/StringVector.h +++ b/moses/TranslationModel/CompactPT/StringVector.h @@ -79,11 +79,12 @@ template <typename ValueT = unsigned char, typename PosT = unsigned int, class StringVector { protected: - std::vector<ValueT, Allocator<ValueT> > m_charArray; - MonotonicVector<PosT, unsigned int, 32, Allocator> m_positions; bool m_sorted; bool m_memoryMapped; - + + std::vector<ValueT, Allocator<ValueT> >* m_charArray; + MonotonicVector<PosT, unsigned int, 32> m_positions; + virtual const ValueT* value_ptr(PosT i) const; public: @@ -148,12 +149,18 @@ class StringVector typedef StringIterator string_iterator; StringVector(); + StringVector(Allocator<ValueT> alloc); + + virtual ~StringVector() + { + delete m_charArray; + } void swap(StringVector<ValueT, PosT, Allocator> &c) { m_positions.commit(); m_positions.swap(c.m_positions); - m_charArray.swap(c.m_charArray); + m_charArray->swap(*c.m_charArray); bool temp = m_sorted; m_sorted = c.m_sorted; @@ -176,7 +183,7 @@ class StringVector void clear() { - m_charArray.clear(); + m_charArray->clear(); m_sorted = true; m_positions = MonotonicVector<PosT, unsigned int, 32>(); } @@ -201,7 +208,7 @@ class StringVector size += std::fread(&m_sorted, sizeof(bool), 1, in) * sizeof(bool); size += m_positions.load(in, m_memoryMapped); - size += loadCharArray(m_charArray, in, m_memoryMapped); + size += loadCharArray(*m_charArray, in, m_memoryMapped); return size; } @@ -272,7 +279,7 @@ class StringVector size_t valSize = size2(); byteSize += ThrowingFwrite(&valSize, sizeof(size_t), 1, out) * sizeof(size_t); - byteSize += ThrowingFwrite(&m_charArray[0], sizeof(ValueT), valSize, out) * sizeof(ValueT); + byteSize += ThrowingFwrite(&(*m_charArray)[0], sizeof(ValueT), valSize, out) * sizeof(ValueT); return byteSize; } @@ -374,7 +381,11 @@ OStream& operator<<(OStream &os, ValueIteratorRange<ValueIteratorT> cr) template<typename ValueT, typename PosT, template <typename> class Allocator> StringVector<ValueT, PosT, Allocator>::StringVector() - : m_sorted(true), m_memoryMapped(false) { } + : m_sorted(true), m_memoryMapped(false), m_charArray(new std::vector<ValueT, Allocator<ValueT> >()) { } + +template<typename ValueT, typename PosT, template <typename> class Allocator> +StringVector<ValueT, PosT, Allocator>::StringVector(Allocator<ValueT> alloc) + : m_sorted(true), m_memoryMapped(false), m_charArray(new std::vector<ValueT, Allocator<ValueT> >(alloc)) { } template<typename ValueT, typename PosT, template <typename> class Allocator> template <typename StringT> @@ -384,7 +395,7 @@ void StringVector<ValueT, PosT, Allocator>::push_back(StringT s) m_sorted = false; m_positions.push_back(size2()); - std::copy(s.begin(), s.end(), std::back_inserter(m_charArray)); + std::copy(s.begin(), s.end(), std::back_inserter(*m_charArray)); } template<typename ValueT, typename PosT, template <typename> class Allocator> @@ -435,7 +446,7 @@ PosT StringVector<ValueT, PosT, Allocator>::size() const template<typename ValueT, typename PosT, template <typename> class Allocator> PosT StringVector<ValueT, PosT, Allocator>::size2() const { - return m_charArray.size(); + return m_charArray->size(); } template<typename ValueT, typename PosT, template <typename> class Allocator> @@ -468,7 +479,7 @@ PosT StringVector<ValueT, PosT, Allocator>::length(PosT i) const template<typename ValueT, typename PosT, template <typename> class Allocator> const ValueT* StringVector<ValueT, PosT, Allocator>::value_ptr(PosT i) const { - return &m_charArray[m_positions[i]]; + return &(*m_charArray)[m_positions[i]]; } template<typename ValueT, typename PosT, template <typename> class Allocator> |