Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorHieu Hoang <hieuhoang@gmail.com>2017-02-01 01:21:59 +0300
committerHieu Hoang <hieuhoang@gmail.com>2017-02-01 01:21:59 +0300
commita8a5b43f2dc32bd1b45006fd43989dc71e74ba0e (patch)
treee84a78fa005e29ec78076d6e525371240871122c /moses2/TranslationModel
parent7206d592751ee9afeb1fa4753b7e19272e2585bc (diff)
move moses2 to root
Diffstat (limited to 'moses2/TranslationModel')
-rw-r--r--moses2/TranslationModel/CompactPT/BlockHashIndex.cpp418
-rw-r--r--moses2/TranslationModel/CompactPT/BlockHashIndex.h200
-rw-r--r--moses2/TranslationModel/CompactPT/CanonicalHuffman.h345
-rw-r--r--moses2/TranslationModel/CompactPT/CmphStringVectorAdapter.cpp95
-rw-r--r--moses2/TranslationModel/CompactPT/CmphStringVectorAdapter.h108
-rw-r--r--moses2/TranslationModel/CompactPT/LexicalReorderingTableCompact.cpp173
-rw-r--r--moses2/TranslationModel/CompactPT/LexicalReorderingTableCompact.h143
-rw-r--r--moses2/TranslationModel/CompactPT/ListCoders.h394
-rw-r--r--moses2/TranslationModel/CompactPT/MmapAllocator.h217
-rw-r--r--moses2/TranslationModel/CompactPT/MonotonicVector.h247
-rw-r--r--moses2/TranslationModel/CompactPT/MurmurHash3.cpp424
-rw-r--r--moses2/TranslationModel/CompactPT/MurmurHash3.h37
-rw-r--r--moses2/TranslationModel/CompactPT/PackedArray.h207
-rw-r--r--moses2/TranslationModel/CompactPT/PhraseDecoder.cpp466
-rw-r--r--moses2/TranslationModel/CompactPT/PhraseDecoder.h142
-rw-r--r--moses2/TranslationModel/CompactPT/PhraseTableCompact.cpp222
-rw-r--r--moses2/TranslationModel/CompactPT/PhraseTableCompact.h68
-rw-r--r--moses2/TranslationModel/CompactPT/StringVector.h662
-rw-r--r--moses2/TranslationModel/CompactPT/TargetPhraseCollectionCache.cpp39
-rw-r--r--moses2/TranslationModel/CompactPT/TargetPhraseCollectionCache.h176
-rw-r--r--moses2/TranslationModel/CompactPT/ThrowingFwrite.cpp30
-rw-r--r--moses2/TranslationModel/CompactPT/ThrowingFwrite.h31
-rw-r--r--moses2/TranslationModel/Memory/Node.h138
-rw-r--r--moses2/TranslationModel/Memory/PhraseTableMemory.cpp268
-rw-r--r--moses2/TranslationModel/Memory/PhraseTableMemory.h85
-rw-r--r--moses2/TranslationModel/PhraseTable.cpp183
-rw-r--r--moses2/TranslationModel/PhraseTable.h128
-rw-r--r--moses2/TranslationModel/ProbingPT/ProbingPT.cpp756
-rw-r--r--moses2/TranslationModel/ProbingPT/ProbingPT.h159
-rw-r--r--moses2/TranslationModel/ProbingPT/StoreTarget.cpp266
-rw-r--r--moses2/TranslationModel/ProbingPT/StoreTarget.h51
-rw-r--r--moses2/TranslationModel/ProbingPT/StoreVocab.cpp13
-rw-r--r--moses2/TranslationModel/ProbingPT/StoreVocab.h64
-rw-r--r--moses2/TranslationModel/ProbingPT/hash.cpp44
-rw-r--r--moses2/TranslationModel/ProbingPT/hash.hh17
-rw-r--r--moses2/TranslationModel/ProbingPT/line_splitter.cpp103
-rw-r--r--moses2/TranslationModel/ProbingPT/line_splitter.hh59
-rw-r--r--moses2/TranslationModel/ProbingPT/probing_hash_utils.cpp40
-rw-r--r--moses2/TranslationModel/ProbingPT/probing_hash_utils.hh55
-rw-r--r--moses2/TranslationModel/ProbingPT/querying.cpp180
-rw-r--r--moses2/TranslationModel/ProbingPT/querying.hh77
-rw-r--r--moses2/TranslationModel/ProbingPT/storing.cpp303
-rw-r--r--moses2/TranslationModel/ProbingPT/storing.hh95
-rw-r--r--moses2/TranslationModel/ProbingPT/vocabid.cpp59
-rw-r--r--moses2/TranslationModel/ProbingPT/vocabid.hh29
-rw-r--r--moses2/TranslationModel/Transliteration.cpp229
-rw-r--r--moses2/TranslationModel/Transliteration.h91
-rw-r--r--moses2/TranslationModel/UnknownWordPenalty.cpp285
-rw-r--r--moses2/TranslationModel/UnknownWordPenalty.h89
49 files changed, 8710 insertions, 0 deletions
diff --git a/moses2/TranslationModel/CompactPT/BlockHashIndex.cpp b/moses2/TranslationModel/CompactPT/BlockHashIndex.cpp
new file mode 100644
index 000000000..338a8e221
--- /dev/null
+++ b/moses2/TranslationModel/CompactPT/BlockHashIndex.cpp
@@ -0,0 +1,418 @@
+// $Id$
+// vim:tabstop=2
+/***********************************************************************
+ Moses - factored phrase-based language decoder
+ Copyright (C) 2006 University of Edinburgh
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ ***********************************************************************/
+
+#include "ThrowingFwrite.h"
+#include "BlockHashIndex.h"
+#include "CmphStringVectorAdapter.h"
+#include "util/exception.hh"
+#include "util/string_stream.hh"
+
+#ifdef HAVE_CMPH
+#include "cmph.h"
+#endif
+
+namespace Moses2
+{
+#ifdef WITH_THREADS
+BlockHashIndex::BlockHashIndex(size_t orderBits, size_t fingerPrintBits,
+ size_t threadsNum) :
+ m_orderBits(orderBits), m_fingerPrintBits(fingerPrintBits), m_fileHandle(0), m_fileHandleStart(
+ 0), m_landmarks(true), m_size(0), m_lastSaved(-1), m_lastDropped(-1), m_numLoadedRanges(
+ 0), m_threadPool(threadsNum)
+{
+#ifndef HAVE_CMPH
+ std::cerr << "minphr: CMPH support not compiled in." << std::endl;
+ exit(1);
+#endif
+}
+#else
+BlockHashIndex::BlockHashIndex(size_t orderBits, size_t fingerPrintBits)
+: m_orderBits(orderBits), m_fingerPrintBits(fingerPrintBits),
+m_fileHandle(0), m_fileHandleStart(0), m_size(0),
+m_lastSaved(-1), m_lastDropped(-1), m_numLoadedRanges(0)
+{
+#ifndef HAVE_CMPH
+ std::cerr << "minphr: CMPH support not compiled in." << std::endl;
+ exit(1);
+#endif
+}
+#endif
+
+BlockHashIndex::~BlockHashIndex()
+{
+#ifdef HAVE_CMPH
+ for (std::vector<void*>::iterator it = m_hashes.begin(); it != m_hashes.end();
+ it++)
+ if (*it != 0) cmph_destroy((cmph_t*) *it);
+
+ for (std::vector<PairedPackedArray<>*>::iterator it = m_arrays.begin();
+ it != m_arrays.end(); it++)
+ if (*it != 0) delete *it;
+#endif
+}
+
+size_t BlockHashIndex::GetHash(const char* key)
+{
+ std::string keyStr(key);
+ size_t i = std::distance(m_landmarks.begin(),
+ std::upper_bound(m_landmarks.begin(), m_landmarks.end(), keyStr)) - 1;
+
+ if (i == 0ul - 1) return GetSize();
+
+ size_t pos = GetHash(i, key);
+ if (pos != GetSize()) return (1ul << m_orderBits) * i + pos;
+ else return GetSize();
+}
+
+size_t BlockHashIndex::GetFprint(const char* key) const
+{
+ size_t hash;
+ MurmurHash3_x86_32(key, std::strlen(key), 100000, &hash);
+ hash &= (1ul << m_fingerPrintBits) - 1;
+ return hash;
+}
+
+size_t BlockHashIndex::GetHash(size_t i, const char* key)
+{
+//#ifdef WITH_THREADS
+// boost::mutex::scoped_lock lock(m_mutex);
+//#endif
+ //if(m_hashes[i] == 0)
+ //LoadRange(i);
+#ifdef HAVE_CMPH
+ size_t idx = cmph_search((cmph_t*) m_hashes[i], key,
+ (cmph_uint32) strlen(key));
+#else
+ assert(0);
+ size_t idx = 0;
+#endif
+
+ std::pair<size_t, size_t> orderPrint = m_arrays[i]->Get(idx, m_orderBits,
+ m_fingerPrintBits);
+ m_clocks[i] = clock();
+
+ if (GetFprint(key) == orderPrint.second) return orderPrint.first;
+ else return GetSize();
+}
+
+size_t BlockHashIndex::GetHash(std::string key)
+{
+ return GetHash(key.c_str());
+}
+
+size_t BlockHashIndex::operator[](std::string key)
+{
+ return GetHash(key);
+}
+
+size_t BlockHashIndex::operator[](char* key)
+{
+ return GetHash(key);
+}
+
+size_t BlockHashIndex::Save(std::string filename)
+{
+ std::FILE* mphf = std::fopen(filename.c_str(), "w");
+ size_t size = Save(mphf);
+ std::fclose(mphf);
+ return size;
+}
+
+void BlockHashIndex::BeginSave(std::FILE * mphf)
+{
+ m_fileHandle = mphf;
+ ThrowingFwrite(&m_orderBits, sizeof(size_t), 1, m_fileHandle);
+ ThrowingFwrite(&m_fingerPrintBits, sizeof(size_t), 1, m_fileHandle);
+
+ m_fileHandleStart = std::ftell(m_fileHandle);
+
+ size_t relIndexPos = 0;
+ ThrowingFwrite(&relIndexPos, sizeof(size_t), 1, m_fileHandle);
+}
+
+void BlockHashIndex::SaveRange(size_t i)
+{
+#ifdef HAVE_CMPH
+ if (m_seekIndex.size() <= i) m_seekIndex.resize(i + 1);
+ m_seekIndex[i] = std::ftell(m_fileHandle) - m_fileHandleStart;
+ cmph_dump((cmph_t*) m_hashes[i], m_fileHandle);
+ m_arrays[i]->Save(m_fileHandle);
+#endif
+}
+
+void BlockHashIndex::SaveLastRange()
+{
+#ifdef WITH_THREADS
+ boost::mutex::scoped_lock lock(m_mutex);
+#endif
+
+ while (!m_queue.empty() && m_lastSaved + 1 == -m_queue.top()) {
+ size_t current = -m_queue.top();
+ m_queue.pop();
+ SaveRange(current);
+ m_lastSaved = current;
+ }
+}
+
+void BlockHashIndex::DropRange(size_t i)
+{
+#ifdef HAVE_CMPH
+ if (m_hashes[i] != 0) {
+ cmph_destroy((cmph_t*) m_hashes[i]);
+ m_hashes[i] = 0;
+ }
+ if (m_arrays[i] != 0) {
+ delete m_arrays[i];
+ m_arrays[i] = 0;
+ m_clocks[i] = 0;
+ }
+ m_numLoadedRanges--;
+#endif
+}
+
+void BlockHashIndex::DropLastRange()
+{
+#ifdef WITH_THREADS
+ boost::mutex::scoped_lock lock(m_mutex);
+#endif
+
+ while (m_lastDropped != m_lastSaved)
+ DropRange(++m_lastDropped);
+}
+
+#ifdef WITH_THREADS
+void BlockHashIndex::WaitAll()
+{
+ m_threadPool.Stop(true);
+}
+#endif
+
+size_t BlockHashIndex::FinalizeSave()
+{
+#ifdef WITH_THREADS
+ m_threadPool.Stop(true);
+#endif
+
+ SaveLastRange();
+
+ size_t relIndexPos = std::ftell(m_fileHandle) - m_fileHandleStart;
+
+ std::fseek(m_fileHandle, m_fileHandleStart, SEEK_SET);
+ ThrowingFwrite(&relIndexPos, sizeof(size_t), 1, m_fileHandle);
+
+ std::fseek(m_fileHandle, m_fileHandleStart + relIndexPos, SEEK_SET);
+ m_landmarks.save(m_fileHandle);
+
+ size_t seekIndexSize = m_seekIndex.size();
+ ThrowingFwrite(&seekIndexSize, sizeof(size_t), 1, m_fileHandle);
+ ThrowingFwrite(&m_seekIndex[0], sizeof(size_t), seekIndexSize, m_fileHandle);
+
+ ThrowingFwrite(&m_size, sizeof(size_t), 1, m_fileHandle);
+
+ size_t fileHandleStop = std::ftell(m_fileHandle);
+ return fileHandleStop - m_fileHandleStart + sizeof(m_orderBits)
+ + sizeof(m_fingerPrintBits);
+}
+
+size_t BlockHashIndex::Save(std::FILE * mphf)
+{
+ m_queue = std::priority_queue<int>();
+ BeginSave(mphf);
+ for (size_t i = 0; i < m_hashes.size(); i++)
+ SaveRange(i);
+ return FinalizeSave();
+}
+
+size_t BlockHashIndex::LoadIndex(std::FILE* mphf)
+{
+ m_fileHandle = mphf;
+
+ size_t beginning = std::ftell(mphf);
+
+ size_t read = 0;
+ read += std::fread(&m_orderBits, sizeof(size_t), 1, mphf);
+ read += std::fread(&m_fingerPrintBits, sizeof(size_t), 1, mphf);
+ m_fileHandleStart = std::ftell(m_fileHandle);
+
+ size_t relIndexPos;
+ read += std::fread(&relIndexPos, sizeof(size_t), 1, mphf);
+ std::fseek(m_fileHandle, m_fileHandleStart + relIndexPos, SEEK_SET);
+
+ m_landmarks.load(mphf);
+
+ size_t seekIndexSize;
+ read += std::fread(&seekIndexSize, sizeof(size_t), 1, m_fileHandle);
+ m_seekIndex.resize(seekIndexSize);
+ read += std::fread(&m_seekIndex[0], sizeof(size_t), seekIndexSize,
+ m_fileHandle);
+ m_hashes.resize(seekIndexSize, 0);
+ m_clocks.resize(seekIndexSize, 0);
+ m_arrays.resize(seekIndexSize, 0);
+
+ read += std::fread(&m_size, sizeof(size_t), 1, m_fileHandle);
+
+ size_t end = std::ftell(mphf);
+
+ return end - beginning;
+}
+
+void BlockHashIndex::LoadRange(size_t i)
+{
+#ifdef HAVE_CMPH
+ std::fseek(m_fileHandle, m_fileHandleStart + m_seekIndex[i], SEEK_SET);
+ cmph_t* hash = cmph_load(m_fileHandle);
+ m_arrays[i] = new PairedPackedArray<>(0, m_orderBits, m_fingerPrintBits);
+ m_arrays[i]->Load(m_fileHandle);
+
+ m_hashes[i] = (void*) hash;
+ m_clocks[i] = clock();
+
+ m_numLoadedRanges++;
+#endif
+}
+
+size_t BlockHashIndex::Load(std::string filename)
+{
+ std::FILE* mphf = std::fopen(filename.c_str(), "r");
+ size_t size = Load(mphf);
+ std::fclose(mphf);
+ return size;
+}
+
+size_t BlockHashIndex::Load(std::FILE * mphf)
+{
+ size_t byteSize = LoadIndex(mphf);
+ size_t end = std::ftell(mphf);
+
+ for (size_t i = 0; i < m_seekIndex.size(); i++)
+ LoadRange(i);
+ std::fseek(m_fileHandle, end, SEEK_SET);
+ return byteSize;
+}
+
+size_t BlockHashIndex::GetSize() const
+{
+ return m_size;
+}
+
+void BlockHashIndex::KeepNLastRanges(float ratio, float tolerance)
+{
+ /*
+ #ifdef WITH_THREADS
+ boost::mutex::scoped_lock lock(m_mutex);
+ #endif
+ size_t n = m_hashes.size() * ratio;
+ size_t max = n * (1 + tolerance);
+ if(m_numLoadedRanges > max) {
+ typedef std::vector<std::pair<clock_t, size_t> > LastLoaded;
+ LastLoaded lastLoaded;
+ for(size_t i = 0; i < m_hashes.size(); i++)
+ if(m_hashes[i] != 0)
+ lastLoaded.push_back(std::make_pair(m_clocks[i], i));
+
+ std::sort(lastLoaded.begin(), lastLoaded.end());
+ for(LastLoaded::reverse_iterator it = lastLoaded.rbegin() + size_t(n * (1 - tolerance));
+ it != lastLoaded.rend(); it++)
+ DropRange(it->second);
+ }*/
+}
+
+void BlockHashIndex::CalcHash(size_t current, void* source_void)
+{
+#ifdef HAVE_CMPH
+ cmph_io_adapter_t* source = (cmph_io_adapter_t*) source_void;
+ cmph_config_t *config = cmph_config_new(source);
+ cmph_config_set_algo(config, CMPH_CHD);
+
+ cmph_t* hash = cmph_new(config);
+ PairedPackedArray<> *pv = new PairedPackedArray<>(source->nkeys, m_orderBits,
+ m_fingerPrintBits);
+
+ size_t i = 0;
+
+ source->rewind(source->data);
+
+ std::string lastKey = "";
+ while (i < source->nkeys) {
+ unsigned keylen;
+ char* key;
+ source->read(source->data, &key, &keylen);
+ std::string temp(key, keylen);
+ source->dispose(source->data, key, keylen);
+
+ if (lastKey > temp) {
+ if (source->nkeys != 2 || temp != "###DUMMY_KEY###") {
+ util::StringStream strme;
+ strme
+ << "ERROR: Input file does not appear to be sorted with LC_ALL=C sort\n";
+ strme << "1: " << lastKey << "\n";
+ strme << "2: " << temp << "\n";
+ UTIL_THROW2(strme.str());
+ }
+ }
+ lastKey = temp;
+
+ size_t fprint = GetFprint(temp.c_str());
+ size_t idx = cmph_search(hash, temp.c_str(), (cmph_uint32) temp.size());
+
+ pv->Set(idx, i, fprint, m_orderBits, m_fingerPrintBits);
+ i++;
+ }
+
+ cmph_config_destroy(config);
+
+#ifdef WITH_THREADS
+ boost::mutex::scoped_lock lock(m_mutex);
+#endif
+
+ if (m_hashes.size() <= current) {
+ m_hashes.resize(current + 1, 0);
+ m_arrays.resize(current + 1, 0);
+ m_clocks.resize(current + 1, 0);
+ }
+
+ m_hashes[current] = (void*) hash;
+ m_arrays[current] = pv;
+ m_clocks[current] = clock();
+ m_queue.push(-current);
+#endif
+}
+
+#ifdef HAVE_CMPH
+void* BlockHashIndex::vectorAdapter(std::vector<std::string>& v)
+{
+ return (void*) CmphVectorAdapter(v);
+}
+
+void* BlockHashIndex::vectorAdapter(
+ StringVector<unsigned, size_t, std::allocator>& sv)
+{
+ return (void*) CmphStringVectorAdapter(sv);
+}
+
+void* BlockHashIndex::vectorAdapter(
+ StringVector<unsigned, size_t, MmapAllocator>& sv)
+{
+ return (void*) CmphStringVectorAdapter(sv);
+}
+#endif
+
+}
diff --git a/moses2/TranslationModel/CompactPT/BlockHashIndex.h b/moses2/TranslationModel/CompactPT/BlockHashIndex.h
new file mode 100644
index 000000000..b91ef8f6c
--- /dev/null
+++ b/moses2/TranslationModel/CompactPT/BlockHashIndex.h
@@ -0,0 +1,200 @@
+// $Id$
+// vim:tabstop=2
+/***********************************************************************
+ Moses - factored phrase-based language decoder
+ Copyright (C) 2006 University of Edinburgh
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ ***********************************************************************/
+
+#ifndef moses_BlockHashIndex_h
+#define moses_BlockHashIndex_h
+
+#include <iostream>
+#include <sstream>
+#include <string>
+#include <vector>
+#include <queue>
+#include <cstring>
+#include <cstdio>
+#include <boost/thread.hpp>
+
+#include "MurmurHash3.h"
+#include "StringVector.h"
+#include "PackedArray.h"
+#include "util/exception.hh"
+#include "util/string_stream.hh"
+
+#ifdef WITH_THREADS
+#include "../../legacy/ThreadPool.h"
+#else
+#include <ctime>
+#endif
+
+#include <boost/shared_ptr.hpp>
+
+namespace Moses2
+{
+
+class BlockHashIndex
+{
+private:
+ std::priority_queue<int> m_queue;
+
+ size_t m_orderBits;
+ size_t m_fingerPrintBits;
+
+ std::FILE* m_fileHandle;
+ size_t m_fileHandleStart;
+
+ StringVector<unsigned char, unsigned long> m_landmarks;
+
+ std::vector<void*> m_hashes;
+ std::vector<clock_t> m_clocks;
+ std::vector<PairedPackedArray<>*> m_arrays;
+
+ std::vector<size_t> m_seekIndex;
+
+ size_t m_size;
+ int m_lastSaved;
+ int m_lastDropped;
+ size_t m_numLoadedRanges;
+
+#ifdef WITH_THREADS
+ ThreadPool m_threadPool;
+ boost::mutex m_mutex;
+
+ template<typename Keys>
+ class HashTask: public Task
+ {
+ public:
+ HashTask(int id, BlockHashIndex& hash, Keys& keys) :
+ m_id(id), m_hash(hash), m_keys(new Keys(keys))
+ {
+ }
+
+ virtual void Run()
+ {
+ m_hash.CalcHash(m_id, *m_keys);
+ }
+
+ virtual ~HashTask()
+ {
+ delete m_keys;
+ }
+
+ private:
+ int m_id;
+ BlockHashIndex& m_hash;
+ Keys* m_keys;
+ };
+#endif
+
+ size_t GetFprint(const char* key) const;
+ size_t GetHash(size_t i, const char* key);
+
+public:
+#ifdef WITH_THREADS
+ BlockHashIndex(size_t orderBits, size_t fingerPrintBits,
+ size_t threadsNum = 2);
+#else
+ BlockHashIndex(size_t orderBits, size_t fingerPrintBits);
+#endif
+
+ ~BlockHashIndex();
+
+ size_t GetHash(const char* key);
+ size_t GetHash(std::string key);
+
+ size_t operator[](std::string key);
+ size_t operator[](char* key);
+
+ void BeginSave(std::FILE* mphf);
+ void SaveRange(size_t i);
+ void SaveLastRange();
+ size_t FinalizeSave();
+
+#ifdef WITH_THREADS
+ void WaitAll();
+#endif
+
+ void DropRange(size_t i);
+ void DropLastRange();
+
+ size_t LoadIndex(std::FILE* mphf);
+ void LoadRange(size_t i);
+
+ size_t Save(std::string filename);
+ size_t Save(std::FILE * mphf);
+
+ size_t Load(std::string filename);
+ size_t Load(std::FILE * mphf);
+
+ size_t GetSize() const;
+
+ void KeepNLastRanges(float ratio = 0.1, float tolerance = 0.1);
+
+ template<typename Keys>
+ void AddRange(Keys &keys)
+ {
+ size_t current = m_landmarks.size();
+
+ if (m_landmarks.size() && m_landmarks.back().str() >= keys[0]) {
+ util::StringStream strme;
+ strme
+ << "ERROR: Input file does not appear to be sorted with LC_ALL=C sort\n";
+ strme << "1: " << m_landmarks.back().str() << "\n";
+ strme << "2: " << keys[0] << "\n";
+ UTIL_THROW2(strme.str());
+ }
+
+ m_landmarks.push_back(keys[0]);
+ m_size += keys.size();
+
+ if (keys.size() == 1) {
+ // add dummy key to avoid null hash
+ keys.push_back("###DUMMY_KEY###");
+ }
+
+#ifdef WITH_THREADS
+
+ boost::shared_ptr<HashTask<Keys> > ht(
+ new HashTask<Keys>(current, *this, keys));
+ m_threadPool.Submit(ht);
+#else
+ CalcHash(current, keys);
+#endif
+ }
+
+ template<typename Keys>
+ void CalcHash(size_t current, Keys &keys)
+ {
+#ifdef HAVE_CMPH
+ void* source = vectorAdapter(keys);
+ CalcHash(current, source);
+#endif
+ }
+
+ void CalcHash(size_t current, void* source);
+
+#ifdef HAVE_CMPH
+ void* vectorAdapter(std::vector<std::string>& v);
+ void* vectorAdapter(StringVector<unsigned, size_t, std::allocator>& sv);
+ void* vectorAdapter(StringVector<unsigned, size_t, MmapAllocator>& sv);
+#endif
+};
+
+}
+#endif
diff --git a/moses2/TranslationModel/CompactPT/CanonicalHuffman.h b/moses2/TranslationModel/CompactPT/CanonicalHuffman.h
new file mode 100644
index 000000000..ffb6488c0
--- /dev/null
+++ b/moses2/TranslationModel/CompactPT/CanonicalHuffman.h
@@ -0,0 +1,345 @@
+// $Id$
+// vim:tabstop=2
+/***********************************************************************
+ Moses - factored phrase-based language decoder
+ Copyright (C) 2006 University of Edinburgh
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ ***********************************************************************/
+
+#ifndef moses_CanonicalHuffman_h
+#define moses_CanonicalHuffman_h
+
+#include <string>
+#include <algorithm>
+#include <boost/dynamic_bitset.hpp>
+#include <boost/unordered_map.hpp>
+
+#include "ThrowingFwrite.h"
+
+namespace Moses2
+{
+
+template<typename Data>
+class CanonicalHuffman
+{
+private:
+ std::vector<Data> m_symbols;
+ std::vector<size_t> m_firstCodes;
+ std::vector<size_t> m_lengthIndex;
+
+ typedef boost::unordered_map<Data, boost::dynamic_bitset<> > EncodeMap;
+ EncodeMap m_encodeMap;
+
+ struct MinHeapSorter
+ {
+ std::vector<size_t>& m_vec;
+
+ MinHeapSorter(std::vector<size_t>& vec) :
+ m_vec(vec)
+ {
+ }
+
+ bool operator()(size_t a, size_t b)
+ {
+ return m_vec[a] > m_vec[b];
+ }
+ };
+
+ template<class Iterator>
+ void CalcLengths(Iterator begin, Iterator end, std::vector<size_t>& lengths)
+ {
+ size_t n = std::distance(begin, end);
+ std::vector<size_t> A(2 * n, 0);
+
+ m_symbols.resize(n);
+ size_t i = 0;
+ for (Iterator it = begin; it != end; it++) {
+ m_symbols[i] = it->first;
+
+ A[i] = n + i;
+ A[n + i] = it->second;
+ i++;
+ }
+
+ if (n == 1) {
+ lengths.push_back(1);
+ return;
+ }
+
+ MinHeapSorter hs(A);
+ std::make_heap(A.begin(), A.begin() + n, hs);
+
+ size_t h = n;
+ size_t m1, m2;
+ while (h > 1) {
+ m1 = A[0];
+ std::pop_heap(A.begin(), A.begin() + h, hs);
+
+ h--;
+
+ m2 = A[0];
+ std::pop_heap(A.begin(), A.begin() + h, hs);
+
+ A[h] = A[m1] + A[m2];
+ A[h - 1] = h;
+ A[m1] = A[m2] = h;
+
+ std::push_heap(A.begin(), A.begin() + h, hs);
+ }
+
+ A[1] = 0;
+ for (size_t i = 2; i < 2 * n; i++)
+ A[i] = A[A[i]] + 1;
+
+ lengths.resize(n);
+ for (size_t i = 0; i < n; i++)
+ lengths[i] = A[i + n];
+ }
+
+ void CalcCodes(std::vector<size_t>& lengths)
+ {
+ std::vector<size_t> numLength;
+ for (std::vector<size_t>::iterator it = lengths.begin();
+ it != lengths.end(); it++) {
+ size_t length = *it;
+ if (numLength.size() <= length) numLength.resize(length + 1, 0);
+ numLength[length]++;
+ }
+
+ m_lengthIndex.resize(numLength.size());
+ m_lengthIndex[0] = 0;
+ for (size_t l = 1; l < numLength.size(); l++)
+ m_lengthIndex[l] = m_lengthIndex[l - 1] + numLength[l - 1];
+
+ size_t maxLength = numLength.size() - 1;
+
+ m_firstCodes.resize(maxLength + 1, 0);
+ for (size_t l = maxLength - 1; l > 0; l--)
+ m_firstCodes[l] = (m_firstCodes[l + 1] + numLength[l + 1]) / 2;
+
+ std::vector<Data> t_symbols;
+ t_symbols.resize(lengths.size());
+
+ std::vector<size_t> nextCode = m_firstCodes;
+ for (size_t i = 0; i < lengths.size(); i++) {
+ Data data = m_symbols[i];
+ size_t length = lengths[i];
+
+ size_t pos = m_lengthIndex[length]
+ + (nextCode[length] - m_firstCodes[length]);
+ t_symbols[pos] = data;
+
+ nextCode[length] = nextCode[length] + 1;
+ }
+
+ m_symbols.swap(t_symbols);
+ }
+
+ void CreateCodeMap()
+ {
+ for (size_t l = 1; l < m_lengthIndex.size(); l++) {
+ size_t intCode = m_firstCodes[l];
+ size_t num = (
+ (l + 1 < m_lengthIndex.size()) ?
+ m_lengthIndex[l + 1] : m_symbols.size()) - m_lengthIndex[l];
+
+ for (size_t i = 0; i < num; i++) {
+ Data data = m_symbols[m_lengthIndex[l] + i];
+ boost::dynamic_bitset<> bitCode(l, intCode);
+ m_encodeMap[data] = bitCode;
+ intCode++;
+ }
+ }
+ }
+
+ const boost::dynamic_bitset<>& Encode(Data data) const
+ {
+ typename EncodeMap::const_iterator it = m_encodeMap.find(data);
+ UTIL_THROW_IF2(it == m_encodeMap.end(),
+ "Cannot find symbol in encoding map");
+ return it->second;
+ }
+
+ template<class BitWrapper>
+ void PutCode(BitWrapper& bitWrapper, const boost::dynamic_bitset<>& code)
+ {
+ for (int j = code.size() - 1; j >= 0; j--)
+ bitWrapper.Put(code[j]);
+ }
+
+public:
+
+ template<class Iterator>
+ CanonicalHuffman(Iterator begin, Iterator end, bool forEncoding = true)
+ {
+ std::vector<size_t> lengths;
+ CalcLengths(begin, end, lengths);
+ CalcCodes(lengths);
+
+ if (forEncoding) CreateCodeMap();
+ }
+
+ CanonicalHuffman(std::FILE* pFile, bool forEncoding = false)
+ {
+ Load(pFile);
+
+ if (forEncoding) CreateCodeMap();
+ }
+
+ template<class BitWrapper>
+ void Put(BitWrapper& bitWrapper, Data data)
+ {
+ PutCode(bitWrapper, Encode(data));
+ }
+
+ template<class BitWrapper>
+ Data Read(BitWrapper& bitWrapper)
+ {
+ if (bitWrapper.TellFromEnd()) {
+ size_t intCode = bitWrapper.Read();
+ size_t len = 1;
+ while (intCode < m_firstCodes[len]) {
+ intCode = 2 * intCode + bitWrapper.Read();
+ len++;
+ }
+ return m_symbols[m_lengthIndex[len] + (intCode - m_firstCodes[len])];
+ }
+ return Data();
+ }
+
+ size_t Load(std::FILE* pFile)
+ {
+ size_t start = std::ftell(pFile);
+ size_t read = 0;
+
+ size_t size;
+ read += std::fread(&size, sizeof(size_t), 1, pFile);
+ m_symbols.resize(size);
+ read += std::fread(&m_symbols[0], sizeof(Data), size, pFile);
+
+ read += std::fread(&size, sizeof(size_t), 1, pFile);
+ m_firstCodes.resize(size);
+ read += std::fread(&m_firstCodes[0], sizeof(size_t), size, pFile);
+
+ read += std::fread(&size, sizeof(size_t), 1, pFile);
+ m_lengthIndex.resize(size);
+ read += std::fread(&m_lengthIndex[0], sizeof(size_t), size, pFile);
+
+ return std::ftell(pFile) - start;
+ }
+
+ size_t Save(std::FILE* pFile)
+ {
+ size_t start = std::ftell(pFile);
+
+ size_t size = m_symbols.size();
+ ThrowingFwrite(&size, sizeof(size_t), 1, pFile);
+ ThrowingFwrite(&m_symbols[0], sizeof(Data), size, pFile);
+
+ size = m_firstCodes.size();
+ ThrowingFwrite(&size, sizeof(size_t), 1, pFile);
+ ThrowingFwrite(&m_firstCodes[0], sizeof(size_t), size, pFile);
+
+ size = m_lengthIndex.size();
+ ThrowingFwrite(&size, sizeof(size_t), 1, pFile);
+ ThrowingFwrite(&m_lengthIndex[0], sizeof(size_t), size, pFile);
+
+ return std::ftell(pFile) - start;
+ }
+};
+
+template<class Container = std::string>
+class BitWrapper
+{
+private:
+ Container& m_data;
+
+ typename Container::iterator m_iterator;
+ typename Container::value_type m_currentValue;
+
+ size_t m_valueBits;
+ typename Container::value_type m_mask;
+ size_t m_bitPos;
+
+public:
+
+ BitWrapper(Container &data) :
+ m_data(data), m_iterator(m_data.begin()), m_currentValue(0), m_valueBits(
+ sizeof(typename Container::value_type) * 8), m_mask(1), m_bitPos(0)
+ {
+ }
+
+ bool Read()
+ {
+ if (m_bitPos % m_valueBits == 0) {
+ if (m_iterator != m_data.end()) m_currentValue = *m_iterator++;
+ }
+ else m_currentValue = m_currentValue >> 1;
+
+ m_bitPos++;
+ return (m_currentValue & m_mask);
+ }
+
+ void Put(bool bit)
+ {
+ if (m_bitPos % m_valueBits == 0) m_data.push_back(0);
+
+ if (bit) m_data[m_data.size() - 1] |= m_mask << (m_bitPos % m_valueBits);
+
+ m_bitPos++;
+ }
+
+ size_t Tell()
+ {
+ return m_bitPos;
+ }
+
+ size_t TellFromEnd()
+ {
+ if (m_data.size() * m_valueBits < m_bitPos) return 0;
+ return m_data.size() * m_valueBits - m_bitPos;
+ }
+
+ void Seek(size_t bitPos)
+ {
+ m_bitPos = bitPos;
+ m_iterator = m_data.begin() + int((m_bitPos - 1) / m_valueBits);
+ m_currentValue = (*m_iterator) >> ((m_bitPos - 1) % m_valueBits);
+ m_iterator++;
+ }
+
+ void SeekFromEnd(size_t bitPosFromEnd)
+ {
+ size_t bitPos = m_data.size() * m_valueBits - bitPosFromEnd;
+ Seek(bitPos);
+ }
+
+ void Reset()
+ {
+ m_iterator = m_data.begin();
+ m_currentValue = 0;
+ m_bitPos = 0;
+ }
+
+ Container& GetContainer()
+ {
+ return m_data;
+ }
+};
+
+}
+
+#endif
diff --git a/moses2/TranslationModel/CompactPT/CmphStringVectorAdapter.cpp b/moses2/TranslationModel/CompactPT/CmphStringVectorAdapter.cpp
new file mode 100644
index 000000000..a51dc5a45
--- /dev/null
+++ b/moses2/TranslationModel/CompactPT/CmphStringVectorAdapter.cpp
@@ -0,0 +1,95 @@
+// $Id$
+// vim:tabstop=2
+/***********************************************************************
+ Moses - factored phrase-based language decoder
+ Copyright (C) 2006 University of Edinburgh
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ ***********************************************************************/
+
+#ifdef HAVE_CMPH
+
+#include "CmphStringVectorAdapter.h"
+
+namespace Moses2
+{
+
+void CmphStringVectorAdapterDispose(void *data, char *key, cmph_uint32 keylen)
+{
+ delete[] key;
+}
+
+void CmphStringVectorAdapterRewind(void *data)
+{
+ cmph_vector_t *cmph_vector = (cmph_vector_t *) data;
+ cmph_vector->position = 0;
+}
+
+//************************************************************************//
+
+cmph_io_adapter_t *CmphVectorAdapterNew(std::vector<std::string>& v)
+{
+ cmph_io_adapter_t * key_source = (cmph_io_adapter_t *) malloc(
+ sizeof(cmph_io_adapter_t));
+ cmph_vector_t * cmph_vector = (cmph_vector_t *) malloc(sizeof(cmph_vector_t));
+ assert(key_source);
+ assert(cmph_vector);
+
+ cmph_vector->vector = (void *) &v;
+ cmph_vector->position = 0;
+ key_source->data = (void *) cmph_vector;
+ key_source->nkeys = v.size();
+
+ return key_source;
+}
+
+int CmphVectorAdapterRead(void *data, char **key, cmph_uint32 *keylen)
+{
+ cmph_vector_t *cmph_vector = (cmph_vector_t *) data;
+ std::vector<std::string>* v = (std::vector<std::string>*) cmph_vector->vector;
+ size_t size;
+ *keylen = (*v)[cmph_vector->position].size();
+ size = *keylen;
+ *key = new char[size + 1];
+ std::string temp = (*v)[cmph_vector->position];
+ strcpy(*key, temp.c_str());
+ cmph_vector->position = cmph_vector->position + 1;
+ return (int) (*keylen);
+}
+
+void CmphVectorAdapterDispose(void *data, char *key, cmph_uint32 keylen)
+{
+ delete[] key;
+}
+
+void CmphVectorAdapterRewind(void *data)
+{
+ cmph_vector_t *cmph_vector = (cmph_vector_t *) data;
+ cmph_vector->position = 0;
+}
+
+cmph_io_adapter_t* CmphVectorAdapter(std::vector<std::string>& v)
+{
+ cmph_io_adapter_t * key_source = CmphVectorAdapterNew(v);
+
+ key_source->read = CmphVectorAdapterRead;
+ key_source->dispose = CmphVectorAdapterDispose;
+ key_source->rewind = CmphVectorAdapterRewind;
+ return key_source;
+}
+
+}
+
+#endif
diff --git a/moses2/TranslationModel/CompactPT/CmphStringVectorAdapter.h b/moses2/TranslationModel/CompactPT/CmphStringVectorAdapter.h
new file mode 100644
index 000000000..20d43a80c
--- /dev/null
+++ b/moses2/TranslationModel/CompactPT/CmphStringVectorAdapter.h
@@ -0,0 +1,108 @@
+// $Id$
+// vim:tabstop=2
+/***********************************************************************
+ Moses - factored phrase-based language decoder
+ Copyright (C) 2006 University of Edinburgh
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ ***********************************************************************/
+
+#ifndef moses_CmphStringVectorAdapterNew_h
+#define moses_CmphStringVectorAdapterNew_h
+
+#include <cassert>
+#include <cstring>
+
+#ifdef HAVE_CMPH
+#include "cmph.h"
+
+#include "StringVector.h"
+
+namespace Moses2
+{
+
+typedef struct
+{
+ void *vector;
+ cmph_uint32 position;
+} cmph_vector_t;
+
+template<typename ValueT, typename PosT, template<typename > class Allocator>
+cmph_io_adapter_t *CmphStringVectorAdapterNew(
+ StringVector<ValueT, PosT, Allocator>& sv)
+{
+ cmph_io_adapter_t * key_source = (cmph_io_adapter_t *) malloc(
+ sizeof(cmph_io_adapter_t));
+ cmph_vector_t * cmph_vector = (cmph_vector_t *) malloc(sizeof(cmph_vector_t));
+ assert(key_source);
+ assert(cmph_vector);
+
+ cmph_vector->vector = (void *) &sv;
+ cmph_vector->position = 0;
+ key_source->data = (void *) cmph_vector;
+ key_source->nkeys = sv.size();
+
+ return key_source;
+}
+
+template<typename ValueT, typename PosT, template<typename > class Allocator>
+int CmphStringVectorAdapterRead(void *data, char **key, cmph_uint32 *keylen)
+{
+ cmph_vector_t *cmph_vector = (cmph_vector_t *) data;
+ StringVector<ValueT, PosT, Allocator>* sv = (StringVector<ValueT, PosT,
+ Allocator>*) cmph_vector->vector;
+ size_t size;
+ *keylen = (*sv)[cmph_vector->position].size();
+ size = *keylen;
+ *key = new char[size + 1];
+ std::string temp = (*sv)[cmph_vector->position];
+ std::strcpy(*key, temp.c_str());
+ cmph_vector->position = cmph_vector->position + 1;
+ return (int) (*keylen);
+}
+
+void CmphStringVectorAdapterDispose(void *data, char *key, cmph_uint32 keylen);
+
+void CmphStringVectorAdapterRewind(void *data);
+
+template<typename ValueT, typename PosT, template<typename > class Allocator>
+cmph_io_adapter_t* CmphStringVectorAdapter(
+ StringVector<ValueT, PosT, Allocator>& sv)
+{
+ cmph_io_adapter_t * key_source = CmphStringVectorAdapterNew(sv);
+
+ key_source->read = CmphStringVectorAdapterRead<ValueT, PosT, Allocator>;
+ key_source->dispose = CmphStringVectorAdapterDispose;
+ key_source->rewind = CmphStringVectorAdapterRewind;
+ return key_source;
+}
+
+//************************************************************************//
+
+cmph_io_adapter_t *CmphVectorAdapterNew(std::vector<std::string>& v);
+
+int CmphVectorAdapterRead(void *data, char **key, cmph_uint32 *keylen);
+
+void CmphVectorAdapterDispose(void *data, char *key, cmph_uint32 keylen);
+
+void CmphVectorAdapterRewind(void *data);
+
+cmph_io_adapter_t* CmphVectorAdapter(std::vector<std::string>& v);
+
+}
+
+#endif
+
+#endif
diff --git a/moses2/TranslationModel/CompactPT/LexicalReorderingTableCompact.cpp b/moses2/TranslationModel/CompactPT/LexicalReorderingTableCompact.cpp
new file mode 100644
index 000000000..1d32b9a6f
--- /dev/null
+++ b/moses2/TranslationModel/CompactPT/LexicalReorderingTableCompact.cpp
@@ -0,0 +1,173 @@
+// -*- c++ -*-
+// vim:tabstop=2
+// $Id$
+/***********************************************************************
+ Moses - factored phrase-based language decoder
+ Copyright (C) 2006 University of Edinburgh
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ ***********************************************************************/
+
+#include "LexicalReorderingTableCompact.h"
+#include "../../SubPhrase.h"
+#include "../../legacy/Util2.h"
+
+namespace Moses2
+{
+
+//////////////////////////////////////////////////////////////////////////////////////////////
+
+bool LexicalReorderingTableCompact::s_inMemoryByDefault = false;
+
+LexicalReorderingTableCompact::LexicalReorderingTableCompact(
+ const std::string& filePath, const std::vector<FactorType>& f_factors,
+ const std::vector<FactorType>& e_factors,
+ const std::vector<FactorType>& c_factors) :
+ LexicalReorderingTable(f_factors, e_factors, c_factors), m_inMemory(
+ s_inMemoryByDefault), m_numScoreComponent(6), m_multipleScoreTrees(
+ true), m_hash(10, 16), m_scoreTrees(1)
+{
+ Load(filePath);
+}
+
+LexicalReorderingTableCompact::LexicalReorderingTableCompact(
+ const std::vector<FactorType>& f_factors,
+ const std::vector<FactorType>& e_factors,
+ const std::vector<FactorType>& c_factors) :
+ LexicalReorderingTable(f_factors, e_factors, c_factors), m_inMemory(
+ s_inMemoryByDefault), m_numScoreComponent(6), m_multipleScoreTrees(
+ true), m_hash(10, 16), m_scoreTrees(1)
+{
+}
+
+LexicalReorderingTableCompact::~LexicalReorderingTableCompact()
+{
+ for (size_t i = 0; i < m_scoreTrees.size(); i++)
+ delete m_scoreTrees[i];
+}
+
+std::vector<float> LexicalReorderingTableCompact::GetScore(const Phrase<Moses2::Word>& f,
+ const Phrase<Moses2::Word>& e, const Phrase<Moses2::Word>& c)
+{
+ std::string key;
+ std::vector<float> scores;
+
+ if (0 == c.GetSize()) key = MakeKey(f, e, c);
+ else {
+ for (size_t i = 0; i <= c.GetSize(); ++i) {
+ SubPhrase<Moses2::Word> sub_c = c.GetSubPhrase(i, c.GetSize() - i);
+ key = MakeKey(f, e, sub_c);
+ }
+ }
+
+ size_t index = m_hash[key];
+ if (m_hash.GetSize() != index) {
+ std::string scoresString;
+ if (m_inMemory) scoresString = m_scoresMemory[index].str();
+ else scoresString = m_scoresMapped[index].str();
+
+ BitWrapper<> bitStream(scoresString);
+ for (size_t i = 0; i < m_numScoreComponent; i++)
+ scores.push_back(
+ m_scoreTrees[m_multipleScoreTrees ? i : 0]->Read(bitStream));
+
+ return scores;
+ }
+
+ return std::vector<float>();
+}
+
+std::string LexicalReorderingTableCompact::MakeKey(const Phrase<Moses2::Word>& f,
+ const Phrase<Moses2::Word>& e, const Phrase<Moses2::Word>& c) const
+{
+ return MakeKey(Trim(f.GetString(m_FactorsF)), Trim(e.GetString(m_FactorsE)),
+ Trim(c.GetString(m_FactorsC)));
+}
+
+std::string LexicalReorderingTableCompact::MakeKey(const std::string& f,
+ const std::string& e, const std::string& c) const
+{
+ std::string key;
+ if (!f.empty()) key += f;
+ if (!m_FactorsE.empty()) {
+ if (!key.empty()) key += " ||| ";
+ key += e;
+ }
+ if (!m_FactorsC.empty()) {
+ if (!key.empty()) key += " ||| ";
+ key += c;
+ }
+ key += " ||| ";
+ return key;
+}
+
+LexicalReorderingTable*
+LexicalReorderingTableCompact::CheckAndLoad(const std::string& filePath,
+ const std::vector<FactorType>& f_factors,
+ const std::vector<FactorType>& e_factors,
+ const std::vector<FactorType>& c_factors)
+{
+#ifdef HAVE_CMPH
+ std::string minlexr = ".minlexr";
+ // file name is specified without suffix
+ if (FileExists(filePath + minlexr)) {
+ //there exists a compact binary version use that
+ std::cerr << "Using compact lexical reordering table" << std::endl;
+ return new LexicalReorderingTableCompact(filePath + minlexr, f_factors,
+ e_factors, c_factors);
+ }
+ // file name is specified with suffix
+ if (filePath.substr(filePath.length() - minlexr.length(), minlexr.length())
+ == minlexr && FileExists(filePath)) {
+ //there exists a compact binary version use that
+ std::cerr << "Using compact lexical reordering table" << std::endl;
+ return new LexicalReorderingTableCompact(filePath, f_factors, e_factors,
+ c_factors);
+ }
+#endif
+ return 0;
+}
+
+void LexicalReorderingTableCompact::Load(std::string filePath)
+{
+ std::FILE* pFile = std::fopen(filePath.c_str(), "r");
+ UTIL_THROW_IF2(pFile == NULL, "File " << filePath << " could not be opened");
+
+ //if(m_inMemory)
+ m_hash.Load(pFile);
+ //else
+ //m_hash.LoadIndex(pFile);
+
+ size_t read = 0;
+ read += std::fread(&m_numScoreComponent, sizeof(m_numScoreComponent), 1,
+ pFile);
+ read += std::fread(&m_multipleScoreTrees, sizeof(m_multipleScoreTrees), 1,
+ pFile);
+
+ if (m_multipleScoreTrees) {
+ m_scoreTrees.resize(m_numScoreComponent);
+ for (size_t i = 0; i < m_numScoreComponent; i++)
+ m_scoreTrees[i] = new CanonicalHuffman<float>(pFile);
+ }
+ else {
+ m_scoreTrees.resize(1);
+ m_scoreTrees[0] = new CanonicalHuffman<float>(pFile);
+ }
+
+ if (m_inMemory) m_scoresMemory.load(pFile, false);
+ else m_scoresMapped.load(pFile, true);
+}
+
+}
diff --git a/moses2/TranslationModel/CompactPT/LexicalReorderingTableCompact.h b/moses2/TranslationModel/CompactPT/LexicalReorderingTableCompact.h
new file mode 100644
index 000000000..90abf4197
--- /dev/null
+++ b/moses2/TranslationModel/CompactPT/LexicalReorderingTableCompact.h
@@ -0,0 +1,143 @@
+// $Id$
+// vim:tabstop=2
+/***********************************************************************
+ Moses - factored phrase-based language decoder
+ Copyright (C) 2006 University of Edinburgh
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ ***********************************************************************/
+
+#ifndef moses_LexicalReorderingTableCompact_h
+#define moses_LexicalReorderingTableCompact_h
+
+#include "BlockHashIndex.h"
+#include "CanonicalHuffman.h"
+#include "StringVector.h"
+#include "../../TypeDef.h"
+#include "../../Phrase.h"
+
+namespace Moses2
+{
+
+//! additional types
+class LexicalReorderingTable
+{
+public:
+ LexicalReorderingTable(const FactorList& f_factors,
+ const FactorList& e_factors, const FactorList& c_factors) :
+ m_FactorsF(f_factors), m_FactorsE(e_factors), m_FactorsC(c_factors)
+ {
+ }
+
+ virtual ~LexicalReorderingTable()
+ {
+ }
+
+public:
+
+ virtual std::vector<float>
+ GetScore(const Phrase<Moses2::Word>& f, const Phrase<Moses2::Word>& e, const Phrase<Moses2::Word>& c) = 0;
+
+ virtual
+ void InitializeForInput()
+ {
+ /* override for on-demand loading */
+ }
+ ;
+
+ virtual
+ void InitializeForInputPhrase(const Phrase<Moses2::Word>&)
+ {
+ }
+
+ const FactorList& GetFFactorMask() const
+ {
+ return m_FactorsF;
+ }
+ const FactorList& GetEFactorMask() const
+ {
+ return m_FactorsE;
+ }
+ const FactorList& GetCFactorMask() const
+ {
+ return m_FactorsC;
+ }
+
+ virtual
+ void DbgDump(std::ostream* out) const
+ {
+ *out << "Overwrite in subclass...\n";
+ }
+ ;
+ // why is this not a pure virtual function? - UG
+
+protected:
+ FactorList m_FactorsF;
+ FactorList m_FactorsE;
+ FactorList m_FactorsC;
+};
+
+//////////////////////////////////////////////////////////////////////////////////////////////
+class LexicalReorderingTableCompact: public LexicalReorderingTable
+{
+private:
+ static bool s_inMemoryByDefault;
+ bool m_inMemory;
+
+ size_t m_numScoreComponent;
+ bool m_multipleScoreTrees;
+
+ BlockHashIndex m_hash;
+
+ typedef CanonicalHuffman<float> ScoreTree;
+ std::vector<ScoreTree*> m_scoreTrees;
+
+ StringVector<unsigned char, unsigned long, MmapAllocator> m_scoresMapped;
+ StringVector<unsigned char, unsigned long, std::allocator> m_scoresMemory;
+
+ std::string MakeKey(const Phrase<Moses2::Word>& f, const Phrase<Moses2::Word>& e, const Phrase<Moses2::Word>& c) const;
+ std::string MakeKey(const std::string& f, const std::string& e,
+ const std::string& c) const;
+
+public:
+ LexicalReorderingTableCompact(const std::string& filePath,
+ const std::vector<FactorType>& f_factors,
+ const std::vector<FactorType>& e_factors,
+ const std::vector<FactorType>& c_factors);
+
+ LexicalReorderingTableCompact(const std::vector<FactorType>& f_factors,
+ const std::vector<FactorType>& e_factors,
+ const std::vector<FactorType>& c_factors);
+
+ virtual
+ ~LexicalReorderingTableCompact();
+
+ virtual std::vector<float>
+ GetScore(const Phrase<Moses2::Word>& f, const Phrase<Moses2::Word>& e, const Phrase<Moses2::Word>& c);
+
+ static LexicalReorderingTable*
+ CheckAndLoad(const std::string& filePath,
+ const std::vector<FactorType>& f_factors,
+ const std::vector<FactorType>& e_factors,
+ const std::vector<FactorType>& c_factors);
+
+ void
+ Load(std::string filePath);
+
+};
+
+}
+
+#endif
diff --git a/moses2/TranslationModel/CompactPT/ListCoders.h b/moses2/TranslationModel/CompactPT/ListCoders.h
new file mode 100644
index 000000000..5a01274d9
--- /dev/null
+++ b/moses2/TranslationModel/CompactPT/ListCoders.h
@@ -0,0 +1,394 @@
+// $Id$
+// vim:tabstop=2
+/***********************************************************************
+ Moses - factored phrase-based language decoder
+ Copyright (C) 2006 University of Edinburgh
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ ***********************************************************************/
+
+#ifndef moses_ListCoders_h
+#define moses_ListCoders_h
+
+#include <cmath>
+#include <cassert>
+
+namespace Moses2
+{
+
+template<typename T = unsigned int>
+class VarIntType
+{
+private:
+ template<typename IntType, typename OutIt>
+ static void EncodeSymbol(IntType input, OutIt output)
+ {
+ if (input == 0) {
+ *output = 0;
+ output++;
+ return;
+ }
+
+ T msb = 1 << (sizeof(T) * 8 - 1);
+ IntType mask = ~msb;
+ IntType shift = (sizeof(T) * 8 - 1);
+
+ while (input) {
+ T res = input & mask;
+ input >>= shift;
+ if (input) res |= msb;
+ *output = res;
+ output++;
+ }
+ }
+ ;
+
+ template<typename InIt, typename IntType>
+ static void DecodeSymbol(InIt &it, InIt end, IntType &output)
+ {
+ T msb = 1 << (sizeof(T) * 8 - 1);
+ IntType shift = (sizeof(T) * 8 - 1);
+
+ output = 0;
+ size_t i = 0;
+ while (it != end && *it & msb) {
+ IntType temp = *it & ~msb;
+ temp <<= shift * i;
+ output |= temp;
+ it++;
+ i++;
+ }
+ assert(it != end);
+
+ IntType temp = *it;
+ temp <<= shift * i;
+ output |= temp;
+ it++;
+ }
+
+public:
+
+ template<typename InIt, typename OutIt>
+ static void Encode(InIt it, InIt end, OutIt outIt)
+ {
+ while (it != end) {
+ EncodeSymbol(*it, outIt);
+ it++;
+ }
+ }
+
+ template<typename InIt, typename OutIt>
+ static void Decode(InIt &it, InIt end, OutIt outIt)
+ {
+ while (it != end) {
+ size_t output;
+ DecodeSymbol(it, end, output);
+ *outIt = output;
+ outIt++;
+ }
+ }
+
+ template<typename InIt>
+ static size_t DecodeAndSum(InIt &it, InIt end, size_t num)
+ {
+ size_t sum = 0;
+ size_t curr = 0;
+
+ while (it != end && curr < num) {
+ size_t output;
+ DecodeSymbol(it, end, output);
+ sum += output;
+ curr++;
+ }
+
+ return sum;
+ }
+
+};
+
+typedef VarIntType<unsigned char> VarByte;
+
+typedef VarByte VarInt8;
+typedef VarIntType<unsigned short> VarInt16;
+typedef VarIntType<unsigned int> VarInt32;
+
+class Simple9
+{
+private:
+ typedef unsigned int uint;
+
+ template<typename InIt>
+ inline static void EncodeSymbol(uint &output, InIt it, InIt end)
+ {
+ uint length = end - it;
+
+ uint type = 0;
+ uint bitlength = 0;
+
+ switch (length) {
+ case 1:
+ type = 1;
+ bitlength = 28;
+ break;
+ case 2:
+ type = 2;
+ bitlength = 14;
+ break;
+ case 3:
+ type = 3;
+ bitlength = 9;
+ break;
+ case 4:
+ type = 4;
+ bitlength = 7;
+ break;
+ case 5:
+ type = 5;
+ bitlength = 5;
+ break;
+ case 7:
+ type = 6;
+ bitlength = 4;
+ break;
+ case 9:
+ type = 7;
+ bitlength = 3;
+ break;
+ case 14:
+ type = 8;
+ bitlength = 2;
+ break;
+ case 28:
+ type = 9;
+ bitlength = 1;
+ break;
+ }
+
+ output = 0;
+ output |= (type << 28);
+
+ uint i = 0;
+ while (it != end) {
+ UTIL_THROW_IF2(*it > 268435455,
+ "You are trying to encode " << *it
+ << " with Simple9. Cannot encode numbers larger than 268435455 (2^28-1)");
+
+ uint l = bitlength * (length - i - 1);
+ output |= *it << l;
+ it++;
+ i++;
+ }
+ }
+
+ template<typename OutIt>
+ static inline void DecodeSymbol(uint input, OutIt outIt)
+ {
+ uint type = (input >> 28);
+
+ uint bitlen = 0;
+ uint shift = 0;
+ uint mask = 0;
+
+ switch (type) {
+ case 1:
+ bitlen = 28;
+ shift = 0;
+ mask = 268435455;
+ break;
+ case 2:
+ bitlen = 14;
+ shift = 14;
+ mask = 16383;
+ break;
+ case 3:
+ bitlen = 9;
+ shift = 18;
+ mask = 511;
+ break;
+ case 4:
+ bitlen = 7;
+ shift = 21;
+ mask = 127;
+ break;
+ case 5:
+ bitlen = 5;
+ shift = 20;
+ mask = 31;
+ break;
+ case 6:
+ bitlen = 4;
+ shift = 24;
+ mask = 15;
+ break;
+ case 7:
+ bitlen = 3;
+ shift = 24;
+ mask = 7;
+ break;
+ case 8:
+ bitlen = 2;
+ shift = 26;
+ mask = 3;
+ break;
+ case 9:
+ bitlen = 1;
+ shift = 27;
+ mask = 1;
+ break;
+ }
+
+ while (shift > 0) {
+ *outIt = (input >> shift) & mask;
+ shift -= bitlen;
+ outIt++;
+ }
+ *outIt = input & mask;
+ outIt++;
+ }
+
+ static inline size_t DecodeAndSumSymbol(uint input, size_t num, size_t &curr)
+ {
+ uint type = (input >> 28);
+
+ uint bitlen = 0;
+ uint shift = 0;
+ uint mask = 0;
+
+ switch (type) {
+ case 1:
+ bitlen = 28;
+ shift = 0;
+ mask = 268435455;
+ break;
+ case 2:
+ bitlen = 14;
+ shift = 14;
+ mask = 16383;
+ break;
+ case 3:
+ bitlen = 9;
+ shift = 18;
+ mask = 511;
+ break;
+ case 4:
+ bitlen = 7;
+ shift = 21;
+ mask = 127;
+ break;
+ case 5:
+ bitlen = 5;
+ shift = 20;
+ mask = 31;
+ break;
+ case 6:
+ bitlen = 4;
+ shift = 24;
+ mask = 15;
+ break;
+ case 7:
+ bitlen = 3;
+ shift = 24;
+ mask = 7;
+ break;
+ case 8:
+ bitlen = 2;
+ shift = 26;
+ mask = 3;
+ break;
+ case 9:
+ bitlen = 1;
+ shift = 27;
+ mask = 1;
+ break;
+ }
+
+ size_t sum = 0;
+ while (shift > 0) {
+ sum += (input >> shift) & mask;
+ shift -= bitlen;
+ if (++curr == num) return sum;
+ }
+ sum += input & mask;
+ curr++;
+ return sum;
+ }
+
+public:
+ template<typename InIt, typename OutIt>
+ static void Encode(InIt it, InIt end, OutIt outIt)
+ {
+ uint parts[] = { 1, 2, 3, 4, 5, 7, 9, 14, 28 };
+
+ uint buffer[28];
+ for (InIt i = it; i < end; i++) {
+ uint lastbit = 1;
+ uint lastpos = 0;
+ uint lastyes = 0;
+ uint j = 0;
+
+ double log2 = log(2);
+ while (j < 9 && lastpos < 28 && (i + lastpos) < end) {
+ if (lastpos >= parts[j]) j++;
+
+ buffer[lastpos] = *(i + lastpos);
+
+ uint reqbit = ceil(log(buffer[lastpos] + 1) / log2);
+ assert(reqbit <= 28);
+
+ uint bit = 28 / floor(28 / reqbit);
+ if (lastbit < bit) lastbit = bit;
+
+ if (parts[j] > 28 / lastbit) break;
+ else if (lastpos == parts[j] - 1) lastyes = lastpos;
+
+ lastpos++;
+ }
+ i += lastyes;
+
+ uint length = lastyes + 1;
+ uint output;
+ EncodeSymbol(output, buffer, buffer + length);
+
+ *outIt = output;
+ outIt++;
+ }
+ }
+
+ template<typename InIt, typename OutIt>
+ static void Decode(InIt &it, InIt end, OutIt outIt)
+ {
+ while (it != end) {
+ DecodeSymbol(*it, outIt);
+ it++;
+ }
+ }
+
+ template<typename InIt>
+ static size_t DecodeAndSum(InIt &it, InIt end, size_t num)
+ {
+ size_t sum = 0;
+ size_t curr = 0;
+ while (it != end && curr < num) {
+ sum += DecodeAndSumSymbol(*it, num, curr);
+ it++;
+ }
+ assert(curr == num);
+ return sum;
+ }
+};
+
+}
+
+#endif
diff --git a/moses2/TranslationModel/CompactPT/MmapAllocator.h b/moses2/TranslationModel/CompactPT/MmapAllocator.h
new file mode 100644
index 000000000..1e40d8d41
--- /dev/null
+++ b/moses2/TranslationModel/CompactPT/MmapAllocator.h
@@ -0,0 +1,217 @@
+// $Id$
+// vim:tabstop=2
+/***********************************************************************
+ Moses - factored phrase-based language decoder
+ Copyright (C) 2006 University of Edinburgh
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ ***********************************************************************/
+
+#ifndef moses_MmapAllocator_h
+#define moses_MmapAllocator_h
+
+#include <limits>
+#include <iostream>
+#include <cstdio>
+#include <unistd.h>
+
+#if defined(_WIN32) || defined(_WIN64)
+#include <windows.h>
+#include <io.h>
+#else
+#include <sys/mman.h>
+#endif
+
+#include "util/mmap.hh"
+
+namespace Moses2
+{
+template<class T>
+class MmapAllocator
+{
+protected:
+ std::FILE* m_file_ptr;
+ size_t m_file_desc;
+
+ size_t m_page_size;
+ size_t m_map_size;
+
+ char* m_data_ptr;
+ size_t m_data_offset;
+ bool m_fixed;
+ size_t* m_count;
+
+public:
+ typedef T value_type;
+ typedef T* pointer;
+ typedef const T* const_pointer;
+ typedef T& reference;
+ typedef const T& const_reference;
+ typedef std::size_t size_type;
+ typedef std::ptrdiff_t difference_type;
+
+ MmapAllocator() throw () :
+ m_file_ptr(std::tmpfile()), m_file_desc(fileno(m_file_ptr)), m_page_size(
+ util::SizePage()), m_map_size(0), m_data_ptr(0), m_data_offset(0), m_fixed(
+ false), m_count(new size_t(0))
+ {
+ }
+
+ MmapAllocator(std::FILE* f_ptr) throw () :
+ m_file_ptr(f_ptr), m_file_desc(fileno(m_file_ptr)), m_page_size(
+ util::SizePage()), m_map_size(0), m_data_ptr(0), m_data_offset(0), m_fixed(
+ false), m_count(new size_t(0))
+ {
+ }
+
+ MmapAllocator(std::FILE* f_ptr, size_t data_offset) throw () :
+ m_file_ptr(f_ptr), m_file_desc(fileno(m_file_ptr)), m_page_size(
+ util::SizePage()), m_map_size(0), m_data_ptr(0), m_data_offset(
+ data_offset), m_fixed(true), m_count(new size_t(0))
+ {
+ }
+
+ MmapAllocator(std::string fileName) throw () :
+ m_file_ptr(std::fopen(fileName.c_str(), "wb+")), m_file_desc(
+ fileno(m_file_ptr)), m_page_size(util::SizePage()), m_map_size(0), m_data_ptr(
+ 0), m_data_offset(0), m_fixed(false), m_count(new size_t(0))
+ {
+ }
+
+ MmapAllocator(const MmapAllocator& c) throw () :
+ m_file_ptr(c.m_file_ptr), m_file_desc(c.m_file_desc), m_page_size(
+ c.m_page_size), m_map_size(c.m_map_size), m_data_ptr(c.m_data_ptr), m_data_offset(
+ c.m_data_offset), m_fixed(c.m_fixed), m_count(c.m_count)
+ {
+ (*m_count)++;
+ }
+
+ ~MmapAllocator() throw ()
+ {
+ if (m_data_ptr && *m_count == 0) {
+ util::UnmapOrThrow(m_data_ptr, m_map_size);
+ if (!m_fixed && std::ftell(m_file_ptr) != -1) std::fclose(m_file_ptr);
+ }
+ (*m_count)--;
+ }
+
+ template<class U>
+ struct rebind
+ {
+ typedef MmapAllocator<U> other;
+ };
+
+ pointer address(reference value) const
+ {
+ return &value;
+ }
+
+ const_pointer address(const_reference value) const
+ {
+ return &value;
+ }
+
+ size_type max_size() const throw ()
+ {
+ return std::numeric_limits<size_t>::max() / sizeof(value_type);
+ }
+
+ pointer allocate(size_type num, const void* = 0)
+ {
+ m_map_size = num * sizeof(T);
+
+#if defined(_WIN32) || defined(_WIN64)
+ // On Windows, MAP_SHARED is not defined and MapOrThrow ignores the flags.
+ const int map_shared = 0;
+#else
+ const int map_shared = MAP_SHARED;
+#endif
+ if (!m_fixed) {
+ size_t read = 0;
+ read += ftruncate(m_file_desc, m_map_size);
+ m_data_ptr = (char *) util::MapOrThrow(m_map_size, true, map_shared,
+ false, m_file_desc, 0);
+ return (pointer) m_data_ptr;
+ }
+ else {
+ const size_t map_offset = (m_data_offset / m_page_size) * m_page_size;
+ const size_t relative_offset = m_data_offset - map_offset;
+ const size_t adjusted_map_size = m_map_size + relative_offset;
+
+ m_data_ptr = (char *) util::MapOrThrow(adjusted_map_size, false,
+ map_shared, false, m_file_desc, map_offset);
+
+ return (pointer) (m_data_ptr + relative_offset);
+ }
+ }
+
+ void deallocate(pointer p, size_type num)
+ {
+ if (!m_fixed) {
+ util::UnmapOrThrow(p, num * sizeof(T));
+ }
+ else {
+ const size_t map_offset = (m_data_offset / m_page_size) * m_page_size;
+ const size_t relative_offset = m_data_offset - map_offset;
+ const size_t adjusted_map_size = m_map_size + relative_offset;
+
+ util::UnmapOrThrow((pointer) ((char*) p - relative_offset),
+ adjusted_map_size);
+ }
+ }
+
+ void construct(pointer p, const T& value)
+ {
+ if (!m_fixed) new (p) value_type(value);
+ }
+ void destroy(pointer p)
+ {
+ if (!m_fixed) p->~T();
+ }
+
+ template<class T1, class T2>
+ friend bool operator==(const MmapAllocator<T1>&,
+ const MmapAllocator<T2>&) throw ();
+
+ template<class T1, class T2>
+ friend bool operator!=(const MmapAllocator<T1>&,
+ const MmapAllocator<T2>&) throw ();
+};
+
+template<class T1, class T2>
+bool operator==(const MmapAllocator<T1>& a1,
+ const MmapAllocator<T2>& a2) throw ()
+{
+ bool equal = true;
+ equal &= a1.m_file_ptr == a2.m_file_ptr;
+ equal &= a1.m_file_desc == a2.m_file_desc;
+ equal &= a1.m_page_size == a2.m_page_size;
+ equal &= a1.m_map_size == a2.m_map_size;
+ equal &= a1.m_data_ptr == a2.m_data_ptr;
+ equal &= a1.m_data_offset == a2.m_data_offset;
+ equal &= a1.m_fixed == a2.m_fixed;
+ return equal;
+}
+
+template<class T1, class T2>
+bool operator!=(const MmapAllocator<T1>& a1,
+ const MmapAllocator<T2>& a2) throw ()
+{
+ return !(a1 == a2);
+}
+
+}
+
+#endif
diff --git a/moses2/TranslationModel/CompactPT/MonotonicVector.h b/moses2/TranslationModel/CompactPT/MonotonicVector.h
new file mode 100644
index 000000000..586397db8
--- /dev/null
+++ b/moses2/TranslationModel/CompactPT/MonotonicVector.h
@@ -0,0 +1,247 @@
+// $Id$
+// vim:tabstop=2
+/***********************************************************************
+ Moses - factored phrase-based language decoder
+ Copyright (C) 2006 University of Edinburgh
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ ***********************************************************************/
+
+#ifndef moses_MonotonicVector_h
+#define moses_MonotonicVector_h
+
+// MonotonicVector - Represents a monotonic increasing function that maps
+// positive integers of any size onto a given number type. Each value has to be
+// equal or larger than the previous one. Depending on the stepSize it can save
+// up to 90% of memory compared to a std::vector<long>. Time complexity is roughly
+// constant, in the worst case, however, stepSize times slower than a normal
+// std::vector.
+
+#include <vector>
+#include <limits>
+#include <algorithm>
+#include <cstdio>
+#include <cassert>
+
+#include "ThrowingFwrite.h"
+#include "ListCoders.h"
+#include "MmapAllocator.h"
+
+namespace Moses2
+{
+
+template<typename PosT = size_t, typename NumT = size_t, PosT stepSize = 32,
+ template<typename > class Allocator = std::allocator>
+class MonotonicVector
+{
+private:
+ typedef std::vector<NumT, Allocator<NumT> > Anchors;
+ typedef std::vector<unsigned int, Allocator<unsigned int> > Diffs;
+
+ Anchors m_anchors;
+ Diffs m_diffs;
+ std::vector<unsigned int> m_tempDiffs;
+
+ size_t m_size;
+ PosT m_last;
+ bool m_final;
+
+public:
+ typedef PosT value_type;
+
+ MonotonicVector() :
+ m_size(0), m_last(0), m_final(false)
+ {
+ }
+
+ size_t size() const
+ {
+ return m_size + m_tempDiffs.size();
+ }
+
+ PosT at(size_t i) const
+ {
+ PosT s = stepSize;
+ PosT j = m_anchors[i / s];
+ PosT r = i % s;
+
+ typename Diffs::const_iterator it = m_diffs.begin() + j;
+
+ PosT k = 0;
+ k += VarInt32::DecodeAndSum(it, m_diffs.end(), 1);
+ if (i < m_size) k += Simple9::DecodeAndSum(it, m_diffs.end(), r);
+ else if (i < m_size + m_tempDiffs.size()) for (size_t l = 0; l < r; l++)
+ k += m_tempDiffs[l];
+
+ return k;
+ }
+
+ PosT operator[](PosT i) const
+ {
+ return at(i);
+ }
+
+ PosT back() const
+ {
+ return at(size() - 1);
+ }
+
+ void push_back(PosT i)
+ {
+ assert(m_final != true);
+
+ if (m_anchors.size() == 0 && m_tempDiffs.size() == 0) {
+ m_anchors.push_back(0);
+ VarInt32::Encode(&i, &i + 1, std::back_inserter(m_diffs));
+ m_last = i;
+ m_size++;
+
+ return;
+ }
+
+ if (m_tempDiffs.size() == stepSize - 1) {
+ Simple9::Encode(m_tempDiffs.begin(), m_tempDiffs.end(),
+ std::back_inserter(m_diffs));
+ m_anchors.push_back(m_diffs.size());
+ VarInt32::Encode(&i, &i + 1, std::back_inserter(m_diffs));
+
+ m_size += m_tempDiffs.size() + 1;
+ m_tempDiffs.clear();
+ }
+ else {
+ PosT last = m_last;
+ PosT diff = i - last;
+ m_tempDiffs.push_back(diff);
+ }
+ m_last = i;
+ }
+
+ void commit()
+ {
+ assert(m_final != true);
+ Simple9::Encode(m_tempDiffs.begin(), m_tempDiffs.end(),
+ std::back_inserter(m_diffs));
+ m_size += m_tempDiffs.size();
+ m_tempDiffs.clear();
+ m_final = true;
+ }
+
+ size_t usage()
+ {
+ return m_diffs.size() * sizeof(unsigned int)
+ + m_anchors.size() * sizeof(NumT);
+ }
+
+ size_t load(std::FILE* in, bool map = false)
+ {
+ size_t byteSize = 0;
+
+ byteSize += fread(&m_final, sizeof(bool), 1, in) * sizeof(bool);
+ byteSize += fread(&m_size, sizeof(size_t), 1, in) * sizeof(size_t);
+ byteSize += fread(&m_last, sizeof(PosT), 1, in) * sizeof(PosT);
+
+ byteSize += loadVector(m_diffs, in, map);
+ byteSize += loadVector(m_anchors, in, map);
+
+ return byteSize;
+ }
+
+ template<typename ValueT>
+ size_t loadVector(std::vector<ValueT, std::allocator<ValueT> >& v,
+ std::FILE* in, bool map = false)
+ {
+ // Can only be read into memory. Mapping not possible with std:allocator.
+ assert(map == false);
+
+ size_t byteSize = 0;
+
+ size_t valSize;
+ byteSize += std::fread(&valSize, sizeof(size_t), 1, in) * sizeof(size_t);
+
+ v.resize(valSize, 0);
+ byteSize += std::fread(&v[0], sizeof(ValueT), valSize, in) * sizeof(ValueT);
+
+ return byteSize;
+ }
+
+ template<typename ValueT>
+ size_t loadVector(std::vector<ValueT, MmapAllocator<ValueT> >& v,
+ std::FILE* in, bool map = false)
+ {
+ size_t byteSize = 0;
+
+ size_t valSize;
+ byteSize += std::fread(&valSize, sizeof(size_t), 1, in) * sizeof(size_t);
+
+ if (map == false) {
+ // Read data into temporary file (default constructor of MmapAllocator)
+ // and map memory onto temporary file. Can be resized.
+
+ v.resize(valSize, 0);
+ byteSize += std::fread(&v[0], sizeof(ValueT), valSize, in)
+ * sizeof(ValueT);
+ }
+ else {
+ // Map it directly on specified region of file "in" starting at valPos
+ // with length valSize * sizeof(ValueT). Mapped region cannot be resized.
+
+ size_t valPos = std::ftell(in);
+
+ Allocator<ValueT> alloc(in, valPos);
+ std::vector<ValueT, Allocator<ValueT> > vTemp(alloc);
+ vTemp.resize(valSize);
+ v.swap(vTemp);
+
+ std::fseek(in, valSize * sizeof(ValueT), SEEK_CUR);
+ byteSize += valSize * sizeof(ValueT);
+ }
+
+ return byteSize;
+ }
+
+ size_t save(std::FILE* out)
+ {
+ if (!m_final) commit();
+
+ bool byteSize = 0;
+ byteSize += ThrowingFwrite(&m_final, sizeof(bool), 1, out) * sizeof(bool);
+ byteSize += ThrowingFwrite(&m_size, sizeof(size_t), 1, out)
+ * sizeof(size_t);
+ byteSize += ThrowingFwrite(&m_last, sizeof(PosT), 1, out) * sizeof(PosT);
+
+ size_t size = m_diffs.size();
+ byteSize += ThrowingFwrite(&size, sizeof(size_t), 1, out) * sizeof(size_t);
+ byteSize += ThrowingFwrite(&m_diffs[0], sizeof(unsigned int), size, out)
+ * sizeof(unsigned int);
+
+ size = m_anchors.size();
+ byteSize += ThrowingFwrite(&size, sizeof(size_t), 1, out) * sizeof(size_t);
+ byteSize += ThrowingFwrite(&m_anchors[0], sizeof(NumT), size, out)
+ * sizeof(NumT);
+
+ return byteSize;
+ }
+
+ void swap(MonotonicVector<PosT, NumT, stepSize, Allocator> &mv)
+ {
+ if (!m_final) commit();
+
+ m_diffs.swap(mv.m_diffs);
+ m_anchors.swap(mv.m_anchors);
+ }
+};
+
+}
+#endif
diff --git a/moses2/TranslationModel/CompactPT/MurmurHash3.cpp b/moses2/TranslationModel/CompactPT/MurmurHash3.cpp
new file mode 100644
index 000000000..c3e567af6
--- /dev/null
+++ b/moses2/TranslationModel/CompactPT/MurmurHash3.cpp
@@ -0,0 +1,424 @@
+//-----------------------------------------------------------------------------
+// MurmurHash3 was written by Austin Appleby, and is placed in the public
+// domain. The author hereby disclaims copyright to this source code.
+
+// Note - The x86 and x64 versions do _not_ produce the same results, as the
+// algorithms are optimized for their respective platforms. You can still
+// compile and run any of them on any platform, but your performance with the
+// non-native version will be less than optimal.
+
+#include "MurmurHash3.h"
+
+//-----------------------------------------------------------------------------
+// Platform-specific functions and macros
+
+// Microsoft Visual Studio
+
+#if defined(_MSC_VER)
+
+#define FORCE_INLINE __forceinline
+
+#include <cstdlib>
+
+#define ROTL32(x,y) _rotl(x,y)
+#define ROTL64(x,y) _rotl64(x,y)
+
+#define BIG_CONSTANT(x) (x)
+
+// Other compilers
+
+#else // defined(_MSC_VER)
+
+#define FORCE_INLINE inline __attribute__((always_inline))
+
+inline uint32_t rotl32(uint32_t x, int8_t r)
+{
+ return (x << r) | (x >> (32 - r));
+}
+
+inline uint64_t rotl64(uint64_t x, int8_t r)
+{
+ return (x << r) | (x >> (64 - r));
+}
+
+#define ROTL32(x,y) rotl32(x,y)
+#define ROTL64(x,y) rotl64(x,y)
+
+#define BIG_CONSTANT(x) (x##LLU)
+
+#endif // !defined(_MSC_VER)
+
+//-----------------------------------------------------------------------------
+// Block read - if your platform needs to do endian-swapping or can only
+// handle aligned reads, do the conversion here
+
+FORCE_INLINE uint32_t getblock(const uint32_t * p, int i)
+{
+ return p[i];
+}
+
+FORCE_INLINE uint64_t getblock(const uint64_t * p, int i)
+{
+ return p[i];
+}
+
+//-----------------------------------------------------------------------------
+// Finalization mix - force all bits of a hash block to avalanche
+
+FORCE_INLINE uint32_t fmix(uint32_t h)
+{
+ h ^= h >> 16;
+ h *= 0x85ebca6b;
+ h ^= h >> 13;
+ h *= 0xc2b2ae35;
+ h ^= h >> 16;
+
+ return h;
+}
+
+//----------
+
+FORCE_INLINE uint64_t fmix(uint64_t k)
+{
+ k ^= k >> 33;
+ k *= BIG_CONSTANT(0xff51afd7ed558ccd);
+ k ^= k >> 33;
+ k *= BIG_CONSTANT(0xc4ceb9fe1a85ec53);
+ k ^= k >> 33;
+
+ return k;
+}
+
+//-----------------------------------------------------------------------------
+
+void MurmurHash3_x86_32(const void * key, int len, uint32_t seed, void * out)
+{
+ const uint8_t * data = (const uint8_t*) key;
+ const int nblocks = len / 4;
+
+ uint32_t h1 = seed;
+
+ uint32_t c1 = 0xcc9e2d51;
+ uint32_t c2 = 0x1b873593;
+
+ //----------
+ // body
+
+ const uint32_t * blocks = (const uint32_t *) (data + nblocks * 4);
+
+ for (int i = -nblocks; i; i++) {
+ uint32_t k1 = getblock(blocks, i);
+
+ k1 *= c1;
+ k1 = ROTL32(k1, 15);
+ k1 *= c2;
+
+ h1 ^= k1;
+ h1 = ROTL32(h1, 13);
+ h1 = h1 * 5 + 0xe6546b64;
+ }
+
+ //----------
+ // tail
+
+ const uint8_t * tail = (const uint8_t*) (data + nblocks * 4);
+
+ uint32_t k1 = 0;
+
+ switch (len & 3) {
+ case 3:
+ k1 ^= tail[2] << 16;
+ case 2:
+ k1 ^= tail[1] << 8;
+ case 1:
+ k1 ^= tail[0];
+ k1 *= c1;
+ k1 = ROTL32(k1, 15);
+ k1 *= c2;
+ h1 ^= k1;
+ };
+
+ //----------
+ // finalization
+
+ h1 ^= len;
+
+ h1 = fmix(h1);
+
+ *(uint32_t*) out = h1;
+}
+
+//-----------------------------------------------------------------------------
+
+void MurmurHash3_x86_128(const void * key, const int len, uint32_t seed,
+ void * out)
+{
+ const uint8_t * data = (const uint8_t*) key;
+ const int nblocks = len / 16;
+
+ uint32_t h1 = seed;
+ uint32_t h2 = seed;
+ uint32_t h3 = seed;
+ uint32_t h4 = seed;
+
+ uint32_t c1 = 0x239b961b;
+ uint32_t c2 = 0xab0e9789;
+ uint32_t c3 = 0x38b34ae5;
+ uint32_t c4 = 0xa1e38b93;
+
+ //----------
+ // body
+
+ const uint32_t * blocks = (const uint32_t *) (data + nblocks * 16);
+
+ for (int i = -nblocks; i; i++) {
+ uint32_t k1 = getblock(blocks, i * 4 + 0);
+ uint32_t k2 = getblock(blocks, i * 4 + 1);
+ uint32_t k3 = getblock(blocks, i * 4 + 2);
+ uint32_t k4 = getblock(blocks, i * 4 + 3);
+
+ k1 *= c1;
+ k1 = ROTL32(k1, 15);
+ k1 *= c2;
+ h1 ^= k1;
+
+ h1 = ROTL32(h1, 19);
+ h1 += h2;
+ h1 = h1 * 5 + 0x561ccd1b;
+
+ k2 *= c2;
+ k2 = ROTL32(k2, 16);
+ k2 *= c3;
+ h2 ^= k2;
+
+ h2 = ROTL32(h2, 17);
+ h2 += h3;
+ h2 = h2 * 5 + 0x0bcaa747;
+
+ k3 *= c3;
+ k3 = ROTL32(k3, 17);
+ k3 *= c4;
+ h3 ^= k3;
+
+ h3 = ROTL32(h3, 15);
+ h3 += h4;
+ h3 = h3 * 5 + 0x96cd1c35;
+
+ k4 *= c4;
+ k4 = ROTL32(k4, 18);
+ k4 *= c1;
+ h4 ^= k4;
+
+ h4 = ROTL32(h4, 13);
+ h4 += h1;
+ h4 = h4 * 5 + 0x32ac3b17;
+ }
+
+ //----------
+ // tail
+
+ const uint8_t * tail = (const uint8_t*) (data + nblocks * 16);
+
+ uint32_t k1 = 0;
+ uint32_t k2 = 0;
+ uint32_t k3 = 0;
+ uint32_t k4 = 0;
+
+ switch (len & 15) {
+ case 15:
+ k4 ^= tail[14] << 16;
+ case 14:
+ k4 ^= tail[13] << 8;
+ case 13:
+ k4 ^= tail[12] << 0;
+ k4 *= c4;
+ k4 = ROTL32(k4, 18);
+ k4 *= c1;
+ h4 ^= k4;
+
+ case 12:
+ k3 ^= tail[11] << 24;
+ case 11:
+ k3 ^= tail[10] << 16;
+ case 10:
+ k3 ^= tail[9] << 8;
+ case 9:
+ k3 ^= tail[8] << 0;
+ k3 *= c3;
+ k3 = ROTL32(k3, 17);
+ k3 *= c4;
+ h3 ^= k3;
+
+ case 8:
+ k2 ^= tail[7] << 24;
+ case 7:
+ k2 ^= tail[6] << 16;
+ case 6:
+ k2 ^= tail[5] << 8;
+ case 5:
+ k2 ^= tail[4] << 0;
+ k2 *= c2;
+ k2 = ROTL32(k2, 16);
+ k2 *= c3;
+ h2 ^= k2;
+
+ case 4:
+ k1 ^= tail[3] << 24;
+ case 3:
+ k1 ^= tail[2] << 16;
+ case 2:
+ k1 ^= tail[1] << 8;
+ case 1:
+ k1 ^= tail[0] << 0;
+ k1 *= c1;
+ k1 = ROTL32(k1, 15);
+ k1 *= c2;
+ h1 ^= k1;
+ };
+
+ //----------
+ // finalization
+
+ h1 ^= len;
+ h2 ^= len;
+ h3 ^= len;
+ h4 ^= len;
+
+ h1 += h2;
+ h1 += h3;
+ h1 += h4;
+ h2 += h1;
+ h3 += h1;
+ h4 += h1;
+
+ h1 = fmix(h1);
+ h2 = fmix(h2);
+ h3 = fmix(h3);
+ h4 = fmix(h4);
+
+ h1 += h2;
+ h1 += h3;
+ h1 += h4;
+ h2 += h1;
+ h3 += h1;
+ h4 += h1;
+
+ ((uint32_t*) out)[0] = h1;
+ ((uint32_t*) out)[1] = h2;
+ ((uint32_t*) out)[2] = h3;
+ ((uint32_t*) out)[3] = h4;
+}
+
+//-----------------------------------------------------------------------------
+
+void MurmurHash3_x64_128(const void * key, const int len, const uint32_t seed,
+ void * out)
+{
+ const uint8_t * data = (const uint8_t*) key;
+ const int nblocks = len / 16;
+
+ uint64_t h1 = seed;
+ uint64_t h2 = seed;
+
+ uint64_t c1 = BIG_CONSTANT(0x87c37b91114253d5);
+ uint64_t c2 = BIG_CONSTANT(0x4cf5ad432745937f);
+
+ //----------
+ // body
+
+ const uint64_t * blocks = (const uint64_t *) (data);
+
+ for (int i = 0; i < nblocks; i++) {
+ uint64_t k1 = getblock(blocks, i * 2 + 0);
+ uint64_t k2 = getblock(blocks, i * 2 + 1);
+
+ k1 *= c1;
+ k1 = ROTL64(k1, 31);
+ k1 *= c2;
+ h1 ^= k1;
+
+ h1 = ROTL64(h1, 27);
+ h1 += h2;
+ h1 = h1 * 5 + 0x52dce729;
+
+ k2 *= c2;
+ k2 = ROTL64(k2, 33);
+ k2 *= c1;
+ h2 ^= k2;
+
+ h2 = ROTL64(h2, 31);
+ h2 += h1;
+ h2 = h2 * 5 + 0x38495ab5;
+ }
+
+ //----------
+ // tail
+
+ const uint8_t * tail = (const uint8_t*) (data + nblocks * 16);
+
+ uint64_t k1 = 0;
+ uint64_t k2 = 0;
+
+ switch (len & 15) {
+ case 15:
+ k2 ^= uint64_t(tail[14]) << 48;
+ case 14:
+ k2 ^= uint64_t(tail[13]) << 40;
+ case 13:
+ k2 ^= uint64_t(tail[12]) << 32;
+ case 12:
+ k2 ^= uint64_t(tail[11]) << 24;
+ case 11:
+ k2 ^= uint64_t(tail[10]) << 16;
+ case 10:
+ k2 ^= uint64_t(tail[9]) << 8;
+ case 9:
+ k2 ^= uint64_t(tail[8]) << 0;
+ k2 *= c2;
+ k2 = ROTL64(k2, 33);
+ k2 *= c1;
+ h2 ^= k2;
+
+ case 8:
+ k1 ^= uint64_t(tail[7]) << 56;
+ case 7:
+ k1 ^= uint64_t(tail[6]) << 48;
+ case 6:
+ k1 ^= uint64_t(tail[5]) << 40;
+ case 5:
+ k1 ^= uint64_t(tail[4]) << 32;
+ case 4:
+ k1 ^= uint64_t(tail[3]) << 24;
+ case 3:
+ k1 ^= uint64_t(tail[2]) << 16;
+ case 2:
+ k1 ^= uint64_t(tail[1]) << 8;
+ case 1:
+ k1 ^= uint64_t(tail[0]) << 0;
+ k1 *= c1;
+ k1 = ROTL64(k1, 31);
+ k1 *= c2;
+ h1 ^= k1;
+ };
+
+ //----------
+ // finalization
+
+ h1 ^= len;
+ h2 ^= len;
+
+ h1 += h2;
+ h2 += h1;
+
+ h1 = fmix(h1);
+ h2 = fmix(h2);
+
+ h1 += h2;
+ h2 += h1;
+
+ ((uint64_t*) out)[0] = h1;
+ ((uint64_t*) out)[1] = h2;
+}
+
+//-----------------------------------------------------------------------------
+
diff --git a/moses2/TranslationModel/CompactPT/MurmurHash3.h b/moses2/TranslationModel/CompactPT/MurmurHash3.h
new file mode 100644
index 000000000..f513008cf
--- /dev/null
+++ b/moses2/TranslationModel/CompactPT/MurmurHash3.h
@@ -0,0 +1,37 @@
+//-----------------------------------------------------------------------------
+// MurmurHash3 was written by Austin Appleby, and is placed in the public
+// domain. The author hereby disclaims copyright to this source code.
+
+#ifndef _MURMURHASH3_H_
+#define _MURMURHASH3_H_
+
+//-----------------------------------------------------------------------------
+// Platform-specific functions and macros
+
+// Microsoft Visual Studio
+
+#if defined(_MSC_VER)
+
+typedef unsigned char uint8_t;
+typedef unsigned long uint32_t;
+typedef unsigned __int64 uint64_t;
+
+// Other compilers
+
+#else // defined(_MSC_VER)
+
+#include <stdint.h>
+
+#endif // !defined(_MSC_VER)
+
+//-----------------------------------------------------------------------------
+
+void MurmurHash3_x86_32(const void * key, int len, uint32_t seed, void * out);
+
+void MurmurHash3_x86_128(const void * key, int len, uint32_t seed, void * out);
+
+void MurmurHash3_x64_128(const void * key, int len, uint32_t seed, void * out);
+
+//-----------------------------------------------------------------------------
+
+#endif // _MURMURHASH3_H_
diff --git a/moses2/TranslationModel/CompactPT/PackedArray.h b/moses2/TranslationModel/CompactPT/PackedArray.h
new file mode 100644
index 000000000..409c3cca8
--- /dev/null
+++ b/moses2/TranslationModel/CompactPT/PackedArray.h
@@ -0,0 +1,207 @@
+// $Id$
+// vim:tabstop=2
+/***********************************************************************
+ Moses - factored phrase-based language decoder
+ Copyright (C) 2006 University of Edinburgh
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ ***********************************************************************/
+
+#ifndef moses_PackedArray_h
+#define moses_PackedArray_h
+
+#include <vector>
+#include <cmath>
+#include <cstring>
+#include <cstdio>
+
+#include "ThrowingFwrite.h"
+
+namespace Moses2
+{
+
+template<typename T = size_t, typename D = unsigned char>
+class PackedArray
+{
+protected:
+ static size_t m_dataBits;
+
+ size_t m_size;
+ size_t m_storageSize;
+ D* m_storage;
+
+public:
+ PackedArray()
+ {
+ m_size = 0;
+ m_storageSize = 0;
+ m_storage = new D[0];
+ }
+
+ PackedArray(size_t size, size_t bits) :
+ m_size(size)
+ {
+ m_storageSize = ceil(float(bits * size) / float(m_dataBits));
+ m_storage = new D[m_storageSize];
+ }
+
+ PackedArray(const PackedArray<T, D> &c)
+ {
+ m_size = c.m_size;
+
+ m_storageSize = c.m_storageSize;
+ m_storage = new D[m_storageSize];
+
+ std::memcpy(m_storage, c.m_storage, m_storageSize * sizeof(D));
+ }
+
+ virtual ~PackedArray()
+ {
+ delete[] m_storage;
+ m_size = 0;
+ m_storageSize = 0;
+ m_storage = 0;
+ }
+
+ T Get(size_t i, size_t bits) const
+ {
+ T out = 0;
+
+ size_t bitstart = (i * bits);
+ size_t bitpos = bitstart;
+
+ size_t zero = ((1ul << (bits)) - 1);
+
+ while (bitpos - bitstart < bits) {
+ size_t pos = bitpos / m_dataBits;
+ size_t off = bitpos % m_dataBits;
+
+ out |= (T(m_storage[pos]) << (bitpos - bitstart)) >> off;
+
+ bitpos += (m_dataBits - off);
+ }
+
+ out &= zero;
+ return out;
+ }
+
+ void Set(size_t i, T v, size_t bits)
+ {
+ size_t bitstart = (i * bits);
+ size_t bitpos = bitstart;
+
+ while (bitpos - bitstart < bits) {
+ size_t pos = bitpos / m_dataBits;
+ size_t off = bitpos % m_dataBits;
+
+ size_t rest = bits - (bitpos - bitstart);
+ D zero = ~((1ul << (rest + off)) - 1) | ((1ul << off) - 1);
+
+ m_storage[pos] &= zero;
+ m_storage[pos] |= v << off;
+ v = v >> (m_dataBits - off);
+ bitpos += (m_dataBits - off);
+ }
+ }
+
+ virtual D*& GetStorage()
+ {
+ return m_storage;
+ }
+
+ virtual size_t GetStorageSize() const
+ {
+ return m_storageSize;
+ }
+
+ virtual size_t Size() const
+ {
+ return m_size;
+ }
+
+ virtual size_t Load(std::FILE* in)
+ {
+ size_t a1 = std::ftell(in);
+
+ size_t read = 0;
+ read += std::fread(&m_size, sizeof(m_size), 1, in);
+ read += std::fread(&m_storageSize, sizeof(m_storageSize), 1, in);
+ delete[] m_storage;
+ m_storage = new D[m_storageSize];
+ read += std::fread(m_storage, sizeof(D), m_storageSize, in);
+
+ size_t a2 = std::ftell(in);
+ return a2 - a1;
+ }
+
+ virtual size_t Save(std::FILE* out)
+ {
+ size_t a1 = std::ftell(out);
+
+ ThrowingFwrite(&m_size, sizeof(m_size), 1, out);
+ ThrowingFwrite(&m_storageSize, sizeof(m_storageSize), 1, out);
+ ThrowingFwrite(m_storage, sizeof(D), m_storageSize, out);
+
+ size_t a2 = std::ftell(out);
+ return a2 - a1;
+ }
+
+};
+
+template<typename T, typename D>
+size_t PackedArray<T, D>::m_dataBits = sizeof(D) * 8;
+
+/**************************************************************************/
+
+template<typename T = size_t, typename D = unsigned char>
+class PairedPackedArray: public PackedArray<T, D>
+{
+public:
+ PairedPackedArray() :
+ PackedArray<T, D>()
+ {
+ }
+
+ PairedPackedArray(size_t size, size_t bits1, size_t bits2) :
+ PackedArray<T, D>(size, bits1 + bits2)
+ {
+ }
+
+ void Set(size_t i, T a, T b, size_t bits1, size_t bits2)
+ {
+ T c = 0;
+ c = a | (b << bits1);
+ PackedArray<T, D>::Set(i, c, bits1 + bits2);
+ }
+
+ void Set(size_t i, std::pair<T, T> p, size_t bits1, size_t bits2)
+ {
+ T c = 0;
+ c = p.second | (p.first << bits1);
+ PackedArray<T, D>::Set(i, c);
+ }
+
+ std::pair<T, T> Get(size_t i, size_t bits1, size_t bits2)
+ {
+ T v = PackedArray<T, D>::Get(i, bits1 + bits2);
+ T a = v & ((1 << bits1) - 1);
+ T b = v >> bits1;
+ return std::pair<T, T>(a, b);
+ }
+};
+
+}
+
+#endif
diff --git a/moses2/TranslationModel/CompactPT/PhraseDecoder.cpp b/moses2/TranslationModel/CompactPT/PhraseDecoder.cpp
new file mode 100644
index 000000000..7860fed94
--- /dev/null
+++ b/moses2/TranslationModel/CompactPT/PhraseDecoder.cpp
@@ -0,0 +1,466 @@
+// $Id$
+// vim:tabstop=2
+/***********************************************************************
+Moses - factored phrase-based language decoder
+Copyright (C) 2006 University of Edinburgh
+
+This library is free software; you can redistribute it and/or
+modify it under the terms of the GNU Lesser General Public
+License as published by the Free Software Foundation; either
+version 2.1 of the License, or (at your option) any later version.
+
+This library is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+Lesser General Public License for more details.
+
+You should have received a copy of the GNU Lesser General Public
+License along with this library; if not, write to the Free Software
+Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+***********************************************************************/
+
+#include <deque>
+
+#include "PhraseDecoder.h"
+#include "../../System.h"
+#include "../../SubPhrase.h"
+
+using namespace std;
+
+namespace Moses2
+{
+
+PhraseDecoder::PhraseDecoder(
+ PhraseTableCompact &phraseDictionary,
+ const std::vector<FactorType>* input,
+ const std::vector<FactorType>* output,
+ size_t numScoreComponent
+ // , const std::vector<float>* weight
+)
+ : m_coding(None), m_numScoreComponent(numScoreComponent),
+ m_containsAlignmentInfo(true), m_maxRank(0),
+ m_symbolTree(0), m_multipleScoreTrees(false),
+ m_scoreTrees(1), m_alignTree(0),
+ m_phraseDictionary(phraseDictionary), m_input(input), m_output(output),
+ // m_weight(weight),
+ m_separator(" ||| ")
+{ }
+
+PhraseDecoder::~PhraseDecoder()
+{
+ if(m_symbolTree)
+ delete m_symbolTree;
+
+ for(size_t i = 0; i < m_scoreTrees.size(); i++)
+ if(m_scoreTrees[i])
+ delete m_scoreTrees[i];
+
+ if(m_alignTree)
+ delete m_alignTree;
+}
+
+inline unsigned PhraseDecoder::GetSourceSymbolId(std::string& symbol)
+{
+ boost::unordered_map<std::string, unsigned>::iterator it
+ = m_sourceSymbolsMap.find(symbol);
+ if(it != m_sourceSymbolsMap.end())
+ return it->second;
+
+ size_t idx = m_sourceSymbols.find(symbol);
+ m_sourceSymbolsMap[symbol] = idx;
+ return idx;
+}
+
+inline std::string PhraseDecoder::GetTargetSymbol(unsigned idx) const
+{
+ if(idx < m_targetSymbols.size())
+ return m_targetSymbols[idx];
+ return std::string("##ERROR##");
+}
+
+inline size_t PhraseDecoder::GetREncType(unsigned encodedSymbol)
+{
+ return (encodedSymbol >> 30) + 1;
+}
+
+inline size_t PhraseDecoder::GetPREncType(unsigned encodedSymbol)
+{
+ return (encodedSymbol >> 31) + 1;
+}
+
+inline unsigned PhraseDecoder::GetTranslation(unsigned srcIdx, size_t rank)
+{
+ size_t srcTrgIdx = m_lexicalTableIndex[srcIdx];
+ return m_lexicalTable[srcTrgIdx + rank].second;
+}
+
+size_t PhraseDecoder::GetMaxSourcePhraseLength()
+{
+ return m_maxPhraseLength;
+}
+
+inline unsigned PhraseDecoder::DecodeREncSymbol1(unsigned encodedSymbol)
+{
+ return encodedSymbol &= ~(3 << 30);
+}
+
+inline unsigned PhraseDecoder::DecodeREncSymbol2Rank(unsigned encodedSymbol)
+{
+ return encodedSymbol &= ~(255 << 24);
+}
+
+inline unsigned PhraseDecoder::DecodeREncSymbol2Position(unsigned encodedSymbol)
+{
+ encodedSymbol &= ~(3 << 30);
+ encodedSymbol >>= 24;
+ return encodedSymbol;
+}
+
+inline unsigned PhraseDecoder::DecodeREncSymbol3(unsigned encodedSymbol)
+{
+ return encodedSymbol &= ~(3 << 30);
+}
+
+inline unsigned PhraseDecoder::DecodePREncSymbol1(unsigned encodedSymbol)
+{
+ return encodedSymbol &= ~(1 << 31);
+}
+
+inline int PhraseDecoder::DecodePREncSymbol2Left(unsigned encodedSymbol)
+{
+ return ((encodedSymbol >> 25) & 63) - 32;
+}
+
+inline int PhraseDecoder::DecodePREncSymbol2Right(unsigned encodedSymbol)
+{
+ return ((encodedSymbol >> 19) & 63) - 32;
+}
+
+inline unsigned PhraseDecoder::DecodePREncSymbol2Rank(unsigned encodedSymbol)
+{
+ return (encodedSymbol & 524287);
+}
+
+size_t PhraseDecoder::Load(std::FILE* in)
+{
+ size_t start = std::ftell(in);
+ size_t read = 0;
+
+ read += std::fread(&m_coding, sizeof(m_coding), 1, in);
+ read += std::fread(&m_numScoreComponent, sizeof(m_numScoreComponent), 1, in);
+ read += std::fread(&m_containsAlignmentInfo, sizeof(m_containsAlignmentInfo), 1, in);
+ read += std::fread(&m_maxRank, sizeof(m_maxRank), 1, in);
+ read += std::fread(&m_maxPhraseLength, sizeof(m_maxPhraseLength), 1, in);
+
+ if(m_coding == REnc) {
+ m_sourceSymbols.load(in);
+
+ size_t size;
+ read += std::fread(&size, sizeof(size_t), 1, in);
+ m_lexicalTableIndex.resize(size);
+ read += std::fread(&m_lexicalTableIndex[0], sizeof(size_t), size, in);
+
+ read += std::fread(&size, sizeof(size_t), 1, in);
+ m_lexicalTable.resize(size);
+ read += std::fread(&m_lexicalTable[0], sizeof(SrcTrg), size, in);
+ }
+
+ m_targetSymbols.load(in);
+
+ m_symbolTree = new CanonicalHuffman<unsigned>(in);
+
+ read += std::fread(&m_multipleScoreTrees, sizeof(m_multipleScoreTrees), 1, in);
+ if(m_multipleScoreTrees) {
+ m_scoreTrees.resize(m_numScoreComponent);
+ for(size_t i = 0; i < m_numScoreComponent; i++)
+ m_scoreTrees[i] = new CanonicalHuffman<float>(in);
+ } else {
+ m_scoreTrees.resize(1);
+ m_scoreTrees[0] = new CanonicalHuffman<float>(in);
+ }
+
+ if(m_containsAlignmentInfo)
+ m_alignTree = new CanonicalHuffman<AlignPoint>(in);
+
+ size_t end = std::ftell(in);
+ return end - start;
+}
+
+std::string PhraseDecoder::MakeSourceKey(std::string &source)
+{
+ return source + m_separator;
+}
+
+TargetPhraseVectorPtr PhraseDecoder::CreateTargetPhraseCollection(
+ const ManagerBase &mgr,
+ const Phrase<Word> &sourcePhrase,
+ bool topLevel,
+ bool eval)
+{
+
+ // Not using TargetPhraseCollection avoiding "new" operator
+ // which can introduce heavy locking with multiple threads
+ TargetPhraseVectorPtr tpv(new TargetPhraseVector());
+ size_t bitsLeft = 0;
+
+ if(m_coding == PREnc) {
+ std::pair<TargetPhraseVectorPtr, size_t> cachedPhraseColl
+ = m_decodingCache.Retrieve(sourcePhrase);
+
+ // Has been cached and is complete or does not need to be completed
+ if(cachedPhraseColl.first != NULL && (!topLevel || cachedPhraseColl.second == 0))
+ return cachedPhraseColl.first;
+
+ // Has been cached, but is incomplete
+ else if(cachedPhraseColl.first != NULL) {
+ bitsLeft = cachedPhraseColl.second;
+ tpv->resize(cachedPhraseColl.first->size());
+ std::copy(cachedPhraseColl.first->begin(),
+ cachedPhraseColl.first->end(),
+ tpv->begin());
+ }
+ }
+
+ // Retrieve source phrase identifier
+ std::string sourcePhraseString = sourcePhrase.GetString(*m_input);
+ size_t sourcePhraseId = m_phraseDictionary.m_hash[MakeSourceKey(sourcePhraseString)];
+ /*
+ cerr << "sourcePhraseString=" << sourcePhraseString << " "
+ << sourcePhraseId
+ << endl;
+ */
+ if(sourcePhraseId != m_phraseDictionary.m_hash.GetSize()) {
+ // Retrieve compressed and encoded target phrase collection
+ std::string encodedPhraseCollection;
+ if(m_phraseDictionary.m_inMemory)
+ encodedPhraseCollection = m_phraseDictionary.m_targetPhrasesMemory[sourcePhraseId].str();
+ else
+ encodedPhraseCollection = m_phraseDictionary.m_targetPhrasesMapped[sourcePhraseId].str();
+
+ BitWrapper<> encodedBitStream(encodedPhraseCollection);
+ if(m_coding == PREnc && bitsLeft)
+ encodedBitStream.SeekFromEnd(bitsLeft);
+
+ // Decompress and decode target phrase collection
+ TargetPhraseVectorPtr decodedPhraseColl =
+ DecodeCollection(mgr, tpv, encodedBitStream, sourcePhrase, topLevel, eval);
+
+ return decodedPhraseColl;
+ } else
+ return TargetPhraseVectorPtr();
+}
+
+TargetPhraseVectorPtr PhraseDecoder::DecodeCollection(
+ const ManagerBase &mgr,
+ TargetPhraseVectorPtr tpv,
+ BitWrapper<> &encodedBitStream,
+ const Phrase<Word> &sourcePhrase,
+ bool topLevel,
+ bool eval)
+{
+ const System &system = mgr.system;
+ FactorCollection &vocab = system.GetVocab();
+
+ bool extending = tpv->size();
+ size_t bitsLeft = encodedBitStream.TellFromEnd();
+
+ std::vector<int> sourceWords;
+ if(m_coding == REnc) {
+ for(size_t i = 0; i < sourcePhrase.GetSize(); i++) {
+ std::string sourceWord
+ = sourcePhrase[i].GetString(*m_input);
+ unsigned idx = GetSourceSymbolId(sourceWord);
+ sourceWords.push_back(idx);
+ }
+ }
+
+ unsigned phraseStopSymbol = 0;
+ AlignPoint alignStopSymbol(-1, -1);
+
+ std::vector<float> scores;
+ std::set<AlignPointSizeT> alignment;
+
+ enum DecodeState { New, Symbol, Score, Alignment, Add } state = New;
+
+ size_t srcSize = sourcePhrase.GetSize();
+
+ TPCompact* targetPhrase = NULL;
+ while(encodedBitStream.TellFromEnd()) {
+
+ if(state == New) {
+ // Creating new TargetPhrase on the heap
+ tpv->push_back(TPCompact());
+ targetPhrase = &tpv->back();
+
+ alignment.clear();
+ scores.clear();
+
+ state = Symbol;
+ }
+
+ if(state == Symbol) {
+ unsigned symbol = m_symbolTree->Read(encodedBitStream);
+ if(symbol == phraseStopSymbol) {
+ state = Score;
+ } else {
+ if(m_coding == REnc) {
+ std::string wordString;
+ size_t type = GetREncType(symbol);
+
+ if(type == 1) {
+ unsigned decodedSymbol = DecodeREncSymbol1(symbol);
+ wordString = GetTargetSymbol(decodedSymbol);
+ } else if (type == 2) {
+ size_t rank = DecodeREncSymbol2Rank(symbol);
+ size_t srcPos = DecodeREncSymbol2Position(symbol);
+
+ if(srcPos >= sourceWords.size())
+ return TargetPhraseVectorPtr();
+
+ wordString = GetTargetSymbol(GetTranslation(sourceWords[srcPos], rank));
+ if(m_phraseDictionary.m_useAlignmentInfo) {
+ size_t trgPos = targetPhrase->words.size();
+ alignment.insert(AlignPoint(srcPos, trgPos));
+ }
+ } else if(type == 3) {
+ size_t rank = DecodeREncSymbol3(symbol);
+ size_t srcPos = targetPhrase->words.size();
+
+ if(srcPos >= sourceWords.size())
+ return TargetPhraseVectorPtr();
+
+ wordString = GetTargetSymbol(GetTranslation(sourceWords[srcPos], rank));
+ if(m_phraseDictionary.m_useAlignmentInfo) {
+ size_t trgPos = srcPos;
+ alignment.insert(AlignPoint(srcPos, trgPos));
+ }
+ }
+
+ Word word;
+ word.CreateFromString(vocab, system, wordString);
+ targetPhrase->words.push_back(word);
+ } else if(m_coding == PREnc) {
+ // if the symbol is just a word
+ if(GetPREncType(symbol) == 1) {
+ unsigned decodedSymbol = DecodePREncSymbol1(symbol);
+
+ Word word;
+ word.CreateFromString(vocab, system, GetTargetSymbol(decodedSymbol));
+ targetPhrase->words.push_back(word);
+ }
+ // if the symbol is a subphrase pointer
+ else {
+ int left = DecodePREncSymbol2Left(symbol);
+ int right = DecodePREncSymbol2Right(symbol);
+ unsigned rank = DecodePREncSymbol2Rank(symbol);
+
+ int srcStart = left + targetPhrase->words.size();
+ int srcEnd = srcSize - right - 1;
+
+ // false positive consistency check
+ if(0 > srcStart || srcStart > srcEnd || unsigned(srcEnd) >= srcSize)
+ return TargetPhraseVectorPtr();
+
+ // false positive consistency check
+ if(m_maxRank && rank > m_maxRank)
+ return TargetPhraseVectorPtr();
+
+ // set subphrase by default to itself
+ TargetPhraseVectorPtr subTpv = tpv;
+
+ // if range smaller than source phrase retrieve subphrase
+ if(unsigned(srcEnd - srcStart + 1) != srcSize) {
+ SubPhrase<Word> subPhrase = sourcePhrase.GetSubPhrase(srcStart, srcEnd - srcStart + 1);
+ subTpv = CreateTargetPhraseCollection(mgr, subPhrase, false);
+ } else {
+ // false positive consistency check
+ if(rank >= tpv->size()-1)
+ return TargetPhraseVectorPtr();
+ }
+
+ // false positive consistency check
+ if(subTpv != NULL && rank < subTpv->size()) {
+ // insert the subphrase into the main target phrase
+ TPCompact& subTp = subTpv->at(rank);
+ if(m_phraseDictionary.m_useAlignmentInfo) {
+ // reconstruct the alignment data based on the alignment of the subphrase
+ for(std::set<AlignPointSizeT>::const_iterator it = subTp.alignment.begin();
+ it != subTp.alignment.end(); it++) {
+ alignment.insert(AlignPointSizeT(srcStart + it->first,
+ targetPhrase->words.size() + it->second));
+ }
+ }
+
+ std::copy(subTp.words.begin(), subTp.words.end(), std::back_inserter(targetPhrase->words));
+ } else
+ return TargetPhraseVectorPtr();
+ }
+ } else {
+ Word word;
+ word.CreateFromString(vocab, system, GetTargetSymbol(symbol));
+ targetPhrase->words.push_back(word);
+ }
+ }
+ } else if(state == Score) {
+ size_t idx = m_multipleScoreTrees ? scores.size() : 0;
+ float score = m_scoreTrees[idx]->Read(encodedBitStream);
+ scores.push_back(score);
+
+ if(scores.size() == m_numScoreComponent) {
+ targetPhrase->scores = scores;
+
+ if(m_containsAlignmentInfo)
+ state = Alignment;
+ else
+ state = Add;
+ }
+ } else if(state == Alignment) {
+ AlignPoint alignPoint = m_alignTree->Read(encodedBitStream);
+ if(alignPoint == alignStopSymbol) {
+ state = Add;
+ } else {
+ if(m_phraseDictionary.m_useAlignmentInfo)
+ alignment.insert(AlignPointSizeT(alignPoint));
+ }
+ }
+
+ if(state == Add) {
+ if(m_phraseDictionary.m_useAlignmentInfo) {
+ size_t sourceSize = sourcePhrase.GetSize();
+ size_t targetSize = targetPhrase->words.size();
+ for(std::set<AlignPointSizeT>::iterator it = alignment.begin(); it != alignment.end(); it++) {
+ if(it->first >= sourceSize || it->second >= targetSize)
+ return TargetPhraseVectorPtr();
+ }
+ targetPhrase->alignment = alignment;
+ }
+
+ if(m_coding == PREnc) {
+ if(!m_maxRank || tpv->size() <= m_maxRank)
+ bitsLeft = encodedBitStream.TellFromEnd();
+
+ if(!topLevel && m_maxRank && tpv->size() >= m_maxRank)
+ break;
+ }
+
+ if(encodedBitStream.TellFromEnd() <= 8)
+ break;
+
+ state = New;
+ }
+ }
+
+ if(m_coding == PREnc && !extending) {
+ bitsLeft = bitsLeft > 8 ? bitsLeft : 0;
+ m_decodingCache.Cache(sourcePhrase, tpv, bitsLeft, m_maxRank);
+ }
+
+ return tpv;
+}
+
+void PhraseDecoder::PruneCache()
+{
+ m_decodingCache.Prune();
+}
+
+}
diff --git a/moses2/TranslationModel/CompactPT/PhraseDecoder.h b/moses2/TranslationModel/CompactPT/PhraseDecoder.h
new file mode 100644
index 000000000..79faa38a6
--- /dev/null
+++ b/moses2/TranslationModel/CompactPT/PhraseDecoder.h
@@ -0,0 +1,142 @@
+// $Id$
+// vim:tabstop=2
+/***********************************************************************
+Moses - factored phrase-based language decoder
+Copyright (C) 2006 University of Edinburgh
+
+This library is free software; you can redistribute it and/or
+modify it under the terms of the GNU Lesser General Public
+License as published by the Free Software Foundation; either
+version 2.1 of the License, or (at your option) any later version.
+
+This library is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+Lesser General Public License for more details.
+
+You should have received a copy of the GNU Lesser General Public
+License along with this library; if not, write to the Free Software
+Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+***********************************************************************/
+
+#pragma once
+
+#include <sstream>
+#include <vector>
+#include <boost/unordered_map.hpp>
+#include <boost/unordered_set.hpp>
+#include <string>
+#include <iterator>
+#include <algorithm>
+#include <sys/stat.h>
+
+#include "PhraseTableCompact.h"
+#include "StringVector.h"
+#include "CanonicalHuffman.h"
+#include "TargetPhraseCollectionCache.h"
+
+#include "../../Phrase.h"
+#include "../../ManagerBase.h"
+
+namespace Moses2
+{
+
+class PhraseTableCompact;
+
+class PhraseDecoder
+{
+protected:
+
+ friend class PhraseTableCompact;
+
+ typedef std::pair<unsigned char, unsigned char> AlignPoint;
+ typedef std::pair<unsigned, unsigned> SrcTrg;
+
+ enum Coding { None, REnc, PREnc } m_coding;
+
+ size_t m_numScoreComponent;
+ bool m_containsAlignmentInfo;
+ size_t m_maxRank;
+ size_t m_maxPhraseLength;
+
+ boost::unordered_map<std::string, unsigned> m_sourceSymbolsMap;
+ StringVector<unsigned char, unsigned, std::allocator> m_sourceSymbols;
+ StringVector<unsigned char, unsigned, std::allocator> m_targetSymbols;
+
+ std::vector<size_t> m_lexicalTableIndex;
+ std::vector<SrcTrg> m_lexicalTable;
+
+ CanonicalHuffman<unsigned>* m_symbolTree;
+
+ bool m_multipleScoreTrees;
+ std::vector<CanonicalHuffman<float>*> m_scoreTrees;
+
+ CanonicalHuffman<AlignPoint>* m_alignTree;
+
+ TargetPhraseCollectionCache m_decodingCache;
+
+ PhraseTableCompact& m_phraseDictionary;
+
+ // ***********************************************
+
+ const std::vector<FactorType>* m_input;
+ const std::vector<FactorType>* m_output;
+
+ std::string m_separator;
+
+ // ***********************************************
+
+ unsigned GetSourceSymbolId(std::string& s);
+ std::string GetTargetSymbol(unsigned id) const;
+
+ size_t GetREncType(unsigned encodedSymbol);
+ size_t GetPREncType(unsigned encodedSymbol);
+
+ unsigned GetTranslation(unsigned srcIdx, size_t rank);
+
+ size_t GetMaxSourcePhraseLength();
+
+ unsigned DecodeREncSymbol1(unsigned encodedSymbol);
+ unsigned DecodeREncSymbol2Rank(unsigned encodedSymbol);
+ unsigned DecodeREncSymbol2Position(unsigned encodedSymbol);
+ unsigned DecodeREncSymbol3(unsigned encodedSymbol);
+
+ unsigned DecodePREncSymbol1(unsigned encodedSymbol);
+ int DecodePREncSymbol2Left(unsigned encodedSymbol);
+ int DecodePREncSymbol2Right(unsigned encodedSymbol);
+ unsigned DecodePREncSymbol2Rank(unsigned encodedSymbol);
+
+ std::string MakeSourceKey(std::string &);
+
+public:
+
+ PhraseDecoder(
+ PhraseTableCompact &phraseDictionary,
+ const std::vector<FactorType>* input,
+ const std::vector<FactorType>* output,
+ size_t numScoreComponent
+ );
+
+ ~PhraseDecoder();
+
+ size_t Load(std::FILE* in);
+
+ TargetPhraseVectorPtr CreateTargetPhraseCollection(
+ const ManagerBase &mgr,
+ const Phrase<Word> &sourcePhrase,
+ bool topLevel = false,
+ bool eval = true);
+
+ TargetPhraseVectorPtr DecodeCollection(
+ const ManagerBase &mgr,
+ TargetPhraseVectorPtr tpv,
+ BitWrapper<> &encodedBitStream,
+ const Phrase<Word> &sourcePhrase,
+ bool topLevel,
+ bool eval);
+
+ void PruneCache();
+};
+
+}
+
diff --git a/moses2/TranslationModel/CompactPT/PhraseTableCompact.cpp b/moses2/TranslationModel/CompactPT/PhraseTableCompact.cpp
new file mode 100644
index 000000000..49244df1b
--- /dev/null
+++ b/moses2/TranslationModel/CompactPT/PhraseTableCompact.cpp
@@ -0,0 +1,222 @@
+#include <boost/algorithm/string/predicate.hpp>
+#include <boost/thread/tss.hpp>
+#include "PhraseTableCompact.h"
+#include "PhraseDecoder.h"
+#include "../../PhraseBased/InputPath.h"
+#include "../../PhraseBased/Manager.h"
+#include "../../PhraseBased/TargetPhrases.h"
+#include "../../PhraseBased/TargetPhraseImpl.h"
+#include "../../PhraseBased/Sentence.h"
+
+using namespace std;
+using namespace boost::algorithm;
+
+namespace Moses2
+{
+bool PhraseTableCompact::s_inMemoryByDefault = false;
+
+PhraseTableCompact::PhraseTableCompact(size_t startInd, const std::string &line)
+:PhraseTable(startInd, line)
+,m_inMemory(s_inMemoryByDefault)
+,m_useAlignmentInfo(true)
+,m_hash(10, 16)
+,m_phraseDecoder(0)
+{
+ ReadParameters();
+}
+
+PhraseTableCompact::~PhraseTableCompact()
+{
+
+}
+
+void PhraseTableCompact::Load(System &system)
+{
+ std::string tFilePath = m_path;
+
+ std::string suffix = ".minphr";
+ if (!ends_with(tFilePath, suffix)) tFilePath += suffix;
+ if (!FileExists(tFilePath))
+ throw runtime_error("Error: File " + tFilePath + " does not exist.");
+
+ m_phraseDecoder
+ = new PhraseDecoder(*this, &m_input, &m_output, GetNumScores());
+
+ std::FILE* pFile = std::fopen(tFilePath.c_str() , "r");
+
+ size_t indexSize;
+ //if(m_inMemory)
+ // Load source phrase index into memory
+ indexSize = m_hash.Load(pFile);
+ // else
+ // Keep source phrase index on disk
+ //indexSize = m_hash.LoadIndex(pFile);
+
+ size_t coderSize = m_phraseDecoder->Load(pFile);
+
+ size_t phraseSize;
+ if(m_inMemory) {
+ // Load target phrase collections into memory
+ phraseSize = m_targetPhrasesMemory.load(pFile, false);
+ }
+ else {
+ // Keep target phrase collections on disk
+ phraseSize = m_targetPhrasesMapped.load(pFile, true);
+ }
+
+ UTIL_THROW_IF2(indexSize == 0 || coderSize == 0 || phraseSize == 0,
+ "Not successfully loaded");
+}
+
+void PhraseTableCompact::SetParameter(const std::string& key, const std::string& value)
+{
+ if (key == "blah") {
+
+ }
+ else {
+ PhraseTable::SetParameter(key, value);
+ }
+}
+
+void PhraseTableCompact::CleanUpAfterSentenceProcessing() const
+{
+ //if(!m_sentenceCache.get())
+ // m_sentenceCache.reset(new PhraseCache());
+
+ m_phraseDecoder->PruneCache();
+ //m_sentenceCache->clear();
+}
+
+
+// pb
+void PhraseTableCompact::Lookup(const Manager &mgr, InputPathsBase &inputPaths) const
+{
+ size_t inputSize = static_cast<const Sentence&>(mgr.GetInput()).GetSize();
+ InputPaths &inputPathsCast = static_cast<InputPaths&>(inputPaths);
+
+ for (size_t i = 0; i < inputSize; ++i) {
+ for (size_t startPos = 0; startPos < inputSize; ++startPos) {
+ size_t endPos = startPos + i;
+ if (endPos >= inputSize) {
+ break;
+ }
+ InputPath *path = inputPathsCast.GetMatrix().GetValue(startPos, i);
+ //cerr << "path=" << path->Debug(mgr.system) << endl;
+ TargetPhrases *tps = Lookup(mgr, mgr.GetPool(), *path);
+ path->AddTargetPhrases(*this, tps);
+ }
+ }
+}
+
+TargetPhrases *PhraseTableCompact::Lookup(const Manager &mgr, MemPool &pool,
+ InputPath &inputPath) const
+{
+ TargetPhrases *ret = NULL;
+
+ const Phrase<Word> &sourcePhrase = inputPath.subPhrase;
+ //cerr << "sourcePhrase=" << sourcePhrase.Debug(mgr.system) << endl;
+
+ // There is no souch source phrase if source phrase is longer than longest
+ // observed source phrase during compilation
+ if(sourcePhrase.GetSize() > m_phraseDecoder->GetMaxSourcePhraseLength())
+ return ret;
+
+ // Retrieve target phrase collection from phrase table
+ TargetPhraseVectorPtr decodedPhraseColl
+ = m_phraseDecoder->CreateTargetPhraseCollection(mgr, sourcePhrase, true, true);
+
+ if(decodedPhraseColl != NULL && decodedPhraseColl->size()) {
+ TargetPhraseVectorPtr tpv(new TargetPhraseVector(*decodedPhraseColl));
+ //TargetPhraseCollection::shared_ptr phraseColl(new TargetPhraseCollection);
+ ret = new (pool.Allocate<TargetPhrases>()) TargetPhrases(pool, decodedPhraseColl->size());
+
+ for (size_t i = 0; i < decodedPhraseColl->size(); ++i) {
+ const TPCompact &tpCompact = decodedPhraseColl->at(i);
+ const TargetPhraseImpl *tp = CreateTargetPhrase(mgr, tpCompact, sourcePhrase);
+
+ ret->AddTargetPhrase(*tp);
+ }
+
+ ret->SortAndPrune(m_tableLimit);
+ mgr.system.featureFunctions.EvaluateAfterTablePruning(pool, *ret, sourcePhrase);
+
+ //cerr << "RET2=" << ret->Debug(mgr.system) << endl;
+ /*
+ // Cache phrase pair for clean-up or retrieval with PREnc
+ const_cast<PhraseDictionaryCompact*>(this)->CacheForCleanup(phraseColl);
+
+ return phraseColl;
+ */
+ }
+
+ return ret;
+
+}
+
+const TargetPhraseImpl *PhraseTableCompact::CreateTargetPhrase(
+ const Manager &mgr,
+ const TPCompact &tpCompact,
+ const Phrase<Word> &sourcePhrase) const
+{
+ MemPool &pool = mgr.GetPool();
+
+ size_t size = tpCompact.words.size();
+ TargetPhraseImpl *ret = new TargetPhraseImpl(pool, *this, mgr.system, size);
+
+ // words
+ for (size_t i = 0; i < size; ++i) {
+ const Word &compactWord = tpCompact.words[i];
+ Word &tpWord = (*ret)[i];
+ tpWord = compactWord;
+ }
+
+ // scores
+ Scores &scores = ret->GetScores();
+ scores.Assign(mgr.system, *this, tpCompact.scores);
+
+ // align
+ ret->SetAlignTerm(tpCompact.alignment);
+
+ // score
+ mgr.system.featureFunctions.EvaluateInIsolation(pool, mgr.system, sourcePhrase, *ret);
+
+ // Cache phrase pair for clean-up or retrieval with PREnc
+ //const_cast<PhraseDictionaryCompact*>(this)->CacheForCleanup(phraseColl);
+
+ //cerr << "ret=" << ret->Debug(mgr.system) << endl;
+ return ret;
+}
+
+
+// scfg
+void PhraseTableCompact::InitActiveChart(
+ MemPool &pool,
+ const SCFG::Manager &mgr,
+ SCFG::InputPath &path) const
+{
+ UTIL_THROW2("Not implemented");
+}
+
+void PhraseTableCompact::Lookup(
+ MemPool &pool,
+ const SCFG::Manager &mgr,
+ size_t maxChartSpan,
+ const SCFG::Stacks &stacks,
+ SCFG::InputPath &path) const
+{
+ UTIL_THROW2("Not implemented");
+}
+
+void PhraseTableCompact::LookupGivenNode(
+ MemPool &pool,
+ const SCFG::Manager &mgr,
+ const SCFG::ActiveChartEntry &prevEntry,
+ const SCFG::Word &wordSought,
+ const Moses2::Hypotheses *hypos,
+ const Moses2::Range &subPhraseRange,
+ SCFG::InputPath &outPath) const
+{
+ UTIL_THROW2("Not implemented");
+}
+
+}
diff --git a/moses2/TranslationModel/CompactPT/PhraseTableCompact.h b/moses2/TranslationModel/CompactPT/PhraseTableCompact.h
new file mode 100644
index 000000000..84ea7e4b2
--- /dev/null
+++ b/moses2/TranslationModel/CompactPT/PhraseTableCompact.h
@@ -0,0 +1,68 @@
+#pragma once
+#include "../PhraseTable.h"
+#include "BlockHashIndex.h"
+
+namespace Moses2
+{
+class PhraseDecoder;
+class TPCompact;
+
+class PhraseTableCompact: public PhraseTable
+{
+public:
+ PhraseTableCompact(size_t startInd, const std::string &line);
+ virtual ~PhraseTableCompact();
+ void Load(System &system);
+ virtual void SetParameter(const std::string& key, const std::string& value);
+
+ virtual void CleanUpAfterSentenceProcessing() const;
+
+ virtual TargetPhrases *Lookup(const Manager &mgr, MemPool &pool,
+ InputPath &inputPath) const;
+
+ // scfg
+ virtual void InitActiveChart(
+ MemPool &pool,
+ const SCFG::Manager &mgr,
+ SCFG::InputPath &path) const;
+
+ virtual void Lookup(const Manager &mgr, InputPathsBase &inputPaths) const;
+
+ virtual void Lookup(
+ MemPool &pool,
+ const SCFG::Manager &mgr,
+ size_t maxChartSpan,
+ const SCFG::Stacks &stacks,
+ SCFG::InputPath &path) const;
+
+protected:
+ static bool s_inMemoryByDefault;
+ bool m_inMemory;
+ bool m_useAlignmentInfo;
+
+ BlockHashIndex m_hash;
+
+ StringVector<unsigned char, size_t, MmapAllocator> m_targetPhrasesMapped;
+ StringVector<unsigned char, size_t, std::allocator> m_targetPhrasesMemory;
+
+ friend class PhraseDecoder;
+ PhraseDecoder* m_phraseDecoder;
+
+ const TargetPhraseImpl *CreateTargetPhrase(
+ const Manager &mgr,
+ const TPCompact &tpCompact,
+ const Phrase<Word> &sourcePhrase) const;
+
+ // SCFG
+ virtual void LookupGivenNode(
+ MemPool &pool,
+ const SCFG::Manager &mgr,
+ const SCFG::ActiveChartEntry &prevEntry,
+ const SCFG::Word &wordSought,
+ const Moses2::Hypotheses *hypos,
+ const Moses2::Range &subPhraseRange,
+ SCFG::InputPath &outPath) const;
+
+};
+
+}
diff --git a/moses2/TranslationModel/CompactPT/StringVector.h b/moses2/TranslationModel/CompactPT/StringVector.h
new file mode 100644
index 000000000..87d6388bf
--- /dev/null
+++ b/moses2/TranslationModel/CompactPT/StringVector.h
@@ -0,0 +1,662 @@
+// $Id$
+// vim:tabstop=2
+/***********************************************************************
+ Moses - factored phrase-based language decoder
+ Copyright (C) 2006 University of Edinburgh
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ ***********************************************************************/
+
+#ifndef moses_StringVector_h
+#define moses_StringVector_h
+
+#include <vector>
+#include <algorithm>
+#include <string>
+#include <iterator>
+#include <cstdio>
+#include <cassert>
+
+#include <boost/iterator/iterator_facade.hpp>
+
+#include "ThrowingFwrite.h"
+#include "MonotonicVector.h"
+#include "MmapAllocator.h"
+
+namespace Moses2
+{
+
+// ********** ValueIteratorRange **********
+
+template<typename ValueIteratorT>
+class ValueIteratorRange
+{
+private:
+ ValueIteratorT m_begin;
+ ValueIteratorT m_end;
+
+public:
+ ValueIteratorRange(ValueIteratorT begin, ValueIteratorT end);
+
+ const ValueIteratorT& begin() const;
+ const ValueIteratorT& end() const;
+ const std::string str() const;
+ operator const std::string()
+ {
+ return str();
+ }
+
+ size_t size()
+ {
+ return std::distance(m_begin, m_end);
+ }
+
+ template<typename StringT>
+ bool operator==(const StringT& o) const;
+ bool operator==(const char* c) const;
+
+ template<typename StringT>
+ bool operator<(const StringT& o) const;
+ bool operator<(const char* c) const;
+};
+
+// ********** StringVector **********
+
+template<typename ValueT = unsigned char, typename PosT = unsigned int,
+ template<typename > class Allocator = std::allocator>
+class StringVector
+{
+protected:
+ bool m_sorted;
+ bool m_memoryMapped;
+
+ std::vector<ValueT, Allocator<ValueT> >* m_charArray;
+ MonotonicVector<PosT, unsigned int, 32> m_positions;
+
+ virtual const ValueT* value_ptr(PosT i) const;
+
+public:
+ //typedef ValueIteratorRange<typename std::vector<ValueT, Allocator<ValueT> >::const_iterator> range;
+ typedef ValueIteratorRange<const ValueT *> range;
+
+ // ********** RangeIterator **********
+
+ class RangeIterator: public boost::iterator_facade<RangeIterator, range,
+ std::random_access_iterator_tag, range, PosT>
+ {
+
+ private:
+ PosT m_index;
+ StringVector<ValueT, PosT, Allocator>* m_container;
+
+ public:
+ RangeIterator();
+ RangeIterator(StringVector<ValueT, PosT, Allocator> &sv, PosT index = 0);
+
+ PosT get_index();
+
+ private:
+ friend class boost::iterator_core_access;
+
+ range dereference() const;
+ bool equal(RangeIterator const& other) const;
+ void increment();
+ void decrement();
+ void advance(PosT n);
+
+ PosT distance_to(RangeIterator const& other) const;
+ };
+
+ // ********** StringIterator **********
+
+ class StringIterator: public boost::iterator_facade<StringIterator,
+ std::string, std::random_access_iterator_tag, const std::string, PosT>
+ {
+
+ private:
+ PosT m_index;
+ StringVector<ValueT, PosT, Allocator>* m_container;
+
+ public:
+ StringIterator();
+ StringIterator(StringVector<ValueT, PosT, Allocator> &sv, PosT index = 0);
+
+ PosT get_index();
+
+ private:
+ friend class boost::iterator_core_access;
+
+ const std::string dereference() const;
+ bool equal(StringIterator const& other) const;
+ void increment();
+ void decrement();
+ void advance(PosT n);
+ PosT distance_to(StringIterator const& other) const;
+ };
+
+ typedef RangeIterator iterator;
+ typedef StringIterator string_iterator;
+
+ StringVector(bool allocate = false);
+ StringVector(Allocator<ValueT>& alloc);
+
+ virtual ~StringVector()
+ {
+ delete m_charArray;
+ }
+
+ void swap(StringVector<ValueT, PosT, Allocator> &c)
+ {
+ m_positions.commit();
+ m_positions.swap(c.m_positions);
+ m_charArray->swap(*c.m_charArray);
+
+ bool temp = m_sorted;
+ m_sorted = c.m_sorted;
+ c.m_sorted = temp;
+ }
+
+ bool is_sorted() const;
+ PosT size() const;
+ virtual PosT size2() const;
+
+ template<class Iterator> Iterator begin() const;
+ template<class Iterator> Iterator end() const;
+
+ iterator begin() const;
+ iterator end() const;
+
+ PosT length(PosT i) const;
+ //typename std::vector<ValueT, Allocator<ValueT> >::const_iterator begin(PosT i) const;
+ //typename std::vector<ValueT, Allocator<ValueT> >::const_iterator end(PosT i) const;
+ const ValueT* begin(PosT i) const;
+ const ValueT* end(PosT i) const;
+
+ void clear()
+ {
+ m_charArray->clear();
+ m_sorted = true;
+ m_positions = MonotonicVector<PosT, unsigned int, 32>();
+ }
+
+ range at(PosT i) const;
+ range operator[](PosT i) const;
+ range back() const;
+
+ template<typename StringT>
+ void push_back(StringT s);
+ void push_back(const char* c);
+
+ template<typename StringT>
+ PosT find(StringT &s) const;
+ PosT find(const char* c) const;
+
+ virtual size_t load(std::FILE* in, bool memoryMapped = false)
+ {
+ size_t size = 0;
+ m_memoryMapped = memoryMapped;
+
+ size += std::fread(&m_sorted, sizeof(bool), 1, in) * sizeof(bool);
+ size += m_positions.load(in, false);
+
+ size += loadCharArray(m_charArray, in, m_memoryMapped);
+ return size;
+ }
+
+ size_t loadCharArray(std::vector<ValueT, std::allocator<ValueT> >*& c,
+ std::FILE* in, bool map = false)
+ {
+ // Can only be read into memory. Mapping not possible with std:allocator.
+ assert(map == false);
+
+ size_t byteSize = 0;
+
+ size_t valSize;
+ byteSize += std::fread(&valSize, sizeof(size_t), 1, in) * sizeof(size_t);
+
+ c = new std::vector<ValueT, std::allocator<ValueT> >(valSize, 0);
+ byteSize += std::fread(&(*c)[0], sizeof(ValueT), valSize, in)
+ * sizeof(ValueT);
+
+ return byteSize;
+ }
+
+ size_t loadCharArray(std::vector<ValueT, MmapAllocator<ValueT> >*& c,
+ std::FILE* in, bool map = false)
+ {
+ size_t byteSize = 0;
+
+ size_t valSize;
+ byteSize += std::fread(&valSize, sizeof(size_t), 1, in) * sizeof(size_t);
+
+ if (map == false) {
+ // Read data into temporary file (default constructor of MmapAllocator)
+ // and map memory onto temporary file. Can be resized.
+ c = new std::vector<ValueT, MmapAllocator<ValueT> >(valSize, 0);
+ byteSize += std::fread(&(*c)[0], sizeof(ValueT), valSize, in)
+ * sizeof(ValueT);
+ }
+ else {
+ // Map it directly on specified region of file "in" starting at valPos
+ // with length valSize * sizeof(ValueT). Mapped region cannot be resized.
+
+ size_t valPos = std::ftell(in);
+ Allocator<ValueT> alloc(in, valPos);
+ c = new std::vector<ValueT, Allocator<ValueT> >(alloc);
+ c->resize(valSize, 0);
+
+ byteSize += valSize * sizeof(ValueT);
+ }
+
+ return byteSize;
+ }
+
+ size_t load(std::string filename, bool memoryMapped = false)
+ {
+ std::FILE* pFile = fopen(filename.c_str(), "r");
+ size_t byteSize = load(pFile, memoryMapped);
+ fclose(pFile);
+ return byteSize;
+ }
+
+ size_t save(std::FILE* out)
+ {
+ size_t byteSize = 0;
+ byteSize += ThrowingFwrite(&m_sorted, sizeof(bool), 1, out) * sizeof(bool);
+
+ byteSize += m_positions.save(out);
+
+ size_t valSize = size2();
+ byteSize += ThrowingFwrite(&valSize, sizeof(size_t), 1, out)
+ * sizeof(size_t);
+ byteSize += ThrowingFwrite(&(*m_charArray)[0], sizeof(ValueT), valSize, out)
+ * sizeof(ValueT);
+
+ return byteSize;
+ }
+
+ size_t save(std::string filename)
+ {
+ std::FILE* pFile = fopen(filename.c_str(), "w");
+ size_t byteSize = save(pFile);
+ fclose(pFile);
+ return byteSize;
+ }
+
+};
+
+// ********** Implementation **********
+
+// ValueIteratorRange
+
+template<typename ValueIteratorT>
+ValueIteratorRange<ValueIteratorT>::ValueIteratorRange(ValueIteratorT begin,
+ ValueIteratorT end) :
+ m_begin(begin), m_end(end)
+{
+}
+
+template<typename ValueIteratorT>
+const ValueIteratorT& ValueIteratorRange<ValueIteratorT>::begin() const
+{
+ return m_begin;
+}
+
+template<typename ValueIteratorT>
+const ValueIteratorT& ValueIteratorRange<ValueIteratorT>::end() const
+{
+ return m_end;
+}
+
+template<typename ValueIteratorT>
+const std::string ValueIteratorRange<ValueIteratorT>::str() const
+{
+ std::string dummy;
+ for (ValueIteratorT it = m_begin; it != m_end; it++)
+ dummy.push_back(*it);
+ return dummy;
+}
+
+template<typename ValueIteratorT>
+template<typename StringT>
+bool ValueIteratorRange<ValueIteratorT>::operator==(const StringT& o) const
+{
+ if (std::distance(m_begin, m_end) == std::distance(o.begin(), o.end())) return std::equal(
+ m_begin, m_end, o.begin());
+ else return false;
+}
+
+template<typename ValueIteratorT>
+bool ValueIteratorRange<ValueIteratorT>::operator==(const char* c) const
+{
+ return *this == std::string(c);
+}
+
+template<typename ValueIteratorT>
+template<typename StringT>
+bool ValueIteratorRange<ValueIteratorT>::operator<(const StringT &s2) const
+{
+ return std::lexicographical_compare(m_begin, m_end, s2.begin(), s2.end(),
+ std::less<typename std::iterator_traits<ValueIteratorT>::value_type>());
+}
+
+template<typename ValueIteratorT>
+bool ValueIteratorRange<ValueIteratorT>::operator<(const char* c) const
+{
+ return *this < std::string(c);
+}
+
+template<typename StringT, typename ValueIteratorT>
+bool operator<(const StringT &s1, const ValueIteratorRange<ValueIteratorT> &s2)
+{
+ return std::lexicographical_compare(s1.begin(), s1.end(), s2.begin(),
+ s2.end(),
+ std::less<typename std::iterator_traits<ValueIteratorT>::value_type>());
+}
+
+template<typename ValueIteratorT>
+bool operator<(const char* c, const ValueIteratorRange<ValueIteratorT> &s2)
+{
+ size_t len = std::char_traits<char>::length(c);
+ return std::lexicographical_compare(c, c + len, s2.begin(), s2.end(),
+ std::less<typename std::iterator_traits<ValueIteratorT>::value_type>());
+}
+
+template<typename OStream, typename ValueIteratorT>
+OStream& operator<<(OStream &os, ValueIteratorRange<ValueIteratorT> cr)
+{
+ ValueIteratorT it = cr.begin();
+ while (it != cr.end())
+ os << *(it++);
+ return os;
+}
+
+// StringVector
+
+template<typename ValueT, typename PosT, template<typename > class Allocator>
+StringVector<ValueT, PosT, Allocator>::StringVector(bool allocate) :
+ m_sorted(true), m_memoryMapped(false), m_charArray(
+ allocate ? new std::vector<ValueT, Allocator<ValueT> >() : 0)
+{
+}
+
+template<typename ValueT, typename PosT, template<typename > class Allocator>
+StringVector<ValueT, PosT, Allocator>::StringVector(Allocator<ValueT> &alloc) :
+ m_sorted(true), m_memoryMapped(false), m_charArray(
+ new std::vector<ValueT, Allocator<ValueT> >(alloc))
+{
+}
+
+template<typename ValueT, typename PosT, template<typename > class Allocator>
+template<typename StringT>
+void StringVector<ValueT, PosT, Allocator>::push_back(StringT s)
+{
+ if (is_sorted() && size() && !(back() < s)) m_sorted = false;
+
+ m_positions.push_back(size2());
+ std::copy(s.begin(), s.end(), std::back_inserter(*m_charArray));
+}
+
+template<typename ValueT, typename PosT, template<typename > class Allocator>
+void StringVector<ValueT, PosT, Allocator>::push_back(const char* c)
+{
+ std::string dummy(c);
+ push_back(dummy);
+}
+
+template<typename ValueT, typename PosT, template<typename > class Allocator>
+template<typename Iterator>
+Iterator StringVector<ValueT, PosT, Allocator>::begin() const
+{
+ return Iterator(const_cast<StringVector<ValueT, PosT, Allocator>&>(*this), 0);
+}
+
+template<typename ValueT, typename PosT, template<typename > class Allocator>
+template<typename Iterator>
+Iterator StringVector<ValueT, PosT, Allocator>::end() const
+{
+ return Iterator(const_cast<StringVector<ValueT, PosT, Allocator>&>(*this),
+ size());
+}
+
+template<typename ValueT, typename PosT, template<typename > class Allocator>
+typename StringVector<ValueT, PosT, Allocator>::iterator StringVector<ValueT,
+ PosT, Allocator>::begin() const
+{
+ return begin<iterator>();
+}
+;
+
+template<typename ValueT, typename PosT, template<typename > class Allocator>
+typename StringVector<ValueT, PosT, Allocator>::iterator StringVector<ValueT,
+ PosT, Allocator>::end() const
+{
+ return end<iterator>();
+}
+;
+
+template<typename ValueT, typename PosT, template<typename > class Allocator>
+bool StringVector<ValueT, PosT, Allocator>::is_sorted() const
+{
+ return m_sorted;
+}
+
+template<typename ValueT, typename PosT, template<typename > class Allocator>
+PosT StringVector<ValueT, PosT, Allocator>::size() const
+{
+ return m_positions.size();
+}
+
+template<typename ValueT, typename PosT, template<typename > class Allocator>
+PosT StringVector<ValueT, PosT, Allocator>::size2() const
+{
+ return m_charArray->size();
+}
+
+template<typename ValueT, typename PosT, template<typename > class Allocator>
+typename StringVector<ValueT, PosT, Allocator>::range StringVector<ValueT, PosT,
+ Allocator>::at(PosT i) const
+{
+ return range(begin(i), end(i));
+}
+
+template<typename ValueT, typename PosT, template<typename > class Allocator>
+typename StringVector<ValueT, PosT, Allocator>::range StringVector<ValueT, PosT,
+ Allocator>::operator[](PosT i) const
+{
+ return at(i);
+}
+
+template<typename ValueT, typename PosT, template<typename > class Allocator>
+typename StringVector<ValueT, PosT, Allocator>::range StringVector<ValueT, PosT,
+ Allocator>::back() const
+{
+ return at(size() - 1);
+}
+
+template<typename ValueT, typename PosT, template<typename > class Allocator>
+PosT StringVector<ValueT, PosT, Allocator>::length(PosT i) const
+{
+ if (i + 1 < size()) return m_positions[i + 1] - m_positions[i];
+ else return size2() - m_positions[i];
+}
+
+template<typename ValueT, typename PosT, template<typename > class Allocator>
+const ValueT* StringVector<ValueT, PosT, Allocator>::value_ptr(PosT i) const
+{
+ return &(*m_charArray)[m_positions[i]];
+}
+
+template<typename ValueT, typename PosT, template<typename > class Allocator>
+//typename std::vector<ValueT, Allocator<ValueT> >::const_iterator StringVector<ValueT, PosT, Allocator>::begin(PosT i) const
+const ValueT* StringVector<ValueT, PosT, Allocator>::begin(PosT i) const
+{
+ //return typename std::vector<ValueT, Allocator<ValueT> >::const_iterator(value_ptr(i));
+ return value_ptr(i);
+}
+
+template<typename ValueT, typename PosT, template<typename > class Allocator>
+//typename std::vector<ValueT, Allocator<ValueT> >::const_iterator StringVector<ValueT, PosT, Allocator>::end(PosT i) const
+const ValueT* StringVector<ValueT, PosT, Allocator>::end(PosT i) const
+{
+ //return typename std::vector<ValueT, Allocator<ValueT> >::const_iterator(value_ptr(i) + length(i));
+ return value_ptr(i) + length(i);
+}
+
+template<typename ValueT, typename PosT, template<typename > class Allocator>
+template<typename StringT>
+PosT StringVector<ValueT, PosT, Allocator>::find(StringT &s) const
+{
+ if (m_sorted) return std::distance(begin(),
+ std::lower_bound(begin(), end(), s));
+ return std::distance(begin(), std::find(begin(), end(), s));
+}
+
+template<typename ValueT, typename PosT, template<typename > class Allocator>
+PosT StringVector<ValueT, PosT, Allocator>::find(const char* c) const
+{
+ std::string s(c);
+ return find(s);
+}
+
+// RangeIterator
+
+template<typename ValueT, typename PosT, template<typename > class Allocator>
+StringVector<ValueT, PosT, Allocator>::RangeIterator::RangeIterator() :
+ m_index(0), m_container(0)
+{
+}
+
+template<typename ValueT, typename PosT, template<typename > class Allocator>
+StringVector<ValueT, PosT, Allocator>::RangeIterator::RangeIterator(
+ StringVector<ValueT, PosT, Allocator> &sv, PosT index) :
+ m_index(index), m_container(&sv)
+{
+}
+
+template<typename ValueT, typename PosT, template<typename > class Allocator>
+PosT StringVector<ValueT, PosT, Allocator>::RangeIterator::get_index()
+{
+ return m_index;
+}
+
+template<typename ValueT, typename PosT, template<typename > class Allocator>
+typename StringVector<ValueT, PosT, Allocator>::range StringVector<ValueT, PosT,
+ Allocator>::RangeIterator::dereference() const
+{
+ return typename StringVector<ValueT, PosT, Allocator>::range(
+ m_container->begin(m_index), m_container->end(m_index));
+}
+
+template<typename ValueT, typename PosT, template<typename > class Allocator>
+bool StringVector<ValueT, PosT, Allocator>::RangeIterator::equal(
+ StringVector<ValueT, PosT, Allocator>::RangeIterator const& other) const
+{
+ return m_index == other.m_index && m_container == other.m_container;
+}
+
+template<typename ValueT, typename PosT, template<typename > class Allocator>
+void StringVector<ValueT, PosT, Allocator>::RangeIterator::increment()
+{
+ m_index++;
+}
+
+template<typename ValueT, typename PosT, template<typename > class Allocator>
+void StringVector<ValueT, PosT, Allocator>::RangeIterator::decrement()
+{
+ m_index--;
+}
+
+template<typename ValueT, typename PosT, template<typename > class Allocator>
+void StringVector<ValueT, PosT, Allocator>::RangeIterator::advance(PosT n)
+{
+ m_index += n;
+}
+
+template<typename ValueT, typename PosT, template<typename > class Allocator>
+PosT StringVector<ValueT, PosT, Allocator>::RangeIterator::distance_to(
+ StringVector<ValueT, PosT, Allocator>::RangeIterator const& other) const
+{
+ return other.m_index - m_index;
+}
+
+// StringIterator
+
+template<typename ValueT, typename PosT, template<typename > class Allocator>
+StringVector<ValueT, PosT, Allocator>::StringIterator::StringIterator() :
+ m_index(0), m_container(0)
+{
+}
+
+template<typename ValueT, typename PosT, template<typename > class Allocator>
+StringVector<ValueT, PosT, Allocator>::StringIterator::StringIterator(
+ StringVector<ValueT, PosT, Allocator> &sv, PosT index) :
+ m_index(index), m_container(&sv)
+{
+}
+
+template<typename ValueT, typename PosT, template<typename > class Allocator>
+PosT StringVector<ValueT, PosT, Allocator>::StringIterator::get_index()
+{
+ return m_index;
+}
+
+template<typename ValueT, typename PosT, template<typename > class Allocator>
+const std::string StringVector<ValueT, PosT, Allocator>::StringIterator::dereference() const
+{
+ return StringVector<ValueT, PosT, Allocator>::range(
+ m_container->begin(m_index), m_container->end(m_index)).str();
+}
+
+template<typename ValueT, typename PosT, template<typename > class Allocator>
+bool StringVector<ValueT, PosT, Allocator>::StringIterator::equal(
+ StringVector<ValueT, PosT, Allocator>::StringIterator const& other) const
+{
+ return m_index == other.m_index && m_container == other.m_container;
+}
+
+template<typename ValueT, typename PosT, template<typename > class Allocator>
+void StringVector<ValueT, PosT, Allocator>::StringIterator::increment()
+{
+ m_index++;
+}
+
+template<typename ValueT, typename PosT, template<typename > class Allocator>
+void StringVector<ValueT, PosT, Allocator>::StringIterator::decrement()
+{
+ m_index--;
+}
+
+template<typename ValueT, typename PosT, template<typename > class Allocator>
+void StringVector<ValueT, PosT, Allocator>::StringIterator::advance(PosT n)
+{
+ m_index += n;
+}
+
+template<typename ValueT, typename PosT, template<typename > class Allocator>
+PosT StringVector<ValueT, PosT, Allocator>::StringIterator::distance_to(
+ StringVector<ValueT, PosT, Allocator>::StringIterator const& other) const
+{
+ return other.m_index - m_index;
+}
+
+// ********** Some typedefs **********
+
+typedef StringVector<unsigned char, unsigned int> MediumStringVector;
+typedef StringVector<unsigned char, unsigned long> LongStringVector;
+
+}
+
+#endif
diff --git a/moses2/TranslationModel/CompactPT/TargetPhraseCollectionCache.cpp b/moses2/TranslationModel/CompactPT/TargetPhraseCollectionCache.cpp
new file mode 100644
index 000000000..07d0469e0
--- /dev/null
+++ b/moses2/TranslationModel/CompactPT/TargetPhraseCollectionCache.cpp
@@ -0,0 +1,39 @@
+// $Id$
+// vim:tabstop=2
+/***********************************************************************
+Moses - factored phrase-based language decoder
+Copyright (C) 2006 University of Edinburgh
+
+This library is free software; you can redistribute it and/or
+modify it under the terms of the GNU Lesser General Public
+License as published by the Free Software Foundation; either
+version 2.1 of the License, or (at your option) any later version.
+
+This library is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+Lesser General Public License for more details.
+
+You should have received a copy of the GNU Lesser General Public
+License along with this library; if not, write to the Free Software
+Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+***********************************************************************/
+
+#include "TargetPhraseCollectionCache.h"
+
+namespace Moses2
+{
+
+boost::thread_specific_ptr<TargetPhraseCollectionCache::CacheMap>
+TargetPhraseCollectionCache::m_phraseCache;
+
+PhraseCompact::PhraseCompact(const Phrase<Word> &copy)
+{
+ for (size_t i = 0; i < copy.GetSize(); ++i) {
+ const Word &word = copy[i];
+ push_back(word);
+ }
+}
+
+}
+
diff --git a/moses2/TranslationModel/CompactPT/TargetPhraseCollectionCache.h b/moses2/TranslationModel/CompactPT/TargetPhraseCollectionCache.h
new file mode 100644
index 000000000..3a9e6f170
--- /dev/null
+++ b/moses2/TranslationModel/CompactPT/TargetPhraseCollectionCache.h
@@ -0,0 +1,176 @@
+// $Id$
+// vim:tabstop=2
+/***********************************************************************
+Moses - factored phrase-based language decoder
+Copyright (C) 2006 University of Edinburgh
+
+This library is free software; you can redistribute it and/or
+modify it under the terms of the GNU Lesser General Public
+License as published by the Free Software Foundation; either
+version 2.1 of the License, or (at your option) any later version.
+
+This library is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+Lesser General Public License for more details.
+
+You should have received a copy of the GNU Lesser General Public
+License along with this library; if not, write to the Free Software
+Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+***********************************************************************/
+
+#pragma once
+
+#include <map>
+#include <set>
+#include <vector>
+
+#include <boost/thread/tss.hpp>
+#include <boost/shared_ptr.hpp>
+
+#include "../../Word.h"
+#include "../../Phrase.h"
+
+namespace Moses2
+{
+typedef std::pair<size_t, size_t> AlignPointSizeT;
+
+struct PhraseCompact : public std::vector<Word>
+{
+public:
+ PhraseCompact(const Phrase<Word> &copy);
+};
+
+struct TPCompact
+{
+ std::vector<Word> words;
+ std::set<AlignPointSizeT> alignment;
+ std::vector<float> scores;
+
+};
+
+// Avoid using new due to locking
+typedef std::vector<TPCompact> TargetPhraseVector;
+typedef boost::shared_ptr<TargetPhraseVector> TargetPhraseVectorPtr;
+
+/** Implementation of Persistent Cache **/
+class TargetPhraseCollectionCache
+{
+private:
+ size_t m_max;
+ float m_tolerance;
+
+ struct LastUsed {
+ clock_t m_clock;
+ TargetPhraseVectorPtr m_tpv;
+ size_t m_bitsLeft;
+
+ LastUsed() : m_clock(0), m_bitsLeft(0) {}
+
+ LastUsed(clock_t clock, TargetPhraseVectorPtr tpv, size_t bitsLeft = 0)
+ : m_clock(clock), m_tpv(tpv), m_bitsLeft(bitsLeft) {}
+ };
+
+ typedef std::map<PhraseCompact, LastUsed> CacheMap;
+ static boost::thread_specific_ptr<CacheMap> m_phraseCache;
+
+public:
+
+ typedef CacheMap::iterator iterator;
+ typedef CacheMap::const_iterator const_iterator;
+
+ TargetPhraseCollectionCache(size_t max = 5000, float tolerance = 0.2)
+ : m_max(max), m_tolerance(tolerance) {
+ }
+
+ iterator Begin() {
+ if(!m_phraseCache.get())
+ m_phraseCache.reset(new CacheMap());
+ return m_phraseCache->begin();
+ }
+
+ const_iterator Begin() const {
+ if(!m_phraseCache.get())
+ m_phraseCache.reset(new CacheMap());
+ return m_phraseCache->begin();
+ }
+
+ iterator End() {
+ if(!m_phraseCache.get())
+ m_phraseCache.reset(new CacheMap());
+ return m_phraseCache->end();
+ }
+
+ const_iterator End() const {
+ if(!m_phraseCache.get())
+ m_phraseCache.reset(new CacheMap());
+ return m_phraseCache->end();
+ }
+
+ /** retrieve translations for source phrase from persistent cache **/
+ void Cache(const Phrase<Word> &sourcePhrase, TargetPhraseVectorPtr tpv,
+ size_t bitsLeft = 0, size_t maxRank = 0) {
+ if(!m_phraseCache.get())
+ m_phraseCache.reset(new CacheMap());
+ // check if source phrase is already in cache
+ iterator it = m_phraseCache->find(sourcePhrase);
+ if(it != m_phraseCache->end())
+ // if found, just update clock
+ it->second.m_clock = clock();
+ else {
+ // else, add to cache
+ if(maxRank && tpv->size() > maxRank) {
+ TargetPhraseVectorPtr tpv_temp(new TargetPhraseVector());
+ tpv_temp->resize(maxRank);
+ std::copy(tpv->begin(), tpv->begin() + maxRank, tpv_temp->begin());
+ (*m_phraseCache)[sourcePhrase] = LastUsed(clock(), tpv_temp, bitsLeft);
+ } else
+ (*m_phraseCache)[sourcePhrase] = LastUsed(clock(), tpv, bitsLeft);
+ }
+ }
+
+ std::pair<TargetPhraseVectorPtr, size_t> Retrieve(const Phrase<Word> &sourcePhrase) {
+ if(!m_phraseCache.get())
+ m_phraseCache.reset(new CacheMap());
+ iterator it = m_phraseCache->find(sourcePhrase);
+ if(it != m_phraseCache->end()) {
+ LastUsed &lu = it->second;
+ lu.m_clock = clock();
+ return std::make_pair(lu.m_tpv, lu.m_bitsLeft);
+ } else
+ return std::make_pair(TargetPhraseVectorPtr(), 0);
+ }
+
+ // if cache full, reduce
+ void Prune() {
+ if(!m_phraseCache.get())
+ m_phraseCache.reset(new CacheMap());
+ if(m_phraseCache->size() > m_max * (1 + m_tolerance)) {
+ typedef std::set<std::pair<clock_t, PhraseCompact > > Cands;
+ Cands cands;
+ for(CacheMap::iterator it = m_phraseCache->begin();
+ it != m_phraseCache->end(); it++) {
+ LastUsed &lu = it->second;
+ cands.insert(std::make_pair(lu.m_clock, it->first));
+ }
+
+ for(Cands::iterator it = cands.begin(); it != cands.end(); it++) {
+ const PhraseCompact& p = it->second;
+ m_phraseCache->erase(p);
+
+ if(m_phraseCache->size() < (m_max * (1 - m_tolerance)))
+ break;
+ }
+ }
+ }
+
+ void CleanUp() {
+ if(!m_phraseCache.get())
+ m_phraseCache.reset(new CacheMap());
+ m_phraseCache->clear();
+ }
+
+};
+
+}
+
diff --git a/moses2/TranslationModel/CompactPT/ThrowingFwrite.cpp b/moses2/TranslationModel/CompactPT/ThrowingFwrite.cpp
new file mode 100644
index 000000000..d9fec5013
--- /dev/null
+++ b/moses2/TranslationModel/CompactPT/ThrowingFwrite.cpp
@@ -0,0 +1,30 @@
+// $Id$
+// vim:tabstop=2
+/***********************************************************************
+ Moses - factored phrase-based language decoder
+ Copyright (C) 2006 University of Edinburgh
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ ***********************************************************************/
+
+#include "ThrowingFwrite.h"
+
+size_t ThrowingFwrite(const void *ptr, size_t size, size_t count, FILE* stream)
+{
+ assert(size);
+ size_t returnValue = std::fwrite(ptr, size, count, stream);
+ UTIL_THROW_IF2(count != returnValue, "Short fwrite; requested size " << size);
+ return returnValue;
+}
diff --git a/moses2/TranslationModel/CompactPT/ThrowingFwrite.h b/moses2/TranslationModel/CompactPT/ThrowingFwrite.h
new file mode 100644
index 000000000..2a0c71a27
--- /dev/null
+++ b/moses2/TranslationModel/CompactPT/ThrowingFwrite.h
@@ -0,0 +1,31 @@
+// $Id$
+// vim:tabstop=2
+/***********************************************************************
+ Moses - factored phrase-based language decoder
+ Copyright (C) 2006 University of Edinburgh
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ ***********************************************************************/
+
+#ifndef moses_ThrowingFwrite_h
+#define moses_ThrowingFwrite_h
+
+#include <cassert>
+#include <cstdio>
+#include "util/exception.hh"
+
+size_t ThrowingFwrite(const void *ptr, size_t size, size_t count, FILE* stream);
+
+#endif
diff --git a/moses2/TranslationModel/Memory/Node.h b/moses2/TranslationModel/Memory/Node.h
new file mode 100644
index 000000000..97fa9618e
--- /dev/null
+++ b/moses2/TranslationModel/Memory/Node.h
@@ -0,0 +1,138 @@
+/*
+ * Node.h
+ *
+ * Created on: 22 Apr 2016
+ * Author: hieu
+ */
+#pragma once
+#include <boost/unordered_map.hpp>
+#include <boost/foreach.hpp>
+#include "../../PhraseBased/TargetPhrases.h"
+#include "../../System.h"
+#include "../../Phrase.h"
+
+namespace Moses2
+{
+class System;
+
+namespace PtMem
+{
+
+template<class WORD, class SP, class TP, class TPS>
+class Node
+{
+public:
+ typedef boost::unordered_map<size_t, Node> Children;
+
+ Node()
+ :m_targetPhrases(NULL)
+ ,m_unsortedTPS(NULL)
+ {}
+
+ ~Node()
+ {}
+
+ void AddRule(const std::vector<FactorType> &factors, SP &source, TP *target)
+ {
+ AddRule(factors, source, target, 0);
+ }
+
+ TPS *Find(const std::vector<FactorType> &factors, const SP &source, size_t pos = 0) const
+ {
+ assert(source.GetSize());
+ if (pos == source.GetSize()) {
+ return m_targetPhrases;
+ }
+ else {
+ const WORD &word = source[pos];
+ //cerr << "word=" << word << endl;
+ typename Children::const_iterator iter = m_children.find(word.hash(factors));
+ if (iter == m_children.end()) {
+ return NULL;
+ }
+ else {
+ const Node &child = iter->second;
+ return child.Find(factors, source, pos + 1);
+ }
+ }
+ }
+
+ const Node *Find(const std::vector<FactorType> &factors, const WORD &word) const
+ {
+ typename Children::const_iterator iter = m_children.find(word.hash(factors));
+ if (iter == m_children.end()) {
+ return NULL;
+ }
+ else {
+ const Node &child = iter->second;
+ return &child;
+ }
+ }
+
+ const TPS *GetTargetPhrases() const
+ { return m_targetPhrases; }
+
+ void SortAndPrune(size_t tableLimit, MemPool &pool, System &system)
+ {
+ BOOST_FOREACH(typename Children::value_type &val, m_children){
+ Node &child = val.second;
+ child.SortAndPrune(tableLimit, pool, system);
+ }
+
+ // prune target phrases in this node
+ if (m_unsortedTPS) {
+ m_targetPhrases = new (pool.Allocate<TPS>()) TPS(pool, m_unsortedTPS->size());
+
+ for (size_t i = 0; i < m_unsortedTPS->size(); ++i) {
+ TP *tp = (*m_unsortedTPS)[i];
+ m_targetPhrases->AddTargetPhrase(*tp);
+ }
+
+ m_targetPhrases->SortAndPrune(tableLimit);
+ system.featureFunctions.EvaluateAfterTablePruning(system.GetSystemPool(), *m_targetPhrases, *m_source);
+
+ delete m_unsortedTPS;
+ }
+ }
+
+ const Children &GetChildren() const
+ { return m_children; }
+
+ void Debug(std::ostream &out, const System &system) const {
+ BOOST_FOREACH(const typename Children::value_type &valPair, m_children) {
+ const WORD &word = valPair.first;
+ //std::cerr << word << "(" << word.hash() << ") ";
+ }
+ }
+protected:
+ Children m_children;
+ TPS *m_targetPhrases;
+ Phrase<WORD> *m_source;
+ std::vector<TP*> *m_unsortedTPS;
+
+ Node &AddRule(const std::vector<FactorType> &factors, SP &source, TP *target, size_t pos)
+ {
+ if (pos == source.GetSize()) {
+ if (m_unsortedTPS == NULL) {
+ m_unsortedTPS = new std::vector<TP*>();
+ m_source = &source;
+ }
+
+ m_unsortedTPS->push_back(target);
+ return *this;
+ }
+ else {
+ const WORD &word = source[pos];
+ Node &child = m_children[word.hash(factors)];
+ //std::cerr << "added " << word << " " << &child << " from " << this << std::endl;
+
+ return child.AddRule(factors, source, target, pos + 1);
+ }
+ }
+
+};
+
+
+}
+} // namespace
+
diff --git a/moses2/TranslationModel/Memory/PhraseTableMemory.cpp b/moses2/TranslationModel/Memory/PhraseTableMemory.cpp
new file mode 100644
index 000000000..09eead137
--- /dev/null
+++ b/moses2/TranslationModel/Memory/PhraseTableMemory.cpp
@@ -0,0 +1,268 @@
+/*
+ * PhraseTableMemory.cpp
+ *
+ * Created on: 28 Oct 2015
+ * Author: hieu
+ */
+
+#include <cassert>
+#include <boost/foreach.hpp>
+#include "PhraseTableMemory.h"
+#include "../../PhraseBased/PhraseImpl.h"
+#include "../../Phrase.h"
+#include "../../System.h"
+#include "../../Scores.h"
+#include "../../InputPathsBase.h"
+#include "../../legacy/InputFileStream.h"
+#include "util/exception.hh"
+
+#include "../../PhraseBased/InputPath.h"
+#include "../../PhraseBased/TargetPhraseImpl.h"
+#include "../../PhraseBased/TargetPhrases.h"
+
+#include "../../SCFG/PhraseImpl.h"
+#include "../../SCFG/TargetPhraseImpl.h"
+#include "../../SCFG/InputPath.h"
+#include "../../SCFG/Stack.h"
+#include "../../SCFG/Stacks.h"
+#include "../../SCFG/Manager.h"
+
+
+using namespace std;
+
+namespace Moses2
+{
+
+
+////////////////////////////////////////////////////////////////////////
+
+PhraseTableMemory::PhraseTableMemory(size_t startInd, const std::string &line)
+:PhraseTable(startInd, line)
+,m_rootPb(NULL)
+,m_rootSCFG(NULL)
+{
+ ReadParameters();
+}
+
+PhraseTableMemory::~PhraseTableMemory()
+{
+ delete m_rootPb;
+ delete m_rootSCFG;
+}
+
+void PhraseTableMemory::Load(System &system)
+{
+ FactorCollection &vocab = system.GetVocab();
+ MemPool &systemPool = system.GetSystemPool();
+ MemPool tmpSourcePool;
+
+ if (system.isPb) {
+ m_rootPb = new PBNODE();
+ }
+ else {
+ m_rootSCFG = new SCFGNODE();
+ //cerr << "m_rootSCFG=" << m_rootSCFG << endl;
+ }
+
+ vector<string> toks;
+ size_t lineNum = 0;
+ InputFileStream strme(m_path);
+ string line;
+ while (getline(strme, line)) {
+ if (++lineNum % 1000000 == 0) {
+ cerr << lineNum << " ";
+ }
+ toks.clear();
+ TokenizeMultiCharSeparator(toks, line, "|||");
+ UTIL_THROW_IF2(toks.size() < 3, "Wrong format");
+ //cerr << "line=" << line << endl;
+ //cerr << "system.isPb=" << system.isPb << endl;
+
+ if (system.isPb) {
+ PhraseImpl *source = PhraseImpl::CreateFromString(tmpSourcePool, vocab, system,
+ toks[0]);
+ //cerr << "created soure" << endl;
+ TargetPhraseImpl *target = TargetPhraseImpl::CreateFromString(systemPool, *this, system,
+ toks[1]);
+ //cerr << "created target" << endl;
+ target->GetScores().CreateFromString(toks[2], *this, system, true);
+ //cerr << "created scores:" << *target << endl;
+
+ if (toks.size() >= 4) {
+ //cerr << "alignstr=" << toks[3] << endl;
+ target->SetAlignmentInfo(toks[3]);
+ }
+
+ // properties
+ if (toks.size() == 7) {
+ //target->properties = (char*) system.systemPool.Allocate(toks[6].size() + 1);
+ //strcpy(target->properties, toks[6].c_str());
+ }
+
+ system.featureFunctions.EvaluateInIsolation(systemPool, system, *source,
+ *target);
+ //cerr << "EvaluateInIsolation:" << *target << endl;
+ m_rootPb->AddRule(m_input, *source, target);
+
+ //cerr << "target=" << target->Debug(system) << endl;
+ }
+ else {
+ SCFG::PhraseImpl *source = SCFG::PhraseImpl::CreateFromString(tmpSourcePool, vocab, system,
+ toks[0]);
+ //cerr << "created source:" << *source << endl;
+ SCFG::TargetPhraseImpl *target = SCFG::TargetPhraseImpl::CreateFromString(systemPool, *this,
+ system, toks[1]);
+
+ //cerr << "created target " << *target << " source=" << *source << endl;
+
+ target->GetScores().CreateFromString(toks[2], *this, system, true);
+ //cerr << "created scores:" << *target << endl;
+
+ //vector<SCORE> scores = Tokenize<SCORE>(toks[2]);
+ //target->sortScore = (scores.size() >= 3) ? TransformScore(scores[2]) : 0;
+
+ target->SetAlignmentInfo(toks[3]);
+
+ // properties
+ if (toks.size() == 7) {
+ //target->properties = (char*) system.systemPool.Allocate(toks[6].size() + 1);
+ //strcpy(target->properties, toks[6].c_str());
+ }
+
+ system.featureFunctions.EvaluateInIsolation(systemPool, system, *source,
+ *target);
+ //cerr << "EvaluateInIsolation:" << *target << endl;
+ m_rootSCFG->AddRule(m_input, *source, target);
+ }
+ }
+
+ if (system.isPb) {
+ m_rootPb->SortAndPrune(m_tableLimit, systemPool, system);
+ //cerr << "root=" << &m_rootPb << endl;
+ }
+ else {
+ m_rootSCFG->SortAndPrune(m_tableLimit, systemPool, system);
+ //cerr << "root=" << &m_rootPb << endl;
+ }
+ /*
+ BOOST_FOREACH(const PtMem::Node<Word>::Children::value_type &valPair, m_rootPb.GetChildren()) {
+ const Word &word = valPair.first;
+ cerr << word << " ";
+ }
+ cerr << endl;
+ */
+}
+
+TargetPhrases* PhraseTableMemory::Lookup(const Manager &mgr, MemPool &pool,
+ InputPath &inputPath) const
+{
+ const SubPhrase<Moses2::Word> &phrase = inputPath.subPhrase;
+ TargetPhrases *tps = m_rootPb->Find(m_input, phrase);
+ return tps;
+}
+
+void PhraseTableMemory::InitActiveChart(
+ MemPool &pool,
+ const SCFG::Manager &mgr,
+ SCFG::InputPath &path) const
+{
+ size_t ptInd = GetPtInd();
+ ActiveChartEntryMem *chartEntry = new (pool.Allocate<ActiveChartEntryMem>()) ActiveChartEntryMem(pool, *m_rootSCFG);
+ path.AddActiveChartEntry(ptInd, chartEntry);
+ //cerr << "InitActiveChart=" << path << endl;
+}
+
+void PhraseTableMemory::Lookup(MemPool &pool,
+ const SCFG::Manager &mgr,
+ size_t maxChartSpan,
+ const SCFG::Stacks &stacks,
+ SCFG::InputPath &path) const
+{
+ if (path.range.GetNumWordsCovered() > maxChartSpan) {
+ return;
+ }
+
+ size_t endPos = path.range.GetEndPos();
+
+ const SCFG::InputPath *prevPath = static_cast<const SCFG::InputPath*>(path.prefixPath);
+ UTIL_THROW_IF2(prevPath == NULL, "prefixPath == NULL");
+
+ // TERMINAL
+ const SCFG::Word &lastWord = path.subPhrase.Back();
+
+ const SCFG::InputPath &subPhrasePath = *mgr.GetInputPaths().GetMatrix().GetValue(endPos, 1);
+
+ //cerr << "BEFORE LookupGivenWord=" << *prevPath << endl;
+ LookupGivenWord(pool, mgr, *prevPath, lastWord, NULL, subPhrasePath.range, path);
+ //cerr << "AFTER LookupGivenWord=" << *prevPath << endl;
+
+ // NON-TERMINAL
+ //const SCFG::InputPath *prefixPath = static_cast<const SCFG::InputPath*>(path.prefixPath);
+ while (prevPath) {
+ const Range &prevRange = prevPath->range;
+ //cerr << "prevRange=" << prevRange << endl;
+
+ size_t startPos = prevRange.GetEndPos() + 1;
+ size_t ntSize = endPos - startPos + 1;
+ const SCFG::InputPath &subPhrasePath = *mgr.GetInputPaths().GetMatrix().GetValue(startPos, ntSize);
+
+ LookupNT(pool, mgr, subPhrasePath.range, *prevPath, stacks, path);
+
+ prevPath = static_cast<const SCFG::InputPath*>(prevPath->prefixPath);
+ }
+}
+
+void PhraseTableMemory::LookupGivenNode(
+ MemPool &pool,
+ const SCFG::Manager &mgr,
+ const SCFG::ActiveChartEntry &prevEntry,
+ const SCFG::Word &wordSought,
+ const Moses2::Hypotheses *hypos,
+ const Moses2::Range &subPhraseRange,
+ SCFG::InputPath &outPath) const
+{
+ const ActiveChartEntryMem &prevEntryCast = static_cast<const ActiveChartEntryMem&>(prevEntry);
+
+ const SCFGNODE &prevNode = prevEntryCast.node;
+ UTIL_THROW_IF2(&prevNode == NULL, "node == NULL");
+
+ size_t ptInd = GetPtInd();
+ const SCFGNODE *nextNode = prevNode.Find(m_input, wordSought);
+
+ /*
+ if (outPath.range.GetStartPos() == 1 || outPath.range.GetStartPos() == 2) {
+ cerr << "range=" << outPath.range
+ << " prevEntry=" << prevEntry.GetSymbolBind().Debug(mgr.system)
+ << " wordSought=" << wordSought.Debug(mgr.system)
+ << " nextNode=" << nextNode
+ << endl;
+ }
+ */
+ if (nextNode) {
+ // new entries
+ ActiveChartEntryMem *chartEntry = new (pool.Allocate<ActiveChartEntryMem>()) ActiveChartEntryMem(pool, *nextNode, prevEntry);
+
+ chartEntry->AddSymbolBindElement(subPhraseRange, wordSought, hypos, *this);
+ //cerr << "AFTER Add=" << symbolBind << endl;
+
+ outPath.AddActiveChartEntry(ptInd, chartEntry);
+
+ const SCFG::TargetPhrases *tps = nextNode->GetTargetPhrases();
+ if (tps) {
+ // there are some rules
+ /*
+ cerr << "outPath=" << outPath.range
+ << " bind=" << chartEntry->GetSymbolBind().Debug(mgr.system)
+ << " pt=" << GetPtInd()
+ << " tps=" << tps->Debug(mgr.system) << endl;
+ */
+ outPath.AddTargetPhrasesToPath(pool, mgr.system, *this, *tps, chartEntry->GetSymbolBind());
+
+ }
+
+ //cerr << "AFTER outPath=" << outPath << endl;
+ }
+}
+
+}
+
diff --git a/moses2/TranslationModel/Memory/PhraseTableMemory.h b/moses2/TranslationModel/Memory/PhraseTableMemory.h
new file mode 100644
index 000000000..035c7c9c5
--- /dev/null
+++ b/moses2/TranslationModel/Memory/PhraseTableMemory.h
@@ -0,0 +1,85 @@
+/*
+ * PhraseTableMemory.h
+ *
+ * Created on: 28 Oct 2015
+ * Author: hieu
+ */
+#pragma once
+
+#include "../PhraseTable.h"
+#include "../../legacy/Util2.h"
+#include "../../SCFG/InputPath.h"
+#include "Node.h"
+#include "../../PhraseBased/PhraseImpl.h"
+#include "../../PhraseBased/TargetPhraseImpl.h"
+#include "../../PhraseBased/TargetPhrases.h"
+#include "../../SCFG/PhraseImpl.h"
+#include "../../SCFG/TargetPhraseImpl.h"
+#include "../../SCFG/TargetPhrases.h"
+
+namespace Moses2
+{
+
+class PhraseTableMemory: public PhraseTable
+{
+ typedef PtMem::Node<Word, Phrase<Word>, TargetPhraseImpl, TargetPhrases> PBNODE;
+ typedef PtMem::Node<SCFG::Word, Phrase<SCFG::Word>, SCFG::TargetPhraseImpl, SCFG::TargetPhrases> SCFGNODE;
+
+//////////////////////////////////////
+ class ActiveChartEntryMem : public SCFG::ActiveChartEntry
+ {
+ typedef SCFG::ActiveChartEntry Parent;
+ public:
+ const PhraseTableMemory::SCFGNODE &node;
+
+ ActiveChartEntryMem(MemPool &pool, const PhraseTableMemory::SCFGNODE &vnode)
+ :Parent(pool)
+ ,node(vnode)
+ {}
+
+ ActiveChartEntryMem(
+ MemPool &pool,
+ const PhraseTableMemory::SCFGNODE &vnode,
+ const ActiveChartEntry &prevEntry)
+ :Parent(prevEntry)
+ ,node(vnode)
+ {}
+ };
+
+ //////////////////////////////////////
+public:
+ PhraseTableMemory(size_t startInd, const std::string &line);
+ virtual ~PhraseTableMemory();
+
+ virtual void Load(System &system);
+ virtual TargetPhrases *Lookup(const Manager &mgr, MemPool &pool,
+ InputPath &inputPath) const;
+
+ virtual void InitActiveChart(
+ MemPool &pool,
+ const SCFG::Manager &mgr,
+ SCFG::InputPath &path) const;
+
+ void Lookup(MemPool &pool,
+ const SCFG::Manager &mgr,
+ size_t maxChartSpan,
+ const SCFG::Stacks &stacks,
+ SCFG::InputPath &path) const;
+
+protected:
+ PBNODE *m_rootPb;
+ SCFGNODE *m_rootSCFG;
+
+ void LookupGivenNode(
+ MemPool &pool,
+ const SCFG::Manager &mgr,
+ const SCFG::ActiveChartEntry &prevEntry,
+ const SCFG::Word &wordSought,
+ const Moses2::Hypotheses *hypos,
+ const Moses2::Range &subPhraseRange,
+ SCFG::InputPath &outPath) const;
+
+};
+
+}
+
diff --git a/moses2/TranslationModel/PhraseTable.cpp b/moses2/TranslationModel/PhraseTable.cpp
new file mode 100644
index 000000000..c790147bb
--- /dev/null
+++ b/moses2/TranslationModel/PhraseTable.cpp
@@ -0,0 +1,183 @@
+/*
+ * PhraseTable.cpp
+ *
+ * Created on: 23 Oct 2015
+ * Author: hieu
+ */
+#include <boost/foreach.hpp>
+#include <queue>
+#include "PhraseTable.h"
+#include "../legacy/Util2.h"
+#include "../TypeDef.h"
+#include "../InputType.h"
+#include "../PhraseBased/Manager.h"
+#include "../PhraseBased/InputPath.h"
+#include "../SCFG/InputPath.h"
+#include "../SCFG/Manager.h"
+
+using namespace std;
+
+namespace Moses2
+{
+
+////////////////////////////////////////////////////////////////////////////
+PhraseTable::PhraseTable(size_t startInd, const std::string &line) :
+ StatelessFeatureFunction(startInd, line), m_tableLimit(20) // default
+ , m_maxCacheSize(DEFAULT_MAX_TRANS_OPT_CACHE_SIZE)
+{
+ m_input.push_back(0);
+}
+
+PhraseTable::~PhraseTable()
+{
+ // TODO Auto-generated destructor stub
+}
+
+void PhraseTable::SetParameter(const std::string& key, const std::string& value)
+{
+ if (key == "cache-size") {
+ m_maxCacheSize = Scan<size_t>(value);
+ }
+ else if (key == "path") {
+ m_path = value;
+ }
+ else if (key == "input-factor") {
+ m_input = Tokenize<FactorType>(value, ",");
+ }
+ else if (key == "output-factor") {
+ m_output = Tokenize<FactorType>(value, ",");
+ }
+ else if (key == "table-limit") {
+ m_tableLimit = Scan<size_t>(value);
+ }
+ else {
+ StatelessFeatureFunction::SetParameter(key, value);
+ }
+}
+
+bool PhraseTable::SatisfyBackoff(const Manager &mgr, const InputPath &path) const
+{
+ const InputType &input = mgr.GetInput();
+ if ((mgr.system.options.input.xml_policy == XmlExclusive)
+ && input.XmlOverlap(path.range.GetStartPos(), path.range.GetEndPos())) {
+ return false;
+ }
+
+ //cerr << GetName() << "=" << GetPtInd() << "=" << decodeGraphBackoff << endl;
+ if (decodeGraphBackoff == 0) {
+ // always lookup
+ return true;
+ }
+ else if (decodeGraphBackoff == -1) {
+ // lookup only if there's no existing rules
+ return path.GetNumRules() ? false : true;
+ }
+ else if (path.range.GetNumWordsCovered() <= decodeGraphBackoff) {
+ return path.GetNumRules() ? false : true;
+ }
+
+ return false;
+}
+
+void PhraseTable::Lookup(const Manager &mgr, InputPathsBase &inputPaths) const
+{
+ BOOST_FOREACH(InputPathBase *pathBase, inputPaths){
+ InputPath *path = static_cast<InputPath*>(pathBase);
+ //cerr << "path=" << path->range << " ";
+
+ if (SatisfyBackoff(mgr, *path)) {
+ TargetPhrases *tpsPtr = Lookup(mgr, mgr.GetPool(), *path);
+ /*
+ cerr << "tpsPtr=" << tpsPtr << " ";
+ if (tps.get()) {
+ cerr << tps.get()->GetSize();
+ }
+ cerr << endl;
+ */
+
+ path->AddTargetPhrases(*this, tpsPtr);
+ }
+ }
+
+}
+
+TargetPhrases *PhraseTable::Lookup(const Manager &mgr, MemPool &pool,
+ InputPath &inputPath) const
+{
+ UTIL_THROW2("Not implemented");
+}
+
+void PhraseTable::EvaluateInIsolation(MemPool &pool, const System &system,
+ const Phrase<Moses2::Word> &source, const TargetPhraseImpl &targetPhrase, Scores &scores,
+ SCORE &estimatedScore) const
+{
+}
+
+void PhraseTable::EvaluateInIsolation(MemPool &pool, const System &system, const Phrase<SCFG::Word> &source,
+ const TargetPhrase<SCFG::Word> &targetPhrase, Scores &scores,
+ SCORE &estimatedScore) const
+{
+
+}
+
+// scfg
+void PhraseTable::LookupUnary(MemPool &pool,
+ const SCFG::Manager &mgr,
+ const SCFG::Stacks &stacks,
+ SCFG::InputPath &path) const
+{
+ //cerr << "BEFORE LookupUnary" << path.Debug(mgr.system) << endl;
+ size_t startPos = path.range.GetStartPos();
+ const SCFG::InputPath *prevPath = mgr.GetInputPaths().GetMatrix().GetValue(startPos, 0);
+ LookupNT(pool, mgr, path.range, *prevPath, stacks, path);
+ //cerr << "AFTER LookupUnary" << path.Debug(mgr.system) << endl;
+}
+
+void PhraseTable::LookupNT(
+ MemPool &pool,
+ const SCFG::Manager &mgr,
+ const Moses2::Range &subPhraseRange,
+ const SCFG::InputPath &prevPath,
+ const SCFG::Stacks &stacks,
+ SCFG::InputPath &outPath) const
+{
+ size_t endPos = outPath.range.GetEndPos();
+
+ const Range &prevRange = prevPath.range;
+
+ size_t startPos = prevRange.GetEndPos() + 1;
+ size_t ntSize = endPos - startPos + 1;
+
+ const SCFG::Stack &ntStack = stacks.GetStack(startPos, ntSize);
+ const SCFG::Stack::Coll &stackColl = ntStack.GetColl();
+
+ BOOST_FOREACH (const SCFG::Stack::Coll::value_type &valPair, stackColl) {
+ const SCFG::Word &ntSought = valPair.first;
+ const Moses2::HypothesisColl *hypos = valPair.second;
+ const Moses2::Hypotheses &sortedHypos = hypos->GetSortedAndPrunedHypos(mgr, mgr.arcLists);
+ //cerr << "ntSought=" << ntSought << ntSought.isNonTerminal << endl;
+ LookupGivenWord(pool, mgr, prevPath, ntSought, &sortedHypos, subPhraseRange, outPath);
+ }
+}
+
+void PhraseTable::LookupGivenWord(
+ MemPool &pool,
+ const SCFG::Manager &mgr,
+ const SCFG::InputPath &prevPath,
+ const SCFG::Word &wordSought,
+ const Moses2::Hypotheses *hypos,
+ const Moses2::Range &subPhraseRange,
+ SCFG::InputPath &outPath) const
+{
+ size_t ptInd = GetPtInd();
+
+
+ BOOST_FOREACH(const SCFG::ActiveChartEntry *prevEntry, prevPath.GetActiveChart(ptInd).entries) {
+ //cerr << "BEFORE LookupGivenNode=" << prevPath << endl;
+ LookupGivenNode(pool, mgr, *prevEntry, wordSought, hypos, subPhraseRange, outPath);
+ //cerr << "AFTER LookupGivenNode=" << prevPath << endl;
+ }
+}
+
+}
+
diff --git a/moses2/TranslationModel/PhraseTable.h b/moses2/TranslationModel/PhraseTable.h
new file mode 100644
index 000000000..9237f5ba6
--- /dev/null
+++ b/moses2/TranslationModel/PhraseTable.h
@@ -0,0 +1,128 @@
+/*
+ * PhraseTable.h
+ *
+ * Created on: 23 Oct 2015
+ * Author: hieu
+ */
+#pragma once
+#include <string>
+#include <boost/unordered_map.hpp>
+#include "../Word.h"
+#include "../HypothesisColl.h"
+#include "../FF/StatelessFeatureFunction.h"
+#include "../legacy/Util2.h"
+
+namespace Moses2
+{
+
+class System;
+class InputPathsBase;
+class InputPath;
+class Manager;
+class TargetPhrases;
+class Range;
+
+namespace SCFG
+{
+class InputPath;
+class Stacks;
+class Manager;
+class ActiveChartEntry;
+}
+
+////////////////////////////////////////////////////////////////////////
+class PhraseTable: public StatelessFeatureFunction
+{
+public:
+ int decodeGraphBackoff;
+
+ PhraseTable(size_t startInd, const std::string &line);
+ virtual ~PhraseTable();
+
+ virtual void SetParameter(const std::string& key, const std::string& value);
+ virtual void Lookup(const Manager &mgr, InputPathsBase &inputPaths) const;
+ virtual TargetPhrases *Lookup(const Manager &mgr, MemPool &pool,
+ InputPath &inputPath) const;
+
+ void SetPtInd(size_t ind)
+ { m_ptInd = ind; }
+
+ size_t GetPtInd() const
+ { return m_ptInd; }
+
+ bool SatisfyBackoff(const Manager &mgr, const InputPath &path) const;
+
+ virtual void
+ EvaluateInIsolation(MemPool &pool, const System &system, const Phrase<Moses2::Word> &source,
+ const TargetPhraseImpl &targetPhrase, Scores &scores,
+ SCORE &estimatedScore) const;
+
+ virtual void
+ EvaluateInIsolation(MemPool &pool, const System &system, const Phrase<SCFG::Word> &source,
+ const TargetPhrase<SCFG::Word> &targetPhrase, Scores &scores,
+ SCORE &estimatedScore) const;
+
+ // scfg
+ virtual void InitActiveChart(
+ MemPool &pool,
+ const SCFG::Manager &mgr,
+ SCFG::InputPath &path) const = 0;
+
+ virtual void Lookup(
+ MemPool &pool,
+ const SCFG::Manager &mgr,
+ size_t maxChartSpan,
+ const SCFG::Stacks &stacks,
+ SCFG::InputPath &path) const = 0;
+
+ virtual void LookupUnary(MemPool &pool,
+ const SCFG::Manager &mgr,
+ const SCFG::Stacks &stacks,
+ SCFG::InputPath &path) const;
+
+protected:
+ std::string m_path;
+ size_t m_ptInd; // in the order that it is list in [feature], NOT order of [mapping]
+ size_t m_tableLimit;
+ std::vector<FactorType> m_input, m_output;
+
+ // cache
+ size_t m_maxCacheSize; // 0 = no caching
+
+ struct CacheCollEntry2
+ {
+ TargetPhrases *tpsPtr;
+ clock_t clock;
+ };
+
+ // scfg
+ virtual void LookupNT(
+ MemPool &pool,
+ const SCFG::Manager &mgr,
+ const Moses2::Range &subPhraseRange,
+ const SCFG::InputPath &prevPath,
+ const SCFG::Stacks &stacks,
+ SCFG::InputPath &outPath) const;
+
+ virtual void LookupGivenWord(
+ MemPool &pool,
+ const SCFG::Manager &mgr,
+ const SCFG::InputPath &prevPath,
+ const SCFG::Word &wordSought,
+ const Moses2::Hypotheses *hypos,
+ const Moses2::Range &subPhraseRange,
+ SCFG::InputPath &outPath) const;
+
+ virtual void LookupGivenNode(
+ MemPool &pool,
+ const SCFG::Manager &mgr,
+ const SCFG::ActiveChartEntry &prevEntry,
+ const SCFG::Word &wordSought,
+ const Moses2::Hypotheses *hypos,
+ const Moses2::Range &subPhraseRange,
+ SCFG::InputPath &outPath) const = 0;
+
+};
+
+}
+
diff --git a/moses2/TranslationModel/ProbingPT/ProbingPT.cpp b/moses2/TranslationModel/ProbingPT/ProbingPT.cpp
new file mode 100644
index 000000000..1f22f45be
--- /dev/null
+++ b/moses2/TranslationModel/ProbingPT/ProbingPT.cpp
@@ -0,0 +1,756 @@
+/*
+ * ProbingPT.cpp
+ *
+ * Created on: 3 Nov 2015
+ * Author: hieu
+ */
+#include <boost/foreach.hpp>
+#include "ProbingPT.h"
+#include "querying.hh"
+#include "probing_hash_utils.hh"
+#include "util/exception.hh"
+#include "../../System.h"
+#include "../../Scores.h"
+#include "../../Phrase.h"
+#include "../../legacy/InputFileStream.h"
+#include "../../legacy/FactorCollection.h"
+#include "../../legacy/Util2.h"
+#include "../../FF/FeatureFunctions.h"
+#include "../../PhraseBased/PhraseImpl.h"
+#include "../../PhraseBased/TargetPhraseImpl.h"
+#include "../../PhraseBased/Manager.h"
+#include "../../PhraseBased/TargetPhrases.h"
+#include "../../SCFG/InputPath.h"
+#include "../../SCFG/Manager.h"
+#include "../../SCFG/TargetPhraseImpl.h"
+#include "../../SCFG/PhraseImpl.h"
+
+using namespace std;
+
+namespace Moses2
+{
+ProbingPT::ActiveChartEntryProbing::ActiveChartEntryProbing(
+ MemPool &pool,
+ const ActiveChartEntryProbing &prevEntry)
+:Parent(prevEntry)
+,m_key(prevEntry.m_key)
+{}
+
+void ProbingPT::ActiveChartEntryProbing::AddSymbolBindElement(
+ const Range &range,
+ const SCFG::Word &word,
+ const Moses2::Hypotheses *hypos,
+ const Moses2::PhraseTable &pt)
+{
+ const ProbingPT &probingPt = static_cast<const ProbingPT&>(pt);
+ std::pair<bool, uint64_t> key = GetKey(word, probingPt);
+ UTIL_THROW_IF2(!key.first, "Word should have been in source vocab");
+ m_key = key.second;
+
+ ActiveChartEntry::AddSymbolBindElement(range, word, hypos, pt);
+}
+
+std::pair<bool, uint64_t> ProbingPT::ActiveChartEntryProbing::GetKey(const SCFG::Word &nextWord, const ProbingPT &pt) const
+{
+ std::pair<bool, uint64_t> ret;
+ ret.second = m_key;
+ uint64_t probingId = pt.GetSourceProbingId(nextWord);
+ if (probingId == pt.GetUnk()) {
+ ret.first = false;
+ return ret;
+ }
+
+ ret.first = true;
+ size_t phraseSize = m_symbolBind.coll.size();
+ ret.second += probingId << phraseSize;
+ return ret;
+}
+
+////////////////////////////////////////////////////////////////////////////
+ProbingPT::ProbingPT(size_t startInd, const std::string &line)
+:PhraseTable(startInd, line)
+,load_method(util::POPULATE_OR_READ)
+{
+ ReadParameters();
+}
+
+ProbingPT::~ProbingPT()
+{
+ delete m_engine;
+}
+
+void ProbingPT::Load(System &system)
+{
+ m_engine = new QueryEngine(m_path.c_str(), load_method);
+
+ m_unkId = 456456546456;
+
+ FactorCollection &vocab = system.GetVocab();
+
+ // source vocab
+ const std::map<uint64_t, std::string> &sourceVocab =
+ m_engine->getSourceVocab();
+ std::map<uint64_t, std::string>::const_iterator iterSource;
+ for (iterSource = sourceVocab.begin(); iterSource != sourceVocab.end();
+ ++iterSource) {
+ string wordStr = iterSource->second;
+ bool isNT;
+ //cerr << "wordStr=" << wordStr << endl;
+ ReformatWord(system, wordStr, isNT);
+ //cerr << "wordStr=" << wordStr << endl;
+
+ const Factor *factor = vocab.AddFactor(wordStr, system, isNT);
+
+ uint64_t probingId = iterSource->first;
+ size_t factorId = factor->GetId();
+
+ if (factorId >= m_sourceVocab.size()) {
+ m_sourceVocab.resize(factorId + 1, m_unkId);
+ }
+ m_sourceVocab[factorId] = probingId;
+ }
+
+ // target vocab
+ InputFileStream targetVocabStrme(m_path + "/TargetVocab.dat");
+ string line;
+ while (getline(targetVocabStrme, line)) {
+ vector<string> toks = Tokenize(line, "\t");
+ UTIL_THROW_IF2(toks.size() != 2, string("Incorrect format:") + line + "\n");
+
+ bool isNT;
+ //cerr << "wordStr=" << toks[0] << endl;
+ ReformatWord(system, toks[0], isNT);
+ //cerr << "wordStr=" << toks[0] << endl;
+
+ const Factor *factor = vocab.AddFactor(toks[0], system, isNT);
+ uint32_t probingId = Scan<uint32_t>(toks[1]);
+
+ if (probingId >= m_targetVocab.size()) {
+ m_targetVocab.resize(probingId + 1);
+ }
+
+ std::pair<bool, const Factor*> ele(isNT, factor);
+ m_targetVocab[probingId] = ele;
+ }
+
+ // alignments
+ CreateAlignmentMap(system, m_path + "/Alignments.dat");
+
+ // cache
+ CreateCache(system);
+}
+
+void ProbingPT::SetParameter(const std::string& key, const std::string& value)
+{
+ if (key == "load") {
+ if (value == "lazy") {
+ load_method = util::LAZY;
+ }
+ else if (value == "populate_or_lazy") {
+ load_method = util::POPULATE_OR_LAZY;
+ }
+ else if (value == "populate_or_read" || value == "populate") {
+ load_method = util::POPULATE_OR_READ;
+ }
+ else if (value == "read") {
+ load_method = util::READ;
+ }
+ else if (value == "parallel_read") {
+ load_method = util::PARALLEL_READ;
+ }
+ else {
+ UTIL_THROW2("load method not supported" << value);
+ }
+ }
+ else {
+ PhraseTable::SetParameter(key, value);
+ }
+}
+
+void ProbingPT::CreateAlignmentMap(System &system, const std::string path)
+{
+ const std::vector< std::vector<unsigned char> > &probingAlignColl = m_engine->getAlignments();
+ m_aligns.resize(probingAlignColl.size(), NULL);
+
+ for (size_t i = 0; i < probingAlignColl.size(); ++i) {
+ AlignmentInfo::CollType aligns;
+
+ const std::vector<unsigned char> &probingAligns = probingAlignColl[i];
+ for (size_t j = 0; j < probingAligns.size(); j += 2) {
+ size_t startPos = probingAligns[j];
+ size_t endPos = probingAligns[j+1];
+ //cerr << "startPos=" << startPos << " " << endPos << endl;
+ aligns.insert(std::pair<size_t,size_t>(startPos, endPos));
+ }
+
+ const AlignmentInfo *align = AlignmentInfoCollection::Instance().Add(aligns);
+ m_aligns[i] = align;
+ //cerr << "align=" << align->Debug(system) << endl;
+ }
+}
+
+void ProbingPT::Lookup(const Manager &mgr, InputPathsBase &inputPaths) const
+{
+ BOOST_FOREACH(InputPathBase *pathBase, inputPaths){
+ InputPath *path = static_cast<InputPath*>(pathBase);
+
+ if (SatisfyBackoff(mgr, *path)) {
+ TargetPhrases *tpsPtr;
+ tpsPtr = Lookup(mgr, mgr.GetPool(), *path);
+ path->AddTargetPhrases(*this, tpsPtr);
+ }
+ }
+}
+
+TargetPhrases* ProbingPT::Lookup(const Manager &mgr, MemPool &pool,
+ InputPath &inputPath) const
+{
+ /*
+ if (inputPath.prefixPath && inputPath.prefixPath->GetTargetPhrases(*this) == NULL) {
+ // assume all paths have prefixes, except rules with 1 word source
+ return NULL;
+ }
+ else {
+ const Phrase &sourcePhrase = inputPath.subPhrase;
+ std::pair<TargetPhrases*, uint64_t> tpsAndKey = CreateTargetPhrase(pool, mgr.system, sourcePhrase);
+ return tpsAndKey.first;
+ }
+ */
+ const Phrase<Moses2::Word> &sourcePhrase = inputPath.subPhrase;
+
+ // get hash for source phrase
+ std::pair<bool, uint64_t> keyStruct = GetKey(sourcePhrase);
+ if (!keyStruct.first) {
+ return NULL;
+ }
+
+ // check in cache
+ CachePb::const_iterator iter = m_cachePb.find(keyStruct.second);
+ if (iter != m_cachePb.end()) {
+ //cerr << "FOUND IN CACHE " << keyStruct.second << " " << sourcePhrase.Debug(mgr.system) << endl;
+ TargetPhrases *tps = iter->second;
+ return tps;
+ }
+
+ // query pt
+ TargetPhrases *tps = CreateTargetPhrases(pool, mgr.system, sourcePhrase,
+ keyStruct.second);
+ return tps;
+}
+
+std::pair<bool, uint64_t> ProbingPT::GetKey(const Phrase<Moses2::Word> &sourcePhrase) const
+{
+ std::pair<bool, uint64_t> ret;
+
+ // create a target phrase from the 1st word of the source, prefix with 'ProbingPT:'
+ size_t sourceSize = sourcePhrase.GetSize();
+ assert(sourceSize);
+
+ uint64_t probingSource[sourceSize];
+ GetSourceProbingIds(sourcePhrase, ret.first, probingSource);
+ if (!ret.first) {
+ // source phrase contains a word unknown in the pt.
+ // We know immediately there's no translation for it
+ }
+ else {
+ ret.second = m_engine->getKey(probingSource, sourceSize);
+ }
+
+ return ret;
+
+}
+
+TargetPhrases *ProbingPT::CreateTargetPhrases(MemPool &pool,
+ const System &system, const Phrase<Moses2::Word> &sourcePhrase, uint64_t key) const
+{
+ TargetPhrases *tps = NULL;
+
+ //Actual lookup
+ std::pair<bool, uint64_t> query_result; // 1st=found, 2nd=target file offset
+ query_result = m_engine->query(key);
+ //cerr << "key2=" << query_result.second << endl;
+
+ if (query_result.first) {
+ const char *offset = m_engine->memTPS + query_result.second;
+ uint64_t *numTP = (uint64_t*) offset;
+
+ tps = new (pool.Allocate<TargetPhrases>()) TargetPhrases(pool, *numTP);
+
+ offset += sizeof(uint64_t);
+ for (size_t i = 0; i < *numTP; ++i) {
+ TargetPhraseImpl *tp = CreateTargetPhrase(pool, system, offset);
+ assert(tp);
+ const FeatureFunctions &ffs = system.featureFunctions;
+ ffs.EvaluateInIsolation(pool, system, sourcePhrase, *tp);
+
+ tps->AddTargetPhrase(*tp);
+
+ }
+
+ tps->SortAndPrune(m_tableLimit);
+ system.featureFunctions.EvaluateAfterTablePruning(pool, *tps, sourcePhrase);
+ //cerr << *tps << endl;
+ }
+
+ return tps;
+}
+
+TargetPhraseImpl *ProbingPT::CreateTargetPhrase(
+ MemPool &pool,
+ const System &system,
+ const char *&offset) const
+{
+ TargetPhraseInfo *tpInfo = (TargetPhraseInfo*) offset;
+ size_t numRealWords = tpInfo->numWords / m_output.size();
+
+ TargetPhraseImpl *tp =
+ new (pool.Allocate<TargetPhraseImpl>()) TargetPhraseImpl(pool, *this,
+ system, numRealWords);
+
+ offset += sizeof(TargetPhraseInfo);
+
+ // scores
+ SCORE *scores = (SCORE*) offset;
+
+ size_t totalNumScores = m_engine->num_scores + m_engine->num_lex_scores;
+
+ if (m_engine->logProb) {
+ // set pt score for rule
+ tp->GetScores().PlusEquals(system, *this, scores);
+
+ // save scores for other FF, eg. lex RO. Just give the offset
+ if (m_engine->num_lex_scores) {
+ tp->scoreProperties = scores + m_engine->num_scores;
+ }
+ }
+ else {
+ // log score 1st
+ SCORE logScores[totalNumScores];
+ for (size_t i = 0; i < totalNumScores; ++i) {
+ logScores[i] = FloorScore(TransformScore(scores[i]));
+ }
+
+ // set pt score for rule
+ tp->GetScores().PlusEquals(system, *this, logScores);
+
+ // save scores for other FF, eg. lex RO.
+ tp->scoreProperties = pool.Allocate<SCORE>(m_engine->num_lex_scores);
+ for (size_t i = 0; i < m_engine->num_lex_scores; ++i) {
+ tp->scoreProperties[i] = logScores[i + m_engine->num_scores];
+ }
+ }
+
+ offset += sizeof(SCORE) * totalNumScores;
+
+ // words
+ for (size_t targetPos = 0; targetPos < numRealWords; ++targetPos) {
+ for (size_t i = 0; i < m_output.size(); ++i) {
+ FactorType factorType = m_output[i];
+
+ uint32_t *probingId = (uint32_t*) offset;
+
+ const std::pair<bool, const Factor *> *factorPair = GetTargetFactor(*probingId);
+ assert(factorPair);
+ assert(!factorPair->first);
+
+ Word &word = (*tp)[targetPos];
+ word[factorType] = factorPair->second;
+
+ offset += sizeof(uint32_t);
+ }
+ }
+
+ // align
+ uint32_t alignTerm = tpInfo->alignTerm;
+ //cerr << "alignTerm=" << alignTerm << endl;
+ UTIL_THROW_IF2(alignTerm >= m_aligns.size(), "Unknown alignInd");
+ tp->Parent::SetAlignTerm(*m_aligns[alignTerm]);
+
+ // properties TODO
+
+ return tp;
+}
+
+void ProbingPT::GetSourceProbingIds(const Phrase<Moses2::Word> &sourcePhrase,
+ bool &ok, uint64_t probingSource[]) const
+{
+
+ size_t size = sourcePhrase.GetSize();
+ for (size_t i = 0; i < size; ++i) {
+ const Word &word = sourcePhrase[i];
+ uint64_t probingId = GetSourceProbingId(word);
+ if (probingId == m_unkId) {
+ ok = false;
+ return;
+ }
+ else {
+ probingSource[i] = probingId;
+ }
+ }
+
+ ok = true;
+}
+
+uint64_t ProbingPT::GetSourceProbingId(const Word &word) const
+{
+ uint64_t ret = 0;
+
+ for (size_t i = 0; i < m_input.size(); ++i) {
+ FactorType factorType = m_input[i];
+ const Factor *factor = word[factorType];
+
+ size_t factorId = factor->GetId();
+ if (factorId >= m_sourceVocab.size()) {
+ return m_unkId;
+ }
+ ret += m_sourceVocab[factorId];
+ }
+
+ return ret;
+}
+
+void ProbingPT::CreateCache(System &system)
+{
+ if (m_maxCacheSize == 0) {
+ return;
+ }
+
+ string filePath = m_path + "/cache";
+ InputFileStream strme(filePath);
+
+ string line;
+ getline(strme, line);
+ //float totalCount = Scan<float>(line);
+
+ MemPool &pool = system.GetSystemPool();
+ FactorCollection &vocab = system.GetVocab();
+
+ MemPool tmpSourcePool;
+
+ size_t lineCount = 0;
+ while (getline(strme, line) && lineCount < m_maxCacheSize) {
+ vector<string> toks = Tokenize(line, "\t");
+ assert(toks.size() == 3);
+ uint64_t key = Scan<uint64_t>(toks[1]);
+ //cerr << "line=" << line << endl;
+
+ if (system.isPb) {
+ PhraseImpl *sourcePhrase = PhraseImpl::CreateFromString(tmpSourcePool, vocab, system, toks[2]);
+
+ /*
+ std::pair<bool, uint64_t> retStruct = GetKey(*sourcePhrase);
+ if (!retStruct.first) {
+ UTIL_THROW2("Unknown cache entry");
+ }
+ cerr << "key=" << retStruct.second << " " << key << endl;
+ */
+ TargetPhrases *tps = CreateTargetPhrases(pool, system, *sourcePhrase, key);
+ assert(tps);
+
+ m_cachePb[key] = tps;
+ }
+ else {
+ // SCFG
+ SCFG::PhraseImpl *sourcePhrase = SCFG::PhraseImpl::CreateFromString(tmpSourcePool, vocab, system, toks[2], false);
+ //cerr << "sourcePhrase=" << sourcePhrase->Debug(system) << endl;
+
+ std::pair<bool, SCFG::TargetPhrases*> tpsPair = CreateTargetPhrasesSCFG(pool, system, *sourcePhrase, key);
+ assert(tpsPair.first && tpsPair.second);
+
+ m_cacheSCFG[key] = tpsPair.second;
+ }
+ ++lineCount;
+ }
+
+}
+
+///////////////////////////////////////////////////////////////////////////////
+// SCFG
+///////////////////////////////////////////////////////////////////////////////
+
+void ProbingPT::ReformatWord(System &system, std::string &wordStr, bool &isNT)
+{
+ isNT = false;
+ if (system.isPb) {
+ return;
+ }
+ else {
+ isNT = (wordStr[0] == '[' && wordStr[wordStr.size() - 1] == ']');
+ //cerr << "nt=" << nt << endl;
+
+ if (isNT) {
+ size_t startPos = wordStr.find("][");
+ if (startPos == string::npos) {
+ startPos = 1;
+ }
+ else {
+ startPos += 2;
+ }
+
+ wordStr = wordStr.substr(startPos, wordStr.size() - startPos - 1);
+ //cerr << "wordStr=" << wordStr << endl;
+ }
+ }
+}
+
+void ProbingPT::InitActiveChart(
+ MemPool &pool,
+ const SCFG::Manager &mgr,
+ SCFG::InputPath &path) const
+{
+ //cerr << "InitActiveChart=" << path.Debug(cerr, mgr.system) << endl;
+ size_t ptInd = GetPtInd();
+ ActiveChartEntryProbing *chartEntry = new (pool.Allocate<ActiveChartEntryProbing>()) ActiveChartEntryProbing(pool);
+ path.AddActiveChartEntry(ptInd, chartEntry);
+}
+
+void ProbingPT::Lookup(MemPool &pool,
+ const SCFG::Manager &mgr,
+ size_t maxChartSpan,
+ const SCFG::Stacks &stacks,
+ SCFG::InputPath &path) const
+{
+ //cerr << "Lookup=" << endl;
+ if (path.range.GetNumWordsCovered() > maxChartSpan) {
+ return;
+ }
+
+ size_t endPos = path.range.GetEndPos();
+
+ const SCFG::InputPath *prevPath = static_cast<const SCFG::InputPath*>(path.prefixPath);
+ UTIL_THROW_IF2(prevPath == NULL, "prefixPath == NULL");
+
+ // TERMINAL
+ const SCFG::Word &lastWord = path.subPhrase.Back();
+
+ const SCFG::InputPath &subPhrasePath = *mgr.GetInputPaths().GetMatrix().GetValue(endPos, 1);
+
+ //cerr << "BEFORE LookupGivenWord=" << *prevPath << endl;
+ LookupGivenWord(pool, mgr, *prevPath, lastWord, NULL, subPhrasePath.range, path);
+ //cerr << "AFTER LookupGivenWord=" << *prevPath << endl;
+
+ // NON-TERMINAL
+ //const SCFG::InputPath *prefixPath = static_cast<const SCFG::InputPath*>(path.prefixPath);
+ while (prevPath) {
+ const Range &prevRange = prevPath->range;
+ //cerr << "prevRange=" << prevRange << endl;
+
+ size_t startPos = prevRange.GetEndPos() + 1;
+ size_t ntSize = endPos - startPos + 1;
+ const SCFG::InputPath &subPhrasePath = *mgr.GetInputPaths().GetMatrix().GetValue(startPos, ntSize);
+
+ LookupNT(pool, mgr, subPhrasePath.range, *prevPath, stacks, path);
+
+ prevPath = static_cast<const SCFG::InputPath*>(prevPath->prefixPath);
+ }
+}
+
+void ProbingPT::LookupGivenNode(
+ MemPool &pool,
+ const SCFG::Manager &mgr,
+ const SCFG::ActiveChartEntry &prevEntry,
+ const SCFG::Word &wordSought,
+ const Moses2::Hypotheses *hypos,
+ const Moses2::Range &subPhraseRange,
+ SCFG::InputPath &outPath) const
+{
+ const ActiveChartEntryProbing &prevEntryCast = static_cast<const ActiveChartEntryProbing&>(prevEntry);
+
+ std::pair<bool, uint64_t> key = prevEntryCast.GetKey(wordSought, *this);
+
+ if (!key.first) {
+ // should only occasionally happen when looking up unary rules
+ return;
+ }
+
+ const Phrase<SCFG::Word> &sourcePhrase = outPath.subPhrase;
+
+ // check in cache
+ CacheSCFG::const_iterator iter = m_cacheSCFG.find(key.second);
+ if (iter != m_cacheSCFG.end()) {
+ //cerr << "FOUND IN CACHE " << key.second << " " << sourcePhrase.Debug(mgr.system) << endl;
+ SCFG::TargetPhrases *tps = iter->second;
+
+ ActiveChartEntryProbing *chartEntry = new (pool.Allocate<ActiveChartEntryProbing>()) ActiveChartEntryProbing(pool, prevEntryCast);
+ //cerr << "AFTER chartEntry" << endl;
+
+ chartEntry->AddSymbolBindElement(subPhraseRange, wordSought, hypos, *this);
+ //cerr << "AFTER AddSymbolBindElement" << endl;
+
+ size_t ptInd = GetPtInd();
+ outPath.AddActiveChartEntry(ptInd, chartEntry);
+
+ outPath.AddTargetPhrasesToPath(pool, mgr.system, *this, *tps, chartEntry->GetSymbolBind());
+ }
+ else {
+ // not in cache. Lookup
+ std::pair<bool, SCFG::TargetPhrases*> tpsPair = CreateTargetPhrasesSCFG(pool, mgr.system, sourcePhrase, key.second);
+ assert(tpsPair.first && tpsPair.second);
+
+ if (tpsPair.first) {
+ // new entries
+ ActiveChartEntryProbing *chartEntry = new (pool.Allocate<ActiveChartEntryProbing>()) ActiveChartEntryProbing(pool, prevEntryCast);
+ //cerr << "AFTER chartEntry" << endl;
+
+ chartEntry->AddSymbolBindElement(subPhraseRange, wordSought, hypos, *this);
+ //cerr << "AFTER AddSymbolBindElement" << endl;
+
+ size_t ptInd = GetPtInd();
+ outPath.AddActiveChartEntry(ptInd, chartEntry);
+ //cerr << "AFTER AddActiveChartEntry" << endl;
+
+ if (tpsPair.second) {
+ // there are some rules
+ //cerr << "symbolbind=" << chartEntry->GetSymbolBind().Debug(mgr.system) << endl;
+ outPath.AddTargetPhrasesToPath(pool, mgr.system, *this, *tpsPair.second, chartEntry->GetSymbolBind());
+ }
+ }
+ }
+}
+
+SCFG::TargetPhraseImpl *ProbingPT::CreateTargetPhraseSCFG(
+ MemPool &pool,
+ const System &system,
+ const char *&offset) const
+{
+ TargetPhraseInfo *tpInfo = (TargetPhraseInfo*) offset;
+ SCFG::TargetPhraseImpl *tp =
+ new (pool.Allocate<SCFG::TargetPhraseImpl>()) SCFG::TargetPhraseImpl(pool, *this,
+ system, tpInfo->numWords - 1);
+
+ offset += sizeof(TargetPhraseInfo);
+
+ // scores
+ SCORE *scores = (SCORE*) offset;
+
+ size_t totalNumScores = m_engine->num_scores + m_engine->num_lex_scores;
+
+ if (m_engine->logProb) {
+ // set pt score for rule
+ tp->GetScores().PlusEquals(system, *this, scores);
+
+ // save scores for other FF, eg. lex RO. Just give the offset
+ if (m_engine->num_lex_scores) {
+ tp->scoreProperties = scores + m_engine->num_scores;
+ }
+ }
+ else {
+ // log score 1st
+ SCORE logScores[totalNumScores];
+ for (size_t i = 0; i < totalNumScores; ++i) {
+ logScores[i] = FloorScore(TransformScore(scores[i]));
+ }
+
+ // set pt score for rule
+ tp->GetScores().PlusEquals(system, *this, logScores);
+
+ // save scores for other FF, eg. lex RO.
+ tp->scoreProperties = pool.Allocate<SCORE>(m_engine->num_lex_scores);
+ for (size_t i = 0; i < m_engine->num_lex_scores; ++i) {
+ tp->scoreProperties[i] = logScores[i + m_engine->num_scores];
+ }
+ }
+
+ offset += sizeof(SCORE) * totalNumScores;
+
+ // words
+ for (size_t i = 0; i < tpInfo->numWords - 1; ++i) {
+ uint32_t *probingId = (uint32_t*) offset;
+
+ const std::pair<bool, const Factor *> *factorPair = GetTargetFactor(*probingId);
+ assert(factorPair);
+
+ SCFG::Word &word = (*tp)[i];
+ word[0] = factorPair->second;
+ word.isNonTerminal = factorPair->first;
+
+ offset += sizeof(uint32_t);
+ }
+
+ // lhs
+ uint32_t *probingId = (uint32_t*) offset;
+
+ const std::pair<bool, const Factor *> *factorPair = GetTargetFactor(*probingId);
+ assert(factorPair);
+ assert(factorPair->first);
+
+ tp->lhs[0] = factorPair->second;
+ tp->lhs.isNonTerminal = factorPair->first;
+
+ offset += sizeof(uint32_t);
+
+ // align
+ uint32_t alignTerm = tpInfo->alignTerm;
+ //cerr << "alignTerm=" << alignTerm << endl;
+ UTIL_THROW_IF2(alignTerm >= m_aligns.size(), "Unknown alignInd");
+ tp->Parent::SetAlignTerm(*m_aligns[alignTerm]);
+
+ uint32_t alignNonTerm = tpInfo->alignNonTerm;
+ //cerr << "alignTerm=" << alignTerm << endl;
+ UTIL_THROW_IF2(alignNonTerm >= m_aligns.size(), "Unknown alignInd");
+ tp->SetAlignNonTerm(*m_aligns[alignNonTerm]);
+
+ // properties TODO
+
+ return tp;
+}
+
+std::pair<bool, SCFG::TargetPhrases*> ProbingPT::CreateTargetPhrasesSCFG(MemPool &pool, const System &system,
+ const Phrase<SCFG::Word> &sourcePhrase, uint64_t key) const
+{
+ std::pair<bool, SCFG::TargetPhrases*> ret(false, NULL);
+
+ std::pair<bool, uint64_t> query_result; // 1st=found, 2nd=target file offset
+ query_result = m_engine->query(key);
+ //cerr << "query_result=" << query_result.first << endl;
+
+ /*
+ if (outPath.range.GetStartPos() == 1 || outPath.range.GetStartPos() == 2) {
+ cerr << "range=" << outPath.range
+ << " prevEntry=" << prevEntry.GetSymbolBind().Debug(mgr.system) << " " << prevEntryCast.GetKey()
+ << " wordSought=" << wordSought.Debug(mgr.system)
+ << " key=" << key.first << " " << key.second
+ << " query_result=" << query_result.first << " " << (query_result.second == NONE)
+ << endl;
+ }
+ */
+
+ if (query_result.first) {
+ ret.first = true;
+ size_t ptInd = GetPtInd();
+
+ if (query_result.second != NONE) {
+ // there are some rules
+ const FeatureFunctions &ffs = system.featureFunctions;
+
+ const char *offset = m_engine->memTPS + query_result.second;
+ uint64_t *numTP = (uint64_t*) offset;
+ //cerr << "numTP=" << *numTP << endl;
+
+ SCFG::TargetPhrases *tps = new (pool.Allocate<SCFG::TargetPhrases>()) SCFG::TargetPhrases(pool, *numTP);
+ ret.second = tps;
+
+ offset += sizeof(uint64_t);
+ for (size_t i = 0; i < *numTP; ++i) {
+ SCFG::TargetPhraseImpl *tp = CreateTargetPhraseSCFG(pool, system, offset);
+ assert(tp);
+ //cerr << "tp=" << tp->Debug(mgr.system) << endl;
+
+ ffs.EvaluateInIsolation(pool, system, sourcePhrase, *tp);
+
+ tps->AddTargetPhrase(*tp);
+
+ }
+
+ tps->SortAndPrune(m_tableLimit);
+ ffs.EvaluateAfterTablePruning(pool, *tps, sourcePhrase);
+ //cerr << "tps=" << tps->GetSize() << endl;
+
+ }
+ }
+
+ return ret;
+}
+
+} // namespace
+
diff --git a/moses2/TranslationModel/ProbingPT/ProbingPT.h b/moses2/TranslationModel/ProbingPT/ProbingPT.h
new file mode 100644
index 000000000..c5fbefd6f
--- /dev/null
+++ b/moses2/TranslationModel/ProbingPT/ProbingPT.h
@@ -0,0 +1,159 @@
+/*
+ * ProbingPT.h
+ *
+ * Created on: 3 Nov 2015
+ * Author: hieu
+ */
+
+#pragma once
+
+#include <boost/iostreams/device/mapped_file.hpp>
+#include <boost/thread/tss.hpp>
+#include <boost/bimap.hpp>
+#include <deque>
+#include "../PhraseTable.h"
+#include "../../Vector.h"
+#include "../../Phrase.h"
+#include "../../SCFG/ActiveChart.h"
+#include "util/mmap.hh"
+
+namespace Moses2
+{
+class AlignmentInfo;
+class QueryEngine;
+class target_text;
+class MemPool;
+class System;
+class RecycleData;
+
+namespace SCFG
+{
+class TargetPhraseImpl;
+class TargetPhrases;
+}
+
+class ProbingPT: public Moses2::PhraseTable
+{
+ //////////////////////////////////////
+ class ActiveChartEntryProbing : public SCFG::ActiveChartEntry
+ {
+ typedef SCFG::ActiveChartEntry Parent;
+ public:
+
+ ActiveChartEntryProbing(MemPool &pool)
+ :Parent(pool)
+ ,m_key(0)
+ {}
+
+ ActiveChartEntryProbing(
+ MemPool &pool,
+ const ActiveChartEntryProbing &prevEntry);
+
+ uint64_t GetKey() const
+ { return m_key; }
+
+ std::pair<bool, uint64_t> GetKey(const SCFG::Word &nextWord, const ProbingPT &pt) const;
+
+ virtual void AddSymbolBindElement(
+ const Range &range,
+ const SCFG::Word &word,
+ const Moses2::Hypotheses *hypos,
+ const Moses2::PhraseTable &pt);
+
+ protected:
+ uint64_t m_key;
+ };
+ //////////////////////////////////////
+
+public:
+ ProbingPT(size_t startInd, const std::string &line);
+ virtual ~ProbingPT();
+ void Load(System &system);
+
+ virtual void SetParameter(const std::string& key, const std::string& value);
+ void Lookup(const Manager &mgr, InputPathsBase &inputPaths) const;
+
+ uint64_t GetUnk() const
+ { return m_unkId; }
+
+ // SCFG
+ void InitActiveChart(
+ MemPool &pool,
+ const SCFG::Manager &mgr,
+ SCFG::InputPath &path) const;
+
+ virtual void Lookup(MemPool &pool,
+ const SCFG::Manager &mgr,
+ size_t maxChartSpan,
+ const SCFG::Stacks &stacks,
+ SCFG::InputPath &path) const;
+
+
+protected:
+ std::vector<uint64_t> m_sourceVocab; // factor id -> pt id
+ std::vector< std::pair<bool, const Factor*> > m_targetVocab; // pt id -> factor*
+ std::vector<const AlignmentInfo*> m_aligns;
+ util::LoadMethod load_method;
+
+ uint64_t m_unkId;
+ QueryEngine *m_engine;
+
+ void CreateAlignmentMap(System &system, const std::string path);
+
+ TargetPhrases *Lookup(const Manager &mgr, MemPool &pool,
+ InputPath &inputPath) const;
+ TargetPhrases *CreateTargetPhrases(MemPool &pool, const System &system,
+ const Phrase<Moses2::Word> &sourcePhrase, uint64_t key) const;
+ TargetPhraseImpl *CreateTargetPhrase(MemPool &pool, const System &system,
+ const char *&offset) const;
+
+ inline const std::pair<bool, const Factor*> *GetTargetFactor(uint32_t probingId) const
+ {
+ if (probingId >= m_targetVocab.size()) {
+ return NULL;
+ }
+ return &m_targetVocab[probingId];
+ }
+
+ std::pair<bool, uint64_t> GetKey(const Phrase<Moses2::Word> &sourcePhrase) const;
+
+ void GetSourceProbingIds(const Phrase<Moses2::Word> &sourcePhrase, bool &ok,
+ uint64_t probingSource[]) const;
+
+ uint64_t GetSourceProbingId(const Word &word) const;
+
+ // caching
+ typedef boost::unordered_map<uint64_t, TargetPhrases*> CachePb;
+ CachePb m_cachePb;
+
+ typedef boost::unordered_map<uint64_t, SCFG::TargetPhrases*> CacheSCFG;
+ CacheSCFG m_cacheSCFG;
+
+ void CreateCache(System &system);
+
+ void ReformatWord(System &system, std::string &wordStr, bool &isNT);
+
+ // SCFG
+ void LookupGivenNode(
+ MemPool &pool,
+ const SCFG::Manager &mgr,
+ const SCFG::ActiveChartEntry &prevEntry,
+ const SCFG::Word &wordSought,
+ const Moses2::Hypotheses *hypos,
+ const Moses2::Range &subPhraseRange,
+ SCFG::InputPath &outPath) const;
+
+ std::pair<bool, SCFG::TargetPhrases*> CreateTargetPhrasesSCFG(MemPool &pool, const System &system,
+ const Phrase<SCFG::Word> &sourcePhrase, uint64_t key) const;
+ // return value: 1st = there are actual rules, not just a empty cell for prefix
+
+ SCFG::TargetPhraseImpl *CreateTargetPhraseSCFG(
+ MemPool &pool,
+ const System &system,
+ const char *&offset) const;
+
+
+};
+
+}
+
diff --git a/moses2/TranslationModel/ProbingPT/StoreTarget.cpp b/moses2/TranslationModel/ProbingPT/StoreTarget.cpp
new file mode 100644
index 000000000..326aaea5f
--- /dev/null
+++ b/moses2/TranslationModel/ProbingPT/StoreTarget.cpp
@@ -0,0 +1,266 @@
+/*
+ * StoreTarget.cpp
+ *
+ * Created on: 19 Jan 2016
+ * Author: hieu
+ */
+#include <boost/foreach.hpp>
+#include "StoreTarget.h"
+#include "line_splitter.hh"
+#include "probing_hash_utils.hh"
+#include "../../legacy/OutputFileStream.h"
+#include "../../legacy/Util2.h"
+
+using namespace std;
+
+namespace Moses2
+{
+
+StoreTarget::StoreTarget(const std::string &basepath)
+:m_basePath(basepath)
+,m_vocab(basepath + "/TargetVocab.dat")
+{
+ std::string path = basepath + "/TargetColl.dat";
+ m_fileTargetColl.open(path.c_str(),
+ std::ios::out | std::ios::binary | std::ios::ate | std::ios::trunc);
+ if (!m_fileTargetColl.is_open()) {
+ throw "can't create file ";
+ }
+
+}
+
+StoreTarget::~StoreTarget()
+{
+ assert(m_coll.empty());
+ m_fileTargetColl.close();
+
+ // vocab
+ m_vocab.Save();
+}
+
+uint64_t StoreTarget::Save()
+{
+ uint64_t ret = m_fileTargetColl.tellp();
+
+ // save to disk
+ uint64_t numTP = m_coll.size();
+ m_fileTargetColl.write((char*) &numTP, sizeof(uint64_t));
+
+ for (size_t i = 0; i < m_coll.size(); ++i) {
+ Save(*m_coll[i]);
+ }
+
+ // clear coll
+ RemoveAllInColl(m_coll);
+ m_coll.clear();
+
+ // starting position of coll
+ return ret;
+}
+
+void StoreTarget::Save(const target_text &rule)
+{
+ // metadata for each tp
+ TargetPhraseInfo tpInfo;
+ tpInfo.alignTerm = GetAlignId(rule.word_align_term);
+ tpInfo.alignNonTerm = GetAlignId(rule.word_align_non_term);
+ tpInfo.numWords = rule.target_phrase.size();
+ tpInfo.propLength = rule.property.size();
+
+ //cerr << "TPInfo=" << sizeof(TPInfo);
+ m_fileTargetColl.write((char*) &tpInfo, sizeof(TargetPhraseInfo));
+
+ // scores
+ for (size_t i = 0; i < rule.prob.size(); ++i) {
+ float prob = rule.prob[i];
+ m_fileTargetColl.write((char*) &prob, sizeof(prob));
+ }
+
+ // tp
+ for (size_t i = 0; i < rule.target_phrase.size(); ++i) {
+ uint32_t vocabId = rule.target_phrase[i];
+ m_fileTargetColl.write((char*) &vocabId, sizeof(vocabId));
+ }
+
+ // prop TODO
+
+}
+
+void StoreTarget::SaveAlignment()
+{
+ std::string path = m_basePath + "/Alignments.dat";
+ Moses2::OutputFileStream file(path);
+
+ BOOST_FOREACH(Alignments::value_type &valPair, m_aligns) {
+ file << valPair.second << "\t";
+
+ const std::vector<size_t> &aligns = valPair.first;
+ BOOST_FOREACH(size_t align, aligns) {
+ file << align << " ";
+ }
+ file << endl;
+ }
+
+}
+
+void StoreTarget::Append(const line_text &line, bool log_prob, bool scfg)
+{
+ target_text *rule = new target_text;
+ //cerr << "line.target_phrase=" << line.target_phrase << endl;
+
+ // target_phrase
+ vector<bool> nonTerms;
+ util::TokenIter<util::SingleCharacter> it;
+ it = util::TokenIter<util::SingleCharacter>(line.target_phrase,
+ util::SingleCharacter(' '));
+ while (it) {
+ StringPiece word = *it;
+ //cerr << "word=" << word << endl;
+
+ bool nonTerm = false;
+ if (scfg) {
+ // not really sure how to handle factored SCFG and NT
+ if (scfg && word[0] == '[' && word[word.size() - 1] == ']') {
+ //cerr << "NON-TERM=" << tok << " " << nonTerms.size() << endl;
+ nonTerm = true;
+ }
+ nonTerms.push_back(nonTerm);
+ }
+
+ util::TokenIter<util::SingleCharacter> itFactor;
+ itFactor = util::TokenIter<util::SingleCharacter>(word,
+ util::SingleCharacter('|'));
+ while (itFactor) {
+ StringPiece factor = *itFactor;
+
+ string factorStr = factor.as_string();
+ uint32_t vocabId = m_vocab.GetVocabId(factorStr);
+
+ rule->target_phrase.push_back(vocabId);
+
+ itFactor++;
+ }
+
+ it++;
+ }
+
+ // probs
+ it = util::TokenIter<util::SingleCharacter>(line.prob,
+ util::SingleCharacter(' '));
+ while (it) {
+ string tok = it->as_string();
+ float prob = Scan<float>(tok);
+
+ if (log_prob) {
+ prob = FloorScore(log(prob));
+ if (prob == 0.0f) prob = 0.0000000001;
+ }
+
+ rule->prob.push_back(prob);
+ it++;
+ }
+
+ /*
+ cerr << "nonTerms=";
+ for (size_t i = 0; i < nonTerms.size(); ++i) {
+ cerr << nonTerms[i] << " ";
+ }
+ cerr << endl;
+ */
+
+ // alignment
+ it = util::TokenIter<util::SingleCharacter>(line.word_align,
+ util::SingleCharacter(' '));
+ while (it) {
+ string tokPair = Trim(it->as_string());
+ if (tokPair.empty()) {
+ break;
+ }
+
+ vector<size_t> alignPair = Tokenize<size_t>(tokPair, "-");
+ assert(alignPair.size() == 2);
+
+ bool nonTerm = false;
+ size_t sourcePos = alignPair[0];
+ size_t targetPos = alignPair[1];
+ if (scfg) {
+ nonTerm = nonTerms[targetPos];
+ }
+
+ //cerr << targetPos << "=" << nonTerm << endl;
+
+ if (nonTerm) {
+ rule->word_align_non_term.push_back(sourcePos);
+ rule->word_align_non_term.push_back(targetPos);
+ //cerr << (int) rule->word_all1.back() << " ";
+ }
+ else {
+ rule->word_align_term.push_back(sourcePos);
+ rule->word_align_term.push_back(targetPos);
+ }
+
+ it++;
+ }
+
+ // extra scores
+ string prop = line.property.as_string();
+ AppendLexRO(prop, rule->prob, log_prob);
+
+ //cerr << "line.property=" << line.property << endl;
+ //cerr << "prop=" << prop << endl;
+
+ // properties
+ /*
+ for (size_t i = 0; i < prop.size(); ++i) {
+ rule->property.push_back(prop[i]);
+ }
+ */
+ m_coll.push_back(rule);
+}
+
+uint32_t StoreTarget::GetAlignId(const std::vector<size_t> &align)
+{
+ boost::unordered_map<std::vector<size_t>, uint32_t>::iterator iter =
+ m_aligns.find(align);
+ if (iter == m_aligns.end()) {
+ uint32_t ind = m_aligns.size();
+ m_aligns[align] = ind;
+ return ind;
+ }
+ else {
+ return iter->second;
+ }
+}
+
+void StoreTarget::AppendLexRO(std::string &prop, std::vector<float> &retvector,
+ bool log_prob) const
+{
+ size_t startPos = prop.find("{{LexRO ");
+
+ if (startPos != string::npos) {
+ size_t endPos = prop.find("}}", startPos + 8);
+ string lexProb = prop.substr(startPos + 8, endPos - startPos - 8);
+ //cerr << "lexProb=" << lexProb << endl;
+
+ // append lex probs to pt probs
+ vector<float> scores = Tokenize<float>(lexProb);
+
+ if (log_prob) {
+ for (size_t i = 0; i < scores.size(); ++i) {
+ scores[i] = FloorScore(log(scores[i]));
+ if (scores[i] == 0.0f) scores[i] = 0.0000000001;
+ }
+ }
+
+ for (size_t i = 0; i < scores.size(); ++i) {
+ retvector.push_back(scores[i]);
+ }
+
+ // exclude LexRO property from property column
+ prop = prop.substr(0, startPos)
+ + prop.substr(endPos + 2, prop.size() - endPos - 2);
+ //cerr << "line.property_to_be_binarized=" << line.property_to_be_binarized << "AAAA" << endl;
+ }
+}
+
+} /* namespace Moses2 */
diff --git a/moses2/TranslationModel/ProbingPT/StoreTarget.h b/moses2/TranslationModel/ProbingPT/StoreTarget.h
new file mode 100644
index 000000000..6fc3b1f66
--- /dev/null
+++ b/moses2/TranslationModel/ProbingPT/StoreTarget.h
@@ -0,0 +1,51 @@
+/*
+ * StoreTarget.h
+ *
+ * Created on: 19 Jan 2016
+ * Author: hieu
+ */
+#pragma once
+#include <string>
+#include <fstream>
+#include <vector>
+#include <inttypes.h>
+#include <boost/unordered_map.hpp>
+#include <boost/unordered_set.hpp>
+#include "StoreVocab.h"
+
+namespace Moses2
+{
+
+class line_text;
+class target_text;
+
+class StoreTarget
+{
+public:
+ StoreTarget(const std::string &basepath);
+ virtual ~StoreTarget();
+
+ uint64_t Save();
+ void SaveAlignment();
+
+ void Append(const line_text &line, bool log_prob, bool scfg);
+protected:
+ std::string m_basePath;
+ std::fstream m_fileTargetColl;
+ StoreVocab<uint32_t> m_vocab;
+
+ typedef boost::unordered_map<std::vector<size_t>, uint32_t> Alignments;
+ Alignments m_aligns;
+
+ std::vector<target_text*> m_coll;
+
+ uint32_t GetAlignId(const std::vector<size_t> &align);
+ void Save(const target_text &rule);
+
+ void AppendLexRO(std::string &prop, std::vector<float> &retvector,
+ bool log_prob) const;
+
+};
+
+} /* namespace Moses2 */
+
diff --git a/moses2/TranslationModel/ProbingPT/StoreVocab.cpp b/moses2/TranslationModel/ProbingPT/StoreVocab.cpp
new file mode 100644
index 000000000..e0b5b0b08
--- /dev/null
+++ b/moses2/TranslationModel/ProbingPT/StoreVocab.cpp
@@ -0,0 +1,13 @@
+/*
+ * StoreVocab.cpp
+ *
+ * Created on: 15 Jun 2016
+ * Author: hieu
+ */
+#include <fstream>
+#include "StoreVocab.h"
+
+namespace Moses2
+{
+
+} /* namespace Moses2 */
diff --git a/moses2/TranslationModel/ProbingPT/StoreVocab.h b/moses2/TranslationModel/ProbingPT/StoreVocab.h
new file mode 100644
index 000000000..e9808707a
--- /dev/null
+++ b/moses2/TranslationModel/ProbingPT/StoreVocab.h
@@ -0,0 +1,64 @@
+/*
+ * StoreVocab.h
+ *
+ * Created on: 15 Jun 2016
+ * Author: hieu
+ */
+#pragma once
+#include <string>
+#include <boost/unordered_map.hpp>
+#include "../../legacy/OutputFileStream.h"
+#include "../../legacy/Util2.h"
+
+namespace Moses2
+{
+
+template<typename VOCABID>
+class StoreVocab
+{
+protected:
+ std::string m_path;
+
+ typedef boost::unordered_map<std::string, VOCABID> Coll;
+ Coll m_vocab;
+
+public:
+ StoreVocab(const std::string &path)
+ :m_path(path)
+ {}
+
+ virtual ~StoreVocab() {}
+
+ VOCABID GetVocabId(const std::string &word)
+ {
+ typename Coll::iterator iter = m_vocab.find(word);
+ if (iter == m_vocab.end()) {
+ VOCABID ind = m_vocab.size() + 1;
+ m_vocab[word] = ind;
+ return ind;
+ }
+ else {
+ return iter->second;
+ }
+ }
+
+ void Insert(VOCABID id, const std::string &word)
+ {
+ m_vocab[word] = id;
+ }
+
+ void Save()
+ {
+ OutputFileStream strme(m_path);
+
+ typename Coll::const_iterator iter;
+ for (iter = m_vocab.begin(); iter != m_vocab.end(); ++iter) {
+ strme << iter->first << "\t" << iter->second << std::endl;
+ }
+
+ strme.Close();
+ }
+};
+
+} /* namespace Moses2 */
+
diff --git a/moses2/TranslationModel/ProbingPT/hash.cpp b/moses2/TranslationModel/ProbingPT/hash.cpp
new file mode 100644
index 000000000..aab5ee2b3
--- /dev/null
+++ b/moses2/TranslationModel/ProbingPT/hash.cpp
@@ -0,0 +1,44 @@
+#include <iostream>
+#include "hash.hh"
+
+using namespace std;
+
+namespace Moses2
+{
+
+uint64_t getHash(StringPiece text)
+{
+ std::size_t len = text.size();
+ uint64_t key = util::MurmurHashNative(text.data(), len);
+ return key;
+}
+
+std::vector<uint64_t> getVocabIDs(const StringPiece &textin)
+{
+ //Tokenize
+ std::vector<uint64_t> output;
+
+ util::TokenIter<util::SingleCharacter> itWord(textin, util::SingleCharacter(' '));
+
+ while (itWord) {
+ StringPiece word = *itWord;
+ uint64_t id = 0;
+
+ util::TokenIter<util::SingleCharacter> itFactor(word, util::SingleCharacter('|'));
+ while (itFactor) {
+ StringPiece factor = *itFactor;
+ //cerr << "factor=" << factor << endl;
+
+ id += getHash(factor);
+ itFactor++;
+ }
+
+ output.push_back(id);
+ itWord++;
+ }
+
+ return output;
+}
+
+}
+
diff --git a/moses2/TranslationModel/ProbingPT/hash.hh b/moses2/TranslationModel/ProbingPT/hash.hh
new file mode 100644
index 000000000..78cc27999
--- /dev/null
+++ b/moses2/TranslationModel/ProbingPT/hash.hh
@@ -0,0 +1,17 @@
+#pragma once
+
+#include "util/string_piece.hh"
+#include "util/murmur_hash.hh"
+#include "util/string_piece.hh" //Tokenization and work with StringPiece
+#include "util/tokenize_piece.hh"
+#include <vector>
+
+namespace Moses2
+{
+
+//Gets the MurmurmurHash for give string
+uint64_t getHash(StringPiece text);
+
+std::vector<uint64_t> getVocabIDs(const StringPiece &textin);
+
+}
diff --git a/moses2/TranslationModel/ProbingPT/line_splitter.cpp b/moses2/TranslationModel/ProbingPT/line_splitter.cpp
new file mode 100644
index 000000000..e4b5e2694
--- /dev/null
+++ b/moses2/TranslationModel/ProbingPT/line_splitter.cpp
@@ -0,0 +1,103 @@
+#include "line_splitter.hh"
+
+namespace Moses2
+{
+
+line_text splitLine(const StringPiece &textin, bool scfg)
+{
+ const char delim[] = "|||";
+ line_text output;
+
+ //Tokenize
+ util::TokenIter<util::MultiCharacter> it(textin, util::MultiCharacter(delim));
+ //Get source phrase
+ output.source_phrase = Trim(*it);
+ //std::cerr << "output.source_phrase=" << output.source_phrase << "AAAA" << std::endl;
+
+ //Get target_phrase
+ it++;
+ output.target_phrase = Trim(*it);
+ //std::cerr << "output.target_phrase=" << output.target_phrase << "AAAA" << std::endl;
+
+ if (scfg) {
+ /*
+ std::cerr << "output.source_phrase=" << output.source_phrase << std::endl;
+ std::cerr << "output.target_phrase=" << output.target_phrase << std::endl;
+ reformatSCFG(output);
+ std::cerr << "output.source_phrase=" << output.source_phrase << std::endl;
+ std::cerr << "output.target_phrase=" << output.target_phrase << std::endl;
+ */
+ }
+
+ //Get probabilities
+ it++;
+ output.prob = Trim(*it);
+ //std::cerr << "output.prob=" << output.prob << "AAAA" << std::endl;
+
+ //Get WordAllignment
+ it++;
+ if (it == util::TokenIter<util::MultiCharacter>::end()) return output;
+ output.word_align = Trim(*it);
+ //std::cerr << "output.word_align=" << output.word_align << "AAAA" << std::endl;
+
+ //Get count
+ it++;
+ if (it == util::TokenIter<util::MultiCharacter>::end()) return output;
+ output.counts = Trim(*it);
+ //std::cerr << "output.counts=" << output.counts << "AAAA" << std::endl;
+
+ //Get sparse_score
+ it++;
+ if (it == util::TokenIter<util::MultiCharacter>::end()) return output;
+ output.sparse_score = Trim(*it);
+ //std::cerr << "output.sparse_score=" << output.sparse_score << "AAAA" << std::endl;
+
+ //Get property
+ it++;
+ if (it == util::TokenIter<util::MultiCharacter>::end()) return output;
+ output.property = Trim(*it);
+ //std::cerr << "output.property=" << output.property << "AAAA" << std::endl;
+
+ return output;
+}
+
+std::vector<unsigned char> splitWordAll1(const StringPiece &textin)
+{
+ const char delim[] = " ";
+ const char delim2[] = "-";
+ std::vector<unsigned char> output;
+
+ //Case with no word alignments.
+ if (textin.size() == 0) {
+ return output;
+ }
+
+ //Split on space
+ util::TokenIter<util::MultiCharacter> it(textin, util::MultiCharacter(delim));
+
+ //For each int
+ while (it) {
+ //Split on dash (-)
+ util::TokenIter<util::MultiCharacter> itInner(*it,
+ util::MultiCharacter(delim2));
+
+ //Insert the two entries in the vector. User will read entry 0 and 1 to get the first,
+ //2 and 3 for second etc. Use unsigned char instead of int to save space, as
+ //word allignments are all very small numbers that fit in a single byte
+ output.push_back((unsigned char) (atoi(itInner->data())));
+ itInner++;
+ output.push_back((unsigned char) (atoi(itInner->data())));
+ it++;
+ }
+
+ return output;
+
+}
+
+void reformatSCFG(line_text &output)
+{
+
+}
+
+}
+
diff --git a/moses2/TranslationModel/ProbingPT/line_splitter.hh b/moses2/TranslationModel/ProbingPT/line_splitter.hh
new file mode 100644
index 000000000..3b086b44a
--- /dev/null
+++ b/moses2/TranslationModel/ProbingPT/line_splitter.hh
@@ -0,0 +1,59 @@
+#pragma once
+
+#include "util/string_piece.hh"
+#include "util/tokenize_piece.hh"
+#include "util/file_piece.hh"
+#include <vector>
+#include <cstdlib> //atof
+#include "util/string_piece.hh" //Tokenization and work with StringPiece
+#include "util/tokenize_piece.hh"
+#include <vector>
+
+namespace Moses2
+{
+
+//Struct for holding processed line
+struct line_text
+{
+ StringPiece source_phrase;
+ StringPiece target_phrase;
+ StringPiece prob;
+ StringPiece word_align;
+ StringPiece counts;
+ StringPiece sparse_score;
+ StringPiece property;
+ std::string property_to_be_binarized;
+};
+
+//Struct for holding processed line
+struct target_text
+{
+ std::vector<unsigned int> target_phrase;
+ std::vector<float> prob;
+ std::vector<size_t> word_align_term;
+ std::vector<size_t> word_align_non_term;
+ std::vector<char> counts;
+ std::vector<char> sparse_score;
+ std::vector<char> property;
+
+ /*
+ void Reset()
+ {
+ target_phrase.clear();
+ prob.clear();
+ word_all1.clear();
+ counts.clear();
+ sparse_score.clear();
+ property.clear();
+ }
+ */
+};
+
+//Ask if it's better to have it receive a pointer to a line_text struct
+line_text splitLine(const StringPiece &textin, bool scfg);
+void reformatSCFG(line_text &output);
+
+std::vector<unsigned char> splitWordAll1(const StringPiece &textin);
+
+}
+
diff --git a/moses2/TranslationModel/ProbingPT/probing_hash_utils.cpp b/moses2/TranslationModel/ProbingPT/probing_hash_utils.cpp
new file mode 100644
index 000000000..96c317b65
--- /dev/null
+++ b/moses2/TranslationModel/ProbingPT/probing_hash_utils.cpp
@@ -0,0 +1,40 @@
+#include <iostream>
+#include "probing_hash_utils.hh"
+#include "util/file.hh"
+
+namespace Moses2
+{
+
+//Read table from disk, return memory map location
+char * readTable(const char * filename, util::LoadMethod load_method, util::scoped_fd &file, util::scoped_memory &memory)
+{
+ //std::cerr << "filename=" << filename << std::endl;
+ file.reset(util::OpenReadOrThrow(filename));
+ uint64_t total_size_ = util::SizeFile(file.get());
+
+ MapRead(load_method, file.get(), 0, total_size_, memory);
+
+ return (char*) memory.get();
+}
+
+void serialize_table(char *mem, size_t size, const std::string &filename)
+{
+ std::ofstream os(filename.c_str(), std::ios::binary);
+ os.write((const char*) &mem[0], size);
+ os.close();
+
+}
+
+uint64_t getKey(const uint64_t source_phrase[], size_t size)
+{
+ //TOO SLOW
+ //uint64_t key = util::MurmurHashNative(&source_phrase[0], source_phrase.size());
+ uint64_t key = 0;
+ for (size_t i = 0; i < size; i++) {
+ key += (source_phrase[i] << i);
+ }
+ return key;
+}
+
+}
+
diff --git a/moses2/TranslationModel/ProbingPT/probing_hash_utils.hh b/moses2/TranslationModel/ProbingPT/probing_hash_utils.hh
new file mode 100644
index 000000000..368147807
--- /dev/null
+++ b/moses2/TranslationModel/ProbingPT/probing_hash_utils.hh
@@ -0,0 +1,55 @@
+#pragma once
+
+#include "util/probing_hash_table.hh"
+
+#include <sys/mman.h>
+#include <boost/functional/hash.hpp>
+#include <fcntl.h>
+#include <fstream>
+
+namespace Moses2
+{
+
+#define API_VERSION 15
+
+//Hash table entry
+struct Entry
+{
+ typedef uint64_t Key;
+ Key key;
+
+ Key GetKey() const
+ {
+ return key;
+ }
+
+ void SetKey(Key to)
+ {
+ key = to;
+ }
+
+ uint64_t value;
+};
+
+#define NONE std::numeric_limits<uint64_t>::max()
+
+//Define table
+typedef util::ProbingHashTable<Entry, boost::hash<uint64_t> > Table;
+
+void serialize_table(char *mem, size_t size, const std::string &filename);
+
+char * readTable(const char * filename, util::LoadMethod load_method, util::scoped_fd &file, util::scoped_memory &memory);
+
+uint64_t getKey(const uint64_t source_phrase[], size_t size);
+
+struct TargetPhraseInfo
+{
+ uint32_t alignTerm;
+ uint32_t alignNonTerm;
+ uint16_t numWords;
+ uint16_t propLength;
+ uint16_t filler;
+};
+
+}
+
diff --git a/moses2/TranslationModel/ProbingPT/querying.cpp b/moses2/TranslationModel/ProbingPT/querying.cpp
new file mode 100644
index 000000000..9ea2d8cb6
--- /dev/null
+++ b/moses2/TranslationModel/ProbingPT/querying.cpp
@@ -0,0 +1,180 @@
+#include "querying.hh"
+#include "util/exception.hh"
+#include "../../legacy/Util2.h"
+
+using namespace std;
+
+namespace Moses2
+{
+
+QueryEngine::QueryEngine(const char * filepath, util::LoadMethod load_method)
+{
+
+ //Create filepaths
+ std::string basepath(filepath);
+ std::string path_to_config = basepath + "/config";
+ std::string path_to_hashtable = basepath + "/probing_hash.dat";
+ std::string path_to_source_vocabid = basepath + "/source_vocabids";
+ std::string alignPath = basepath + "/Alignments.dat";
+
+ file_exits(basepath);
+
+ ///Source phrase vocabids
+ read_map(source_vocabids, path_to_source_vocabid.c_str());
+
+ // alignments
+ read_alignments(alignPath);
+
+ // target phrase
+ string targetCollPath = basepath + "/TargetColl.dat";
+ memTPS = readTable(targetCollPath.c_str(), load_method, fileTPS_, memoryTPS_);
+
+ //Read config file
+ boost::unordered_map<std::string, std::string> keyValue;
+
+ std::ifstream config(path_to_config.c_str());
+ std::string line;
+ while (getline(config, line)) {
+ std::vector<std::string> toks = Moses2::Tokenize(line, "\t");
+ UTIL_THROW_IF2(toks.size() != 2, "Wrong config format:" << line);
+ keyValue[ toks[0] ] = toks[1];
+ }
+
+ bool found;
+ //Check API version:
+ int version;
+ found = Get(keyValue, "API_VERSION", version);
+ if (!found) {
+ std::cerr << "Old or corrupted version of ProbingPT. Please rebinarize your phrase tables." << std::endl;
+ }
+ else if (version != API_VERSION) {
+ std::cerr << "The ProbingPT API has changed. " << version << "!="
+ << API_VERSION << " Please rebinarize your phrase tables." << std::endl;
+ exit(EXIT_FAILURE);
+ }
+
+ //Get tablesize.
+ int tablesize;
+ found = Get(keyValue, "uniq_entries", tablesize);
+ if (!found) {
+ std::cerr << "uniq_entries not found" << std::endl;
+ exit(EXIT_FAILURE);
+ }
+
+ //Number of scores
+ found = Get(keyValue, "num_scores", num_scores);
+ if (!found) {
+ std::cerr << "num_scores not found" << std::endl;
+ exit(EXIT_FAILURE);
+ }
+
+ //How may scores from lex reordering models
+ found = Get(keyValue, "num_lex_scores", num_lex_scores);
+ if (!found) {
+ std::cerr << "num_lex_scores not found" << std::endl;
+ exit(EXIT_FAILURE);
+ }
+
+ // have the scores been log() and FloorScore()?
+ found = Get(keyValue, "log_prob", logProb);
+ if (!found) {
+ std::cerr << "logProb not found" << std::endl;
+ exit(EXIT_FAILURE);
+ }
+
+ config.close();
+
+ //Read hashtable
+ table_filesize = Table::Size(tablesize, 1.2);
+ mem = readTable(path_to_hashtable.c_str(), load_method, file_, memory_);
+ Table table_init(mem, table_filesize);
+ table = table_init;
+
+ std::cerr << "Initialized successfully! " << std::endl;
+}
+
+QueryEngine::~QueryEngine()
+{
+ //Clear mmap content from memory.
+ //munmap(mem, table_filesize);
+
+}
+
+uint64_t QueryEngine::getKey(uint64_t source_phrase[], size_t size) const
+{
+ //TOO SLOW
+ //uint64_t key = util::MurmurHashNative(&source_phrase[0], source_phrase.size());
+ return Moses2::getKey(source_phrase, size);
+}
+
+std::pair<bool, uint64_t> QueryEngine::query(uint64_t key)
+{
+ std::pair<bool, uint64_t> ret;
+
+ const Entry * entry;
+ ret.first = table.Find(key, entry);
+ if (ret.first) {
+ ret.second = entry->value;
+ }
+ return ret;
+}
+
+void QueryEngine::read_alignments(const std::string &alignPath)
+{
+ std::ifstream strm(alignPath.c_str());
+
+ string line;
+ while (getline(strm, line)) {
+ vector<string> toks = Moses2::Tokenize(line, "\t ");
+ UTIL_THROW_IF2(toks.size() == 0, "Corrupt alignment file");
+
+ uint32_t alignInd = Scan<uint32_t>(toks[0]);
+ if (alignInd >= alignColl.size()) {
+ alignColl.resize(alignInd + 1);
+ }
+
+ Alignments &aligns = alignColl[alignInd];
+ for (size_t i = 1; i < toks.size(); ++i) {
+ size_t pos = Scan<size_t>(toks[i]);
+ aligns.push_back(pos);
+ }
+ }
+}
+
+void QueryEngine::file_exits(const std::string &basePath)
+{
+ if (!FileExists(basePath + "/Alignments.dat")) {
+ UTIL_THROW2("Require file does not exist in: " << basePath << "/Alignments.dat");
+ }
+ if (!FileExists(basePath + "/TargetColl.dat")) {
+ UTIL_THROW2("Require file does not exist in: " << basePath << "/TargetColl.dat");
+ }
+ if (!FileExists(basePath + "/TargetVocab.dat")) {
+ UTIL_THROW2("Require file does not exist in: " << basePath << "/TargetVocab.dat");
+ }
+ if (!FileExists(basePath + "/cache")) {
+ UTIL_THROW2("Require file does not exist in: " << basePath << "/cache");
+ }
+ if (!FileExists(basePath + "/config")) {
+ UTIL_THROW2("Require file does not exist in: " << basePath << "/config");
+ }
+ if (!FileExists(basePath + "/probing_hash.dat")) {
+ UTIL_THROW2("Require file does not exist in: " << basePath << "/probing_hash.dat");
+ }
+ if (!FileExists(basePath + "/source_vocabids")) {
+ UTIL_THROW2("Require file does not exist in: " << basePath << "/source_vocabids");
+ }
+
+ /*
+
+ if (!FileExists(path_to_config) || !FileExists(path_to_hashtable) ||
+ !FileExists(path_to_source_vocabid) || !FileExists(basepath + alignPath) ||
+ !FileExists(basepath + "/TargetColl.dat") || !FileExists(basepath + "/TargetVocab.dat") ||
+ !FileExists(basepath + "/cache")) {
+ UTIL_THROW2("A required table doesn't exist in: " << basepath);
+ }
+ */
+}
+
+}
+
diff --git a/moses2/TranslationModel/ProbingPT/querying.hh b/moses2/TranslationModel/ProbingPT/querying.hh
new file mode 100644
index 000000000..dcdd2a75a
--- /dev/null
+++ b/moses2/TranslationModel/ProbingPT/querying.hh
@@ -0,0 +1,77 @@
+#pragma once
+
+#include <boost/iostreams/device/mapped_file.hpp>
+#include <boost/unordered_map.hpp>
+#include <sys/stat.h> //For finding size of file
+#include "vocabid.hh"
+#include <algorithm> //toLower
+#include <deque>
+#include "probing_hash_utils.hh"
+#include "hash.hh" //Includes line splitter
+#include "line_splitter.hh"
+#include "../../legacy/Util2.h"
+
+namespace Moses2
+{
+
+class QueryEngine
+{
+ std::map<uint64_t, std::string> source_vocabids;
+
+ typedef std::vector<unsigned char> Alignments;
+ std::vector<Alignments> alignColl;
+
+ Table table;
+ char *mem; //Memory for the table, necessary so that we can correctly destroy the object
+
+ size_t table_filesize;
+ bool is_reordering;
+
+ util::scoped_fd file_;
+ util::scoped_memory memory_;
+
+ // target phrases
+ boost::iostreams::mapped_file_source file;
+
+ util::scoped_fd fileTPS_;
+ util::scoped_memory memoryTPS_;
+
+ void read_alignments(const std::string &alignPath);
+ void file_exits(const std::string &basePath);
+
+public:
+ int num_scores;
+ int num_lex_scores;
+ bool logProb;
+ const char *memTPS;
+
+ QueryEngine(const char *, util::LoadMethod load_method);
+ ~QueryEngine();
+
+ std::pair<bool, uint64_t> query(uint64_t key);
+
+ const std::map<uint64_t, std::string> &getSourceVocab() const
+ { return source_vocabids; }
+
+ const std::vector<Alignments> &getAlignments() const
+ { return alignColl; }
+
+ uint64_t getKey(uint64_t source_phrase[], size_t size) const;
+
+ template<typename T>
+ inline bool Get(const boost::unordered_map<std::string, std::string> &keyValue, const std::string &sought, T &found) const
+ {
+ boost::unordered_map<std::string, std::string>::const_iterator iter = keyValue.find(sought);
+ if (iter == keyValue.end()) {
+ return false;
+ }
+
+ const std::string &foundStr = iter->second;
+ found = Scan<T>(foundStr);
+ return true;
+ }
+
+};
+
+}
+
diff --git a/moses2/TranslationModel/ProbingPT/storing.cpp b/moses2/TranslationModel/ProbingPT/storing.cpp
new file mode 100644
index 000000000..75cdcc038
--- /dev/null
+++ b/moses2/TranslationModel/ProbingPT/storing.cpp
@@ -0,0 +1,303 @@
+#include <sys/stat.h>
+#include <boost/foreach.hpp>
+#include "line_splitter.hh"
+#include "storing.hh"
+#include "StoreTarget.h"
+#include "StoreVocab.h"
+#include "../../legacy/Util2.h"
+#include "../../legacy/InputFileStream.h"
+
+using namespace std;
+
+namespace Moses2
+{
+
+///////////////////////////////////////////////////////////////////////
+void Node::Add(Table &table, const SourcePhrase &sourcePhrase, size_t pos)
+{
+ if (pos < sourcePhrase.size()) {
+ uint64_t vocabId = sourcePhrase[pos];
+
+ Node *child;
+ Children::iterator iter = m_children.find(vocabId);
+ if (iter == m_children.end()) {
+ // New node. Write other children then discard them
+ BOOST_FOREACH(Children::value_type &valPair, m_children) {
+ Node &otherChild = valPair.second;
+ otherChild.Write(table);
+ }
+ m_children.clear();
+
+ // create new node
+ child = &m_children[vocabId];
+ assert(!child->done);
+ child->key = key + (vocabId << pos);
+ }
+ else {
+ child = &iter->second;
+ }
+
+ child->Add(table, sourcePhrase, pos + 1);
+ }
+ else {
+ // this node was written previously 'cos it has rules
+ done = true;
+ }
+}
+
+void Node::Write(Table &table)
+{
+ //cerr << "START write " << done << " " << key << endl;
+ BOOST_FOREACH(Children::value_type &valPair, m_children) {
+ Node &child = valPair.second;
+ child.Write(table);
+ }
+
+ if (!done) {
+ // save
+ Entry sourceEntry;
+ sourceEntry.value = NONE;
+ sourceEntry.key = key;
+
+ //Put into table
+ table.Insert(sourceEntry);
+ }
+}
+
+///////////////////////////////////////////////////////////////////////
+void createProbingPT(const std::string &phrasetable_path,
+ const std::string &basepath, int num_scores, int num_lex_scores,
+ bool log_prob, int max_cache_size, bool scfg)
+{
+ std::cerr << "Starting..." << std::endl;
+
+ //Get basepath and create directory if missing
+ mkdir(basepath.c_str(), S_IRWXU | S_IRWXG | S_IROTH | S_IXOTH);
+
+ StoreTarget storeTarget(basepath);
+
+ //Get uniq lines:
+ unsigned long uniq_entries = countUniqueSource(phrasetable_path);
+
+ //Source phrase vocabids
+ StoreVocab<uint64_t> sourceVocab(basepath + "/source_vocabids");
+
+ //Read the file
+ util::FilePiece filein(phrasetable_path.c_str());
+
+ //Init the probing hash table
+ size_t size = Table::Size(uniq_entries, 1.2);
+ char * mem = new char[size];
+ memset(mem, 0, size);
+ Table sourceEntries(mem, size);
+
+ std::priority_queue<CacheItem*, std::vector<CacheItem*>, CacheItemOrderer> cache;
+ float totalSourceCount = 0;
+
+ //Keep track of the size of each group of target phrases
+ size_t line_num = 0;
+
+ //Read everything and processs
+ std::string prevSource;
+
+ Node sourcePhrases;
+ sourcePhrases.done = true;
+ sourcePhrases.key = 0;
+
+ while (true) {
+ try {
+ //Process line read
+ line_text line;
+ line = splitLine(filein.ReadLine(), scfg);
+ //cerr << "line=" << line.source_phrase << endl;
+
+ ++line_num;
+ if (line_num % 1000000 == 0) {
+ std::cerr << line_num << " " << std::flush;
+ }
+
+ //Add source phrases to vocabularyIDs
+ add_to_map(sourceVocab, line.source_phrase);
+
+ if (prevSource.empty()) {
+ // 1st line
+ prevSource = line.source_phrase.as_string();
+ storeTarget.Append(line, log_prob, scfg);
+ }
+ else if (prevSource == line.source_phrase) {
+ //If we still have the same line, just append to it:
+ storeTarget.Append(line, log_prob, scfg);
+ }
+ else {
+ assert(prevSource != line.source_phrase);
+
+ //Create a new entry even
+
+ // save
+ uint64_t targetInd = storeTarget.Save();
+
+ // next line
+ storeTarget.Append(line, log_prob, scfg);
+
+ //Create an entry for the previous source phrase:
+ Entry sourceEntry;
+ sourceEntry.value = targetInd;
+ //The key is the sum of hashes of individual words bitshifted by their position in the phrase.
+ //Probably not entirerly correct, but fast and seems to work fine in practise.
+ std::vector<uint64_t> vocabid_source = getVocabIDs(prevSource);
+ if (scfg) {
+ // storing prefixes?
+ sourcePhrases.Add(sourceEntries, vocabid_source);
+ }
+ sourceEntry.key = getKey(vocabid_source);
+
+ /*
+ cerr << "prevSource=" << prevSource << flush
+ << " vocabids=" << Debug(vocabid_source) << flush
+ << " key=" << sourceEntry.key << endl;
+ */
+ //Put into table
+ sourceEntries.Insert(sourceEntry);
+
+ // update cache - CURRENT source phrase, not prev
+ if (max_cache_size) {
+ std::string countStr = line.counts.as_string();
+ countStr = Trim(countStr);
+ if (!countStr.empty()) {
+ std::vector<float> toks = Tokenize<float>(countStr);
+ //cerr << "CACHE:" << line.source_phrase << " " << countStr << " " << toks[1] << endl;
+
+ if (toks.size() >= 2) {
+ totalSourceCount += toks[1];
+
+ // compute key for CURRENT source
+ std::vector<uint64_t> currVocabidSource = getVocabIDs(line.source_phrase.as_string());
+ uint64_t currKey = getKey(currVocabidSource);
+
+ CacheItem *item = new CacheItem(
+ Trim(line.source_phrase.as_string()),
+ currKey,
+ toks[1]);
+ cache.push(item);
+
+ if (max_cache_size > 0 && cache.size() > max_cache_size) {
+ cache.pop();
+ }
+ }
+ }
+ }
+
+ //Set prevLine
+ prevSource = line.source_phrase.as_string();
+ }
+
+ }
+ catch (util::EndOfFileException e) {
+ std::cerr
+ << "Reading phrase table finished, writing remaining files to disk."
+ << std::endl;
+
+ //After the final entry is constructed we need to add it to the phrase_table
+ //Create an entry for the previous source phrase:
+ uint64_t targetInd = storeTarget.Save();
+
+ Entry sourceEntry;
+ sourceEntry.value = targetInd;
+
+ //The key is the sum of hashes of individual words. Probably not entirerly correct, but fast
+ std::vector<uint64_t> vocabid_source = getVocabIDs(prevSource);
+ sourceEntry.key = getKey(vocabid_source);
+
+ //Put into table
+ sourceEntries.Insert(sourceEntry);
+
+ break;
+ }
+ }
+
+ sourcePhrases.Write(sourceEntries);
+
+ storeTarget.SaveAlignment();
+
+ serialize_table(mem, size, (basepath + "/probing_hash.dat"));
+
+ sourceVocab.Save();
+
+ serialize_cache(cache, (basepath + "/cache"), totalSourceCount);
+
+ delete[] mem;
+
+ //Write configfile
+ std::ofstream configfile;
+ configfile.open((basepath + "/config").c_str());
+ configfile << "API_VERSION\t" << API_VERSION << '\n';
+ configfile << "uniq_entries\t" << uniq_entries << '\n';
+ configfile << "num_scores\t" << num_scores << '\n';
+ configfile << "num_lex_scores\t" << num_lex_scores << '\n';
+ configfile << "log_prob\t" << log_prob << '\n';
+ configfile.close();
+}
+
+size_t countUniqueSource(const std::string &path)
+{
+ size_t ret = 0;
+ InputFileStream strme(path);
+
+ std::string line, prevSource;
+ while (std::getline(strme, line)) {
+ std::vector<std::string> toks = TokenizeMultiCharSeparator(line, "|||");
+ assert(toks.size() != 0);
+
+ if (prevSource != toks[0]) {
+ prevSource = toks[0];
+ ++ret;
+ }
+ }
+
+ return ret;
+}
+
+void serialize_cache(
+ std::priority_queue<CacheItem*, std::vector<CacheItem*>, CacheItemOrderer> &cache,
+ const std::string &path, float totalSourceCount)
+{
+ std::vector<const CacheItem*> vec(cache.size());
+
+ size_t ind = cache.size() - 1;
+ while (!cache.empty()) {
+ const CacheItem *item = cache.top();
+ vec[ind] = item;
+ cache.pop();
+ --ind;
+ }
+
+ std::ofstream os(path.c_str());
+
+ os << totalSourceCount << std::endl;
+ for (size_t i = 0; i < vec.size(); ++i) {
+ const CacheItem *item = vec[i];
+ os << item->count << "\t" << item->sourceKey << "\t" << item->source << std::endl;
+ delete item;
+ }
+
+ os.close();
+}
+
+uint64_t getKey(const std::vector<uint64_t> &vocabid_source)
+{
+ return Moses2::getKey(vocabid_source.data(), vocabid_source.size());
+}
+
+std::vector<uint64_t> CreatePrefix(const std::vector<uint64_t> &vocabid_source, size_t endPos)
+{
+ assert(endPos < vocabid_source.size());
+
+ std::vector<uint64_t> ret(endPos + 1);
+ for (size_t i = 0; i <= endPos; ++i) {
+ ret[i] = vocabid_source[i];
+ }
+ return ret;
+}
+
+}
+
diff --git a/moses2/TranslationModel/ProbingPT/storing.hh b/moses2/TranslationModel/ProbingPT/storing.hh
new file mode 100644
index 000000000..10d7050d3
--- /dev/null
+++ b/moses2/TranslationModel/ProbingPT/storing.hh
@@ -0,0 +1,95 @@
+#pragma once
+
+#include <boost/unordered_set.hpp>
+#include <boost/unordered_map.hpp>
+#include <cstdio>
+#include <sstream>
+#include <fstream>
+#include <iostream>
+#include <string>
+#include <queue>
+#include <sys/stat.h> //mkdir
+
+#include "hash.hh" //Includes line_splitter
+#include "probing_hash_utils.hh"
+
+#include "util/file_piece.hh"
+#include "util/file.hh"
+#include "vocabid.hh"
+
+namespace Moses2
+{
+typedef std::vector<uint64_t> SourcePhrase;
+
+
+class Node
+{
+ typedef boost::unordered_map<uint64_t, Node> Children;
+ Children m_children;
+
+public:
+ uint64_t key;
+ bool done;
+
+ Node()
+ :done(false)
+ {}
+
+ void Add(Table &table, const SourcePhrase &sourcePhrase, size_t pos = 0);
+ void Write(Table &table);
+};
+
+
+void createProbingPT(const std::string &phrasetable_path,
+ const std::string &basepath, int num_scores, int num_lex_scores,
+ bool log_prob, int max_cache_size, bool scfg);
+uint64_t getKey(const std::vector<uint64_t> &source_phrase);
+
+std::vector<uint64_t> CreatePrefix(const std::vector<uint64_t> &vocabid_source, size_t endPos);
+
+template<typename T>
+std::string Debug(const std::vector<T> &vec)
+{
+ std::stringstream strm;
+ for (size_t i = 0; i < vec.size(); ++i) {
+ strm << vec[i] << " ";
+ }
+ return strm.str();
+}
+
+size_t countUniqueSource(const std::string &path);
+
+class CacheItem
+{
+public:
+ std::string source;
+ uint64_t sourceKey;
+ float count;
+ CacheItem(const std::string &vSource, uint64_t vSourceKey, float vCount)
+ :source(vSource)
+ ,sourceKey(vSourceKey)
+ ,count(vCount)
+ {
+ }
+
+ bool operator<(const CacheItem &other) const
+ {
+ return count > other.count;
+ }
+};
+
+class CacheItemOrderer
+{
+public:
+ bool operator()(const CacheItem* a, const CacheItem* b) const
+ {
+ return (*a) < (*b);
+ }
+};
+
+void serialize_cache(
+ std::priority_queue<CacheItem*, std::vector<CacheItem*>, CacheItemOrderer> &cache,
+ const std::string &path, float totalSourceCount);
+
+}
+
diff --git a/moses2/TranslationModel/ProbingPT/vocabid.cpp b/moses2/TranslationModel/ProbingPT/vocabid.cpp
new file mode 100644
index 000000000..696373ee5
--- /dev/null
+++ b/moses2/TranslationModel/ProbingPT/vocabid.cpp
@@ -0,0 +1,59 @@
+#include <boost/foreach.hpp>
+#include "vocabid.hh"
+#include "StoreVocab.h"
+#include "../../legacy/Util2.h"
+
+namespace Moses2
+{
+
+void add_to_map(StoreVocab<uint64_t> &sourceVocab,
+ const StringPiece &textin)
+{
+ //Tokenize
+ util::TokenIter<util::SingleCharacter> itWord(textin, util::SingleCharacter(' '));
+
+ while (itWord) {
+ StringPiece word = *itWord;
+
+ util::TokenIter<util::SingleCharacter> itFactor(word, util::SingleCharacter('|'));
+ while (itFactor) {
+ StringPiece factor = *itFactor;
+
+ sourceVocab.Insert(getHash(factor), factor.as_string());
+ itFactor++;
+ }
+ itWord++;
+ }
+}
+
+void serialize_map(const std::map<uint64_t, std::string> &karta,
+ const std::string &filename)
+{
+ std::ofstream os(filename.c_str());
+
+ std::map<uint64_t, std::string>::const_iterator iter;
+ for (iter = karta.begin(); iter != karta.end(); ++iter) {
+ os << iter->first << '\t' << iter->second << std::endl;
+ }
+
+ os.close();
+}
+
+void read_map(std::map<uint64_t, std::string> &karta, const char* filename)
+{
+ std::ifstream is(filename);
+
+ std::string line;
+ while (getline(is, line)) {
+ std::vector<std::string> toks = Tokenize(line, "\t");
+ assert(toks.size() == 2);
+ uint64_t ind = Scan<uint64_t>(toks[1]);
+ karta[ind] = toks[0];
+ }
+
+ //Close the stream after we are done.
+ is.close();
+}
+
+}
+
diff --git a/moses2/TranslationModel/ProbingPT/vocabid.hh b/moses2/TranslationModel/ProbingPT/vocabid.hh
new file mode 100644
index 000000000..55d99d453
--- /dev/null
+++ b/moses2/TranslationModel/ProbingPT/vocabid.hh
@@ -0,0 +1,29 @@
+//Serialization
+#include <boost/serialization/serialization.hpp>
+#include <boost/serialization/map.hpp>
+#include <boost/archive/text_iarchive.hpp>
+#include <boost/archive/text_oarchive.hpp>
+#include <fstream>
+#include <iostream>
+#include <vector>
+
+#include <map> //Container
+#include "hash.hh" //Hash of elements
+
+#include "util/string_piece.hh" //Tokenization and work with StringPiece
+#include "util/tokenize_piece.hh"
+
+namespace Moses2
+{
+template<typename VOCABID>
+class StoreVocab;
+
+void add_to_map(StoreVocab<uint64_t> &sourceVocab,
+ const StringPiece &textin);
+
+void serialize_map(const std::map<uint64_t, std::string> &karta,
+ const std::string &filename);
+
+void read_map(std::map<uint64_t, std::string> &karta, const char* filename);
+
+}
diff --git a/moses2/TranslationModel/Transliteration.cpp b/moses2/TranslationModel/Transliteration.cpp
new file mode 100644
index 000000000..f92348ee9
--- /dev/null
+++ b/moses2/TranslationModel/Transliteration.cpp
@@ -0,0 +1,229 @@
+/*
+ * Transliteration.cpp
+ *
+ * Created on: 28 Oct 2015
+ * Author: hieu
+ */
+#include <boost/foreach.hpp>
+#include "Transliteration.h"
+#include "../System.h"
+#include "../Scores.h"
+#include "../InputType.h"
+#include "../PhraseBased/Manager.h"
+#include "../PhraseBased/TargetPhraseImpl.h"
+#include "../PhraseBased/InputPath.h"
+#include "../PhraseBased/TargetPhrases.h"
+#include "../PhraseBased/Sentence.h"
+#include "../SCFG/InputPath.h"
+#include "../SCFG/TargetPhraseImpl.h"
+#include "../SCFG/Manager.h"
+#include "../SCFG/Sentence.h"
+#include "../SCFG/ActiveChart.h"
+#include "util/tempfile.hh"
+#include "../legacy/Util2.h"
+
+using namespace std;
+
+namespace Moses2
+{
+
+Transliteration::Transliteration(size_t startInd, const std::string &line) :
+ PhraseTable(startInd, line)
+{
+ ReadParameters();
+ UTIL_THROW_IF2(m_mosesDir.empty() ||
+ m_scriptDir.empty() ||
+ m_externalDir.empty() ||
+ m_inputLang.empty() ||
+ m_outputLang.empty(), "Must specify all arguments");
+}
+
+Transliteration::~Transliteration()
+{
+ // TODO Auto-generated destructor stub
+}
+
+void
+Transliteration::
+SetParameter(const std::string& key, const std::string& value)
+{
+ if (key == "moses-dir") {
+ m_mosesDir = value;
+ } else if (key == "script-dir") {
+ m_scriptDir = value;
+ } else if (key == "external-dir") {
+ m_externalDir = value;
+ } else if (key == "input-lang") {
+ m_inputLang = value;
+ } else if (key == "output-lang") {
+ m_outputLang = value;
+ } else {
+ PhraseTable::SetParameter(key, value);
+ }
+}
+
+void Transliteration::Lookup(const Manager &mgr,
+ InputPathsBase &inputPaths) const
+{
+ BOOST_FOREACH(InputPathBase *pathBase, inputPaths){
+ InputPath *path = static_cast<InputPath*>(pathBase);
+
+ if (SatisfyBackoff(mgr, *path)) {
+ const SubPhrase<Moses2::Word> &phrase = path->subPhrase;
+
+ TargetPhrases *tps = Lookup(mgr, mgr.GetPool(), *path);
+ path->AddTargetPhrases(*this, tps);
+ }
+ }
+
+}
+
+TargetPhrases *Transliteration::Lookup(const Manager &mgr, MemPool &pool,
+ InputPath &inputPath) const
+{
+ const SubPhrase<Moses2::Word> &sourcePhrase = inputPath.subPhrase;
+ size_t hash = sourcePhrase.hash();
+
+ // TRANSLITERATE
+ const util::temp_file inFile;
+ const util::temp_dir outDir;
+
+ ofstream inStream(inFile.path().c_str());
+ inStream << sourcePhrase.Debug(mgr.system) << endl;
+ inStream.close();
+
+ string cmd = m_scriptDir + "/Transliteration/prepare-transliteration-phrase-table.pl" +
+ " --transliteration-model-dir " + m_filePath +
+ " --moses-src-dir " + m_mosesDir +
+ " --external-bin-dir " + m_externalDir +
+ " --input-extension " + m_inputLang +
+ " --output-extension " + m_outputLang +
+ " --oov-file " + inFile.path() +
+ " --out-dir " + outDir.path();
+
+ int ret = system(cmd.c_str());
+ UTIL_THROW_IF2(ret != 0, "Transliteration script error");
+
+ TargetPhrases *tps = NULL;
+ tps = new (pool.Allocate<TargetPhrases>()) TargetPhrases(pool, 1);
+
+ vector<TargetPhraseImpl*> targetPhrases
+ = CreateTargetPhrases(mgr, pool, sourcePhrase, outDir.path());
+
+ vector<TargetPhraseImpl*>::const_iterator iter;
+ for (iter = targetPhrases.begin(); iter != targetPhrases.end(); ++iter) {
+ TargetPhraseImpl *tp = *iter;
+ tps->AddTargetPhrase(*tp);
+ }
+ mgr.system.featureFunctions.EvaluateAfterTablePruning(pool, *tps, sourcePhrase);
+
+ inputPath.AddTargetPhrases(*this, tps);
+}
+
+std::vector<TargetPhraseImpl*> Transliteration::CreateTargetPhrases(
+ const Manager &mgr,
+ MemPool &pool,
+ const SubPhrase<Moses2::Word> &sourcePhrase,
+ const std::string &outDir) const
+{
+ std::vector<TargetPhraseImpl*> ret;
+
+ string outPath = outDir + "/out.txt";
+ ifstream outStream(outPath.c_str());
+
+ string line;
+ while (getline(outStream, line)) {
+ vector<string> toks = Moses2::Tokenize(line, "\t");
+ UTIL_THROW_IF2(toks.size() != 2, "Error in transliteration output file. Expecting word\tscore");
+
+ TargetPhraseImpl *tp =
+ new (pool.Allocate<TargetPhraseImpl>()) TargetPhraseImpl(pool, *this, mgr.system, 1);
+ Moses2::Word &word = (*tp)[0];
+ word.CreateFromString(mgr.system.GetVocab(), mgr.system, toks[0]);
+
+ float score = Scan<float>(toks[1]);
+ tp->GetScores().PlusEquals(mgr.system, *this, score);
+
+ // score of all other ff when this rule is being loaded
+ mgr.system.featureFunctions.EvaluateInIsolation(pool, mgr.system, sourcePhrase, *tp);
+
+ ret.push_back(tp);
+ }
+
+ outStream.close();
+
+ return ret;
+
+}
+
+
+void Transliteration::EvaluateInIsolation(const System &system,
+ const Phrase<Moses2::Word> &source, const TargetPhraseImpl &targetPhrase, Scores &scores,
+ SCORE &estimatedScore) const
+{
+ UTIL_THROW2("Not implemented");
+}
+
+// SCFG ///////////////////////////////////////////////////////////////////////////////////////////
+void Transliteration::InitActiveChart(
+ MemPool &pool,
+ const SCFG::Manager &mgr,
+ SCFG::InputPath &path) const
+{
+ UTIL_THROW2("Not implemented");
+}
+
+void Transliteration::Lookup(MemPool &pool,
+ const SCFG::Manager &mgr,
+ size_t maxChartSpan,
+ const SCFG::Stacks &stacks,
+ SCFG::InputPath &path) const
+{
+ UTIL_THROW2("Not implemented");
+}
+
+void Transliteration::LookupUnary(MemPool &pool,
+ const SCFG::Manager &mgr,
+ const SCFG::Stacks &stacks,
+ SCFG::InputPath &path) const
+{
+ UTIL_THROW2("Not implemented");
+}
+
+void Transliteration::LookupNT(
+ MemPool &pool,
+ const SCFG::Manager &mgr,
+ const Moses2::Range &subPhraseRange,
+ const SCFG::InputPath &prevPath,
+ const SCFG::Stacks &stacks,
+ SCFG::InputPath &outPath) const
+{
+ UTIL_THROW2("Not implemented");
+}
+
+void Transliteration::LookupGivenWord(
+ MemPool &pool,
+ const SCFG::Manager &mgr,
+ const SCFG::InputPath &prevPath,
+ const SCFG::Word &wordSought,
+ const Moses2::Hypotheses *hypos,
+ const Moses2::Range &subPhraseRange,
+ SCFG::InputPath &outPath) const
+{
+ UTIL_THROW2("Not implemented");
+}
+
+void Transliteration::LookupGivenNode(
+ MemPool &pool,
+ const SCFG::Manager &mgr,
+ const SCFG::ActiveChartEntry &prevEntry,
+ const SCFG::Word &wordSought,
+ const Moses2::Hypotheses *hypos,
+ const Moses2::Range &subPhraseRange,
+ SCFG::InputPath &outPath) const
+{
+ UTIL_THROW2("Not implemented");
+}
+
+}
+
diff --git a/moses2/TranslationModel/Transliteration.h b/moses2/TranslationModel/Transliteration.h
new file mode 100644
index 000000000..15f262ac8
--- /dev/null
+++ b/moses2/TranslationModel/Transliteration.h
@@ -0,0 +1,91 @@
+/*
+ * Transliteration.h
+ *
+ * Created on: 28 Oct 2015
+ * Author: hieu
+ */
+
+#pragma once
+
+#include "PhraseTable.h"
+
+namespace Moses2
+{
+class Sentence;
+class InputPaths;
+class Range;
+
+class Transliteration: public PhraseTable
+{
+public:
+ Transliteration(size_t startInd, const std::string &line);
+ virtual ~Transliteration();
+
+ void Lookup(const Manager &mgr, InputPathsBase &inputPaths) const;
+ virtual TargetPhrases *Lookup(const Manager &mgr, MemPool &pool,
+ InputPath &inputPath) const;
+
+ virtual void
+ EvaluateInIsolation(const System &system, const Phrase<Moses2::Word> &source,
+ const TargetPhraseImpl &targetPhrase, Scores &scores,
+ SCORE &estimatedScore) const;
+
+ virtual void InitActiveChart(
+ MemPool &pool,
+ const SCFG::Manager &mgr,
+ SCFG::InputPath &path) const;
+
+ void Lookup(MemPool &pool,
+ const SCFG::Manager &mgr,
+ size_t maxChartSpan,
+ const SCFG::Stacks &stacks,
+ SCFG::InputPath &path) const;
+
+ void LookupUnary(MemPool &pool,
+ const SCFG::Manager &mgr,
+ const SCFG::Stacks &stacks,
+ SCFG::InputPath &path) const;
+
+protected:
+ virtual void LookupNT(
+ MemPool &pool,
+ const SCFG::Manager &mgr,
+ const Moses2::Range &subPhraseRange,
+ const SCFG::InputPath &prevPath,
+ const SCFG::Stacks &stacks,
+ SCFG::InputPath &outPath) const;
+
+ virtual void LookupGivenWord(
+ MemPool &pool,
+ const SCFG::Manager &mgr,
+ const SCFG::InputPath &prevPath,
+ const SCFG::Word &wordSought,
+ const Moses2::Hypotheses *hypos,
+ const Moses2::Range &subPhraseRange,
+ SCFG::InputPath &outPath) const;
+
+ virtual void LookupGivenNode(
+ MemPool &pool,
+ const SCFG::Manager &mgr,
+ const SCFG::ActiveChartEntry &prevEntry,
+ const SCFG::Word &wordSought,
+ const Moses2::Hypotheses *hypos,
+ const Moses2::Range &subPhraseRange,
+ SCFG::InputPath &outPath) const;
+
+ void SetParameter(const std::string& key, const std::string& value);
+
+protected:
+ std::string m_filePath;
+ std::string m_mosesDir, m_scriptDir, m_externalDir, m_inputLang, m_outputLang;
+
+ std::vector<TargetPhraseImpl*> CreateTargetPhrases(
+ const Manager &mgr,
+ MemPool &pool,
+ const SubPhrase<Moses2::Word> &sourcePhrase,
+ const std::string &outDir) const;
+
+};
+
+}
+
diff --git a/moses2/TranslationModel/UnknownWordPenalty.cpp b/moses2/TranslationModel/UnknownWordPenalty.cpp
new file mode 100644
index 000000000..d786b2cff
--- /dev/null
+++ b/moses2/TranslationModel/UnknownWordPenalty.cpp
@@ -0,0 +1,285 @@
+/*
+ * UnknownWordPenalty.cpp
+ *
+ * Created on: 28 Oct 2015
+ * Author: hieu
+ */
+#include <boost/foreach.hpp>
+#include "UnknownWordPenalty.h"
+#include "../System.h"
+#include "../Scores.h"
+#include "../InputType.h"
+#include "../PhraseBased/Manager.h"
+#include "../PhraseBased/TargetPhraseImpl.h"
+#include "../PhraseBased/InputPath.h"
+#include "../PhraseBased/TargetPhrases.h"
+#include "../PhraseBased/Sentence.h"
+#include "../SCFG/InputPath.h"
+#include "../SCFG/TargetPhraseImpl.h"
+#include "../SCFG/Manager.h"
+#include "../SCFG/Sentence.h"
+#include "../SCFG/ActiveChart.h"
+
+using namespace std;
+
+namespace Moses2
+{
+
+UnknownWordPenalty::UnknownWordPenalty(size_t startInd, const std::string &line)
+:PhraseTable(startInd, line)
+,m_drop(false)
+{
+ m_tuneable = false;
+ ReadParameters();
+}
+
+UnknownWordPenalty::~UnknownWordPenalty()
+{
+ // TODO Auto-generated destructor stub
+}
+
+void UnknownWordPenalty::SetParameter(const std::string& key, const std::string& value)
+{
+ if (key == "drop") {
+ m_drop = Scan<bool>(value);
+ }
+ else if (key == "prefix") {
+ m_prefix = value;
+ }
+ else if (key == "suffix") {
+ m_suffix = value;
+ }
+ else {
+ PhraseTable::SetParameter(key, value);
+ }
+}
+
+void UnknownWordPenalty::ProcessXML(
+ const Manager &mgr,
+ MemPool &pool,
+ const Sentence &sentence,
+ InputPaths &inputPaths) const
+{
+ const Vector<const InputType::XMLOption*> &xmlOptions = sentence.GetXMLOptions();
+ BOOST_FOREACH(const InputType::XMLOption *xmlOption, xmlOptions) {
+ TargetPhraseImpl *target = TargetPhraseImpl::CreateFromString(pool, *this, mgr.system, xmlOption->GetTranslation());
+
+ if (xmlOption->prob) {
+ Scores &scores = target->GetScores();
+ scores.PlusEquals(mgr.system, *this, Moses2::TransformScore(xmlOption->prob));
+ }
+
+ InputPath *path = inputPaths.GetMatrix().GetValue(xmlOption->startPos, xmlOption->phraseSize - 1);
+ const SubPhrase<Moses2::Word> &source = path->subPhrase;
+
+ mgr.system.featureFunctions.EvaluateInIsolation(pool, mgr.system, source, *target);
+
+ TargetPhrases *tps = new (pool.Allocate<TargetPhrases>()) TargetPhrases(pool, 1);
+
+ tps->AddTargetPhrase(*target);
+ mgr.system.featureFunctions.EvaluateAfterTablePruning(pool, *tps, source);
+
+ path->AddTargetPhrases(*this, tps);
+ }
+}
+
+void UnknownWordPenalty::Lookup(const Manager &mgr,
+ InputPathsBase &inputPaths) const
+{
+ BOOST_FOREACH(InputPathBase *pathBase, inputPaths){
+ InputPath *path = static_cast<InputPath*>(pathBase);
+
+ if (SatisfyBackoff(mgr, *path)) {
+ const SubPhrase<Moses2::Word> &phrase = path->subPhrase;
+
+ TargetPhrases *tps = Lookup(mgr, mgr.GetPool(), *path);
+ path->AddTargetPhrases(*this, tps);
+ }
+ }
+
+}
+
+TargetPhrases *UnknownWordPenalty::Lookup(const Manager &mgr, MemPool &pool,
+ InputPath &inputPath) const
+{
+ const System &system = mgr.system;
+ TargetPhrases *tps = NULL;
+
+ // any other pt translate this?
+ size_t numPt = mgr.system.mappings.size();
+ const TargetPhrases **allTPS =
+ static_cast<InputPath&>(inputPath).targetPhrases;
+ for (size_t i = 0; i < numPt; ++i) {
+ const TargetPhrases *otherTps = allTPS[i];
+
+ if (otherTps && otherTps->GetSize()) {
+ return tps;
+ }
+ }
+
+ const SubPhrase<Moses2::Word> &source = inputPath.subPhrase;
+ const Moses2::Word &sourceWord = source[0];
+ const Factor *factor = sourceWord[0];
+
+ tps = new (pool.Allocate<TargetPhrases>()) TargetPhrases(pool, 1);
+
+ size_t numWords = m_drop ? 0 : 1;
+
+ TargetPhraseImpl *target =
+ new (pool.Allocate<TargetPhraseImpl>()) TargetPhraseImpl(pool, *this,
+ system, numWords);
+
+ if (!m_drop) {
+ Moses2::Word &word = (*target)[0];
+
+ if (m_prefix.empty() && m_suffix.empty()) {
+ word[0] = factor;
+ }
+ else {
+ stringstream strm;
+ if (!m_prefix.empty()) {
+ strm << m_prefix;
+ }
+ strm << factor->GetString();
+ if (!m_suffix.empty()) {
+ strm << m_suffix;
+ }
+
+ FactorCollection &fc = system.GetVocab();
+ const Factor *targetFactor = fc.AddFactor(strm.str(), system, false);
+ word[0] = targetFactor;
+ }
+ }
+
+ Scores &scores = target->GetScores();
+ scores.PlusEquals(mgr.system, *this, -100);
+
+ MemPool &memPool = mgr.GetPool();
+ system.featureFunctions.EvaluateInIsolation(memPool, system, source, *target);
+
+ tps->AddTargetPhrase(*target);
+ system.featureFunctions.EvaluateAfterTablePruning(memPool, *tps, source);
+
+ return tps;
+}
+
+void UnknownWordPenalty::EvaluateInIsolation(const System &system,
+ const Phrase<Moses2::Word> &source, const TargetPhraseImpl &targetPhrase, Scores &scores,
+ SCORE &estimatedScore) const
+{
+
+}
+
+// SCFG ///////////////////////////////////////////////////////////////////////////////////////////
+void UnknownWordPenalty::InitActiveChart(
+ MemPool &pool,
+ const SCFG::Manager &mgr,
+ SCFG::InputPath &path) const
+{
+}
+
+void UnknownWordPenalty::Lookup(MemPool &pool,
+ const SCFG::Manager &mgr,
+ size_t maxChartSpan,
+ const SCFG::Stacks &stacks,
+ SCFG::InputPath &path) const
+{
+ const System &system = mgr.system;
+
+ size_t numWords = path.range.GetNumWordsCovered();
+ if (numWords > 1) {
+ // only create 1 word phrases
+ return;
+ }
+
+ if (path.GetNumRules()) {
+ // only create rules if no other rules
+ return;
+ }
+
+ // don't do 1st if 1st word
+ if (path.range.GetStartPos() == 0) {
+ return;
+ }
+
+ // don't do 1st if last word
+ const SCFG::Sentence &sentence = static_cast<const SCFG::Sentence&>(mgr.GetInput());
+ if (path.range.GetStartPos() + 1 == sentence.GetSize()) {
+ return;
+ }
+
+ // terminal
+ const SCFG::Word &lastWord = path.subPhrase.Back();
+ //cerr << "UnknownWordPenalty lastWord=" << lastWord << endl;
+
+ const Factor *factor = lastWord[0];
+ SCFG::TargetPhraseImpl *tp = new (pool.Allocate<SCFG::TargetPhraseImpl>()) SCFG::TargetPhraseImpl(pool, *this, system, 1);
+ SCFG::Word &word = (*tp)[0];
+ word.CreateFromString(system.GetVocab(), system, factor->GetString().as_string());
+
+ tp->lhs.CreateFromString(system.GetVocab(), system, "[X]");
+
+ size_t endPos = path.range.GetEndPos();
+ const SCFG::InputPath &subPhrasePath = *mgr.GetInputPaths().GetMatrix().GetValue(endPos, 1);
+
+ SCFG::ActiveChartEntry *chartEntry = new (pool.Allocate<SCFG::ActiveChartEntry>()) SCFG::ActiveChartEntry(pool);
+ chartEntry->AddSymbolBindElement(subPhrasePath.range, lastWord, NULL, *this);
+ path.AddActiveChartEntry(GetPtInd(), chartEntry);
+
+ Scores &scores = tp->GetScores();
+ scores.PlusEquals(mgr.system, *this, -100);
+
+ MemPool &memPool = mgr.GetPool();
+ const SubPhrase<SCFG::Word> &source = path.subPhrase;
+ system.featureFunctions.EvaluateInIsolation(memPool, system, source, *tp);
+
+ SCFG::TargetPhrases *tps = new (pool.Allocate<SCFG::TargetPhrases>()) SCFG::TargetPhrases(pool);
+ tps->AddTargetPhrase(*tp);
+
+ path.AddTargetPhrasesToPath(pool, mgr.system, *this, *tps, chartEntry->GetSymbolBind());
+}
+
+void UnknownWordPenalty::LookupUnary(MemPool &pool,
+ const SCFG::Manager &mgr,
+ const SCFG::Stacks &stacks,
+ SCFG::InputPath &path) const
+{
+}
+
+void UnknownWordPenalty::LookupNT(
+ MemPool &pool,
+ const SCFG::Manager &mgr,
+ const Moses2::Range &subPhraseRange,
+ const SCFG::InputPath &prevPath,
+ const SCFG::Stacks &stacks,
+ SCFG::InputPath &outPath) const
+{
+ UTIL_THROW2("Not implemented");
+}
+
+void UnknownWordPenalty::LookupGivenWord(
+ MemPool &pool,
+ const SCFG::Manager &mgr,
+ const SCFG::InputPath &prevPath,
+ const SCFG::Word &wordSought,
+ const Moses2::Hypotheses *hypos,
+ const Moses2::Range &subPhraseRange,
+ SCFG::InputPath &outPath) const
+{
+ UTIL_THROW2("Not implemented");
+}
+
+void UnknownWordPenalty::LookupGivenNode(
+ MemPool &pool,
+ const SCFG::Manager &mgr,
+ const SCFG::ActiveChartEntry &prevEntry,
+ const SCFG::Word &wordSought,
+ const Moses2::Hypotheses *hypos,
+ const Moses2::Range &subPhraseRange,
+ SCFG::InputPath &outPath) const
+{
+ UTIL_THROW2("Not implemented");
+}
+
+}
+
diff --git a/moses2/TranslationModel/UnknownWordPenalty.h b/moses2/TranslationModel/UnknownWordPenalty.h
new file mode 100644
index 000000000..52c235a36
--- /dev/null
+++ b/moses2/TranslationModel/UnknownWordPenalty.h
@@ -0,0 +1,89 @@
+/*
+ * UnknownWordPenalty.h
+ *
+ * Created on: 28 Oct 2015
+ * Author: hieu
+ */
+
+#pragma once
+
+#include "PhraseTable.h"
+
+namespace Moses2
+{
+class Sentence;
+class InputPaths;
+class Range;
+
+class UnknownWordPenalty: public PhraseTable
+{
+public:
+ UnknownWordPenalty(size_t startInd, const std::string &line);
+ virtual ~UnknownWordPenalty();
+
+ virtual void SetParameter(const std::string& key, const std::string& value);
+
+ void Lookup(const Manager &mgr, InputPathsBase &inputPaths) const;
+ virtual TargetPhrases *Lookup(const Manager &mgr, MemPool &pool,
+ InputPath &inputPath) const;
+
+ void ProcessXML(
+ const Manager &mgr,
+ MemPool &pool,
+ const Sentence &sentence,
+ InputPaths &inputPaths) const;
+
+ virtual void
+ EvaluateInIsolation(const System &system, const Phrase<Moses2::Word> &source,
+ const TargetPhraseImpl &targetPhrase, Scores &scores,
+ SCORE &estimatedScore) const;
+
+ virtual void InitActiveChart(
+ MemPool &pool,
+ const SCFG::Manager &mgr,
+ SCFG::InputPath &path) const;
+
+ void Lookup(MemPool &pool,
+ const SCFG::Manager &mgr,
+ size_t maxChartSpan,
+ const SCFG::Stacks &stacks,
+ SCFG::InputPath &path) const;
+
+ void LookupUnary(MemPool &pool,
+ const SCFG::Manager &mgr,
+ const SCFG::Stacks &stacks,
+ SCFG::InputPath &path) const;
+
+protected:
+ virtual void LookupNT(
+ MemPool &pool,
+ const SCFG::Manager &mgr,
+ const Moses2::Range &subPhraseRange,
+ const SCFG::InputPath &prevPath,
+ const SCFG::Stacks &stacks,
+ SCFG::InputPath &outPath) const;
+
+ virtual void LookupGivenWord(
+ MemPool &pool,
+ const SCFG::Manager &mgr,
+ const SCFG::InputPath &prevPath,
+ const SCFG::Word &wordSought,
+ const Moses2::Hypotheses *hypos,
+ const Moses2::Range &subPhraseRange,
+ SCFG::InputPath &outPath) const;
+
+ virtual void LookupGivenNode(
+ MemPool &pool,
+ const SCFG::Manager &mgr,
+ const SCFG::ActiveChartEntry &prevEntry,
+ const SCFG::Word &wordSought,
+ const Moses2::Hypotheses *hypos,
+ const Moses2::Range &subPhraseRange,
+ SCFG::InputPath &outPath) const;
+protected:
+ bool m_drop;
+ std::string m_prefix, m_suffix;
+};
+
+}
+