Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorHieu Hoang <hieuhoang@gmail.com>2017-02-01 01:21:59 +0300
committerHieu Hoang <hieuhoang@gmail.com>2017-02-01 01:21:59 +0300
commita8a5b43f2dc32bd1b45006fd43989dc71e74ba0e (patch)
treee84a78fa005e29ec78076d6e525371240871122c /moses2/legacy
parent7206d592751ee9afeb1fa4753b7e19272e2585bc (diff)
move moses2 to root
Diffstat (limited to 'moses2/legacy')
-rw-r--r--moses2/legacy/Bitmap.cpp87
-rw-r--r--moses2/legacy/Bitmap.h244
-rw-r--r--moses2/legacy/Bitmaps.cpp74
-rw-r--r--moses2/legacy/Bitmaps.h40
-rw-r--r--moses2/legacy/Factor.cpp45
-rw-r--r--moses2/legacy/Factor.h104
-rw-r--r--moses2/legacy/FactorCollection.cpp111
-rw-r--r--moses2/legacy/FactorCollection.h130
-rw-r--r--moses2/legacy/InputFileStream.cpp60
-rw-r--r--moses2/legacy/InputFileStream.h46
-rw-r--r--moses2/legacy/Matrix.cpp34
-rw-r--r--moses2/legacy/Matrix.h106
-rw-r--r--moses2/legacy/OutputCollector.h165
-rw-r--r--moses2/legacy/OutputFileStream.cpp88
-rw-r--r--moses2/legacy/OutputFileStream.h81
-rw-r--r--moses2/legacy/Parameter.cpp1707
-rw-r--r--moses2/legacy/Parameter.h176
-rw-r--r--moses2/legacy/Range.cpp32
-rw-r--r--moses2/legacy/Range.h123
-rw-r--r--moses2/legacy/ThreadPool.cpp150
-rw-r--r--moses2/legacy/ThreadPool.h140
-rw-r--r--moses2/legacy/Timer.cpp104
-rw-r--r--moses2/legacy/Timer.h39
-rw-r--r--moses2/legacy/Util2.cpp29
-rw-r--r--moses2/legacy/Util2.h351
-rw-r--r--moses2/legacy/gzfilebuf.h101
26 files changed, 4367 insertions, 0 deletions
diff --git a/moses2/legacy/Bitmap.cpp b/moses2/legacy/Bitmap.cpp
new file mode 100644
index 000000000..a8dc7db4d
--- /dev/null
+++ b/moses2/legacy/Bitmap.cpp
@@ -0,0 +1,87 @@
+// $Id$
+
+/***********************************************************************
+ Moses - factored phrase-based language decoder
+ Copyright (C) 2006 University of Edinburgh
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ ***********************************************************************/
+
+#include <boost/functional/hash.hpp>
+#include "Bitmap.h"
+
+namespace Moses2
+{
+
+Bitmap::Bitmap(MemPool &pool, size_t size) :
+ m_bitmap(pool, size)
+{
+}
+
+void Bitmap::Init(const std::vector<bool>& initializer)
+{
+
+ for (size_t i = 0; i < initializer.size(); ++i) {
+ m_bitmap[i] = initializer[i];
+ }
+
+ // The initializer may not be of the same length. Change to the desired
+ // length. If we need to add any elements, initialize them to false.
+ for (size_t i = initializer.size(); i < m_bitmap.size(); ++i) {
+ m_bitmap[i] = false;
+ }
+
+ m_numWordsCovered = std::count(m_bitmap.begin(), m_bitmap.end(), true);
+
+ // Find the first gap, and cache it.
+ Array<char>::const_iterator first_gap = std::find(m_bitmap.begin(),
+ m_bitmap.end(), false);
+ m_firstGap = ((first_gap == m_bitmap.end()) ?
+ NOT_FOUND: first_gap - m_bitmap.begin());
+}
+
+void Bitmap::Init(const Bitmap &copy, const Range &range)
+{
+ m_firstGap = copy.m_firstGap;
+ m_numWordsCovered = copy.m_numWordsCovered;
+ for (size_t i = 0; i < m_bitmap.size(); ++i) {
+ m_bitmap[i] = copy.m_bitmap[i];
+ }
+ SetValueNonOverlap(range);
+}
+
+// for unordered_set in stack
+size_t Bitmap::hash() const
+{
+ size_t ret = m_bitmap.hash();
+ return ret;
+}
+
+bool Bitmap::operator==(const Bitmap& other) const
+{
+ return m_bitmap == other.m_bitmap;
+}
+
+// friend
+std::ostream& operator<<(std::ostream& out, const Bitmap& bitmap)
+{
+ for (size_t i = 0; i < bitmap.m_bitmap.size(); i++) {
+ out << int(bitmap.GetValue(i));
+ }
+ return out;
+}
+
+}
+
diff --git a/moses2/legacy/Bitmap.h b/moses2/legacy/Bitmap.h
new file mode 100644
index 000000000..e6a0f7948
--- /dev/null
+++ b/moses2/legacy/Bitmap.h
@@ -0,0 +1,244 @@
+// $Id$
+
+/***********************************************************************
+ Moses - factored phrase-based language decoder
+ Copyright (C) 2006 University of Edinburgh
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ ***********************************************************************/
+
+#pragma once
+
+#include <algorithm>
+#include <limits>
+#include <vector>
+#include <iostream>
+#include <cstring>
+#include <cmath>
+#include <cstdlib>
+#include "Range.h"
+#include "../Array.h"
+
+namespace Moses2
+{
+class MemPool;
+
+typedef unsigned long WordsBitmapID;
+
+/** Vector of boolean to represent whether a word has been translated or not.
+ *
+ * Implemented using a vector of char, which is usually the same representation
+ * for the elements that a C array of bool would use. A vector of bool, or a
+ * Boost dynamic_bitset, could be much more efficient in theory. Unfortunately
+ * algorithms like std::find() are not optimized for vector<bool> on gcc or
+ * clang, and dynamic_bitset lacks all the optimized search operations we want.
+ * Only benchmarking will tell what works best. Perhaps dynamic_bitset could
+ * still be a dramatic improvement, if we flip the meaning of the bits around
+ * so we can use its find_first() and find_next() for the most common searches.
+ */
+class Bitmap
+{
+ friend std::ostream& operator<<(std::ostream& out, const Bitmap& bitmap);
+private:
+ Array<char> m_bitmap; //! Ticks of words in sentence that have been done.
+ size_t m_firstGap; //! Cached position of first gap, or NOT_FOUND.
+ size_t m_numWordsCovered;
+
+ Bitmap(); // not implemented
+ Bitmap& operator=(const Bitmap& other);
+
+ /** Update the first gap, when bits are flipped */
+ void UpdateFirstGap(size_t startPos, size_t endPos, bool value)
+ {
+ if (value) {
+ //may remove gap
+ if (startPos <= m_firstGap && m_firstGap <= endPos) {
+ m_firstGap = NOT_FOUND;
+ for (size_t i = endPos + 1; i < m_bitmap.size(); ++i) {
+ if (!m_bitmap[i]) {
+ m_firstGap = i;
+ break;
+ }
+ }
+ }
+
+ }
+ else {
+ //setting positions to false, may add new gap
+ if (startPos < m_firstGap) {
+ m_firstGap = startPos;
+ }
+ }
+ }
+
+ //! set value between 2 positions, inclusive
+ void
+ SetValueNonOverlap(Range const& range) {
+ size_t startPos = range.GetStartPos();
+ size_t endPos = range.GetEndPos();
+
+ for(size_t pos = startPos; pos <= endPos; pos++) {
+ m_bitmap[pos] = true;
+ }
+
+ m_numWordsCovered += range.GetNumWordsCovered();
+ UpdateFirstGap(startPos, endPos, true);
+ }
+
+ public:
+ //! Create Bitmap of length size, and initialise with vector.
+ explicit Bitmap(MemPool &pool, size_t size);
+
+ void Init(const std::vector<bool>& initializer);
+ void Init(const Bitmap &copy, const Range &range);
+
+ //! Count of words translated.
+ size_t GetNumWordsCovered() const {
+ return m_numWordsCovered;
+ }
+
+ //! position of 1st word not yet translated, or NOT_FOUND if everything already translated
+ size_t GetFirstGapPos() const {
+ return m_firstGap;
+ }
+
+ //! position of last word not yet translated, or NOT_FOUND if everything already translated
+ size_t GetLastGapPos() const {
+ for (int pos = int(m_bitmap.size()) - 1; pos >= 0; pos--) {
+ if (!m_bitmap[pos]) {
+ return pos;
+ }
+ }
+ // no starting pos
+ return NOT_FOUND;
+ }
+
+ //! position of last translated word
+ size_t GetLastPos() const {
+ for (int pos = int(m_bitmap.size()) - 1; pos >= 0; pos--) {
+ if (m_bitmap[pos]) {
+ return pos;
+ }
+ }
+ // no starting pos
+ return NOT_FOUND;
+ }
+
+ //! whether a word has been translated at a particular position
+ bool GetValue(size_t pos) const {
+ return bool(m_bitmap[pos]);
+ }
+ //! set value at a particular position
+ void SetValue( size_t pos, bool value ) {
+ bool origValue = m_bitmap[pos];
+ if (origValue == value) {
+ // do nothing
+ }
+ else {
+ m_bitmap[pos] = value;
+ UpdateFirstGap(pos, pos, value);
+ if (value) {
+ ++m_numWordsCovered;
+ }
+ else {
+ --m_numWordsCovered;
+ }
+ }
+ }
+
+ //! whether every word has been translated
+ bool IsComplete() const {
+ return GetSize() == GetNumWordsCovered();
+ }
+ //! whether the wordrange overlaps with any translated word in this bitmap
+ bool Overlap(const Range &compare) const {
+ for (size_t pos = compare.GetStartPos(); pos <= compare.GetEndPos(); pos++) {
+ if (m_bitmap[pos])
+ return true;
+ }
+ return false;
+ }
+ //! number of elements
+ size_t GetSize() const {
+ return m_bitmap.size();
+ }
+
+ inline size_t GetEdgeToTheLeftOf(size_t l) const {
+ if (l == 0) return l;
+ while (l && !m_bitmap[l-1]) {
+ --l;
+ }
+ return l;
+ }
+
+ inline size_t GetEdgeToTheRightOf(size_t r) const {
+ if (r+1 == m_bitmap.size()) return r;
+ return (
+ std::find(m_bitmap.begin() + r + 1, m_bitmap.end(), true) -
+ m_bitmap.begin()
+ ) - 1;
+ }
+
+ //! converts bitmap into an integer ID: it consists of two parts: the first 16 bit are the pattern between the first gap and the last word-1, the second 16 bit are the number of filled positions. enforces a sentence length limit of 65535 and a max distortion of 16
+ WordsBitmapID GetID() const {
+ assert(m_bitmap.size() < (1<<16));
+
+ size_t start = GetFirstGapPos();
+ if (start == NOT_FOUND) start = m_bitmap.size(); // nothing left
+
+ size_t end = GetLastPos();
+ if (end == NOT_FOUND) end = 0;// nothing translated yet
+
+ assert(end < start || end-start <= 16);
+ WordsBitmapID id = 0;
+ for(size_t pos = end; pos > start; pos--) {
+ id = id*2 + (int) GetValue(pos);
+ }
+ return id + (1<<16) * start;
+ }
+
+ //! converts bitmap into an integer ID, with an additional span covered
+ WordsBitmapID GetIDPlus( size_t startPos, size_t endPos ) const {
+ assert(m_bitmap.size() < (1<<16));
+
+ size_t start = GetFirstGapPos();
+ if (start == NOT_FOUND) start = m_bitmap.size(); // nothing left
+
+ size_t end = GetLastPos();
+ if (end == NOT_FOUND) end = 0;// nothing translated yet
+
+ if (start == startPos) start = endPos+1;
+ if (end < endPos) end = endPos;
+
+ assert(end < start || end-start <= 16);
+ WordsBitmapID id = 0;
+ for(size_t pos = end; pos > start; pos--) {
+ id = id*2;
+ if (GetValue(pos) || (startPos<=pos && pos<=endPos))
+ id++;
+ }
+ return id + (1<<16) * start;
+ }
+
+ // for unordered_set in stack
+ size_t hash() const;
+ bool operator==(const Bitmap& other) const;
+ bool operator!=(const Bitmap& other) const {
+ return !(*this == other);
+ }
+
+ };
+
+ }
diff --git a/moses2/legacy/Bitmaps.cpp b/moses2/legacy/Bitmaps.cpp
new file mode 100644
index 000000000..879ad9d71
--- /dev/null
+++ b/moses2/legacy/Bitmaps.cpp
@@ -0,0 +1,74 @@
+#include <boost/foreach.hpp>
+#include "Bitmaps.h"
+#include "Util2.h"
+
+using namespace std;
+
+namespace Moses2
+{
+
+Bitmaps::Bitmaps(MemPool &pool) :
+ m_pool(pool)
+{
+}
+
+Bitmaps::~Bitmaps()
+{
+}
+
+void Bitmaps::Init(size_t inputSize,
+ const std::vector<bool> &initSourceCompleted)
+{
+ m_initBitmap = new (m_pool.Allocate<Bitmap>()) Bitmap(m_pool, inputSize);
+ m_initBitmap->Init(initSourceCompleted);
+ m_coll[m_initBitmap];
+}
+
+const Bitmap &Bitmaps::GetNextBitmap(const Bitmap &bm, const Range &range)
+{
+ Bitmap *newBM;
+ if (m_recycler.empty()) {
+ newBM = new (m_pool.Allocate<Bitmap>()) Bitmap(m_pool, bm.GetSize());
+ }
+ else {
+ newBM = m_recycler.top();
+ m_recycler.pop();
+ }
+
+ newBM->Init(bm, range);
+
+ Coll::const_iterator iter = m_coll.find(newBM);
+ if (iter == m_coll.end()) {
+ m_coll[newBM] = NextBitmaps();
+ return *newBM;
+ }
+ else {
+ m_recycler.push(newBM);
+
+ return *iter->first;
+ }
+}
+
+const Bitmap &Bitmaps::GetBitmap(const Bitmap &bm, const Range &range)
+{
+ Coll::iterator iter = m_coll.find(&bm);
+ assert(iter != m_coll.end());
+
+ const Bitmap *newBM;
+ NextBitmaps &next = iter->second;
+ NextBitmaps::const_iterator iterNext = next.find(&range);
+ if (iterNext == next.end()) {
+ // not seen the link yet.
+ newBM = &GetNextBitmap(bm, range);
+ next[&range] = newBM;
+ }
+ else {
+ // link exist
+ //std::cerr << "link exists" << endl;
+ newBM = iterNext->second;
+ }
+ return *newBM;
+}
+
+}
+
diff --git a/moses2/legacy/Bitmaps.h b/moses2/legacy/Bitmaps.h
new file mode 100644
index 000000000..d8207b59e
--- /dev/null
+++ b/moses2/legacy/Bitmaps.h
@@ -0,0 +1,40 @@
+#pragma once
+
+#include <boost/unordered_set.hpp>
+#include <boost/unordered_map.hpp>
+#include <set>
+#include <stack>
+#include "Bitmap.h"
+#include "Util2.h"
+
+namespace Moses2
+{
+class MemPool;
+
+class Bitmaps
+{
+ typedef boost::unordered_map<const Range*, const Bitmap*> NextBitmaps;
+ typedef boost::unordered_map<const Bitmap*, NextBitmaps,
+ UnorderedComparer<Bitmap>, UnorderedComparer<Bitmap> > Coll;
+ //typedef std::set<const Bitmap*, OrderedComparer<Bitmap> > Coll;
+ Coll m_coll;
+ Bitmap *m_initBitmap;
+
+ MemPool &m_pool;
+ std::stack<Bitmap*> m_recycler;
+
+ const Bitmap &GetNextBitmap(const Bitmap &bm, const Range &range);
+public:
+ Bitmaps(MemPool &pool);
+ virtual ~Bitmaps();
+ void Init(size_t inputSize, const std::vector<bool> &initSourceCompleted);
+
+ const Bitmap &GetInitialBitmap() const
+ {
+ return *m_initBitmap;
+ }
+ const Bitmap &GetBitmap(const Bitmap &bm, const Range &range);
+};
+
+}
+
diff --git a/moses2/legacy/Factor.cpp b/moses2/legacy/Factor.cpp
new file mode 100644
index 000000000..be9bad2c1
--- /dev/null
+++ b/moses2/legacy/Factor.cpp
@@ -0,0 +1,45 @@
+// $Id$
+
+/***********************************************************************
+ Moses - factored phrase-based language decoder
+ Copyright (C) 2006 University of Edinburgh
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ ***********************************************************************/
+
+#include "Factor.h"
+
+#include <boost/functional/hash.hpp>
+
+using namespace std;
+
+namespace Moses2
+{
+
+// friend
+ostream& operator<<(ostream& out, const Factor& factor)
+{
+ out << factor.GetString();
+ return out;
+}
+
+size_t hash_value(const Factor& f)
+{
+ boost::hash<size_t> hasher;
+ return hasher(f.GetId());
+}
+
+}
+
diff --git a/moses2/legacy/Factor.h b/moses2/legacy/Factor.h
new file mode 100644
index 000000000..99d53f4f0
--- /dev/null
+++ b/moses2/legacy/Factor.h
@@ -0,0 +1,104 @@
+// $Id$
+
+/***********************************************************************
+ Moses - factored phrase-based language decoder
+ Copyright (C) 2006 University of Edinburgh
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ ***********************************************************************/
+
+#pragma once
+
+#include <ostream>
+#include <string>
+#include <vector>
+#include "util/string_piece.hh"
+
+namespace Moses2
+{
+
+struct FactorFriend;
+class FactorCollection;
+
+/** Represents a factor (word, POS, etc).
+ * A Factor has a contiguous identifier and string value.
+ */
+class Factor
+{
+ friend std::ostream& operator<<(std::ostream&, const Factor&);
+
+ // only these classes are allowed to instantiate this class
+ friend class FactorCollection;
+ friend struct FactorFriend;
+
+ // FactorCollection writes here.
+ // This is mutable so the pointer can be changed to pool-backed memory.
+ mutable StringPiece m_string;
+ size_t m_id;
+
+ //! protected constructor. only friend class, FactorCollection, is allowed to create Factor objects
+ Factor()
+ {
+ }
+
+ // Needed for STL containers. They'll delegate through FactorFriend, which is never exposed publicly.
+ Factor(const Factor &factor) :
+ m_string(factor.m_string), m_id(factor.m_id)
+ {
+ }
+
+ // Not implemented. Shouldn't be called.
+ Factor &operator=(const Factor &factor);
+
+public:
+ //! original string representation of the factor
+ StringPiece GetString() const
+ {
+ return m_string;
+ }
+ //! contiguous ID
+ inline size_t GetId() const
+ {
+ return m_id;
+ }
+
+ /** transitive comparison between 2 factors.
+ * -1 = less than
+ * +1 = more than
+ * 0 = same
+ */
+ inline int Compare(const Factor &compare) const
+ {
+ if (this < &compare) return -1;
+ if (this > &compare) return 1;
+ return 0;
+ }
+ //! transitive comparison used for adding objects into FactorCollection
+ inline bool operator<(const Factor &compare) const
+ {
+ return this < &compare;
+ }
+
+ // quick equality comparison. Not used
+ inline bool operator==(const Factor &compare) const
+ {
+ return this == &compare;
+ }
+};
+
+size_t hash_value(const Factor &f);
+
+}
+
diff --git a/moses2/legacy/FactorCollection.cpp b/moses2/legacy/FactorCollection.cpp
new file mode 100644
index 000000000..f8beb9b40
--- /dev/null
+++ b/moses2/legacy/FactorCollection.cpp
@@ -0,0 +1,111 @@
+// $Id$
+
+/***********************************************************************
+ Moses - factored phrase-based language decoder
+ Copyright (C) 2006 University of Edinburgh
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ ***********************************************************************/
+
+#include <boost/version.hpp>
+#ifdef WITH_THREADS
+#include <boost/thread/locks.hpp>
+#endif
+#include <ostream>
+#include <string>
+#include "FactorCollection.h"
+#include "util/pool.hh"
+#include "util/exception.hh"
+#include "../System.h"
+
+using namespace std;
+
+namespace Moses2
+{
+
+const Factor *FactorCollection::AddFactor(const StringPiece &factorString,
+ const System &system, bool isNonTerminal)
+{
+ FactorFriend to_ins;
+ to_ins.in.m_string = factorString;
+ to_ins.in.m_id = (isNonTerminal) ? m_factorIdNonTerminal : m_factorId;
+ Set & set = (isNonTerminal) ? m_set : m_setNonTerminal;
+ // If we're threaded, hope a read-only lock is sufficient.
+#ifdef WITH_THREADS
+ {
+ // read=lock scope
+ boost::shared_lock<boost::shared_mutex> read_lock(m_accessLock);
+ Set::const_iterator i = set.find(to_ins);
+ if (i != set.end()) return &i->in;
+ }
+ boost::unique_lock<boost::shared_mutex> lock(m_accessLock);
+#endif // WITH_THREADS
+ std::pair<Set::iterator, bool> ret(set.insert(to_ins));
+ if (ret.second) {
+ ret.first->in.m_string.set(
+ memcpy(m_string_backing.Allocate(factorString.size()),
+ factorString.data(), factorString.size()), factorString.size());
+ if (isNonTerminal) {
+ m_factorIdNonTerminal++;
+ UTIL_THROW_IF2(m_factorIdNonTerminal >= moses_MaxNumNonterminals,
+ "Number of non-terminals exceeds maximum size reserved. Adjust parameter moses_MaxNumNonterminals, then recompile");
+ }
+ else {
+ m_factorId++;
+ }
+ }
+
+ const Factor *factor = &ret.first->in;
+
+ return factor;
+}
+
+const Factor *FactorCollection::GetFactor(const StringPiece &factorString,
+ bool isNonTerminal)
+{
+ FactorFriend to_find;
+ to_find.in.m_string = factorString;
+ to_find.in.m_id = (isNonTerminal) ? m_factorIdNonTerminal : m_factorId;
+ Set & set = (isNonTerminal) ? m_set : m_setNonTerminal;
+ {
+ // read=lock scope
+#ifdef WITH_THREADS
+ boost::shared_lock<boost::shared_mutex> read_lock(m_accessLock);
+#endif // WITH_THREADS
+ Set::const_iterator i = set.find(to_find);
+ if (i != set.end()) return &i->in;
+ }
+ return NULL;
+}
+
+FactorCollection::~FactorCollection()
+{
+}
+
+// friend
+ostream& operator<<(ostream& out, const FactorCollection& factorCollection)
+{
+#ifdef WITH_THREADS
+ boost::shared_lock<boost::shared_mutex> lock(factorCollection.m_accessLock);
+#endif
+ for (FactorCollection::Set::const_iterator i = factorCollection.m_set.begin();
+ i != factorCollection.m_set.end(); ++i) {
+ out << i->in;
+ }
+ return out;
+}
+
+}
+
diff --git a/moses2/legacy/FactorCollection.h b/moses2/legacy/FactorCollection.h
new file mode 100644
index 000000000..0430e5cde
--- /dev/null
+++ b/moses2/legacy/FactorCollection.h
@@ -0,0 +1,130 @@
+// $Id$
+
+/***********************************************************************
+ Moses - factored phrase-based language decoder
+ Copyright (C) 2006 University of Edinburgh
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ ***********************************************************************/
+
+#pragma once
+
+// reserve space for non-terminal symbols (ensuring consecutive numbering, and allowing quick lookup by ID)
+#ifndef moses_MaxNumNonterminals
+#define moses_MaxNumNonterminals 10000
+#endif
+
+#ifdef WITH_THREADS
+#include <boost/thread/shared_mutex.hpp>
+#endif
+
+#include "util/murmur_hash.hh"
+#include <boost/unordered_set.hpp>
+
+#include <functional>
+#include <string>
+
+#include "util/string_piece.hh"
+#include "util/pool.hh"
+#include "Factor.h"
+
+namespace Moses2
+{
+
+class System;
+
+/** We don't want Factor to be copyable by anybody. But we also want to store
+ * it in an STL container. The solution is that Factor's copy constructor is
+ * private and friended to FactorFriend. The STL containers can delegate
+ * copying, so friending the container isn't sufficient. STL containers see
+ * FactorFriend's public copy constructor and everybody else sees Factor's
+ * private copy constructor.
+ */
+struct FactorFriend
+{
+ Factor in;
+};
+
+/** collection of factors
+ *
+ * All Factors in moses are accessed and created by a FactorCollection.
+ * By enforcing this strict creation processes (ie, forbidding factors
+ * from being created on the stack, etc), their memory addresses can
+ * be used as keys to uniquely identify them.
+ * Only 1 FactorCollection object should be created.
+ */
+class FactorCollection
+{
+ friend std::ostream& operator<<(std::ostream&, const FactorCollection&);
+ friend class System;
+
+ struct HashFactor: public std::unary_function<const FactorFriend &,
+ std::size_t>
+ {
+ std::size_t operator()(const FactorFriend &factor) const
+ {
+ return util::MurmurHashNative(factor.in.m_string.data(),
+ factor.in.m_string.size());
+ }
+ };
+ struct EqualsFactor: public std::binary_function<const FactorFriend &,
+ const FactorFriend &, bool>
+ {
+ bool operator()(const FactorFriend &left, const FactorFriend &right) const
+ {
+ return left.in.GetString() == right.in.GetString();
+ }
+ };
+ typedef boost::unordered_set<FactorFriend, HashFactor, EqualsFactor> Set;
+ Set m_set;
+ Set m_setNonTerminal;
+
+ util::Pool m_string_backing;
+
+#ifdef WITH_THREADS
+ //reader-writer lock
+ mutable boost::shared_mutex m_accessLock;
+#endif
+
+ size_t m_factorIdNonTerminal; /**< unique, contiguous ids, starting from 0, for each non-terminal factor */
+ size_t m_factorId; /**< unique, contiguous ids, starting from moses_MaxNumNonterminals, for each terminal factor */
+
+ //! constructor. only the 1 static variable can be created
+ FactorCollection() :
+ m_factorIdNonTerminal(0), m_factorId(moses_MaxNumNonterminals)
+ {
+ }
+
+public:
+ ~FactorCollection();
+
+ /** returns a factor with the same direction, factorType and factorString.
+ * If a factor already exist in the collection, return the existing factor, if not create a new 1
+ */
+ const Factor *AddFactor(const StringPiece &factorString, const System &system,
+ bool isNonTerminal);
+
+ size_t GetNumNonTerminals()
+ {
+ return m_factorIdNonTerminal;
+ }
+
+ const Factor *GetFactor(const StringPiece &factorString, bool isNonTerminal =
+ false);
+
+};
+
+}
+
diff --git a/moses2/legacy/InputFileStream.cpp b/moses2/legacy/InputFileStream.cpp
new file mode 100644
index 000000000..a68ea53ef
--- /dev/null
+++ b/moses2/legacy/InputFileStream.cpp
@@ -0,0 +1,60 @@
+// $Id$
+
+/***********************************************************************
+ Moses - factored phrase-based language decoder
+ Copyright (C) 2006 University of Edinburgh
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ ***********************************************************************/
+
+#include "InputFileStream.h"
+#include "gzfilebuf.h"
+#include <iostream>
+
+using namespace std;
+
+namespace Moses2
+{
+
+InputFileStream::InputFileStream(const std::string &filePath) :
+ std::istream(NULL), m_streambuf(NULL)
+{
+ if (filePath.size() > 3 && filePath.substr(filePath.size() - 3, 3) == ".gz") {
+ m_streambuf = new gzfilebuf(filePath.c_str());
+ }
+ else {
+ std::filebuf* fb = new std::filebuf();
+ fb = fb->open(filePath.c_str(), std::ios::in);
+ if (!fb) {
+ cerr << "Can't read " << filePath.c_str() << endl;
+ exit(1);
+ }
+ m_streambuf = fb;
+ }
+ this->init(m_streambuf);
+}
+
+InputFileStream::~InputFileStream()
+{
+ delete m_streambuf;
+ m_streambuf = NULL;
+}
+
+void InputFileStream::Close()
+{
+}
+
+}
+
diff --git a/moses2/legacy/InputFileStream.h b/moses2/legacy/InputFileStream.h
new file mode 100644
index 000000000..d8f78848c
--- /dev/null
+++ b/moses2/legacy/InputFileStream.h
@@ -0,0 +1,46 @@
+// $Id$
+
+/***********************************************************************
+ Moses - factored phrase-based language decoder
+ Copyright (C) 2006 University of Edinburgh
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ ***********************************************************************/
+
+#pragma once
+
+#include <cstdlib>
+#include <fstream>
+#include <string>
+
+namespace Moses2
+{
+
+/** Used in place of std::istream, can read zipped files if it ends in .gz
+ */
+class InputFileStream: public std::istream
+{
+protected:
+ std::streambuf *m_streambuf;
+public:
+
+ explicit InputFileStream(const std::string &filePath);
+ ~InputFileStream();
+
+ void Close();
+};
+
+}
+
diff --git a/moses2/legacy/Matrix.cpp b/moses2/legacy/Matrix.cpp
new file mode 100644
index 000000000..9d2abc8ab
--- /dev/null
+++ b/moses2/legacy/Matrix.cpp
@@ -0,0 +1,34 @@
+// $Id$
+// vim:tabstop=2
+
+/***********************************************************************
+ Moses - factored phrase-based language decoder
+ Copyright (C) 2006 University of Edinburgh
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ ***********************************************************************/
+
+#include <string>
+#include <iostream>
+#include "Matrix.h"
+#include "Util2.h"
+
+using namespace std;
+
+namespace Moses2
+{
+
+}
+
diff --git a/moses2/legacy/Matrix.h b/moses2/legacy/Matrix.h
new file mode 100644
index 000000000..6c498b53d
--- /dev/null
+++ b/moses2/legacy/Matrix.h
@@ -0,0 +1,106 @@
+// $Id$
+
+/***********************************************************************
+ Moses - factored phrase-based language decoder
+ Copyright (C) 2006 University of Edinburgh
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ ***********************************************************************/
+
+#pragma once
+
+#include <iostream>
+#include "Util2.h"
+#include "../MemPool.h"
+
+namespace Moses2
+{
+template<typename T>
+class Matrix
+{
+protected:
+ size_t m_rows, m_cols; /**< length of the square (sentence length) */
+ T *m_array; /**< two-dimensional array to store floats */
+
+ Matrix(); // not implemented
+ Matrix(const Matrix &copy); // not implemented
+
+public:
+ Matrix(MemPool &pool, size_t rows, size_t cols) :
+ m_rows(rows), m_cols(cols)
+ {
+ m_array = pool.Allocate<T>(rows * cols);
+ }
+
+ ~Matrix(); // not implemented
+
+ // set upper triangle
+ void InitTriangle(const T &val)
+ {
+ assert(m_rows == m_cols);
+ for (size_t row = 0; row < m_rows; row++) {
+ for (size_t col = row; col < m_cols; col++) {
+ SetValue(row, col, val);
+ }
+ }
+ }
+
+ // everything
+ void Init(const T &val)
+ {
+ for (size_t row = 0; row < m_rows; row++) {
+ for (size_t col = 0; col < m_cols; col++) {
+ SetValue(row, col, val);
+ }
+ }
+ }
+
+ /** Returns length of the square: typically the sentence length */
+ inline size_t GetSize() const
+ {
+ assert(m_rows == m_cols);
+ return m_rows;
+ }
+
+ inline size_t GetRows() const
+ {
+ return m_rows;
+ }
+
+ inline size_t GetCols() const
+ {
+ return m_cols;
+ }
+
+ /** Get a future cost score for a span */
+ inline const T &GetValue(size_t row, size_t col) const
+ {
+ return m_array[row * m_cols + col];
+ }
+
+ inline T &GetValue(size_t row, size_t col)
+ {
+ return m_array[row * m_cols + col];
+ }
+
+ /** Set a future cost score for a span */
+ inline void SetValue(size_t row, size_t col, const T &value)
+ {
+ m_array[row * m_cols + col] = value;
+ }
+};
+
+}
+
diff --git a/moses2/legacy/OutputCollector.h b/moses2/legacy/OutputCollector.h
new file mode 100644
index 000000000..5504d9add
--- /dev/null
+++ b/moses2/legacy/OutputCollector.h
@@ -0,0 +1,165 @@
+/***********************************************************************
+ Moses - factored phrase-based language decoder
+ Copyright (C) 2011 University of Edinburgh
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ ***********************************************************************/
+
+#pragma once
+
+#ifdef WITH_THREADS
+#include <boost/thread/mutex.hpp>
+#endif
+
+#ifdef BOOST_HAS_PTHREADS
+#include <pthread.h>
+#endif
+
+#include <iostream>
+#include <map>
+#include <ostream>
+#include <fstream>
+#include <string>
+#include "util/exception.hh"
+
+namespace Moses2
+{
+/**
+ * Makes sure output goes in the correct order when multi-threading
+ **/
+class OutputCollector
+{
+public:
+ OutputCollector(std::ostream* outStream = &std::cout,
+ std::ostream* debugStream = &std::cerr) :
+ m_nextOutput(0), m_outStream(outStream), m_debugStream(debugStream), m_isHoldingOutputStream(
+ false), m_isHoldingDebugStream(false)
+ {
+ }
+
+ OutputCollector(std::string xout, std::string xerr = "") :
+ m_nextOutput(0)
+ {
+ // TO DO open magic streams instead of regular ofstreams! [UG]
+
+ if (xout == "/dev/stderr") {
+ m_outStream = &std::cerr;
+ m_isHoldingOutputStream = false;
+ }
+ else if (xout.size() && xout != "/dev/stdout" && xout != "-") {
+ m_outStream = new std::ofstream(xout.c_str());
+ UTIL_THROW_IF2(!m_outStream->good(),
+ "Failed to open output file" << xout);
+ m_isHoldingOutputStream = true;
+ }
+ else {
+ m_outStream = &std::cout;
+ m_isHoldingOutputStream = false;
+ }
+
+ if (xerr == "/dev/stdout") {
+ m_debugStream = &std::cout;
+ m_isHoldingDebugStream = false;
+ }
+ else if (xerr.size() && xerr != "/dev/stderr") {
+ m_debugStream = new std::ofstream(xerr.c_str());
+ UTIL_THROW_IF2(!m_debugStream->good(),
+ "Failed to open debug stream" << xerr);
+ m_isHoldingDebugStream = true;
+ }
+ else {
+ m_debugStream = &std::cerr;
+ m_isHoldingDebugStream = false;
+ }
+ }
+
+ ~OutputCollector()
+ {
+ if (m_isHoldingOutputStream) delete m_outStream;
+ if (m_isHoldingDebugStream) delete m_debugStream;
+ }
+
+ void HoldOutputStream()
+ {
+ m_isHoldingOutputStream = true;
+ }
+
+ void HoldDebugStream()
+ {
+ m_isHoldingDebugStream = true;
+ }
+
+ bool OutputIsCout() const
+ {
+ return (m_outStream == &std::cout);
+ }
+
+ /**
+ * Write or cache the output, as appropriate.
+ **/
+ void Write(int sourceId, const std::string& output, const std::string& debug =
+ "")
+ {
+#ifdef WITH_THREADS
+ boost::mutex::scoped_lock lock(m_mutex);
+#endif
+ if (sourceId == m_nextOutput) {
+ //This is the one we were expecting
+ *m_outStream << output << std::flush;
+ *m_debugStream << debug << std::flush;
+ ++m_nextOutput;
+ //see if there's any more
+ std::map<int, std::string>::iterator iter;
+ while ((iter = m_outputs.find(m_nextOutput)) != m_outputs.end()) {
+ *m_outStream << iter->second << std::flush;
+ ++m_nextOutput;
+ std::map<int, std::string>::iterator debugIter = m_debugs.find(
+ iter->first);
+ m_outputs.erase(iter);
+ if (debugIter != m_debugs.end()) {
+ *m_debugStream << debugIter->second << std::flush;
+ m_debugs.erase(debugIter);
+ }
+ }
+ }
+ else {
+ //save for later
+ m_outputs[sourceId] = output;
+ m_debugs[sourceId] = debug;
+ }
+ }
+
+private:
+ std::map<int, std::string> m_outputs;
+ std::map<int, std::string> m_debugs;
+ int m_nextOutput;
+ std::ostream* m_outStream;
+ std::ostream* m_debugStream;
+ bool m_isHoldingOutputStream;
+ bool m_isHoldingDebugStream;
+#ifdef WITH_THREADS
+ boost::mutex m_mutex;
+#endif
+
+public:
+ void SetOutputStream(std::ostream* outStream)
+ {
+ m_outStream = outStream;
+ }
+
+};
+
+} // namespace Moses
+
diff --git a/moses2/legacy/OutputFileStream.cpp b/moses2/legacy/OutputFileStream.cpp
new file mode 100644
index 000000000..ad46f3a0c
--- /dev/null
+++ b/moses2/legacy/OutputFileStream.cpp
@@ -0,0 +1,88 @@
+// $Id: OutputFileStream.cpp 2780 2010-01-29 17:11:17Z bojar $
+
+/***********************************************************************
+ Moses - factored phrase-based language decoder
+ Copyright (C) 2006 University of Edinburgh
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ ***********************************************************************/
+
+#include <iostream>
+#include <boost/algorithm/string/predicate.hpp>
+#include <boost/iostreams/filter/gzip.hpp>
+#include "OutputFileStream.h"
+#include "gzfilebuf.h"
+
+using namespace std;
+using namespace boost::algorithm;
+
+namespace Moses2
+{
+OutputFileStream::OutputFileStream() :
+ boost::iostreams::filtering_ostream(), m_outFile(NULL), m_open(false)
+{
+}
+
+OutputFileStream::OutputFileStream(const std::string &filePath) :
+ m_outFile(NULL), m_open(false)
+{
+ Open(filePath);
+}
+
+OutputFileStream::~OutputFileStream()
+{
+ Close();
+}
+
+bool OutputFileStream::Open(const std::string &filePath)
+{
+ assert(!m_open);
+ if (filePath == std::string("-")) {
+ // Write to standard output. Leave m_outFile null.
+ this->push(std::cout);
+ }
+ else {
+ m_outFile = new ofstream(filePath.c_str(),
+ ios_base::out | ios_base::binary);
+ if (m_outFile->fail()) {
+ return false;
+ }
+
+ if (ends_with(filePath, ".gz")) {
+ this->push(boost::iostreams::gzip_compressor());
+ }
+ this->push(*m_outFile);
+ }
+
+ m_open = true;
+ return true;
+}
+
+void OutputFileStream::Close()
+{
+ if (!m_open) return;
+ this->flush();
+ if (m_outFile) {
+ this->pop(); // file
+
+ m_outFile->close();
+ delete m_outFile;
+ m_outFile = NULL;
+ }
+ m_open = false;
+}
+
+}
+
diff --git a/moses2/legacy/OutputFileStream.h b/moses2/legacy/OutputFileStream.h
new file mode 100644
index 000000000..27c0b4539
--- /dev/null
+++ b/moses2/legacy/OutputFileStream.h
@@ -0,0 +1,81 @@
+// $Id: InputFileStream.h 2939 2010-02-24 11:15:44Z jfouet $
+
+/***********************************************************************
+ Moses - factored phrase-based language decoder
+ Copyright (C) 2006 University of Edinburgh
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ ***********************************************************************/
+
+#pragma once
+
+#include <cstdlib>
+#include <fstream>
+#include <string>
+#include <iostream>
+#include <boost/iostreams/filtering_stream.hpp>
+
+namespace Moses2
+{
+
+/** Version of std::ostream with transparent compression.
+ *
+ * Transparently compresses output when writing to a file whose name ends in
+ * ".gz". Or, writes to stdout instead of a file when given a filename
+ * consisting of just a dash ("-").
+ */
+class OutputFileStream: public boost::iostreams::filtering_ostream
+{
+private:
+ /** File that needs flushing & closing when we close this stream.
+ *
+ * Is NULL when no file is opened, e.g. when writing to standard output.
+ */
+ std::ofstream *m_outFile;
+
+ /// Is this stream open?
+ bool m_open;
+
+public:
+ /** Create an unopened OutputFileStream.
+ *
+ * Until it's been opened, nothing can be done with this stream.
+ */
+ OutputFileStream();
+
+ /// Create an OutputFileStream, and open it by calling Open().
+ OutputFileStream(const std::string &filePath);
+ virtual ~OutputFileStream();
+
+ // TODO: Can we please just always throw an exception when this fails?
+ /** Open stream.
+ *
+ * If filePath is "-" (just a dash), this opens the stream for writing to
+ * standard output. Otherwise, it opens the given file. If the filename
+ * has the ".gz" suffix, output will be transparently compressed.
+ *
+ * Call Close() to close the file.
+ *
+ * Returns whether opening the file was successful. It may also throw an
+ * exception on failure.
+ */
+ bool Open(const std::string &filePath);
+
+ /// Flush and close stream. After this, the stream can be opened again.
+ void Close();
+};
+
+}
+
diff --git a/moses2/legacy/Parameter.cpp b/moses2/legacy/Parameter.cpp
new file mode 100644
index 000000000..5cb88645e
--- /dev/null
+++ b/moses2/legacy/Parameter.cpp
@@ -0,0 +1,1707 @@
+// $Id$
+
+/***********************************************************************
+ Moses - factored phrase-based language decoder
+ Copyright (C) 2006 University of Edinburgh
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ ***********************************************************************/
+
+#include <ctime>
+#include <iostream>
+#include <iterator>
+#include <fstream>
+#include <sstream>
+#include <algorithm>
+#include <boost/algorithm/string/predicate.hpp>
+#include <boost/program_options.hpp>
+
+#include "Parameter.h"
+#include "InputFileStream.h"
+#include "../FF/FeatureRegistry.h"
+#include "util/string_stream.hh"
+#include "util/exception.hh"
+#include "util/random.hh"
+
+using namespace std;
+using namespace boost::algorithm;
+namespace po = boost::program_options;
+
+namespace Moses2
+{
+
+/** define allowed parameters */
+Parameter::Parameter()
+{
+ ///////////////////////////////////////////////////////////////////////////////////////
+ // general options
+ po::options_description main_opts("Main Options");
+ AddParam(main_opts, "config", "f", "location of the configuration file");
+ AddParam(main_opts, "input-file", "i",
+ "location of the input file to be translated");
+
+ AddParam(main_opts, "verbose", "v", "verbosity level of the logging");
+ AddParam(main_opts, "show-weights", "print feature weights and exit");
+ //AddParam(main_opts, "time-out",
+ // "seconds after which is interrupted (-1=no time-out, default is -1)");
+
+ ///////////////////////////////////////////////////////////////////////////////////////
+ // factorization options
+ po::options_description factor_opts("General Factorization Options");
+ //AddParam(factor_opts, "factor-delimiter", "fd",
+ // "specify a different factor delimiter than the default");
+ // one should be able to specify different factor delimiters for intput and output
+ AddParam(factor_opts, "mapping", "description of decoding steps"); // whatever that means ...
+ AddParam(factor_opts, "placeholder-factor",
+ "Which source factor to use to store the original text for placeholders. The factor must not be used by a translation or gen model");
+
+ ///////////////////////////////////////////////////////////////////////////////////////
+ // general search options
+ po::options_description search_opts("Search Options");
+ string desc = "Which search algorithm to use.\n";
+ desc += "0=normal stack (default)\n";
+ desc += "1=cube pruning\n";
+ desc += "3=chart (with cube pruning)\n";
+ desc += "4=stack with batched lm requests\n";
+ desc += "5=chart (with incremental search)\n";
+ desc += "6=string-to-tree\n";
+ desc += "7=tree-to-string\n";
+ desc += "8=tree-to-string (SCFG-based)\n";
+ desc += "9=forest-to-string";
+ AddParam(search_opts, "search-algorithm", desc);
+ AddParam(search_opts, "beam-threshold", "b",
+ "threshold for threshold pruning");
+ //AddParam(search_opts, "early-discarding-threshold", "edt",
+ // "threshold for constructing hypotheses based on estimate cost");
+ AddParam(search_opts, "stack", "s",
+ "maximum stack size for histogram pruning. 0 = unlimited stack size");
+ //AddParam(search_opts, "stack-diversity", "sd",
+ // "minimum number of hypothesis of each coverage in stack (default 0)");
+
+ // feature weight-related options
+ //AddParam(search_opts, "weight-file", "wf",
+ // "feature weights file. Do *not* put weights for 'core' features in here - they go in moses.ini");
+ AddParam(search_opts, "weight",
+ "weights for ALL models, 1 per line 'WeightName value'. Weight names can be repeated");
+
+ AddParam(search_opts, "feature-overwrite",
+ "Override arguments in a particular feature function with a particular key. Format: -feature-overwrite \"FeatureName key=value\"");
+
+ po::options_description tune_opts("Options used in tuning.");
+ AddParam(tune_opts, "weight-overwrite",
+ "special parameter for mert. All on 1 line. Overrides weights specified in 'weights' argument");
+ AddParam(tune_opts, "feature-add",
+ "Add a feature function on the command line. Used by mira to add BLEU feature");
+ AddParam(tune_opts, "weight-add",
+ "Add weight for FF if it doesn't exist, i.e weights here are added 1st, and can be override by the ini file or on the command line. Used to specify initial weights for FF that was also specified on the copmmand line");
+
+ // phrase table limitations:
+ //AddParam(search_opts, "max-partial-trans-opt",
+ // "maximum number of partial translation options per input span (during mapping steps)");
+ //AddParam(search_opts, "max-trans-opt-per-coverage",
+ // "maximum number of translation options per input span (after applying mapping steps)");
+ AddParam(search_opts, "max-phrase-length",
+ "maximum phrase length (default 20)");
+ //AddParam(search_opts, "translation-option-threshold", "tot",
+ // "threshold for translation options relative to best for input phrase");
+
+ // miscellaneous search options
+ //AddParam(search_opts, "disable-discarding", "dd",
+ // "disable hypothesis discarding"); // ??? memory management? UG
+ //AddParam(search_opts, "phrase-drop-allowed", "da",
+ // "if present, allow dropping of source words"); //da = drop any (word); see -du for comparison
+ AddParam(search_opts, "threads", "th",
+ "number of threads to use in decoding (defaults to single-threaded)");
+
+ // distortion options
+ po::options_description disto_opts("Distortion options");
+ AddParam(disto_opts, "distortion-limit", "dl",
+ "distortion (reordering) limit in maximum number of words (0 = monotone, -1 = unlimited)");
+ AddParam(disto_opts, "monotone-at-punctuation", "mp",
+ "do not reorder over punctuation");
+ //AddParam(disto_opts, "early-distortion-cost", "edc",
+ // "include estimate of distortion cost yet to be incurred in the score [Moore & Quirk 2007]. Default is no");
+ //AddParam(disto_opts, "distortion",
+ // "configurations for each factorized/lexicalized reordering model."); // zombie parameter?
+
+ // cube pruning
+ po::options_description cube_opts("Cube pruning options.");
+ AddParam(cube_opts, "cube-pruning-pop-limit", "cbp",
+ "How many hypotheses should be popped for each stack. (default = 1000)");
+ AddParam(cube_opts, "cube-pruning-diversity", "cbd",
+ "How many hypotheses should be created for each coverage. (default = 0)");
+ AddParam(cube_opts, "cube-pruning-lazy-scoring", "cbls",
+ "Don't fully score a hypothesis until it is popped");
+ //AddParam(cube_opts, "cube-pruning-deterministic-search", "cbds",
+ // "Break ties deterministically during search");
+
+ ///////////////////////////////////////////////////////////////////////////////////////
+ // minimum bayes risk decoding
+ po::options_description mbr_opts(
+ "Minimum Bayes Risk (MBR), Lattice MBR, and Consensus decoding");
+
+ //AddParam(mbr_opts, "minimum-bayes-risk", "mbr",
+ // "use miminum Bayes risk to determine best translation");
+ //AddParam(mbr_opts, "mbr-size",
+ // "number of translation candidates considered in MBR decoding (default 200)");
+ //AddParam(mbr_opts, "mbr-scale",
+ // "scaling factor to convert log linear score probability in MBR decoding (default 1.0)");
+
+ //AddParam(mbr_opts, "lminimum-bayes-risk", "lmbr",
+ // "use lattice miminum Bayes risk to determine best translation");
+ //AddParam(mbr_opts, "consensus-decoding", "con",
+ // "use consensus decoding (De Nero et. al. 2009)");
+
+ po::options_description lmbr_opts("Options specific to Lattic MBR");
+ //AddParam(lmbr_opts, "lmbr-p", "unigram precision value for lattice mbr");
+ //AddParam(lmbr_opts, "lmbr-r", "ngram precision decay value for lattice mbr");
+ //AddParam(lmbr_opts, "lmbr-thetas", "theta(s) for lattice mbr calculation");
+ //AddParam(mbr_opts, "lmbr-map-weight",
+ // "weight given to map solution when doing lattice MBR (default 0)");
+ //AddParam(mbr_opts, "lmbr-pruning-factor",
+ // "average number of nodes/word wanted in pruned lattice");
+ //AddParam(mbr_opts, "lattice-hypo-set",
+ // "to use lattice as hypo set during lattice MBR");
+
+ ///////////////////////////////////////////////////////////////////////////////////////
+ // OOV handling options
+ po::options_description oov_opts("OOV Handling Options");
+ AddParam(oov_opts, "drop-unknown", "du",
+ "drop unknown words instead of copying them");
+ AddParam(oov_opts, "mark-unknown", "mu", "mark unknown words in output");
+ AddParam(oov_opts, "unknown-word-prefix",
+ "prefix to unknwon word when marked (default: 'UNK')");
+ AddParam(oov_opts, "unknown-word-suffix",
+ "suffix to unknwon word when marked (default: '')");
+ //AddParam(oov_opts, "lmodel-oov-feature",
+ // "add language model oov feature, one per model");
+ //AddParam(oov_opts, "output-unknowns",
+ // "Output the unknown (OOV) words to the given file, one line per sentence");
+ //AddParam(oov_opts, "always-create-direct-transopt",
+ // "Always create a translation that translates the source word ad-verbatim");
+
+ ///////////////////////////////////////////////////////////////////////////////////////
+ // input options
+ po::options_description input_opts("Input Format Options");
+ AddParam(input_opts, "input-factors", "list of factors in the input");
+ AddParam(input_opts, "inputtype",
+ "text (0), confusion network (1), word lattice (2), tree (3) (default = 0)");
+ AddParam(input_opts, "xml-input", "xi",
+ "allows markup of input with desired translations and probabilities. values can be 'pass-through' (default), 'inclusive', 'exclusive', 'constraint', 'ignore'");
+ //AddParam(input_opts, "xml-brackets", "xb",
+ // "specify strings to be used as xml tags opening and closing, e.g. \"{{ }}\" (default \"< >\"). Avoid square brackets because of configuration file format. Valid only with text input mode");
+ //AddParam(input_opts, "start-translation-id", "Id of 1st input. Default = 0");
+ //AddParam(input_opts, "alternate-weight-setting", "aws",
+ // "alternate set of weights to used per xml specification");
+
+ ///////////////////////////////////////////////////////////////////////////////////////
+ // output options
+ po::options_description output_opts("Output Options");
+ //AddParam(output_opts, "report-all-factors",
+ // "report all factors in output, not just first");
+ AddParam(output_opts, "output-factors", "list if factors in the output");
+ //AddParam(output_opts, "print-id",
+ // "prefix translations with id. Default if false");
+ //AddParam(output_opts, "print-passthrough",
+ // "output the sgml tag <passthrough> without any computation on that. Default is false");
+ //AddParam(output_opts, "print-passthrough-in-n-best",
+ // "output the sgml tag <passthrough> without any computation on that in each entry of the n-best-list. Default is false");
+ //AddParam(output_opts, "print-all-derivations",
+ // "to print all derivations in search graph");
+ AddParam(output_opts, "translation-details", "T",
+ "for each best hypothesis, report translation details to the given file");
+
+ AddParam(output_opts, "output-hypo-score",
+ "Output the hypo score to stdout with the output string. For search error analysis. Default is false");
+ //AddParam(output_opts, "output-word-graph", "owg",
+ // "Output stack info as word graph. Takes filename, 0=only hypos in stack, 1=stack + nbest hypos");
+ //AddParam(output_opts, "tree-translation-details", "Ttree",
+ // "for each hypothesis, report translation details with tree fragment info to given file");
+ //AddParam(output_opts, "print-alignment-info",
+ // "Output word-to-word alignment to standard out, separated from translation by |||. Word-to-word alignments are takne from the phrase table if any. Default is false");
+ //AddParam(output_opts, "alignment-output-file",
+ // "print output word alignments into given file");
+ //AddParam(output_opts, "sort-word-alignment",
+ // "Sort word alignments for more consistent display. 0=no sort (default), 1=target order");
+ AddParam(output_opts, "report-segmentation", "t",
+ "report phrase segmentation in the output");
+ AddParam(output_opts, "report-segmentation-enriched", "tt",
+ "report phrase segmentation in the output with additional information");
+
+ // translation-all-details was introduced in the context of DIMwid: Decoder Inspection for Moses (using Widgets)
+ // see here: https://ufal.mff.cuni.cz/pbml/100/art-kurtz-seemann-braune-maletti.pdf
+ //AddParam(output_opts, "translation-all-details", "Tall",
+ // "for all hypotheses, report translation details to the given file");
+
+ po::options_description osg_opts("Options for outputting search graphs");
+ //AddParam(osg_opts, "output-search-graph", "osg",
+ // "Output connected hypotheses of search into specified filename");
+ //AddParam(osg_opts, "output-search-graph-extended", "osgx",
+ // "Output connected hypotheses of search into specified filename, in extended format");
+ //AddParam(osg_opts, "unpruned-search-graph", "usg",
+ // "When outputting chart search graph, do not exclude dead ends. Note: stack pruning may have eliminated some hypotheses");
+ //AddParam(osg_opts, "output-search-graph-slf", "slf",
+ // "Output connected hypotheses of search into specified directory, one file per sentence, in HTK standard lattice format (SLF) - the flag should be followed by a directory name, which must exist");
+ //AddParam(output_opts, "include-lhs-in-search-graph", "lhssg",
+ // "When outputting chart search graph, include the label of the LHS of the rule (useful when using syntax)");
+#ifdef HAVE_PROTOBUF
+ //AddParam(osg_opts,"output-search-graph-pb", "pb", "Write phrase lattice to protocol buffer objects in the specified path.");
+#endif
+ //AddParam(osg_opts, "output-search-graph-hypergraph",
+ // "DEPRECATED! Output connected hypotheses of search into specified directory, one file per sentence, in a hypergraph format (see Kenneth Heafield's lazy hypergraph decoder). This flag is followed by 3 values: 'true (gz|txt|bz) directory-name'");
+
+ ///////////////////////////////////////////////////////////////////////////////////////
+ // nbest-options
+ po::options_description nbest_opts("N-best Options");
+ AddParam(nbest_opts, "n-best-list",
+ "file and size of n-best-list to be generated; specify - as the file in order to write to STDOUT");
+ // AddParam(nbest_opts,"n-best-list-file", "file of n-best-list to be generated; specify - as the file in order to write to STDOUT");
+ // AddParam(nbest_opts,"n-best-list-size", "size of n-best-list to be generated; specify - as the file in order to write to STDOUT");
+ //AddParam(nbest_opts, "labeled-n-best-list",
+ // "print out labels for each weight type in n-best list. default is true");
+ //AddParam(nbest_opts, "n-best-trees",
+ // "Write n-best target-side trees to n-best-list");
+ AddParam(nbest_opts, "n-best-factor",
+ "factor to compute the maximum number of contenders (=factor*nbest-size). value 0 means infinity, i.e. no threshold. default is 0");
+ //AddParam(nbest_opts, "report-all-factors-in-n-best",
+ // "Report all factors in n-best-lists. Default is false");
+ //AddParam(nbest_opts, "lattice-samples",
+ // "generate samples from lattice, in same format as nbest list. Uses the file and size arguments, as in n-best-list");
+ //AddParam(nbest_opts, "include-segmentation-in-n-best",
+ // "include phrasal segmentation in the n-best list. default is false");
+ //AddParam(nbest_opts, "print-alignment-info-in-n-best",
+ // "Include word-to-word alignment in the n-best list. Word-to-word alignments are taken from the phrase table if any. Default is false");
+
+ ///////////////////////////////////////////////////////////////////////////////////////
+ // server options
+ po::options_description server_opts("Moses Server Options");
+ AddParam(server_opts, "server", "Run moses as a translation server.");
+ AddParam(server_opts, "server-port", "Port for moses server");
+ AddParam(server_opts, "server-log", "Log destination for moses server");
+ //AddParam(server_opts, "session-timeout",
+ // "Timeout for sessions, e.g. '2h30m' or 1d (=24h)");
+ //AddParam(server_opts, "session-cache-size",
+ // string("Max. number of sessions cached.")
+ // + "Least recently used session is dumped first.");
+ AddParam(server_opts, "serial",
+ "Run server in serial mode, processing only one request at a time.");
+
+ AddParam(server_opts,"server-maxconn",
+ "Max. No of simultaneous HTTP transactions allowed by the server.");
+ AddParam(server_opts,"server-maxconn-backlog",
+ "Max. No. of requests the OS will queue if the server is busy.");
+ AddParam(server_opts,"server-keepalive-maxconn",
+ "Max. No. of requests the server will accept on a single TCP connection.");
+ AddParam(server_opts,"server-keepalive-timeout",
+ "Max. number of seconds the server will keep a persistent connection alive.");
+ AddParam(server_opts,"server-timeout",
+ "Max. number of seconds the server will wait for a client to submit a request once a connection has been established.");
+
+ po::options_description irstlm_opts("IRSTLM Options");
+ //AddParam(irstlm_opts, "clean-lm-cache",
+ // "clean language model caches after N translations (default N=1)");
+
+ po::options_description chart_opts("Chart Decoding Options");
+ AddParam(chart_opts, "max-chart-span",
+ "maximum num. of source word chart rules can consume (default 10)");
+ AddParam(chart_opts, "non-terminals",
+ "list of non-term symbols, space separated");
+ //AddParam(chart_opts, "rule-limit",
+ // "a little like table limit. But for chart decoding rules. Default is DEFAULT_MAX_TRANS_OPT_SIZE");
+ //AddParam(chart_opts, "source-label-overlap",
+ // "What happens if a span already has a label. 0=add more. 1=replace. 2=discard. Default is 0");
+ //AddParam(chart_opts, "unknown-lhs",
+ // "file containing target lhs of unknown words. 1 per line: LHS prob");
+
+ po::options_description misc_opts("Miscellaneous Options");
+ //AddParam(misc_opts, "mira", "do mira training");
+ //AddParam(misc_opts, "description",
+ // "Source language, target language, description");
+ //AddParam(misc_opts, "no-cache",
+ // "Disable all phrase-table caching. Default = false (ie. enable caching)");
+ //AddParam(misc_opts, "default-non-term-for-empty-range-only",
+ // "Don't add [X] to all ranges, just ranges where there isn't a source non-term. Default = false (ie. add [X] everywhere)");
+ //AddParam(misc_opts, "s2t-parsing-algorithm",
+ // "Which S2T parsing algorithm to use. 0=recursive CYK+, 1=scope-3 (default = 0)");
+
+ //AddParam(o,"continue-partial-translation", "cpt", "start from nonempty hypothesis");
+ AddParam(misc_opts, "decoding-graph-backoff", "dpb",
+ "only use subsequent decoding paths for unknown spans of given length");
+ //AddParam(misc_opts, "references",
+ // "Reference file(s) - used for bleu score feature");
+ //AddParam(misc_opts, "recover-input-path", "r",
+ // "(conf net/word lattice only) - recover input path corresponding to the best translation");
+ //AddParam(misc_opts, "link-param-count",
+ // "Number of parameters on word links when using confusion networks or lattices (default = 1)");
+ //AddParam(misc_opts, "feature-name-overwrite",
+ // "Override feature name (NOT arguments). Eg. SRILM-->KENLM, PhraseDictionaryMemory-->PhraseDictionaryScope3");
+
+ AddParam(misc_opts, "feature", "All the feature functions should be here");
+ //AddParam(misc_opts, "context-string",
+ // "A (tokenized) string containing context words for context-sensitive translation.");
+ //AddParam(misc_opts, "context-weights",
+ // "A key-value map for context-sensitive translation.");
+ //AddParam(misc_opts, "context-window",
+ // "Context window (in words) for context-sensitive translation: {+|-|+-}<number>.");
+ AddParam(misc_opts, "cpu-affinity-offset", "CPU Affinity. Default = -1 (no affinity)");
+ AddParam(misc_opts, "cpu-affinity-increment",
+ "Set to 1 (default) to put each thread on different cores. 0 to run all threads on one core");
+
+ // Compact phrase table and reordering table.
+ po::options_description cpt_opts(
+ "Options when using compact phrase and reordering tables.");
+ //AddParam(cpt_opts, "minphr-memory",
+ // "Load phrase table in minphr format into memory");
+ //AddParam(cpt_opts, "minlexr-memory",
+ // "Load lexical reordering table in minlexr format into memory");
+
+ po::options_description spe_opts("Simulated Post-editing Options");
+ //AddParam(spe_opts, "spe-src", "Simulated post-editing. Source filename");
+ //AddParam(spe_opts, "spe-trg", "Simulated post-editing. Target filename");
+ //AddParam(spe_opts, "spe-aln", "Simulated post-editing. Alignment filename");
+
+ ///////////////////////////////////////////////////////////////////////////////////////
+ // DEPRECATED options
+ po::options_description deprec_opts("Deprecated Options");
+ AddParam(deprec_opts, "text-type",
+ "DEPRECATED. DO NOT USE. should be one of dev/devtest/test, used for domain adaptation features");
+
+ /*
+ AddParam(deprec_opts, "link-param-count",
+ "DEPRECATED. DO NOT USE. Number of parameters on word links when using confusion networks or lattices (default = 1)");
+ AddParam(deprec_opts, "weight-slm", "slm",
+ "DEPRECATED. DO NOT USE. weight(s) for syntactic language model");
+ AddParam(deprec_opts, "weight-bl", "bl",
+ "DEPRECATED. DO NOT USE. weight for bleu score feature");
+ AddParam(deprec_opts, "weight-d", "d",
+ "DEPRECATED. DO NOT USE. weight(s) for distortion (reordering components)");
+ AddParam(deprec_opts, "weight-dlm", "dlm",
+ "DEPRECATED. DO NOT USE. weight for discriminative LM feature function (on top of sparse weights)");
+ AddParam(deprec_opts, "weight-lr", "lr",
+ "DEPRECATED. DO NOT USE. weight(s) for lexicalized reordering, if not included in weight-d");
+ AddParam(deprec_opts, "weight-generation", "g",
+ "DEPRECATED. DO NOT USE. weight(s) for generation components");
+ AddParam(deprec_opts, "weight-i", "I",
+ "DEPRECATED. DO NOT USE. weight(s) for word insertion - used for parameters from confusion network and lattice input links");
+ AddParam(deprec_opts, "weight-l", "lm",
+ "DEPRECATED. DO NOT USE. weight(s) for language models");
+ AddParam(deprec_opts, "weight-lex", "lex",
+ "DEPRECATED. DO NOT USE. weight for global lexical model");
+ AddParam(deprec_opts, "weight-glm", "glm",
+ "DEPRECATED. DO NOT USE. weight for global lexical feature, sparse producer");
+ AddParam(deprec_opts, "weight-wt", "wt",
+ "DEPRECATED. DO NOT USE. weight for word translation feature");
+ AddParam(deprec_opts, "weight-pp", "pp",
+ "DEPRECATED. DO NOT USE. weight for phrase pair feature");
+ AddParam(deprec_opts, "weight-pb", "pb",
+ "DEPRECATED. DO NOT USE. weight for phrase boundary feature");
+ AddParam(deprec_opts, "weight-t", "tm",
+ "DEPRECATED. DO NOT USE. weights for translation model components");
+ AddParam(deprec_opts, "weight-p", "w",
+ "DEPRECATED. DO NOT USE. weight for phrase penalty");
+ AddParam(deprec_opts, "weight-w", "w",
+ "DEPRECATED. DO NOT USE. weight for word penalty");
+ AddParam(deprec_opts, "weight-u", "u",
+ "DEPRECATED. DO NOT USE. weight for unknown word penalty");
+ AddParam(deprec_opts, "weight-e", "e",
+ "DEPRECATED. DO NOT USE. weight for word deletion");
+ AddParam(deprec_opts, "input-scores",
+ "DEPRECATED. DO NOT USE. 2 numbers on 2 lines - [1] of scores on each edge of a confusion network or lattice input (default=1). [2] Number of 'real' word scores (0 or 1. default=0)");
+ AddParam(deprec_opts, "dlm-model",
+ "DEPRECATED. DO NOT USE. Order, factor and vocabulary file for discriminative LM. Use * for filename to indicate unlimited vocabulary.");
+ AddParam(deprec_opts, "generation-file",
+ "DEPRECATED. DO NOT USE. location and properties of the generation table");
+ AddParam(deprec_opts, "global-lexical-file", "gl",
+ "DEPRECATED. DO NOT USE. discriminatively trained global lexical translation model file");
+ AddParam(deprec_opts, "glm-feature",
+ "DEPRECATED. DO NOT USE. discriminatively trained global lexical translation feature, sparse producer");
+ AddParam(deprec_opts, "lmodel-file",
+ "DEPRECATED. DO NOT USE. location and properties of the language models");
+ AddParam(deprec_opts, "lmodel-dub",
+ "DEPRECATED. DO NOT USE. dictionary upper bounds of language models");
+#ifdef HAVE_SYNLM
+ AddParam(deprec_opts,"slmodel-file", "DEPRECATED. DO NOT USE. location of the syntactic language model file(s)");
+ AddParam(deprec_opts,"slmodel-factor", "DEPRECATED. DO NOT USE. factor to use with syntactic language model");
+ AddParam(deprec_opts,"slmodel-beam", "DEPRECATED. DO NOT USE. beam width to use with syntactic language model's parser");
+#endif
+ AddParam(deprec_opts, "ttable-file",
+ "DEPRECATED. DO NOT USE. location and properties of the translation tables");
+ AddParam(deprec_opts, "phrase-pair-feature",
+ "DEPRECATED. DO NOT USE. Source and target factors for phrase pair feature");
+ AddParam(deprec_opts, "phrase-boundary-source-feature",
+ "DEPRECATED. DO NOT USE. Source factors for phrase boundary feature");
+ AddParam(deprec_opts, "phrase-boundary-target-feature",
+ "DEPRECATED. DO NOT USE. Target factors for phrase boundary feature");
+ AddParam(deprec_opts, "phrase-length-feature",
+ "DEPRECATED. DO NOT USE. Count features for source length, target length, both of each phrase");
+ AddParam(deprec_opts, "target-word-insertion-feature",
+ "DEPRECATED. DO NOT USE. Count feature for each unaligned target word");
+ AddParam(deprec_opts, "source-word-deletion-feature",
+ "DEPRECATED. DO NOT USE. Count feature for each unaligned source word");
+ AddParam(deprec_opts, "word-translation-feature",
+ "DEPRECATED. DO NOT USE. Count feature for word translation according to word alignment");
+ */
+
+ po::options_description zombie_opts("Zombie Options");
+ //AddParam(zombie_opts, "distortion-file",
+ // "source factors (0 if table independent of source), target factors, location of the factorized/lexicalized reordering tables");
+
+ //mbr_opts.add(lmbr_opts);
+ search_opts.add(cube_opts);
+ //search_opts.add(mbr_opts);
+ search_opts.add(disto_opts);
+ search_opts.add(chart_opts);
+
+ //input_opts.add(spe_opts);
+
+ output_opts.add(nbest_opts);
+ //output_opts.add(osg_opts);
+
+ m_options.add(main_opts);
+ m_options.add(server_opts);
+ m_options.add(input_opts);
+ m_options.add(search_opts);
+ m_options.add(output_opts);
+ m_options.add(oov_opts);
+ m_options.add(factor_opts);
+ //m_options.add(cpt_opts);
+ //m_options.add(irstlm_opts);
+ m_options.add(tune_opts);
+ m_options.add(misc_opts);
+ //m_options.add(deprec_opts);
+ //m_options.add(zombie_opts);
+
+}
+
+Parameter::~Parameter()
+{
+}
+
+const PARAM_VEC *Parameter::GetParam(const std::string &paramName) const
+{
+ PARAM_MAP::const_iterator iter = m_setting.find(paramName);
+ if (iter == m_setting.end()) {
+ return NULL;
+ }
+ else {
+ return &iter->second;
+ }
+
+}
+
+/** initialize a parameter, sub of constructor */
+void Parameter::AddParam(po::options_description& optgroup,
+ string const& paramName, string const& description)
+{
+ m_valid[paramName] = true;
+ m_description[paramName] = description;
+ optgroup.add_options()(paramName.c_str(), description.c_str());
+}
+
+/** initialize a parameter (including abbreviation), sub of constructor */
+void Parameter::AddParam(po::options_description& optgroup,
+ string const& paramName, string const& abbrevName,
+ string const& description)
+{
+ m_valid[paramName] = true;
+ m_valid[abbrevName] = true;
+ m_abbreviation[paramName] = abbrevName;
+ m_fullname[abbrevName] = paramName;
+ m_description[paramName] = description;
+ string optname = paramName;
+ if (abbrevName.size() == 1) {
+ optname += string(",") + abbrevName;
+ // m_confusable[abbrevName[0]].insert(paramName);
+ }
+ optgroup.add_options()(optname.c_str(), description.c_str());
+}
+
+/** print descriptions of all parameters */
+void Parameter::Explain()
+{
+ cerr << "Usage:" << endl;
+ cerr << m_options << endl;
+ // for(PARAM_STRING::const_iterator iterParam = m_description.begin();
+ // iterParam != m_description.end(); iterParam++)
+ // {
+ // const string paramName = iterParam->first;
+ // const string paramDescription = iterParam->second;
+ // cerr << "\t-" << paramName;
+ // PARAM_STRING::const_iterator iterAbbr = m_abbreviation.find( paramName );
+ // if ( iterAbbr != m_abbreviation.end() )
+ // cerr << " (" << iterAbbr->second << ")";
+ // cerr << ": " << paramDescription << endl;
+ // }
+}
+
+/** check whether an item on the command line is a switch or a value
+ * \param token token on the command line to checked **/
+
+bool Parameter::isOption(const char* token)
+{
+ if (!token) return false;
+ std::string tokenString(token);
+ size_t length = tokenString.size();
+ if (length <= 1) return false;
+ if (!starts_with(tokenString, "-")) return false;
+ if (tokenString.substr(1, 1).find_first_not_of("0123456789") == 0) return true;
+ return false;
+}
+
+/** load all parameters from the configuration file and the command line switches */
+bool Parameter::LoadParam(const string &filePath)
+{
+ const char *argv[] = { "executable", "-f", filePath.c_str() };
+ return LoadParam(3, (char**) argv);
+}
+
+/** load all parameters from the configuration file and the command line switches */
+bool Parameter::LoadParam(int argc, char* xargv[])
+{
+ // legacy parameter handling: all parameters are expected
+ // to start with a single dash
+ char* argv[argc + 1];
+ for (int i = 0; i < argc; ++i) {
+ argv[i] = xargv[i];
+ if (strlen(argv[i]) > 2 && argv[i][0] == '-' && argv[i][1] == '-') ++argv[i];
+ }
+
+ // config file (-f) arg mandatory
+ string configPath;
+ if ((configPath = FindParam("-f", argc, argv)) == "" && (configPath =
+ FindParam("-config", argc, argv)) == "") {
+ PrintCredit();
+ Explain();
+ FeatureRegistry::Instance().PrintFF();
+
+ cerr << endl;
+ cerr << "No configuration file was specified. Use -config or -f";
+ cerr << endl;
+ return false;
+ }
+ else {
+ if (!ReadConfigFile(configPath)) {
+ std::cerr << "Could not read " << configPath;
+ return false;
+ }
+ }
+
+ // overwrite parameters with values from switches
+ for (PARAM_STRING::const_iterator iterParam = m_description.begin();
+ iterParam != m_description.end(); iterParam++) {
+ const string paramName = iterParam->first;
+ OverwriteParam("-" + paramName, paramName, argc, argv);
+ }
+
+ // ... also shortcuts
+ for (PARAM_STRING::const_iterator iterParam = m_abbreviation.begin();
+ iterParam != m_abbreviation.end(); iterParam++) {
+ const string paramName = iterParam->first;
+ const string paramShortName = iterParam->second;
+ OverwriteParam("-" + paramShortName, paramName, argc, argv);
+ }
+
+ AddFeaturesCmd();
+
+ // logging of parameters that were set in either config or switch
+ int verbose = 1;
+ if (m_setting.find("verbose") != m_setting.end()
+ && m_setting["verbose"].size() > 0) verbose = Scan<int>(
+ m_setting["verbose"][0]);
+ if (verbose >= 1) { // only if verbose
+ cerr << "Defined parameters (per moses.ini or switch):" << endl;
+ for (PARAM_MAP::const_iterator iterParam = m_setting.begin();
+ iterParam != m_setting.end(); iterParam++) {
+ cerr << "\t" << iterParam->first << ": ";
+ for (size_t i = 0; i < iterParam->second.size(); i++)
+ cerr << iterParam->second[i] << " ";
+ cerr << endl;
+ }
+ }
+
+ // don't mix old and new format
+ if ((GetParam("feature") || GetParam("weight"))
+ && (GetParam("weight-slm") || GetParam("weight-bl")
+ || GetParam("weight-d") || GetParam("weight-dlm")
+ || GetParam("weight-lrl") || GetParam("weight-generation")
+ || GetParam("weight-i") || GetParam("weight-l")
+ || GetParam("weight-lex") || GetParam("weight-glm")
+ || GetParam("weight-wt") || GetParam("weight-pp")
+ || GetParam("weight-pb") || GetParam("weight-t")
+ || GetParam("weight-w") || GetParam("weight-p")
+ || GetParam("weight-u") || GetParam("weight-e")
+ || GetParam("dlm-mode") || GetParam("generation-file")
+ || GetParam("global-lexical-file") || GetParam("glm-feature")
+ || GetParam("lmodel-file") || GetParam("lmodel-dub")
+ || GetParam("slmodel-file") || GetParam("slmodel-factor")
+ || GetParam("slmodel-beam") || GetParam("ttable-file")
+ || GetParam("phrase-pair-feature")
+ || GetParam("phrase-boundary-source-feature")
+ || GetParam("phrase-boundary-target-feature")
+ || GetParam("phrase-length-feature")
+ || GetParam("target-word-insertion-feature")
+ || GetParam("source-word-deletion-feature")
+ || GetParam("word-translation-feature"))) {
+ UTIL_THROW(util::Exception, "Don't mix old and new ini file format");
+ }
+
+ // convert old weights args to new format
+ if (GetParam("feature") == NULL) {
+ ConvertWeightArgs();
+ }
+ CreateWeightsMap();
+ WeightOverwrite();
+
+ // check for illegal parameters
+ bool noErrorFlag = true;
+ for (int i = 0; i < argc; i++) {
+ if (isOption(argv[i])) {
+ string paramSwitch = (string) argv[i];
+ string paramName = paramSwitch.substr(1);
+ if (m_valid.find(paramName) == m_valid.end()) {
+ std::cerr << "illegal switch: " << paramSwitch;
+ noErrorFlag = false;
+ }
+ }
+ }
+
+ //Save("/tmp/moses.ini.new");
+
+ // check if parameters make sense
+ return Validate() && noErrorFlag;
+}
+
+void Parameter::AddFeaturesCmd()
+{
+ const PARAM_VEC *params = GetParam("feature-add");
+ if (params) {
+ PARAM_VEC::const_iterator iter;
+ for (iter = params->begin(); iter != params->end(); ++iter) {
+ const string &line = *iter;
+ AddFeature(line);
+ }
+
+ m_setting.erase("feature-add");
+ }
+}
+
+std::vector<float> Parameter::GetWeights(const std::string &name)
+{
+ std::vector<float> ret = m_weights[name];
+
+ // cerr << "WEIGHT " << name << "=";
+ // for (size_t i = 0; i < ret.size(); ++i) {
+ // cerr << ret[i] << ",";
+ // }
+ // cerr << endl;
+ return ret;
+}
+
+void Parameter::SetWeight(const std::string &name, size_t ind, float weight)
+{
+ PARAM_VEC &newWeights = m_setting["weight"];
+ string line = name + SPrint(ind) + "= " + SPrint(weight);
+ newWeights.push_back(line);
+}
+
+void Parameter::SetWeight(const std::string &name, size_t ind,
+ const vector<float> &weights)
+{
+ PARAM_VEC &newWeights = m_setting["weight"];
+ string line = name + SPrint(ind) + "=";
+
+ for (size_t i = 0; i < weights.size(); ++i) {
+ line += " " + SPrint(weights[i]);
+ }
+ newWeights.push_back(line);
+}
+
+void Parameter::AddWeight(const std::string &name, size_t ind,
+ const std::vector<float> &weights)
+{
+ PARAM_VEC &newWeights = m_setting["weight"];
+
+ string sought = name + SPrint(ind) + "=";
+ for (size_t i = 0; i < newWeights.size(); ++i) {
+ string &line = newWeights[i];
+ if (line.find(sought) == 0) {
+ // found existing weight, most likely to be input weights. Append to this line
+ for (size_t i = 0; i < weights.size(); ++i) {
+ line += " " + SPrint(weights[i]);
+ }
+ return;
+ }
+ }
+
+ // nothing found. Just set
+ SetWeight(name, ind, weights);
+}
+
+void Parameter::ConvertWeightArgsSingleWeight(const string &oldWeightName,
+ const string &newWeightName)
+{
+ size_t ind = 0;
+ PARAM_MAP::iterator iterMap;
+
+ iterMap = m_setting.find(oldWeightName);
+ if (iterMap != m_setting.end()) {
+ const PARAM_VEC &weights = iterMap->second;
+ for (size_t i = 0; i < weights.size(); ++i) {
+ SetWeight(newWeightName, ind, Scan<float>(weights[i]));
+ }
+
+ m_setting.erase(iterMap);
+ }
+}
+
+void Parameter::ConvertWeightArgsPhraseModel(const string &oldWeightName)
+{
+ const PARAM_VEC *params;
+
+ // process input weights 1st
+ params = GetParam("weight-i");
+ if (params) {
+ vector<float> inputWeights = Scan<float>(*params);
+ PARAM_VEC &numInputScores = m_setting["input-scores"];
+ if (inputWeights.size() == 1) {
+ UTIL_THROW_IF2(numInputScores.size() != 0,
+ "No [input-scores] section allowed");
+ numInputScores.push_back("1");
+ numInputScores.push_back("0");
+ }
+ else if (inputWeights.size() == 2) {
+ UTIL_THROW_IF2(numInputScores.size() != 0,
+ "No [input-scores] section allowed");
+ numInputScores.push_back("1");
+ numInputScores.push_back("1");
+ }
+
+ SetWeight("PhraseDictionaryBinary", 0, inputWeights);
+ }
+
+ // convert actually pt feature
+ cerr << "Creating phrase table features" << endl;
+
+ size_t numInputScores = 0;
+ size_t numRealWordsInInput = 0;
+ map<string, size_t> ptIndices;
+
+ params = GetParam("input-scores");
+ if (params) {
+ numInputScores = Scan<size_t>(params->at(0));
+
+ if (params->size() > 1) {
+ numRealWordsInInput = Scan<size_t>(params->at(1));
+ }
+ }
+
+ // load phrase translation tables
+ params = GetParam("ttable-file");
+ if (params) {
+ // weights
+ const vector<string> translationVector = *params;
+
+ vector<size_t> maxTargetPhrase;
+ params = GetParam("ttable-limit");
+ if (params) {
+ maxTargetPhrase = Scan<size_t>(*params);
+ }
+
+ if (maxTargetPhrase.size() == 1 && translationVector.size() > 1) {
+ cerr << "Using uniform ttable-limit of " << maxTargetPhrase[0]
+ << " for all translation tables." << endl;
+ for (size_t i = 1; i < translationVector.size(); i++)
+ maxTargetPhrase.push_back(maxTargetPhrase[0]);
+ }
+ else if (maxTargetPhrase.size() != 1
+ && maxTargetPhrase.size() < translationVector.size()) {
+ std::cerr << "You specified " << translationVector.size()
+ << " translation tables, but only " << maxTargetPhrase.size()
+ << " ttable-limits.";
+ return;
+ }
+
+ // MAIN LOOP
+ const PARAM_VEC &oldWeights = m_setting[oldWeightName];
+
+ size_t currOldInd = 0;
+ for (size_t currDict = 0; currDict < translationVector.size(); currDict++) {
+ util::StringStream ptLine;
+
+ vector<string> token = Tokenize(translationVector[currDict]);
+
+ if (currDict == 0 && token.size() == 4) {
+ std::cerr
+ << "Phrase table specification in old 4-field format. No longer supported";
+ return;
+ }
+ UTIL_THROW_IF2(token.size() < 5,
+ "Phrase table must have at least 5 scores");
+
+ int implementation = Scan<int>(token[0]);
+
+ string ptType;
+ switch (implementation) {
+ case 0: // Memory
+ ptType = "PhraseDictionaryMemory";
+ break;
+ case 1: // Binary
+ ptType = "PhraseDictionaryBinary";
+ break;
+ case 2: // OnDisk
+ ptType = "PhraseDictionaryOnDisk";
+ break;
+ case 6: // SCFG
+ ptType = "PhraseDictionaryMemory";
+ break;
+ case 12: // Compact
+ ptType = "PhraseDictionaryCompact";
+ break;
+ case 8: // SuffixArray
+ ptType = "PhraseDictionarySuffixArray";
+ break;
+ case 14: // DSuffixArray
+ ptType = "PhraseDictionaryDynSuffixArray";
+ break;
+ case 15: // DCacheBased:
+ ptType = "PhraseDictionaryDynamicCacheBased";
+ break;
+ default:
+ break;
+ }
+
+ size_t ptInd;
+ if (ptIndices.find(ptType) == ptIndices.end()) {
+ ptIndices[ptType] = 0;
+ ptInd = 0;
+ }
+ else {
+ ptInd = ++ptIndices[ptType];
+ }
+
+ // weights
+ size_t numFFInd = (token.size() == 4) ? 2 : 3;
+ size_t numFF = Scan<size_t>(token[numFFInd]);
+
+ vector<float> weights(numFF);
+ for (size_t currFF = 0; currFF < numFF; ++currFF) {
+ UTIL_THROW_IF2(currOldInd >= oldWeights.size(),
+ "Errors converting old phrase-table weights to new weights");
+ float weight = Scan<float>(oldWeights[currOldInd]);
+ weights[currFF] = weight;
+
+ ++currOldInd;
+ }
+
+ // cerr << weights.size() << " PHRASE TABLE WEIGHTS "
+ // << __FILE__ << ":" << __LINE__ << endl;
+ AddWeight(ptType, ptInd, weights);
+
+ // actual pt
+ ptLine << ptType << " ";
+ ptLine << "input-factor=" << token[1] << " ";
+ ptLine << "output-factor=" << token[2] << " ";
+ ptLine << "path=" << token[4] << " ";
+
+ //characteristics of the phrase table
+
+ vector<FactorType> input = Tokenize<FactorType>(token[1], ","), output =
+ Tokenize<FactorType>(token[2], ",");
+ size_t numScoreComponent = Scan<size_t>(token[3]);
+ string filePath = token[4];
+
+ if (currDict == 0) {
+ // only the 1st pt. THis is shit
+ // TODO. find what the assumptions made by confusion network about phrase table output which makes
+ // it only work with binary file. This is a hack
+ numScoreComponent += numInputScores + numRealWordsInInput;
+ }
+
+ ptLine << "num-features=" << numScoreComponent << " ";
+ ptLine << "table-limit=" << maxTargetPhrase[currDict] << " ";
+
+ if (implementation == 8 || implementation == 14) {
+ ptLine << "target-path=" << token[5] << " ";
+ ptLine << "alignment-path=" << token[6] << " ";
+ }
+
+ AddFeature(ptLine.str());
+ } // for(size_t currDict = 0 ; currDict < translationVector.size(); currDict++) {
+ } // if (GetParam("ttable-file").size() > 0) {
+
+ m_setting.erase("weight-i");
+ m_setting.erase(oldWeightName);
+ m_setting.erase("ttable-file");
+ m_setting.erase("ttable-limit");
+
+}
+
+void Parameter::AddFeature(const std::string &line)
+{
+ PARAM_VEC &features = m_setting["feature"];
+ features.push_back(line);
+}
+
+void Parameter::ConvertWeightArgsDistortion()
+{
+ const string oldWeightName = "weight-d";
+ const string oldLexReordingName = "distortion-file";
+
+ // distortion / lex distortion
+ const PARAM_VEC *oldWeights = GetParam(oldWeightName);
+
+ if (oldWeights) {
+ const PARAM_VEC *searchAlgo = GetParam("search-algorithm");
+ if (searchAlgo == NULL
+ || (searchAlgo->size() > 0
+ && (Trim(searchAlgo->at(0)) == "0" || Trim(searchAlgo->at(0)) == "1"))) {
+ // phrase-based. Add distance distortion to list of features
+ AddFeature("Distortion");
+ SetWeight("Distortion", 0, Scan<float>(oldWeights->at(0)));
+ }
+
+ // everything but the last is lex reordering model
+
+ size_t currOldInd = 1;
+ const PARAM_VEC *lextable = GetParam(oldLexReordingName);
+
+ for (size_t indTable = 0; lextable && indTable < lextable->size();
+ ++indTable) {
+ const string &line = lextable->at(indTable);
+ vector<string> toks = Tokenize(line);
+
+ size_t numFF = Scan<size_t>(toks[2]);
+
+ vector<float> weights(numFF);
+ for (size_t currFF = 0; currFF < numFF; ++currFF) {
+ UTIL_THROW_IF2(oldWeights && currOldInd >= oldWeights->size(),
+ "Errors converting old distortion weights to new weights");
+ float weight = Scan<float>(oldWeights->at(currOldInd));
+ weights[currFF] = weight;
+
+ ++currOldInd;
+ }
+ SetWeight("LexicalReordering", indTable, weights);
+
+ util::StringStream strme;
+ strme << "LexicalReordering " << "type=" << toks[1] << " ";
+
+ vector<FactorType> factors = Tokenize<FactorType>(toks[0], "-");
+ UTIL_THROW_IF2(factors.size() != 2,
+ "Error in old factor specification for lexicalized reordering model: " << toks[0]);
+ strme << "input-factor=" << factors[0] << " output-factor=" << factors[1]
+ << " ";
+
+ strme << "num-features=" << toks[2] << " ";
+ strme << "path=" << toks[3];
+
+ AddFeature(strme.str());
+ }
+ }
+
+ m_setting.erase(oldWeightName);
+ m_setting.erase(oldLexReordingName);
+
+}
+
+void Parameter::ConvertWeightArgsLM()
+{
+ const string oldWeightName = "weight-l";
+ const string oldFeatureName = "lmodel-file";
+ const PARAM_VEC *params;
+
+ bool isChartDecoding = true;
+
+ params = GetParam("search-algorithm");
+ if (params == NULL
+ || (params->size() > 0
+ && (Trim(params->at(0)) == "0" || Trim(params->at(0)) == "1"))) {
+ isChartDecoding = false;
+ }
+
+ vector<int> oovWeights;
+ params = GetParam("lmodel-oov-feature");
+ if (params) {
+ oovWeights = Scan<int>(*params);
+ }
+
+ PARAM_MAP::iterator iterMap;
+
+ iterMap = m_setting.find(oldWeightName);
+ if (iterMap != m_setting.end()) {
+
+ size_t currOldInd = 0;
+ const PARAM_VEC &weights = iterMap->second;
+ const PARAM_VEC &models = m_setting[oldFeatureName];
+ for (size_t lmIndex = 0; lmIndex < models.size(); ++lmIndex) {
+ const string &line = models[lmIndex];
+ vector<string> modelToks = Tokenize(line);
+
+ int lmType = Scan<int>(modelToks[0]);
+
+ string newFeatureName;
+ switch (lmType) {
+ case 0:
+ newFeatureName = "SRILM";
+ break;
+ case 1:
+ newFeatureName = "IRSTLM";
+ break;
+ case 8:
+ case 9:
+ newFeatureName = "KENLM";
+ break;
+ default:
+ UTIL_THROW2("Unkown language model type id:" << lmType)
+ ;
+ }
+
+ size_t numFF = 1;
+ if (oovWeights.size() > lmIndex) numFF += oovWeights[lmIndex];
+
+ vector<float> weightsLM(numFF);
+ for (size_t currFF = 0; currFF < numFF; ++currFF) {
+ UTIL_THROW_IF2(currOldInd >= weights.size(),
+ "Errors converting old LM weights to new weights");
+ weightsLM[currFF] = Scan<float>(weights[currOldInd]);
+ if (isChartDecoding) {
+ weightsLM[currFF] = UntransformLMScore(weightsLM[currFF]);
+ }
+
+ ++currOldInd;
+ }
+
+ SetWeight(newFeatureName, lmIndex, weightsLM);
+
+ string featureLine = newFeatureName + " " + "factor=" + modelToks[1] + " " // factor
+ + "order=" + modelToks[2] + " " // order
+ + "num-features=" + SPrint(numFF) + " ";
+ if (lmType == 9) {
+ featureLine += "lazyken=1 ";
+ }
+ else if (lmType == 8) {
+ featureLine += "lazyken=0 ";
+ }
+
+ featureLine += "path=" + modelToks[3]; // file
+
+ AddFeature(featureLine);
+ } // for (size_t lmIndex = 0; lmIndex < models.size(); ++lmIndex) {
+
+ m_setting.erase(iterMap);
+ }
+
+ m_setting.erase(oldFeatureName);
+}
+
+void Parameter::ConvertWeightArgsGeneration(const std::string &oldWeightName,
+ const std::string &newWeightName)
+{
+ string oldFeatureName = "generation-file";
+
+ // distortion / lex distortion
+ PARAM_VEC &oldWeights = m_setting[oldWeightName];
+
+ if (oldWeights.size() > 0) {
+ size_t currOldInd = 0;
+ PARAM_VEC &models = m_setting[oldFeatureName];
+
+ for (size_t indTable = 0; indTable < models.size(); ++indTable) {
+ string &line = models[indTable];
+ vector<string> modelToks = Tokenize(line);
+
+ size_t numFF = Scan<size_t>(modelToks[2]);
+
+ vector<float> weights(numFF);
+ for (size_t currFF = 0; currFF < numFF; ++currFF) {
+ UTIL_THROW_IF2(currOldInd >= oldWeights.size(),
+ "Errors converting old generation weights to new weights");
+ float weight = Scan<float>(oldWeights[currOldInd]);
+ weights[currFF] = weight;
+
+ ++currOldInd;
+ }
+ SetWeight(newWeightName, indTable, weights);
+
+ util::StringStream strme;
+ strme << "Generation " << "input-factor=" << modelToks[0] << " "
+ << "output-factor=" << modelToks[1] << " " << "num-features="
+ << modelToks[2] << " " << "path=" << modelToks[3];
+ AddFeature(strme.str());
+ }
+ }
+
+ m_setting.erase(oldWeightName);
+ m_setting.erase(oldFeatureName);
+}
+
+void Parameter::ConvertWeightArgsWordPenalty()
+{
+ const std::string oldWeightName = "weight-w";
+ const std::string newWeightName = "WordPenalty";
+
+ bool isChartDecoding = true;
+ const PARAM_VEC *searchAlgo = GetParam("search-algorithm");
+ if (searchAlgo == NULL
+ || (searchAlgo->size() > 0
+ && (Trim(searchAlgo->at(0)) == "0" || Trim(searchAlgo->at(0)) == "1"))) {
+ isChartDecoding = false;
+ }
+
+ PARAM_MAP::iterator iterMap;
+
+ iterMap = m_setting.find(oldWeightName);
+ if (iterMap != m_setting.end()) {
+ const PARAM_VEC &weights = iterMap->second;
+ for (size_t i = 0; i < weights.size(); ++i) {
+ float weight = Scan<float>(weights[i]);
+ if (isChartDecoding) {
+ weight *= 0.434294482;
+ }
+ SetWeight(newWeightName, i, weight);
+ }
+
+ m_setting.erase(iterMap);
+ }
+
+}
+
+void Parameter::ConvertPhrasePenalty()
+{
+ string oldWeightName = "weight-p";
+ const PARAM_VEC *params = GetParam(oldWeightName);
+ if (params) {
+ UTIL_THROW_IF2(params->size() != 1,
+ "There should be only 1 phrase-penalty weight");
+ float weight = Scan<float>(params->at(0));
+ AddFeature("PhrasePenalty");
+ SetWeight("PhrasePenalty", 0, weight);
+
+ m_setting.erase(oldWeightName);
+ }
+}
+
+void Parameter::ConvertWeightArgs()
+{
+ // can't handle discr LM. must do it manually 'cos of bigram/n-gram split
+ UTIL_THROW_IF2(m_setting.count("weight-dlm") != 0,
+ "Can't handle discr LM. must do it manually 'cos of bigram/n-gram split");
+
+ // check that old & new format aren't mixed
+ if (m_setting.count("weight")
+ && (m_setting.count("weight-i") || m_setting.count("weight-t")
+ || m_setting.count("weight-w") || m_setting.count("weight-l")
+ || m_setting.count("weight-u") || m_setting.count("weight-lex")
+ || m_setting.count("weight-generation")
+ || m_setting.count("weight-lr") || m_setting.count("weight-d"))) {
+ cerr << "Do not mix old and new format for specify weights";
+ }
+
+ ConvertWeightArgsWordPenalty();
+ ConvertWeightArgsLM();
+ ConvertWeightArgsSingleWeight("weight-slm", "SyntacticLM");
+ ConvertWeightArgsSingleWeight("weight-u", "UnknownWordPenalty");
+ ConvertWeightArgsGeneration("weight-generation", "Generation");
+ ConvertWeightArgsDistortion();
+
+ // don't know or can't be bothered converting these weights
+ ConvertWeightArgsSingleWeight("weight-lr", "LexicalReordering");
+ ConvertWeightArgsSingleWeight("weight-bl", "BleuScoreFeature");
+ ConvertWeightArgsSingleWeight("weight-glm", "GlobalLexicalModel");
+ ConvertWeightArgsSingleWeight("weight-wt", "WordTranslationFeature");
+ ConvertWeightArgsSingleWeight("weight-pp", "PhrasePairFeature");
+ ConvertWeightArgsSingleWeight("weight-pb", "PhraseBoundaryFeature");
+
+ ConvertWeightArgsSingleWeight("weight-e", "WordDeletion"); // TODO Can't find real name
+ ConvertWeightArgsSingleWeight("weight-lex", "GlobalLexicalReordering"); // TODO Can't find real name
+
+ ConvertPhrasePenalty();
+
+ AddFeature("WordPenalty");
+ AddFeature("UnknownWordPenalty");
+
+ ConvertWeightArgsPhraseModel("weight-t");
+
+}
+
+void Parameter::CreateWeightsMap()
+{
+ CreateWeightsMap(m_setting["weight-add"]);
+ CreateWeightsMap(m_setting["weight"]);
+}
+
+void Parameter::CreateWeightsMap(const PARAM_VEC &vec)
+{
+ for (size_t i = 0; i < vec.size(); ++i) {
+ const string &line = vec[i];
+ vector<string> toks = Tokenize(line);
+ UTIL_THROW_IF2(toks.size() < 2, "Error in format of weights: " << line);
+
+ string name = toks[0];
+ name = name.substr(0, name.size() - 1);
+
+ vector<float> weights(toks.size() - 1);
+ for (size_t i = 1; i < toks.size(); ++i) {
+ float weight = Scan<float>(toks[i]);
+ weights[i - 1] = weight;
+ }
+ m_weights[name] = weights;
+ }
+}
+
+void Parameter::WeightOverwrite()
+{
+ PARAM_VEC &vec = m_setting["weight-overwrite"];
+
+ if (vec.size() == 0) return;
+
+ // should only be on 1 line
+ UTIL_THROW_IF2(vec.size() != 1, "weight-overwrite should only be on 1 line");
+
+ string name("");
+ vector<float> weights;
+ vector<string> toks = Tokenize(vec[0]);
+ size_t cnt = 0;
+ const std::vector<float>* oldWeights = NULL;
+ for (size_t i = 0; i < toks.size(); ++i) {
+ const string &tok = toks[i];
+
+ if (ends_with(tok, "=")) {
+ // start of new feature
+
+ if (name != "") {
+ // save previous ff
+ m_weights[name] = weights;
+ weights.clear();
+ }
+
+ name = tok.substr(0, tok.size() - 1);
+ std::map<std::string, std::vector<float> >::const_iterator found =
+ m_weights.find(name);
+ if (found != m_weights.end()) {
+ oldWeights = &(found->second);
+ }
+ else {
+ oldWeights = NULL;
+ }
+ cnt = 0;
+ }
+ else {
+ // a weight for curr ff
+ if (toks[i] == "x") {
+ UTIL_THROW_IF2(!oldWeights || cnt >= oldWeights->size(),
+ "Keeping previous weight failed in weight-overwrite");
+ weights.push_back(oldWeights->at(cnt));
+ }
+ else {
+ float weight = Scan<float>(toks[i]);
+ weights.push_back(weight);
+ }
+ ++cnt;
+ }
+ }
+
+ if (name != "") {
+ m_weights[name] = weights;
+ }
+
+}
+
+/** check that parameter settings make sense */
+bool Parameter::Validate()
+{
+ bool noErrorFlag = true;
+
+ PARAM_MAP::const_iterator iterParams;
+ for (iterParams = m_setting.begin(); iterParams != m_setting.end();
+ ++iterParams) {
+ const std::string &key = iterParams->first;
+
+ if (m_valid.find(key) == m_valid.end()) {
+ std::cerr << "Unknown parameter " << key;
+ noErrorFlag = false;
+ }
+ }
+
+ if (m_setting["lmodel-dub"].size() > 0) {
+ if (m_setting["lmodel-file"].size() != m_setting["lmodel-dub"].size()) {
+ std::cerr << "Config and parameters specify "
+ << static_cast<int>(m_setting["lmodel-file"].size())
+ << " language model files (lmodel-file), but "
+ << static_cast<int>(m_setting["lmodel-dub"].size())
+ << " LM upperbounds (lmodel-dub)" << endl;
+ noErrorFlag = false;
+ }
+ }
+
+ // do files exist?
+
+ // input file
+ if (noErrorFlag && m_setting["input-file"].size() == 1) {
+ noErrorFlag = FileExists(m_setting["input-file"][0]);
+ if (!noErrorFlag) {
+ std::cerr << endl << "Input file " << m_setting["input-file"][0]
+ << " does not exist";
+ }
+ }
+ // generation tables
+ if (noErrorFlag) {
+ std::vector<std::string> ext;
+ //raw tables in either un compressed or compressed form
+ ext.push_back("");
+ ext.push_back(".gz");
+ noErrorFlag = FilesExist("generation-file", 3, ext);
+ }
+ // distortion
+ if (noErrorFlag) {
+ std::vector<std::string> ext;
+ //raw tables in either un compressed or compressed form
+ ext.push_back("");
+ ext.push_back(".gz");
+ //prefix tree format
+ ext.push_back(".binlexr.idx");
+ //prefix tree format
+ ext.push_back(".minlexr");
+ noErrorFlag = FilesExist("distortion-file", 3, ext);
+ }
+ return noErrorFlag;
+}
+
+/** check whether a file exists */
+bool Parameter::FilesExist(const string &paramName, int fieldNo,
+ std::vector<std::string> const& extensions)
+{
+ typedef std::vector<std::string> StringVec;
+ StringVec::const_iterator iter;
+
+ PARAM_MAP::const_iterator iterParam = m_setting.find(paramName);
+ if (iterParam == m_setting.end()) {
+ // no param. therefore nothing to check
+ return true;
+ }
+ const StringVec &pathVec = (*iterParam).second;
+ for (iter = pathVec.begin(); iter != pathVec.end(); ++iter) {
+ StringVec vec = Tokenize(*iter);
+
+ size_t tokenizeIndex;
+ if (fieldNo == -1) tokenizeIndex = vec.size() - 1;
+ else tokenizeIndex = static_cast<size_t>(fieldNo);
+
+ if (tokenizeIndex >= vec.size()) {
+ std::cerr << "Expected at least " << (tokenizeIndex + 1)
+ << " tokens per entry in '" << paramName << "', but only found "
+ << vec.size();
+ return false;
+ }
+ const string &pathStr = vec[tokenizeIndex];
+
+ bool fileFound = 0;
+ for (size_t i = 0; i < extensions.size() && !fileFound; ++i) {
+ fileFound |= FileExists(pathStr + extensions[i]);
+ }
+ if (!fileFound) {
+ std::cerr << "File " << pathStr << " does not exist";
+ return false;
+ }
+ }
+ return true;
+}
+
+/** look for a switch in arg, update parameter */
+// TODO arg parsing like this does not belong in the library, it belongs
+// in moses-cmd
+string Parameter::FindParam(const string &paramSwitch, int argc, char* argv[])
+{
+ for (int i = 0; i < argc; i++) {
+ if (string(argv[i]) == paramSwitch) {
+ if (i + 1 < argc) {
+ return argv[i + 1];
+ }
+ else {
+ std::cerr << "Option " << paramSwitch << " requires a parameter!";
+ // TODO return some sort of error, not the empty string
+ }
+ }
+ }
+ return "";
+}
+
+/** update parameter settings with command line switches
+ * \param paramSwitch (potentially short) name of switch
+ * \param paramName full name of parameter
+ * \param argc number of arguments on command line
+ * \param argv values of paramters on command line */
+void Parameter::OverwriteParam(const string &paramSwitch,
+ const string &paramName, int argc, char* argv[])
+{
+ int startPos = -1;
+ for (int i = 0; i < argc; i++) {
+ if (string(argv[i]) == paramSwitch) {
+ startPos = i + 1;
+ break;
+ }
+ }
+ if (startPos < 0) return;
+
+ int index = 0;
+ m_setting[paramName]; // defines the parameter, important for boolean switches
+ while (startPos < argc && (!isOption(argv[startPos]))) {
+ if (m_setting[paramName].size() > (size_t) index) m_setting[paramName][index] =
+ argv[startPos];
+ else m_setting[paramName].push_back(argv[startPos]);
+ index++;
+ startPos++;
+ }
+}
+
+/** read parameters from a configuration file */
+bool Parameter::ReadConfigFile(const string &filePath)
+{
+ InputFileStream inFile(filePath);
+ string line, paramName;
+ while (getline(inFile, line)) {
+ // comments
+ size_t comPos = line.find_first_of("#");
+ if (comPos != string::npos) line = line.substr(0, comPos);
+ // trim leading and trailing spaces/tabs
+ line = Trim(line);
+
+ if (line.size() == 0) {
+ // blank line. do nothing.
+ }
+ else if (line[0] == '[') {
+ // new parameter
+ for (size_t currPos = 0; currPos < line.size(); currPos++) {
+ if (line[currPos] == ']') {
+ paramName = line.substr(1, currPos - 1);
+ break;
+ }
+ }
+ }
+ else {
+ // add value to parameter
+ m_setting[paramName].push_back(line);
+ }
+ }
+ return true;
+}
+
+struct Credit
+{
+ string name, contact, currentPursuits, areaResponsibility;
+ int sortId;
+
+ Credit(string name, string contact, string currentPursuits,
+ string areaResponsibility)
+ {
+ this->name = name;
+ this->contact = contact;
+ this->currentPursuits = currentPursuits;
+ this->areaResponsibility = areaResponsibility;
+ this->sortId = util::rand_excl(1000);
+ }
+
+ bool operator<(const Credit &other) const
+ {
+ /*
+ if (areaResponsibility.size() != 0 && other.areaResponsibility.size() ==0)
+ return true;
+ if (areaResponsibility.size() == 0 && other.areaResponsibility.size() !=0)
+ return false;
+
+ return name < other.name;
+ */
+ return sortId < other.sortId;
+ }
+
+};
+
+std::ostream& operator<<(std::ostream &os, const Credit &credit)
+{
+ os << credit.name;
+ if (credit.contact != "") os << "\t contact: " << credit.contact;
+ if (credit.currentPursuits != "") os << " " << credit.currentPursuits;
+ if (credit.areaResponsibility != "") os << " I'll answer question on: "
+ << credit.areaResponsibility;
+ return os;
+}
+
+void Parameter::PrintCredit()
+{
+ vector<Credit> everyone;
+ srand(time(NULL));
+
+ everyone.push_back(
+ Credit("Nicola Bertoldi", "911", "", "scripts & other stuff"));
+ everyone.push_back(Credit("Ondrej Bojar", "", "czech this out!", ""));
+ everyone.push_back(
+ Credit("Chris Callison-Burch", "anytime, anywhere",
+ "international playboy", ""));
+ everyone.push_back(Credit("Alexandra Constantin", "", "eu sunt varza", ""));
+ everyone.push_back(
+ Credit("Brooke Cowan", "brooke@csail.mit.edu",
+ "if you're going to san francisco, be sure to wear a flower in your hair",
+ ""));
+ everyone.push_back(
+ Credit("Chris Dyer", "can't. i'll be out driving my mustang",
+ "driving my mustang", ""));
+ everyone.push_back(
+ Credit("Marcello Federico", "federico at itc at it",
+ "Researcher at ITC-irst, Trento, Italy", "IRST language model"));
+ everyone.push_back(
+ Credit("Evan Herbst", "Small college in upstate New York", "", ""));
+ everyone.push_back(
+ Credit("Philipp Koehn", "only between 2 and 4am", "",
+ "Nothing fazes this dude"));
+ everyone.push_back(
+ Credit("Christine Moran", "weird building at MIT", "", ""));
+ everyone.push_back(
+ Credit("Wade Shen", "via morse code", "buying another laptop", ""));
+ everyone.push_back(
+ Credit("Richard Zens", "richard at aachen dot de", "",
+ "ambiguous source input, confusion networks, confusing source code"));
+ everyone.push_back(
+ Credit("Hieu Hoang", "http://www.hoang.co.uk/hieu/",
+ "phd student at Edinburgh Uni. Original Moses developer",
+ "general queries/ flames on Moses."));
+
+ sort(everyone.begin(), everyone.end());
+
+ cerr
+ << "Moses - A beam search decoder for phrase-based statistical machine translation models"
+ << endl << "Copyright (C) 2006 University of Edinburgh" << endl << endl
+
+ << "This library is free software; you can redistribute it and/or" << endl
+ << "modify it under the terms of the GNU Lesser General Public" << endl
+ << "License as published by the Free Software Foundation; either" << endl
+ << "version 2.1 of the License, or (at your option) any later version."
+ << endl << endl
+
+ << "This library is distributed in the hope that it will be useful,"
+ << endl
+ << "but WITHOUT ANY WARRANTY; without even the implied warranty of"
+ << endl
+ << "MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU"
+ << endl << "Lesser General Public License for more details." << endl
+ << endl
+
+ << "You should have received a copy of the GNU Lesser General Public"
+ << endl
+ << "License along with this library; if not, write to the Free Software"
+ << endl
+ << "Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA"
+ << endl << endl
+ << "***********************************************************************"
+ << endl << endl << "Built on " << __DATE__ << " at " __TIME__ << endl
+ << endl << "WHO'S FAULT IS THIS GODDAM SOFTWARE:" << endl;
+
+ ostream_iterator<Credit> out(cerr, "\n");
+ copy(everyone.begin(), everyone.end(), out);
+ cerr << endl << endl;
+}
+
+/** update parameter settings with command line switches
+ * \param paramName full name of parameter
+ * \param values inew values for paramName */
+void Parameter::OverwriteParam(const string &paramName, PARAM_VEC values)
+{
+ cerr << "Overwriting parameter " << paramName;
+
+ m_setting[paramName]; // defines the parameter, important for boolean switches
+ if (m_setting[paramName].size() > 1) {
+ cerr << " (the parameter had " << m_setting[paramName].size()
+ << " previous values)";
+ UTIL_THROW_IF2(m_setting[paramName].size() != values.size(),
+ "Number of weight override for " << paramName << " is not the same as the original number of weights");
+ }
+ else {
+ cerr << " (the parameter does not have previous values)";
+ m_setting[paramName].resize(values.size());
+ }
+ cerr << " with the following values:";
+ int i = 0;
+ for (PARAM_VEC::iterator iter = values.begin(); iter != values.end();
+ iter++, i++) {
+ m_setting[paramName][i] = *iter;
+ cerr << " " << *iter;
+ }
+ cerr << std::endl;
+}
+
+std::set<std::string> Parameter::GetWeightNames() const
+{
+ std::set<std::string> ret;
+ std::map<std::string, std::vector<float> >::const_iterator iter;
+ for (iter = m_weights.begin(); iter != m_weights.end(); ++iter) {
+ const string &key = iter->first;
+ ret.insert(key);
+ }
+ return ret;
+}
+
+void Parameter::Save(const std::string path)
+{
+ ofstream file;
+ file.open(path.c_str());
+
+ PARAM_MAP::const_iterator iterOuter;
+ for (iterOuter = m_setting.begin(); iterOuter != m_setting.end();
+ ++iterOuter) {
+ const std::string &sectionName = iterOuter->first;
+ file << "[" << sectionName << "]" << endl;
+
+ const PARAM_VEC &values = iterOuter->second;
+
+ PARAM_VEC::const_iterator iterInner;
+ for (iterInner = values.begin(); iterInner != values.end(); ++iterInner) {
+ const std::string &value = *iterInner;
+ file << value << endl;
+ }
+
+ file << endl;
+ }
+
+ file.close();
+}
+
+template<>
+void Parameter::SetParameter<bool>(bool &parameter,
+ std::string const& parameterName, bool const& defaultValue) const
+{
+ const PARAM_VEC *params = GetParam(parameterName);
+
+ // default value if nothing is specified
+ parameter = defaultValue;
+ if (params == NULL) {
+ return;
+ }
+
+ // if parameter is just specified as, e.g. "-parameter" set it true
+ if (params->size() == 0) {
+ parameter = true;
+ }
+ // if paramter is specified "-parameter true" or "-parameter false"
+ else if (params->size() == 1) {
+ parameter = Scan<bool>(params->at(0));
+ }
+}
+
+void Parameter::SetParameter(bool& var, std::string const& name)
+{
+ SetParameter(var, name, false);
+}
+
+}
+
diff --git a/moses2/legacy/Parameter.h b/moses2/legacy/Parameter.h
new file mode 100644
index 000000000..f43ce98a4
--- /dev/null
+++ b/moses2/legacy/Parameter.h
@@ -0,0 +1,176 @@
+/// $Id$
+
+/***********************************************************************
+ Moses - factored phrase-based language decoder
+ Copyright (C) 2006 University of Edinburgh
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ ***********************************************************************/
+
+#pragma once
+
+#include <string>
+#include <set>
+#include <map>
+#include <vector>
+#include <boost/program_options.hpp>
+#include "Util2.h"
+
+namespace Moses2
+{
+
+typedef std::vector<std::string> PARAM_VEC;
+typedef std::map<std::string, PARAM_VEC> PARAM_MAP;
+typedef std::map<std::string, bool> PARAM_BOOL;
+typedef std::map<std::string, std::string> PARAM_STRING;
+
+/** Handles parameter values set in config file or on command line.
+ * Process raw parameter data (names and values as strings) for StaticData
+ * to parse; to get useful values, see StaticData.
+ */
+class Parameter
+{
+ typedef boost::program_options::options_description options_description;
+ typedef boost::program_options::value_semantic value_semantic;
+protected:
+ PARAM_MAP m_setting;
+ PARAM_BOOL m_valid;
+ PARAM_STRING m_abbreviation;
+ PARAM_STRING m_description;
+ PARAM_STRING m_fullname;
+ // std::map<char,std::set<std::string> > m_confusable;
+ // stores long parameter names that start with a letter that is also a short option.
+ options_description m_options;
+
+ std::map<std::string, std::vector<float> > m_weights;
+
+ std::string FindParam(const std::string &paramSwitch, int argc, char* argv[]);
+ void OverwriteParam(const std::string &paramSwitch,
+ const std::string &paramName, int argc, char* argv[]);
+ bool ReadConfigFile(const std::string &filePath);
+ bool FilesExist(const std::string &paramName, int fieldNo,
+ std::vector<std::string> const& fileExtension = std::vector<std::string>(
+ 1, ""));
+ bool isOption(const char* token);
+ bool Validate();
+
+ void
+ AddParam(options_description& optgroup, value_semantic const* optvalue,
+ std::string const& paramName, std::string const& description);
+
+ void
+ AddParam(options_description& optgroup, std::string const &paramName,
+ std::string const &description);
+
+ void
+ AddParam(options_description& optgroup, value_semantic const* optvalue,
+ std::string const& paramName, std::string const& abbrevName,
+ std::string const& description);
+
+ void
+ AddParam(options_description& optgroup, std::string const& paramName,
+ std::string const& abbrevName, std::string const& description);
+
+ void PrintCredit();
+
+ void SetWeight(const std::string &name, size_t ind, float weight);
+ void SetWeight(const std::string &name, size_t ind,
+ const std::vector<float> &weights);
+ void AddWeight(const std::string &name, size_t ind,
+ const std::vector<float> &weights);
+ void ConvertWeightArgs();
+ void ConvertWeightArgsSingleWeight(const std::string &oldWeightName,
+ const std::string &newWeightName);
+ void ConvertWeightArgsPhraseModel(const std::string &oldWeightName);
+ void ConvertWeightArgsLM();
+ void ConvertWeightArgsDistortion();
+ void ConvertWeightArgsGeneration(const std::string &oldWeightName,
+ const std::string &newWeightName);
+ void ConvertWeightArgsPhrasePenalty();
+ void ConvertWeightArgsWordPenalty();
+ void ConvertPhrasePenalty();
+ void CreateWeightsMap();
+ void CreateWeightsMap(const PARAM_VEC &vec);
+ void WeightOverwrite();
+ void AddFeature(const std::string &line);
+ void AddFeaturesCmd();
+
+public:
+ Parameter();
+ ~Parameter();
+ bool LoadParam(int argc, char* argv[]);
+ bool LoadParam(const std::string &filePath);
+ void Explain();
+
+ /** return a vector of strings holding the whitespace-delimited values on the ini-file line corresponding to the given parameter name */
+ const PARAM_VEC *GetParam(const std::string &paramName) const;
+
+ /** check if parameter is defined (either in moses.ini or as switch) */
+ bool isParamSpecified(const std::string &paramName) const
+ {
+ return m_setting.find(paramName) != m_setting.end();
+ }
+
+ void OverwriteParam(const std::string &paramName, PARAM_VEC values);
+
+ std::vector<float> GetWeights(const std::string &name);
+ const std::map<std::string, std::vector<float> > &GetAllWeights() const
+ {
+ return m_weights;
+ }
+ std::set<std::string> GetWeightNames() const;
+
+ const PARAM_MAP &GetParams() const
+ {
+ return m_setting;
+ }
+
+ void Save(const std::string path);
+
+ template<typename T>
+ void SetParameter(T &var, const std::string &name,
+ const T &defaultValue) const
+ {
+ const PARAM_VEC *params = GetParam(name);
+ if (params && params->size()) {
+ var = Scan<T>(params->at(0));
+ }
+ else {
+ var = defaultValue;
+ }
+ }
+
+ void SetParameter(bool& var, std::string const& name);
+
+ bool SetBooleanSwitch(bool& val, std::string const name)
+ {
+ // issues a warning if format is wrong
+ const PARAM_VEC *params = GetParam(name);
+ val = (params && params->size());
+ if (val && params->size() != 1) {
+ std::cerr << "ERROR: wrong format for switch -" << name;
+ return false;
+ }
+ return true;
+ }
+
+};
+
+template<>
+void Parameter::SetParameter<bool>(bool &var, const std::string &name,
+ const bool &defaultValue) const;
+
+}
+
diff --git a/moses2/legacy/Range.cpp b/moses2/legacy/Range.cpp
new file mode 100644
index 000000000..7186e4265
--- /dev/null
+++ b/moses2/legacy/Range.cpp
@@ -0,0 +1,32 @@
+/***********************************************************************
+ Moses - factored phrase-based language decoder
+ Copyright (C) 2006 University of Edinburgh
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ ***********************************************************************/
+
+#include "Range.h"
+
+namespace Moses2
+{
+
+std::ostream& operator <<(std::ostream& out, const Range& range)
+{
+ out << "[" << range.m_startPos << ".." << range.m_endPos << "]";
+ return out;
+}
+
+}
+
diff --git a/moses2/legacy/Range.h b/moses2/legacy/Range.h
new file mode 100644
index 000000000..76d720bed
--- /dev/null
+++ b/moses2/legacy/Range.h
@@ -0,0 +1,123 @@
+// $Id$
+
+/***********************************************************************
+ Moses - factored phrase-based language decoder
+ Copyright (C) 2006 University of Edinburgh
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ ***********************************************************************/
+
+#pragma once
+
+#include <cassert>
+#include <iostream>
+#include <boost/functional/hash.hpp>
+#include "Util2.h"
+#include "util/exception.hh"
+
+#ifdef WIN32
+#undef max
+#endif
+
+namespace Moses2
+{
+
+/***
+ * Efficient version of Bitmap for contiguous ranges
+ */
+class Range
+{
+ friend std::ostream& operator <<(std::ostream& out, const Range& range);
+
+ // m_endPos is inclusive
+ size_t m_startPos, m_endPos;
+public:
+ inline explicit Range()
+ {
+ }
+ inline Range(size_t startPos, size_t endPos) :
+ m_startPos(startPos), m_endPos(endPos)
+ {
+ }
+ inline Range(const Range &copy) :
+ m_startPos(copy.GetStartPos()), m_endPos(copy.GetEndPos())
+ {
+ }
+
+ inline size_t GetStartPos() const
+ {
+ return m_startPos;
+ }
+ inline size_t GetEndPos() const
+ {
+ return m_endPos;
+ }
+
+ inline void SetStartPos(size_t val)
+ {
+ m_startPos = val;
+ }
+ inline void SetEndPos(size_t val)
+ {
+ m_endPos = val;
+ }
+
+ //! count of words translated
+ inline size_t GetNumWordsCovered() const
+ {
+ assert(
+ (m_startPos == NOT_FOUND && m_endPos == NOT_FOUND) || (m_startPos != NOT_FOUND && m_endPos != NOT_FOUND));
+ return (m_startPos == NOT_FOUND) ? 0 : m_endPos - m_startPos + 1;
+ }
+
+ //! transitive comparison
+ inline bool operator<(const Range& x) const {
+ return (m_startPos<x.m_startPos
+ || (m_startPos==x.m_startPos && m_endPos<x.m_endPos));
+ }
+
+ // equality operator
+ inline bool operator==(const Range& x) const {
+ return (m_startPos==x.m_startPos && m_endPos==x.m_endPos);
+ }
+ // Whether two word ranges overlap or not
+ inline bool Overlap(const Range& x) const {
+
+ if ( x.m_endPos < m_startPos || x.m_startPos > m_endPos) return false;
+
+ return true;
+ }
+
+ inline size_t GetNumWordsBetween(const Range& x) const {
+ UTIL_THROW_IF2(Overlap(x), "Overlapping ranges");
+
+ if (x.m_endPos < m_startPos) {
+ return m_startPos - x.m_endPos - 1;
+ }
+
+ return x.m_startPos - m_endPos - 1;
+ }
+
+};
+
+inline size_t hash_value(const Range& range)
+{
+ size_t seed = range.GetStartPos();
+ boost::hash_combine(seed, range.GetEndPos());
+ return seed;
+}
+
+}
+
diff --git a/moses2/legacy/ThreadPool.cpp b/moses2/legacy/ThreadPool.cpp
new file mode 100644
index 000000000..3e159020b
--- /dev/null
+++ b/moses2/legacy/ThreadPool.cpp
@@ -0,0 +1,150 @@
+// $Id: ThreadPool.cpp 3045 2010-04-05 13:07:29Z hieuhoang1972 $
+
+/***********************************************************************
+ Moses - factored phrase-based language decoder
+ Copyright (C) 2009 University of Edinburgh
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ ***********************************************************************/
+#include <stdio.h>
+#include <pthread.h>
+#include <pthread.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <errno.h>
+#include <unistd.h>
+
+#include "ThreadPool.h"
+
+using namespace std;
+
+namespace Moses2
+{
+
+#define handle_error_en(en, msg) \
+ do { errno = en; perror(msg); exit(EXIT_FAILURE); } while (0)
+
+ThreadPool::ThreadPool(size_t numThreads, int cpuAffinityOffset,
+ int cpuAffinityIncr) :
+ m_stopped(false), m_stopping(false), m_queueLimit(0)
+{
+ size_t numCPU = sysconf(_SC_NPROCESSORS_ONLN);
+ int cpuInd = cpuAffinityOffset % numCPU;
+
+ for (size_t i = 0; i < numThreads; ++i) {
+ boost::thread *thread = m_threads.create_thread(
+ boost::bind(&ThreadPool::Execute, this));
+
+#ifdef __linux
+ if (cpuAffinityOffset >= 0) {
+ int s;
+
+ boost::thread::native_handle_type handle = thread->native_handle();
+
+ //cerr << "numCPU=" << numCPU << endl;
+ cpu_set_t cpuset;
+ CPU_ZERO(&cpuset);
+
+ CPU_SET(cpuInd, &cpuset);
+ cpuInd += cpuAffinityIncr;
+ cpuInd = cpuInd % numCPU;
+
+ s = pthread_setaffinity_np(handle, sizeof(cpu_set_t), &cpuset);
+ if (s != 0) {
+ handle_error_en(s, "pthread_setaffinity_np");
+ //cerr << "affinity error with thread " << i << endl;
+ }
+
+ // get affinity
+ CPU_ZERO(&cpuset);
+ s = pthread_getaffinity_np(handle, sizeof(cpu_set_t), &cpuset);
+ cerr << "Set returned by pthread_getaffinity_np() contained:\n";
+ for (int j = 0; j < CPU_SETSIZE; j++) {
+ if (CPU_ISSET(j, &cpuset)) {
+ cerr << " CPU " << j << "\n";
+ }
+ }
+ }
+#endif
+ }
+}
+
+void ThreadPool::Execute()
+{
+ do {
+ boost::shared_ptr<Task> task;
+ {
+ // Find a job to perform
+ boost::mutex::scoped_lock lock(m_mutex);
+ if (m_tasks.empty() && !m_stopped) {
+ m_threadNeeded.wait(lock);
+ }
+ if (!m_stopped && !m_tasks.empty()) {
+ task = m_tasks.front();
+ m_tasks.pop();
+ }
+ }
+ //Execute job
+ if (task) {
+ // must read from task before run. otherwise task may be deleted by main thread
+ // race condition
+ task->DeleteAfterExecution();
+ task->Run();
+ }
+ m_threadAvailable.notify_all();
+ }
+ while (!m_stopped);
+}
+
+void ThreadPool::Submit(boost::shared_ptr<Task> task)
+{
+ boost::mutex::scoped_lock lock(m_mutex);
+ if (m_stopping) {
+ throw runtime_error("ThreadPool stopping - unable to accept new jobs");
+ }
+ while (m_queueLimit > 0 && m_tasks.size() >= m_queueLimit) {
+ m_threadAvailable.wait(lock);
+ }
+ m_tasks.push(task);
+ m_threadNeeded.notify_all();
+}
+
+void ThreadPool::Stop(bool processRemainingJobs)
+{
+ {
+ //prevent more jobs from being added to the queue
+ boost::mutex::scoped_lock lock(m_mutex);
+ if (m_stopped) return;
+ m_stopping = true;
+ }
+ if (processRemainingJobs) {
+ boost::mutex::scoped_lock lock(m_mutex);
+ //wait for queue to drain.
+ while (!m_tasks.empty() && !m_stopped) {
+ m_threadAvailable.wait(lock);
+ }
+ }
+ //tell all threads to stop
+ {
+ boost::mutex::scoped_lock lock(m_mutex);
+ m_stopped = true;
+ }
+ m_threadNeeded.notify_all();
+
+ m_threads.join_all();
+}
+
+}
+
diff --git a/moses2/legacy/ThreadPool.h b/moses2/legacy/ThreadPool.h
new file mode 100644
index 000000000..62a8f43ad
--- /dev/null
+++ b/moses2/legacy/ThreadPool.h
@@ -0,0 +1,140 @@
+// $Id: ThreadPool.h 3045 2010-04-05 13:07:29Z hieuhoang1972 $
+
+/***********************************************************************
+ Moses - factored phrase-based language decoder
+ Copyright (C) 2009 University of Edinburgh
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ ***********************************************************************/
+
+#pragma once
+
+#include <iostream>
+#include <queue>
+#include <vector>
+
+#include <boost/shared_ptr.hpp>
+
+#ifdef WITH_THREADS
+#include <boost/bind.hpp>
+#include <boost/thread.hpp>
+#endif
+
+#ifdef BOOST_HAS_PTHREADS
+#include <pthread.h>
+#endif
+
+//#include "Util.h"
+
+namespace Moses2
+{
+
+/**
+ * Classes to implement a ThreadPool.
+ **/
+
+/** A task to be executed by the ThreadPool
+ */
+class Task
+{
+public:
+ virtual void Run() = 0;
+ virtual bool DeleteAfterExecution()
+ {
+ return true;
+ }
+ virtual ~Task()
+ {
+ }
+};
+
+class ThreadPool
+{
+public:
+ /**
+ * Construct a thread pool of a fixed size.
+ **/
+ explicit ThreadPool(size_t numThreads, int cpuAffinityOffset = -1,
+ int cpuAffinityIncr = 1);
+
+ ~ThreadPool()
+ {
+ Stop();
+ }
+
+ /**
+ * Add a job to the threadpool.
+ **/
+ void Submit(boost::shared_ptr<Task> task);
+
+ /**
+ * Wait until all queued jobs have completed, and shut down
+ * the ThreadPool.
+ **/
+ void Stop(bool processRemainingJobs = false);
+
+ /**
+ * Set maximum number of queued threads (otherwise Submit blocks)
+ **/
+ void SetQueueLimit(size_t limit)
+ {
+ m_queueLimit = limit;
+ }
+
+private:
+ /**
+ * The main loop executed by each thread.
+ **/
+ void Execute();
+
+ std::queue<boost::shared_ptr<Task> > m_tasks;
+ boost::thread_group m_threads;
+ boost::mutex m_mutex;
+ boost::condition_variable m_threadNeeded;
+ boost::condition_variable m_threadAvailable;
+ bool m_stopped;
+ bool m_stopping;
+ size_t m_queueLimit;
+};
+
+class TestTask: public Task
+{
+public:
+ TestTask(int id) :
+ m_id(id)
+ {
+ }
+
+ virtual void Run()
+ {
+#ifdef BOOST_HAS_PTHREADS
+ pthread_t tid = pthread_self();
+#else
+ typedef void * pthread_t;
+ pthread_t tid = 0;
+#endif
+ std::cerr << "Executing " << m_id << " in thread id " << tid << std::endl;
+ }
+
+ virtual ~TestTask()
+ {
+ }
+
+private:
+ int m_id;
+};
+
+}
+
diff --git a/moses2/legacy/Timer.cpp b/moses2/legacy/Timer.cpp
new file mode 100644
index 000000000..b1857ee0d
--- /dev/null
+++ b/moses2/legacy/Timer.cpp
@@ -0,0 +1,104 @@
+#include <iostream>
+#include <iomanip>
+#include "Timer.h"
+
+#include "util/usage.hh"
+
+namespace Moses2
+{
+
+Timer::Timer() :
+ running(false), stopped(false)
+{
+ start_time = 0;
+}
+
+/***
+ * Return the total wall time that the timer has been in the "running"
+ * state since it was first "started".
+ */
+double Timer::get_elapsed_time() const
+{
+ if (stopped) {
+ return stop_time - start_time;
+ }
+ if (running) {
+ return util::WallTime() - start_time;
+ }
+ return 0;
+}
+
+/***
+ * Start a timer. If it is already running, let it continue running.
+ * Print an optional message.
+ */
+void Timer::start(const char* msg)
+{
+ // Print an optional message, something like "Starting timer t";
+ if (msg) {
+ std::cerr << msg << std::endl;
+ }
+
+ // Return immediately if the timer is already running
+ if (running && !stopped) return;
+
+ // If stopped, recompute start time
+ if (stopped) {
+ start_time = util::WallTime() - (stop_time - start_time);
+ stopped = false;
+ }
+ else {
+ start_time = util::WallTime();
+ running = true;
+ }
+}
+
+/***
+ * Stop a timer.
+ * Print an optional message.
+ */
+void Timer::stop(const char* msg)
+{
+ // Print an optional message, something like "Stopping timer t";
+ if (msg) {
+ std::cerr << msg << std::endl;
+ }
+
+ // Return immediately if the timer is not running
+ if (stopped || !running) return;
+
+ // Record stopped time
+ stop_time = util::WallTime();
+
+ // Change timer status to running
+ stopped = true;
+}
+
+/***
+ * Print out an optional message followed by the current timer timing.
+ */
+void Timer::check(const char* msg)
+{
+ // Print an optional message, something like "Checking timer t";
+ if (msg) {
+ std::cerr << msg << " : ";
+ }
+
+// VERBOSE(1, "[" << std::setiosflags(std::ios::fixed) << std::setprecision(2) << (running ? elapsed_time() : 0) << "] seconds\n");
+ std::cerr << "[" << (running ? get_elapsed_time() : 0) << "] seconds\n";
+}
+
+/***
+ * Allow timers to be printed to ostreams using the syntax 'os << t'
+ * for an ostream 'os' and a timer 't'. For example, "cout << t" will
+ * print out the total amount of time 't' has been "running".
+ */
+std::ostream& operator<<(std::ostream& os, Timer& t)
+{
+ //os << std::setprecision(2) << std::setiosflags(std::ios::fixed) << (t.running ? t.elapsed_time() : 0);
+ os << (t.running ? t.get_elapsed_time() : 0);
+ return os;
+}
+
+}
+
diff --git a/moses2/legacy/Timer.h b/moses2/legacy/Timer.h
new file mode 100644
index 000000000..3f44ef4b9
--- /dev/null
+++ b/moses2/legacy/Timer.h
@@ -0,0 +1,39 @@
+#pragma once
+
+#include <ctime>
+#include <iostream>
+#include <iomanip>
+
+namespace Moses2
+{
+
+/** Wrapper around time_t to time how long things have been running
+ * according to walltime. We avoid CPU time since it is less reliable
+ * in a multi-threaded environment and can spuriously include clock cycles
+ * used by other threads in the same process.
+ */
+class Timer
+{
+ friend std::ostream& operator<<(std::ostream& os, Timer& t);
+
+private:
+ bool running;
+ bool stopped;
+ double start_time;
+ double stop_time;
+
+public:
+ /***
+ * 'running' is initially false. A timer needs to be explicitly started
+ * using 'start'
+ */
+ Timer();
+
+ void start(const char* msg = 0);
+ void stop(const char* msg = 0);
+ void check(const char* msg = 0);
+ double get_elapsed_time() const;
+};
+
+}
+
diff --git a/moses2/legacy/Util2.cpp b/moses2/legacy/Util2.cpp
new file mode 100644
index 000000000..ffc348090
--- /dev/null
+++ b/moses2/legacy/Util2.cpp
@@ -0,0 +1,29 @@
+#include "Util2.h"
+#include "util/exception.hh"
+
+namespace Moses2
+{
+
+class BoolValueException: public util::Exception
+{
+};
+
+template<>
+bool Scan<bool>(const std::string &input)
+{
+ std::string lc = ToLower(input);
+ if (lc == "yes" || lc == "y" || lc == "true" || lc == "1") return true;
+ if (lc == "no" || lc == "n" || lc == "false" || lc == "0") return false;
+ UTIL_THROW(BoolValueException,
+ "Could not interpret " << input << " as a boolean. After lowercasing, valid values are yes, y, true, 1, no, n, false, and 0.");
+}
+
+const std::string ToLower(const std::string& str)
+{
+ std::string lc(str);
+ std::transform(lc.begin(), lc.end(), lc.begin(), (int (*)(int))std::tolower);return
+lc ;
+}
+
+}
+
diff --git a/moses2/legacy/Util2.h b/moses2/legacy/Util2.h
new file mode 100644
index 000000000..eef638f93
--- /dev/null
+++ b/moses2/legacy/Util2.h
@@ -0,0 +1,351 @@
+#pragma once
+
+#include <boost/thread.hpp>
+#include <boost/thread/mutex.hpp>
+#include <fstream>
+#include <iostream>
+#include <string>
+#include <limits>
+#include <sstream>
+#include <vector>
+#include <queue>
+#include <cmath>
+#include <stdlib.h>
+#include "../TypeDef.h"
+#include "util/exception.hh"
+
+namespace Moses2
+{
+
+template<typename T>
+class UnorderedComparer
+{
+public:
+ size_t operator()(const T& obj) const
+ {
+ return obj.hash();
+ }
+
+ bool operator()(const T& a, const T& b) const
+ {
+ return a == b;
+ }
+
+ size_t operator()(const T* obj) const
+ {
+ return obj->hash();
+ }
+
+ bool operator()(const T* a, const T* b) const
+ {
+ return (*a) == (*b);
+ }
+
+};
+
+template<typename T>
+void Init(T arr[], size_t size, const T &val)
+{
+ for (size_t i = 0; i < size; ++i) {
+ arr[i] = val;
+ }
+}
+
+//! delete white spaces at beginning and end of string
+inline std::string Trim(const std::string& str, const std::string dropChars =
+ " \t\n\r")
+{
+ std::string res = str;
+ res.erase(str.find_last_not_of(dropChars) + 1);
+ return res.erase(0, res.find_first_not_of(dropChars));
+}
+
+//! convert string to variable of type T. Used to reading floats, int etc from files
+template<typename T>
+inline T Scan(const std::string &input)
+{
+ std::stringstream stream(input);
+ T ret;
+ stream >> ret;
+ return ret;
+}
+
+//! just return input
+template<>
+inline std::string Scan<std::string>(const std::string &input)
+{
+ return input;
+}
+
+template<>
+inline SCORE Scan<SCORE>(const std::string &input)
+{
+ SCORE ret = atof(input.c_str());
+ return ret;
+}
+
+//! Specialisation to understand yes/no y/n true/false 0/1
+template<>
+bool Scan<bool>(const std::string &input);
+
+template<>
+inline S2TParsingAlgorithm Scan<S2TParsingAlgorithm>(const std::string &input)
+{
+ return (S2TParsingAlgorithm) Scan<size_t>(input);
+}
+
+template<>
+inline SourceLabelOverlap Scan<SourceLabelOverlap>(const std::string &input)
+{
+ return (SourceLabelOverlap) Scan<size_t>(input);
+}
+
+template<>
+inline SearchAlgorithm Scan<SearchAlgorithm>(const std::string &input)
+{
+ return (SearchAlgorithm) Scan<size_t>(input);
+}
+
+template<>
+ inline XmlInputType Scan<XmlInputType>(const std::string &input)
+ {
+ XmlInputType ret;
+ if (input=="exclusive") ret = XmlExclusive;
+ else if (input=="inclusive") ret = XmlInclusive;
+ else if (input=="constraint") ret = XmlConstraint;
+ else if (input=="ignore") ret = XmlIgnore;
+ else if (input=="pass-through") ret = XmlPassThrough;
+ else {
+ UTIL_THROW2("Unknown XML input type");
+ }
+
+ return ret;
+ }
+
+template<>
+ inline InputTypeEnum Scan<InputTypeEnum>(const std::string &input)
+ {
+ return (InputTypeEnum) Scan<size_t>(input);
+ }
+
+template<>
+ inline WordAlignmentSort Scan<WordAlignmentSort>(const std::string &input)
+ {
+ return (WordAlignmentSort) Scan<size_t>(input);
+ }
+
+//! convert vectors of string to vectors of type T variables
+template<typename T>
+inline std::vector<T> Scan(const std::vector<std::string> &input)
+{
+ std::vector<T> output(input.size());
+ for (size_t i = 0; i < input.size(); i++) {
+ output[i] = Scan<T>(input[i]);
+ }
+ return output;
+}
+
+//! speeded up version of above
+template<typename T>
+inline void Scan(std::vector<T> &output, const std::vector<std::string> &input)
+{
+ output.resize(input.size());
+ for (size_t i = 0; i < input.size(); i++) {
+ output[i] = Scan<T>(input[i]);
+ }
+}
+
+/** tokenise input string to vector of string. each element has been separated by a character in the delimiters argument.
+ The separator can only be 1 character long. The default delimiters are space or tab
+ */
+inline std::vector<std::string> Tokenize(const std::string& str,
+ const std::string& delimiters = " \t")
+{
+ std::vector<std::string> tokens;
+ // Skip delimiters at beginning.
+ std::string::size_type lastPos = str.find_first_not_of(delimiters, 0);
+ // Find first "non-delimiter".
+ std::string::size_type pos = str.find_first_of(delimiters, lastPos);
+
+ while (std::string::npos != pos || std::string::npos != lastPos) {
+ // Found a token, add it to the vector.
+ tokens.push_back(str.substr(lastPos, pos - lastPos));
+ // Skip delimiters. Note the "not_of"
+ lastPos = str.find_first_not_of(delimiters, pos);
+ // Find next "non-delimiter"
+ pos = str.find_first_of(delimiters, lastPos);
+ }
+
+ return tokens;
+}
+
+//! tokenise input string to vector of type T
+template<typename T>
+inline std::vector<T> Tokenize(const std::string &input,
+ const std::string& delimiters = " \t")
+{
+ std::vector<std::string> stringVector = Tokenize(input, delimiters);
+ return Scan<T>(stringVector);
+}
+
+/** only split of the first delimiter. Used by class FeatureFunction for parse key=value pair.
+ * Value may have = character
+ */
+inline std::vector<std::string> TokenizeFirstOnly(const std::string& str,
+ const std::string& delimiters = " \t")
+{
+ std::vector<std::string> tokens;
+ std::string::size_type pos = str.find_first_of(delimiters);
+
+ if (std::string::npos != pos) {
+ // Found a token, add it to the vector.
+ tokens.push_back(str.substr(0, pos));
+ tokens.push_back(str.substr(pos + 1, str.size() - pos - 1));
+ }
+ else {
+ tokens.push_back(str);
+ }
+
+ return tokens;
+}
+
+inline std::vector<std::string> TokenizeMultiCharSeparator(
+ const std::string& str, const std::string& separator)
+{
+ std::vector<std::string> tokens;
+
+ size_t pos = 0;
+ // Find first "non-delimiter".
+ std::string::size_type nextPos = str.find(separator, pos);
+
+ while (nextPos != std::string::npos) {
+ // Found a token, add it to the vector.
+ tokens.push_back(str.substr(pos, nextPos - pos));
+ // Skip delimiters. Note the "not_of"
+ pos = nextPos + separator.size();
+ // Find next "non-delimiter"
+ nextPos = str.find(separator, pos);
+ }
+ tokens.push_back(str.substr(pos, nextPos - pos));
+
+ return tokens;
+}
+
+// speeded up version of above
+inline void TokenizeMultiCharSeparator(std::vector<std::string> &output,
+ const std::string& str, const std::string& separator)
+{
+ size_t pos = 0;
+ // Find first "non-delimiter".
+ std::string::size_type nextPos = str.find(separator, pos);
+
+ while (nextPos != std::string::npos) {
+ // Found a token, add it to the vector.
+ output.push_back(Trim(str.substr(pos, nextPos - pos)));
+ // Skip delimiters. Note the "not_of"
+ pos = nextPos + separator.size();
+ // Find next "non-delimiter"
+ nextPos = str.find(separator, pos);
+ }
+ output.push_back(Trim(str.substr(pos, nextPos - pos)));
+}
+
+//! get string representation of any object/variable, as long as it can pipe to a stream
+template<typename T>
+inline std::string SPrint(const T &input)
+{
+ std::stringstream stream("");
+ stream << input;
+ return stream.str();
+}
+
+//! irst number are in log 10, transform to natural log
+inline float TransformLMScore(float irstScore)
+{
+ return irstScore * 2.30258509299405f;
+}
+
+//! transform prob to natural log score
+inline float TransformScore(float prob)
+{
+ return log(prob);
+}
+
+//! make sure score doesn't fall below LOWEST_SCORE
+inline float FloorScore(float logScore)
+{
+ return (std::max)(logScore, LOWEST_SCORE);
+}
+
+inline float UntransformLMScore(float logNScore)
+{
+ // opposite of above
+ return logNScore / 2.30258509299405f;
+}
+
+inline bool FileExists(const std::string& filePath)
+{
+ std::ifstream ifs(filePath.c_str());
+ return !ifs.fail();
+}
+
+const std::string ToLower(const std::string& str);
+
+//! delete and remove every element of a collection object such as set, list etc
+template<class COLL>
+void RemoveAllInColl(COLL &coll)
+{
+ for (typename COLL::const_iterator iter = coll.begin(); iter != coll.end();
+ ++iter) {
+ delete (*iter);
+ }
+ coll.clear();
+}
+
+template<typename T>
+void Swap(T &a, T &b)
+{
+ T &c = a;
+ a = b;
+ b = c;
+}
+
+template<typename T>
+T &GetThreadSpecificObj(boost::thread_specific_ptr<T> &coll)
+{
+ T *obj;
+ obj = coll.get();
+ if (obj == NULL) {
+ obj = new T;
+ coll.reset(obj);
+ }
+ assert(obj);
+ return *obj;
+
+}
+
+// grab the underlying contain of priority queue
+template<class T, class S, class C>
+S& Container(std::priority_queue<T, S, C>& q)
+{
+ struct HackedQueue: private std::priority_queue<T, S, C>
+ {
+ static S& Container(std::priority_queue<T, S, C>& q)
+ {
+ return q.*&HackedQueue::c;
+ }
+ };
+ return HackedQueue::Container(q);
+}
+
+#define HERE __FILE__ << ":" << __LINE__
+
+/** Enforce rounding */
+inline void FixPrecision(std::ostream& stream, size_t size = 3)
+{
+ stream.setf(std::ios::fixed);
+ stream.precision(size);
+}
+
+}
+
diff --git a/moses2/legacy/gzfilebuf.h b/moses2/legacy/gzfilebuf.h
new file mode 100644
index 000000000..ea7021757
--- /dev/null
+++ b/moses2/legacy/gzfilebuf.h
@@ -0,0 +1,101 @@
+#ifndef moses_gzfile_buf_h
+#define moses_gzfile_buf_h
+
+#include <stdexcept>
+#include <streambuf>
+#include <zlib.h>
+#include <cstring>
+
+namespace Moses2
+{
+
+/** wrapper around gzip input stream. Unknown parentage
+ * @todo replace with boost version - output stream already uses it
+ */
+class gzfilebuf: public std::streambuf
+{
+public:
+ gzfilebuf(const char *filename)
+ {
+ _gzf = gzopen(filename, "rb");
+ if (!_gzf) throw std::runtime_error(
+ "Could not open " + std::string(filename) + ".");
+ setg(_buff + sizeof(int), // beginning of putback area
+ _buff + sizeof(int), // read position
+ _buff + sizeof(int)); // end position
+ }
+ ~gzfilebuf()
+ {
+ gzclose(_gzf);
+ }
+protected:
+ virtual int_type overflow(int_type /* c */)
+ {
+ throw;
+ }
+
+ // write multiple characters
+ virtual std::streamsize xsputn(const char* /* s */, std::streamsize /* num */)
+ {
+ throw;
+ }
+
+ virtual std::streampos seekpos(std::streampos /* sp */,
+ std::ios_base::openmode /* which = std::ios_base::in | std::ios_base::out */)
+ {
+ throw;
+ }
+
+ //read one character
+ virtual int_type underflow()
+ {
+ // is read position before end of _buff?
+ if (gptr() < egptr()) {
+ return traits_type::to_int_type(*gptr());
+ }
+
+ /* process size of putback area
+ * - use number of characters read
+ * - but at most four
+ */
+ unsigned int numPutback = gptr() - eback();
+ if (numPutback > sizeof(int)) {
+ numPutback = sizeof(int);
+ }
+
+ /* copy up to four characters previously read into
+ * the putback _buff (area of first four characters)
+ */
+ std::memmove(_buff + (sizeof(int) - numPutback), gptr() - numPutback,
+ numPutback);
+
+ // read new characters
+ int num = gzread(_gzf, _buff + sizeof(int), _buffsize - sizeof(int));
+ if (num <= 0) {
+ // ERROR or EOF
+ return EOF;
+ }
+
+ // reset _buff pointers
+ setg(_buff + (sizeof(int) - numPutback), // beginning of putback area
+ _buff + sizeof(int), // read position
+ _buff + sizeof(int) + num); // end of buffer
+
+ // return next character
+ return traits_type::to_int_type(*gptr());
+ }
+
+ std::streamsize xsgetn(char* s, std::streamsize num)
+ {
+ return gzread(_gzf, s, num);
+ }
+
+private:
+ gzFile _gzf;
+ static const unsigned int _buffsize = 1024;
+ char _buff[_buffsize];
+};
+
+}
+
+#endif