Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
m---------lazy0
-rw-r--r--moses/src/Phrase.cpp12
-rw-r--r--moses/src/Phrase.h2
-rw-r--r--moses/src/RuleTable/Loader.h2
-rw-r--r--moses/src/RuleTable/LoaderCompact.cpp3
-rw-r--r--moses/src/RuleTable/LoaderCompact.h2
-rw-r--r--moses/src/RuleTable/LoaderHiero.cpp4
-rw-r--r--moses/src/RuleTable/LoaderHiero.h2
-rw-r--r--moses/src/RuleTable/LoaderStandard.cpp77
-rw-r--r--moses/src/RuleTable/LoaderStandard.h4
-rw-r--r--moses/src/RuleTable/PhraseDictionaryALSuffixArray.cpp5
-rw-r--r--moses/src/RuleTable/Trie.cpp5
-rw-r--r--moses/src/Word.cpp18
-rw-r--r--moses/src/Word.h3
-rwxr-xr-xscripts/generic/trainlm-irst.perl4
15 files changed, 71 insertions, 72 deletions
diff --git a/lazy b/lazy
-Subproject 4fde3f71c093afae1865d549c9b2c3a3cce5abe
+Subproject a43576d0576613c2587b3c0e012c22a0976ff53
diff --git a/moses/src/Phrase.cpp b/moses/src/Phrase.cpp
index 45ad41d95..a2b2bb256 100644
--- a/moses/src/Phrase.cpp
+++ b/moses/src/Phrase.cpp
@@ -160,13 +160,15 @@ void Phrase::CreateFromString(const std::vector<FactorType> &factorOrder, const
void Phrase::CreateFromStringNewFormat(FactorDirection direction
, const std::vector<FactorType> &factorOrder
- , const std::string &phraseString
+ , const StringPiece &phraseString
, const std::string & /*factorDelimiter */
, Word &lhs)
{
// parse
- vector<string> annotatedWordVector;
- Tokenize(annotatedWordVector, phraseString);
+ vector<StringPiece> annotatedWordVector;
+ for (util::TokenIter<util::AnyCharacter, true> it(phraseString, "\t "); it; ++it) {
+ annotatedWordVector.push_back(*it);
+ }
// KOMMA|none ART|Def.Z NN|Neut.NotGen.Sg VVFIN|none
// to
// "KOMMA|none" "ART|Def.Z" "NN|Neut.NotGen.Sg" "VVFIN|none"
@@ -174,7 +176,7 @@ void Phrase::CreateFromStringNewFormat(FactorDirection direction
m_words.reserve(annotatedWordVector.size()-1);
for (size_t phrasePos = 0 ; phrasePos < annotatedWordVector.size() - 1 ; phrasePos++) {
- string &annotatedWord = annotatedWordVector[phrasePos];
+ StringPiece &annotatedWord = annotatedWordVector[phrasePos];
bool isNonTerminal;
if (annotatedWord.substr(0, 1) == "[" && annotatedWord.substr(annotatedWord.size()-1, 1) == "]") {
// non-term
@@ -197,7 +199,7 @@ void Phrase::CreateFromStringNewFormat(FactorDirection direction
}
// lhs
- string &annotatedWord = annotatedWordVector.back();
+ StringPiece &annotatedWord = annotatedWordVector.back();
CHECK(annotatedWord.substr(0, 1) == "[" && annotatedWord.substr(annotatedWord.size()-1, 1) == "]");
annotatedWord = annotatedWord.substr(1, annotatedWord.size() - 2);
diff --git a/moses/src/Phrase.h b/moses/src/Phrase.h
index 547f6cbcd..4d2662059 100644
--- a/moses/src/Phrase.h
+++ b/moses/src/Phrase.h
@@ -70,7 +70,7 @@ public:
void CreateFromStringNewFormat(FactorDirection direction
, const std::vector<FactorType> &factorOrder
- , const std::string &phraseString
+ , const StringPiece &phraseString
, const std::string &factorDelimiter
, Word &lhs);
diff --git a/moses/src/RuleTable/Loader.h b/moses/src/RuleTable/Loader.h
index fac8900bd..7ca0c339f 100644
--- a/moses/src/RuleTable/Loader.h
+++ b/moses/src/RuleTable/Loader.h
@@ -40,7 +40,7 @@ class RuleTableLoader
virtual bool Load(const std::vector<FactorType> &input,
const std::vector<FactorType> &output,
- std::istream &inStream,
+ const std::string &inFile,
const std::vector<float> &weight,
size_t tableLimit,
const LMList &languageModels,
diff --git a/moses/src/RuleTable/LoaderCompact.cpp b/moses/src/RuleTable/LoaderCompact.cpp
index f88c425e1..78f943ebf 100644
--- a/moses/src/RuleTable/LoaderCompact.cpp
+++ b/moses/src/RuleTable/LoaderCompact.cpp
@@ -36,7 +36,7 @@ namespace Moses
bool RuleTableLoaderCompact::Load(const std::vector<FactorType> &input,
const std::vector<FactorType> &output,
- std::istream &inStream,
+ const std::string &inFile,
const std::vector<float> &weight,
size_t /* tableLimit */,
const LMList &languageModels,
@@ -45,6 +45,7 @@ bool RuleTableLoaderCompact::Load(const std::vector<FactorType> &input,
{
PrintUserTime("Start loading compact rule table");
+ InputFileStream inStream(inFile);
LineReader reader(inStream);
// Read and check version number.
diff --git a/moses/src/RuleTable/LoaderCompact.h b/moses/src/RuleTable/LoaderCompact.h
index 4dc7c87aa..5a5d83525 100644
--- a/moses/src/RuleTable/LoaderCompact.h
+++ b/moses/src/RuleTable/LoaderCompact.h
@@ -41,7 +41,7 @@ class RuleTableLoaderCompact : public RuleTableLoader
public:
bool Load(const std::vector<FactorType> &input,
const std::vector<FactorType> &output,
- std::istream &inStream,
+ const std::string &inFile,
const std::vector<float> &weight,
size_t tableLimit,
const LMList &languageModels,
diff --git a/moses/src/RuleTable/LoaderHiero.cpp b/moses/src/RuleTable/LoaderHiero.cpp
index db43b2958..0eb8cadc1 100644
--- a/moses/src/RuleTable/LoaderHiero.cpp
+++ b/moses/src/RuleTable/LoaderHiero.cpp
@@ -15,7 +15,7 @@ namespace Moses {
bool RuleTableLoaderHiero::Load(const std::vector<FactorType> &input,
const std::vector<FactorType> &output,
- std::istream &inStream,
+ const std::string &inFile,
const std::vector<float> &weight,
size_t tableLimit,
const LMList &languageModels,
@@ -24,7 +24,7 @@ bool RuleTableLoaderHiero::Load(const std::vector<FactorType> &input,
{
bool ret = RuleTableLoaderStandard::Load(HieroFormat
,input, output
- ,inStream, weight
+ ,inFile, weight
,tableLimit, languageModels
,wpProducer, ruleTable);
return ret;
diff --git a/moses/src/RuleTable/LoaderHiero.h b/moses/src/RuleTable/LoaderHiero.h
index 4a74f90b8..3f77b765c 100644
--- a/moses/src/RuleTable/LoaderHiero.h
+++ b/moses/src/RuleTable/LoaderHiero.h
@@ -19,7 +19,7 @@ class RuleTableLoaderHiero : public RuleTableLoaderStandard
public:
bool Load(const std::vector<FactorType> &input,
const std::vector<FactorType> &output,
- std::istream &inStream,
+ const std::string &inFile,
const std::vector<float> &weight,
size_t tableLimit,
const LMList &languageModels,
diff --git a/moses/src/RuleTable/LoaderStandard.cpp b/moses/src/RuleTable/LoaderStandard.cpp
index 331a07de0..41fee2e4b 100644
--- a/moses/src/RuleTable/LoaderStandard.cpp
+++ b/moses/src/RuleTable/LoaderStandard.cpp
@@ -23,7 +23,9 @@
#include <string>
#include <iterator>
#include <algorithm>
+#include <iostream>
#include <sys/stat.h>
+#include <stdlib.h>
#include "RuleTable/Trie.h"
#include "FactorCollection.h"
#include "Word.h"
@@ -34,6 +36,10 @@
#include "UserMessage.h"
#include "ChartTranslationOptionList.h"
#include "FactorCollection.h"
+#include "util/file_piece.hh"
+#include "util/string_piece.hh"
+#include "util/tokenize_piece.hh"
+
using namespace std;
@@ -41,7 +47,7 @@ namespace Moses
{
bool RuleTableLoaderStandard::Load(const std::vector<FactorType> &input
, const std::vector<FactorType> &output
- , std::istream &inStream
+ , const std::string &inFile
, const std::vector<float> &weight
, size_t tableLimit
, const LMList &languageModels
@@ -50,7 +56,7 @@ bool RuleTableLoaderStandard::Load(const std::vector<FactorType> &input
{
bool ret = Load(MosesFormat
,input, output
- ,inStream, weight
+ ,inFile, weight
,tableLimit, languageModels
,wpProducer, ruleTable);
return ret;
@@ -107,7 +113,7 @@ void ReformateHieroScore(string &scoreString)
scoreString = Join(" ", toks);
}
-string *ReformatHieroRule(const string &lineOrig)
+void ReformatHieroRule(const string &lineOrig, string &out)
{
vector<string> tokens;
vector<float> scoreVector;
@@ -137,13 +143,13 @@ string *ReformatHieroRule(const string &lineOrig)
<< scoreString << " ||| "
<< align.str();
- return new string(ret.str());
+ out = ret.str();
}
bool RuleTableLoaderStandard::Load(FormatType format
, const std::vector<FactorType> &input
, const std::vector<FactorType> &output
- , std::istream &inStream
+ , const std::string &inFile
, const std::vector<float> &weight
, size_t /* tableLimit */
, const LMList &languageModels
@@ -155,44 +161,54 @@ bool RuleTableLoaderStandard::Load(FormatType format
const StaticData &staticData = StaticData::Instance();
const std::string& factorDelimiter = staticData.GetFactorDelimiter();
-
string lineOrig;
size_t count = 0;
- while(getline(inStream, lineOrig)) {
- const string *line;
- if (format == HieroFormat) { // reformat line
- line = ReformatHieroRule(lineOrig);
- }
- else
- { // do nothing to format of line
- line = &lineOrig;
- }
-
- vector<string> tokens;
- vector<float> scoreVector;
+ std::ostream *progress = NULL;
+ IFVERBOSE(1) progress = &std::cerr;
+ util::FilePiece in(inFile.c_str(), progress);
- TokenizeMultiCharSeparator(tokens, *line , "|||" );
+ // reused variables
+ vector<float> scoreVector;
+ StringPiece line;
+ std::string hiero_before, hiero_after;
+
+ while(true) {
+ try {
+ line = in.ReadLine();
+ } catch (const util::EndOfFileException &e) { break; }
+
+ if (format == HieroFormat) { // inefficiently reformat line
+ hiero_before.assign(line.data(), line.size());
+ ReformatHieroRule(hiero_before, hiero_after);
+ line = hiero_after;
+ }
- if (tokens.size() != 4 && tokens.size() != 5) {
+ util::TokenIter<util::MultiCharacter> pipes(line, "|||");
+ StringPiece sourcePhraseString(*pipes);
+ StringPiece targetPhraseString(*++pipes);
+ StringPiece scoreString(*++pipes);
+ StringPiece alignString(*++pipes);
+
+ if (++pipes && ++pipes) {
stringstream strme;
strme << "Syntax error at " << ruleTable.GetFilePath() << ":" << count;
UserMessage::Add(strme.str());
abort();
}
- const string &sourcePhraseString = tokens[0]
- , &targetPhraseString = tokens[1]
- , &scoreString = tokens[2]
- , &alignString = tokens[3];
-
bool isLHSEmpty = (sourcePhraseString.find_first_not_of(" \t", 0) == string::npos);
if (isLHSEmpty && !staticData.IsWordDeletionEnabled()) {
TRACE_ERR( ruleTable.GetFilePath() << ":" << count << ": pt entry contains empty target, skipping\n");
continue;
}
- Tokenize<float>(scoreVector, scoreString);
+ scoreVector.clear();
+ for (util::TokenIter<util::AnyCharacter, true> s(scoreString, " \t"); s; ++s) {
+ char *err_ind;
+ scoreVector.push_back(strtod(s->data(), &err_ind));
+ UTIL_THROW_IF(err_ind == s->data(), util::Exception, "Bad score " << *s << " on line " << count);
+ }
const size_t numScoreComponents = ruleTable.GetFeature()->GetNumScoreComponents();
if (scoreVector.size() != numScoreComponents) {
stringstream strme;
@@ -201,7 +217,6 @@ bool RuleTableLoaderStandard::Load(FormatType format
UserMessage::Add(strme.str());
abort();
}
- CHECK(scoreVector.size() == numScoreComponents);
// parse source & find pt node
@@ -231,14 +246,6 @@ bool RuleTableLoaderStandard::Load(FormatType format
phraseColl.Add(targetPhrase);
count++;
-
- if (format == HieroFormat) { // reformat line
- delete line;
- }
- else
- { // do nothing
- }
-
}
// sort and prune each target phrase collection
diff --git a/moses/src/RuleTable/LoaderStandard.h b/moses/src/RuleTable/LoaderStandard.h
index 6fea42794..e0940fdd9 100644
--- a/moses/src/RuleTable/LoaderStandard.h
+++ b/moses/src/RuleTable/LoaderStandard.h
@@ -32,7 +32,7 @@ protected:
bool Load(FormatType format,
const std::vector<FactorType> &input,
const std::vector<FactorType> &output,
- std::istream &inStream,
+ const std::string &inFile,
const std::vector<float> &weight,
size_t tableLimit,
const LMList &languageModels,
@@ -41,7 +41,7 @@ protected:
public:
bool Load(const std::vector<FactorType> &input,
const std::vector<FactorType> &output,
- std::istream &inStream,
+ const std::string &inFile,
const std::vector<float> &weight,
size_t tableLimit,
const LMList &languageModels,
diff --git a/moses/src/RuleTable/PhraseDictionaryALSuffixArray.cpp b/moses/src/RuleTable/PhraseDictionaryALSuffixArray.cpp
index 68e2416e7..051ea26d3 100644
--- a/moses/src/RuleTable/PhraseDictionaryALSuffixArray.cpp
+++ b/moses/src/RuleTable/PhraseDictionaryALSuffixArray.cpp
@@ -59,12 +59,9 @@ void PhraseDictionaryALSuffixArray::InitializeForInput(InputType const& source)
string grammarFile = GetFilePath() + "/grammar.out." + SPrint(translationId) + ".gz";
- // data from file
- InputFileStream inFile(grammarFile);
-
std::auto_ptr<RuleTableLoader> loader =
RuleTableLoaderFactory::Create(grammarFile);
- bool ret = loader->Load(*m_input, *m_output, inFile, *m_weight, m_tableLimit,
+ bool ret = loader->Load(*m_input, *m_output, grammarFile, *m_weight, m_tableLimit,
*m_languageModels, m_wpProducer, *this);
CHECK(ret);
diff --git a/moses/src/RuleTable/Trie.cpp b/moses/src/RuleTable/Trie.cpp
index 017d39826..1fc404b7c 100644
--- a/moses/src/RuleTable/Trie.cpp
+++ b/moses/src/RuleTable/Trie.cpp
@@ -43,16 +43,13 @@ bool RuleTableTrie::Load(const std::vector<FactorType> &input,
m_filePath = filePath;
m_tableLimit = tableLimit;
- // data from file
- InputFileStream inFile(filePath);
-
std::auto_ptr<Moses::RuleTableLoader> loader =
Moses::RuleTableLoaderFactory::Create(filePath);
if (!loader.get())
{
return false;
}
- bool ret = loader->Load(input, output, inFile, weight, tableLimit,
+ bool ret = loader->Load(input, output, filePath, weight, tableLimit,
languageModels, wpProducer, *this);
return ret;
}
diff --git a/moses/src/Word.cpp b/moses/src/Word.cpp
index 045fa01b7..b748beba2 100644
--- a/moses/src/Word.cpp
+++ b/moses/src/Word.cpp
@@ -25,6 +25,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
#include "Word.h"
#include "TypeDef.h"
#include "StaticData.h" // needed to determine the FactorDelimiter
+#include "util/tokenize_piece.hh"
using namespace std;
@@ -87,23 +88,16 @@ std::string Word::GetString(const vector<FactorType> factorType,bool endWithBlan
void Word::CreateFromString(FactorDirection direction
, const std::vector<FactorType> &factorOrder
- , const std::string &str
+ , const StringPiece &str
, bool isNonTerminal)
{
FactorCollection &factorCollection = FactorCollection::Instance();
- vector<string> wordVec;
- const std::string& factorDelimiter = StaticData::Instance().GetFactorDelimiter();
- TokenizeMultiCharSeparator(wordVec, str, factorDelimiter);
- //Tokenize(wordVec, str, "|");
- CHECK(wordVec.size() <= factorOrder.size());
-
- const Factor *factor;
- for (size_t ind = 0; ind < wordVec.size(); ++ind) {
- FactorType factorType = factorOrder[ind];
- factor = factorCollection.AddFactor(direction, factorType, wordVec[ind]);
- m_factorArray[factorType] = factor;
+ util::TokenIter<util::MultiCharacter> fit(str, StaticData::Instance().GetFactorDelimiter());
+ for (size_t ind = 0; ind < factorOrder.size() && fit; ++ind, ++fit) {
+ m_factorArray[factorOrder[ind]] = factorCollection.AddFactor(*fit);
}
+ CHECK(!fit);
// assume term/non-term same for all factors
m_isNonTerminal = isNonTerminal;
diff --git a/moses/src/Word.h b/moses/src/Word.h
index aed7430a8..c3c93563b 100644
--- a/moses/src/Word.h
+++ b/moses/src/Word.h
@@ -29,6 +29,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
#include "TypeDef.h"
#include "Factor.h"
#include "Util.h"
+#include "util/string_piece.hh"
namespace Moses
{
@@ -129,7 +130,7 @@ public:
void CreateFromString(FactorDirection direction
, const std::vector<FactorType> &factorOrder
- , const std::string &str
+ , const StringPiece &str
, bool isNonTerminal);
void CreateUnknownWord(const Word &sourceWord);
diff --git a/scripts/generic/trainlm-irst.perl b/scripts/generic/trainlm-irst.perl
index d8c6ce2a5..6fac9767f 100755
--- a/scripts/generic/trainlm-irst.perl
+++ b/scripts/generic/trainlm-irst.perl
@@ -71,11 +71,11 @@ print "extension is $ext\n";
if ($ext eq "gz")
{
- $cmd = "$irstPath/compile-lm $tempPath/iarpa.gz --text yes /dev/stdout | gzip -c > $lmPath";
+ $cmd = "$irstPath/compile-lm --text $tempPath/iarpa.gz /dev/stdout | gzip -c > $lmPath";
}
else
{
- $cmd = "$irstPath/compile-lm $tempPath/iarpa.gz --text yes $lmPath";
+ $cmd = "$irstPath/compile-lm --text $tempPath/iarpa.gz $lmPath";
}
print STDERR "EXECUTING $cmd\n";