Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
path: root/moses
diff options
context:
space:
mode:
authorMatthias Huck <huck@i6.informatik.rwth-aachen.de>2015-02-25 04:27:08 +0300
committerMatthias Huck <huck@i6.informatik.rwth-aachen.de>2015-02-25 04:27:08 +0300
commit0a6adcde3a5a352d1276098bc0d3a0439e7b90a9 (patch)
treeb3e627614774bdfcdd2a2e5edaf8d4b0d4a304e1 /moses
parent39c1ef52dc04e88b4205869aaff54459ad8ab80d (diff)
Model1Feature: special UNK token in MGIZA vocabularies
Diffstat (limited to 'moses')
-rw-r--r--moses/FF/Model1Feature.cpp15
1 files changed, 14 insertions, 1 deletions
diff --git a/moses/FF/Model1Feature.cpp b/moses/FF/Model1Feature.cpp
index c4888a1ea..1c375bc41 100644
--- a/moses/FF/Model1Feature.cpp
+++ b/moses/FF/Model1Feature.cpp
@@ -71,6 +71,19 @@ void Model1Vocabulary::Load(const std::string& fileName)
std::string line;
unsigned i = 0;
+ if ( getline(inFile, line) ) // first line of MGIZA vocabulary files seems to be special : "1 UNK 0" -- skip if it's this
+ {
+ ++i;
+ std::vector<std::string> tokens = Tokenize(line);
+ UTIL_THROW_IF2(tokens.size()!=3, "Line " << i << " in " << fileName << " has wrong number of tokens.");
+ unsigned id = Scan<unsigned>(tokens[0]);
+ if (! ( (id == 1) && (tokens[1] == "UNK") ))
+ {
+ const Factor* factor = factorCollection.AddFactor(tokens[1],false); // TODO: can we assume that the vocabulary is know and filter the model on loading?
+ bool stored = Store(factor, id);
+ UTIL_THROW_IF2(!stored, "Line " << i << " in " << fileName << " overwrites existing vocabulary entry.");
+ }
+ }
while ( getline(inFile, line) )
{
++i;
@@ -79,7 +92,7 @@ void Model1Vocabulary::Load(const std::string& fileName)
unsigned id = Scan<unsigned>(tokens[0]);
const Factor* factor = factorCollection.AddFactor(tokens[1],false); // TODO: can we assume that the vocabulary is know and filter the model on loading?
bool stored = Store(factor, id);
- UTIL_THROW_IF2(!stored && (tokens[1] != "UNK"), "Line " << i << " in " << fileName << " overwrites existing vocabulary entry.");
+ UTIL_THROW_IF2(!stored, "Line " << i << " in " << fileName << " overwrites existing vocabulary entry.");
}
inFile.Close();
}