diff options
Diffstat (limited to 'src/command/marian_conv.cpp')
-rw-r--r-- | src/command/marian_conv.cpp | 26 |
1 files changed, 24 insertions, 2 deletions
diff --git a/src/command/marian_conv.cpp b/src/command/marian_conv.cpp index e0e89d2b..943f61d4 100644 --- a/src/command/marian_conv.cpp +++ b/src/command/marian_conv.cpp @@ -3,7 +3,7 @@ #include "tensors/cpu/expression_graph_packable.h" #include "onnx/expression_graph_onnx_exporter.h" #include "layers/lsh.h" - +#include "data/shortlist.h" #include <sstream> int main(int argc, char** argv) { @@ -16,7 +16,8 @@ int main(int argc, char** argv) { YAML::Node config; // @TODO: get rid of YAML::Node here entirely to avoid the pattern. Currently not fixing as it requires more changes to the Options object. auto cli = New<cli::CLIWrapper>( config, - "Convert a model in the .npz format and normal memory layout to a mmap-able binary model which could be in normal memory layout or packed memory layout", + "Convert a model in the .npz format and normal memory layout to a mmap-able binary model which could be in normal memory layout or packed memory layout\n" + "or convert a text lexical shortlist to a binary shortlist with {--shortlist,-s} option", "Allowed options", "Examples:\n" " ./marian-conv -f model.npz -t model.bin --gemm-type packed16"); @@ -30,9 +31,30 @@ int main(int argc, char** argv) { "Encode output matrix and optional rotation matrix into model file. " "arg1: number of bits in LSH encoding, arg2: name of output weights matrix")->implicit_val("1024 Wemb"); cli->add<std::vector<std::string>>("--vocabs,-V", "Vocabulary file, required for ONNX export"); + cli->add<std::vector<std::string>>("--shortlist,-s", "Shortlist conversion: filePath firstNum bestNum threshold"); + cli->add<std::string>("--dump-shortlist,-d", "Binary shortlist dump path","lex.bin"); cli->parse(argc, argv); options->merge(config); } + + // shortlist conversion: + // ./marian-conv --shortlist lex.esen.s2t 100 100 0 --dump-shortlist lex.esen.bin --vocabs vocab.esen.spm vocab.esen.spm + if(options->hasAndNotEmpty("shortlist")){ + auto vocabPaths = options->get<std::vector<std::string>>("vocabs"); + auto dumpPath = options->get<std::string>("dump-shortlist"); + + Ptr<Vocab> srcVocab = New<Vocab>(options, 0); + srcVocab->load(vocabPaths[0]); + Ptr<Vocab> trgVocab = New<Vocab>(options, 1); + trgVocab->load(vocabPaths[1]); + + Ptr<const data::ShortlistGenerator> binaryShortlistGenerator + = New<data::BinaryShortlistGenerator>(options, srcVocab, trgVocab, 0, 1, vocabPaths[0] == vocabPaths[1]); + binaryShortlistGenerator->dump(dumpPath); + LOG(info, "Dumping of the shortlist is finished"); + return 0; + } + auto modelFrom = options->get<std::string>("from"); auto modelTo = options->get<std::string>("to"); |