// Query binary phrase tables. // Christian Hardmeier, 16 May 2010 #include #include #include #include #include "Util.h" #include "OnDiskWrapper.h" #include "SourcePhrase.h" using namespace std; using namespace OnDiskPt; void usage(); typedef unsigned int uint; void Tokenize(OnDiskPt::Phrase &phrase , const std::string &token, bool addSourceNonTerm, bool addTargetNonTerm , OnDiskPt::OnDiskWrapper &onDiskWrapper) { bool nonTerm = false; size_t tokSize = token.size(); int comStr =token.compare(0, 1, "["); if (comStr == 0) { comStr = token.compare(tokSize - 1, 1, "]"); nonTerm = comStr == 0; } if (nonTerm) { // non-term size_t splitPos = token.find_first_of("[", 2); string wordStr = token.substr(0, splitPos); if (splitPos == string::npos) { // lhs - only 1 word WordPtr word (new Word()); word->CreateFromString(wordStr, onDiskWrapper.GetVocab()); phrase.AddWord(word); } else { // source & target non-terms if (addSourceNonTerm) { WordPtr word( new Word()); word->CreateFromString(wordStr, onDiskWrapper.GetVocab()); phrase.AddWord(word); } wordStr = token.substr(splitPos, tokSize - splitPos); if (addTargetNonTerm) { WordPtr word(new Word()); word->CreateFromString(wordStr, onDiskWrapper.GetVocab()); phrase.AddWord(word); } } } else { // term WordPtr word(new Word()); word->CreateFromString(token, onDiskWrapper.GetVocab()); phrase.AddWord(word); } } int main(int argc, char **argv) { int tableLimit = 20; std::string ttable = ""; bool useAlignments = false; for(int i = 1; i < argc; i++) { if(!strcmp(argv[i], "-tlimit")) { if(i + 1 == argc) usage(); tableLimit = atoi(argv[++i]); } else if(!strcmp(argv[i], "-t")) { if(i + 1 == argc) usage(); ttable = argv[++i]; } else usage(); } if(ttable == "") usage(); OnDiskWrapper onDiskWrapper; bool retDb = onDiskWrapper.BeginLoad(ttable); CHECK(retDb); cerr << "Ready..." << endl; std::string line; while(getline(std::cin, line)) { std::vector tokens; tokens = Moses::Tokenize(line, " "); cerr << "line: " << line << endl; // create source phrase SourcePhrase sourcePhrase; for (size_t pos = 0; pos < tokens.size(); ++pos) { const string &tok = tokens[pos]; if (pos == tokens.size() - 1) { // last position. LHS non-term Tokenize(sourcePhrase, tok, false, true, onDiskWrapper); } else { Tokenize(sourcePhrase, tok, true, true, onDiskWrapper); } } const PhraseNode *node = &onDiskWrapper.GetRootSourceNode(); cerr << "node=" << node << endl; assert(node); for (size_t pos = 0; pos < sourcePhrase.GetSize(); ++pos) { const Word &word = sourcePhrase.GetWord(pos); cerr << word << " "; node = node->GetChild(word, onDiskWrapper); cerr << "node=" << node << endl; if (node == NULL) { break; } } if (node) { // source phrase points to a bunch of rules const TargetPhraseCollection *coll = node->GetTargetPhraseCollection(tableLimit, onDiskWrapper); string str = coll->GetDebugStr(); cout << "Found " << coll->GetSize() << endl; for (size_t ind = 0; ind < coll->GetSize(); ++ind) { const TargetPhrase &targetPhrase = coll->GetTargetPhrase(ind); cerr << " "; targetPhrase.DebugPrint(cerr, onDiskWrapper.GetVocab()); cerr << endl; } } else { cout << "Not found" << endl; } std::cout << '\n'; std::cout.flush(); } cerr << "Finished." << endl; } void usage() { std::cerr << "Usage: queryOnDiskPt [-n ] [-a] -t \n" "-tlimit max number of rules per source phrase (default: 20)\n" "-t phrase table\n"; exit(1); }