Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPhilipp Koehn <pkoehn@odin.inf.ed.ac.uk>2014-09-21 09:03:51 +0400
committerPhilipp Koehn <pkoehn@odin.inf.ed.ac.uk>2014-09-21 09:03:51 +0400
commit1ed54a6181b02da19d20c548d9f9484e34e21206 (patch)
tree70c28f0ac2b4d5bde6a9a8667d9ff50180654290 /biconcor
parent3740c9f24859620e68b3e77a5aa0084f9b2d18be (diff)
add tool for phrase lookup with biconcor
Diffstat (limited to 'biconcor')
-rw-r--r--biconcor/Jamfile2
-rw-r--r--biconcor/phrase-lookup.cpp132
2 files changed, 133 insertions, 1 deletions
diff --git a/biconcor/Jamfile b/biconcor/Jamfile
index 76f5c7aaf..83a738000 100644
--- a/biconcor/Jamfile
+++ b/biconcor/Jamfile
@@ -1,2 +1,2 @@
exe biconcor : Vocabulary.cpp SuffixArray.cpp TargetCorpus.cpp Alignment.cpp Mismatch.cpp PhrasePair.cpp PhrasePairCollection.cpp biconcor.cpp base64.cpp ;
-
+exe phrase-lookup : Vocabulary.cpp SuffixArray.cpp phrase-lookup.cpp ;
diff --git a/biconcor/phrase-lookup.cpp b/biconcor/phrase-lookup.cpp
new file mode 100644
index 000000000..c6d1b9cdf
--- /dev/null
+++ b/biconcor/phrase-lookup.cpp
@@ -0,0 +1,132 @@
+#include "SuffixArray.h"
+#include <getopt.h>
+
+using namespace std;
+
+size_t lookup( string );
+vector<string> tokenize( const char input[] );
+SuffixArray suffixArray;
+
+int main(int argc, char* argv[]) {
+ // handle parameters
+ string query;
+ string fileNameSuffix;
+ string fileNameSource;
+ int loadFlag = false;
+ int saveFlag = false;
+ int createFlag = false;
+ int queryFlag = false;
+ int stdioFlag = false; // receive requests from STDIN, respond to STDOUT
+ string info = "usage: biconcor\n\t[--load model-file]\n\t[--save model-file]\n\t[--create corpus]\n\t[--query string]\n\t[--stdio]\n";
+ while(1) {
+ static struct option long_options[] = {
+ {"load", required_argument, 0, 'l'},
+ {"save", required_argument, 0, 's'},
+ {"create", required_argument, 0, 'c'},
+ {"query", required_argument, 0, 'q'},
+ {"stdio", no_argument, 0, 'i'},
+ {0, 0, 0, 0}
+ };
+ int option_index = 0;
+ int c = getopt_long (argc, argv, "l:s:c:q:i", long_options, &option_index);
+ if (c == -1) break;
+ switch (c) {
+ case 'l':
+ fileNameSuffix = string(optarg);
+ loadFlag = true;
+ break;
+ case 's':
+ fileNameSuffix = string(optarg);
+ saveFlag = true;
+ break;
+ case 'c':
+ fileNameSource = string(optarg);
+ createFlag = true;
+ break;
+ case 'q':
+ query = string(optarg);
+ queryFlag = true;
+ break;
+ case 'i':
+ stdioFlag = true;
+ break;
+ default:
+ cerr << info;
+ exit(1);
+ }
+ }
+ if (stdioFlag) {
+ queryFlag = true;
+ }
+
+ // check if parameter settings are legal
+ if (saveFlag && !createFlag) {
+ cerr << "error: cannot save without creating\n" << info;
+ exit(1);
+ }
+ if (saveFlag && loadFlag) {
+ cerr << "error: cannot load and save at the same time\n" << info;
+ exit(1);
+ }
+ if (!loadFlag && !createFlag) {
+ cerr << "error: neither load or create - i have no info!\n" << info;
+ exit(1);
+ }
+
+ // do your thing
+ if (createFlag) {
+ cerr << "will create\n";
+ cerr << "corpus is in " << fileNameSource << endl;
+ suffixArray.Create( fileNameSource );
+ if (saveFlag) {
+ suffixArray.Save( fileNameSuffix );
+ cerr << "will save in " << fileNameSuffix << endl;
+ }
+ }
+ if (loadFlag) {
+ cerr << "will load from " << fileNameSuffix << endl;
+ suffixArray.Load( fileNameSuffix );
+ }
+ if (stdioFlag) {
+ while(true) {
+ string query;
+ if (getline(cin, query, '\n').eof()) {
+ return 0;
+ }
+ cout << lookup( query ) << endl;
+ }
+ }
+ else if (queryFlag) {
+ cout << lookup( query ) << endl;
+ }
+ return 0;
+}
+
+size_t lookup( string query ) {
+ cerr << "query is " << query << endl;
+ vector< string > queryString = tokenize( query.c_str() );
+ return suffixArray.Count( queryString );
+}
+
+vector<string> tokenize( const char input[] )
+{
+ vector< string > token;
+ bool betweenWords = true;
+ int start=0;
+ int i=0;
+ for(; input[i] != '\0'; i++) {
+ bool isSpace = (input[i] == ' ' || input[i] == '\t');
+
+ if (!isSpace && betweenWords) {
+ start = i;
+ betweenWords = false;
+ } else if (isSpace && !betweenWords) {
+ token.push_back( string( input+start, i-start ) );
+ betweenWords = true;
+ }
+ }
+ if (!betweenWords)
+ token.push_back( string( input+start, i-start ) );
+ return token;
+}
+