add tool for phrase lookup with biconcor

author: Philipp Koehn <pkoehn@odin.inf.ed.ac.uk> 2014-09-21 09:03:51 +0400
committer: Philipp Koehn <pkoehn@odin.inf.ed.ac.uk> 2014-09-21 09:03:51 +0400
commit: 1ed54a6181b02da19d20c548d9f9484e34e21206 (patch)
tree: 70c28f0ac2b4d5bde6a9a8667d9ff50180654290 /biconcor
parent: 3740c9f24859620e68b3e77a5aa0084f9b2d18be (diff)
2 files changed, 133 insertions, 1 deletions
diff --git a/biconcor/Jamfile b/biconcor/Jamfile
index 76f5c7aaf..83a738000 100644
--- a/biconcor/Jamfile
+++ b/biconcor/Jamfile
@@ -1,2 +1,2 @@
 exe biconcor : Vocabulary.cpp SuffixArray.cpp TargetCorpus.cpp Alignment.cpp Mismatch.cpp PhrasePair.cpp PhrasePairCollection.cpp biconcor.cpp base64.cpp ;
-
+exe phrase-lookup : Vocabulary.cpp SuffixArray.cpp phrase-lookup.cpp ;
diff --git a/biconcor/phrase-lookup.cpp b/biconcor/phrase-lookup.cpp
new file mode 100644
index 000000000..c6d1b9cdf
--- /dev/null
+++ b/biconcor/phrase-lookup.cpp
@@ -0,0 +1,132 @@
+#include "SuffixArray.h"
+#include <getopt.h>
+
+using namespace std;
+
+size_t lookup( string );
+vector<string> tokenize( const char input[] );
+SuffixArray suffixArray;
+
+int main(int argc, char* argv[]) {
+  // handle parameters
+  string query;
+  string fileNameSuffix;
+  string fileNameSource;
+  int loadFlag = false;
+  int saveFlag = false;
+  int createFlag = false;
+  int queryFlag = false;
+  int stdioFlag = false;  // receive requests from STDIN, respond to STDOUT
+  string info = "usage: biconcor\n\t[--load model-file]\n\t[--save model-file]\n\t[--create corpus]\n\t[--query string]\n\t[--stdio]\n";
+  while(1) {
+    static struct option long_options[] = {
+      {"load", required_argument, 0, 'l'},
+      {"save", required_argument, 0, 's'},
+      {"create", required_argument, 0, 'c'},
+      {"query", required_argument, 0, 'q'},
+      {"stdio", no_argument, 0, 'i'},
+      {0, 0, 0, 0}
+    };
+    int option_index = 0;
+    int c = getopt_long (argc, argv, "l:s:c:q:i", long_options, &option_index);
+    if (c == -1) break;
+    switch (c) {
+    case 'l':
+      fileNameSuffix = string(optarg);
+      loadFlag = true;
+      break;
+    case 's':
+      fileNameSuffix = string(optarg);
+      saveFlag = true;
+      break;
+    case 'c':
+      fileNameSource = string(optarg);
+      createFlag = true;
+      break;
+    case 'q':
+      query = string(optarg);
+      queryFlag = true;
+      break;
+    case 'i':
+      stdioFlag = true;
+      break;
+    default:
+      cerr << info;
+      exit(1);
+    }
+  }
+  if (stdioFlag) {
+    queryFlag = true;
+  }
+
+  // check if parameter settings are legal
+  if (saveFlag && !createFlag) {
+    cerr << "error: cannot save without creating\n" << info;
+    exit(1);
+  }
+  if (saveFlag && loadFlag) {
+    cerr << "error: cannot load and save at the same time\n" << info;
+    exit(1);
+  }
+  if (!loadFlag && !createFlag) {
+    cerr << "error: neither load or create - i have no info!\n" << info;
+    exit(1);
+  }
+
+  // do your thing
+  if (createFlag) {
+    cerr << "will create\n";
+    cerr << "corpus is in " << fileNameSource << endl;
+    suffixArray.Create( fileNameSource );
+    if (saveFlag) {
+      suffixArray.Save( fileNameSuffix );
+      cerr << "will save in " << fileNameSuffix << endl;
+    }
+  }
+  if (loadFlag) {
+    cerr << "will load from " << fileNameSuffix << endl;
+    suffixArray.Load( fileNameSuffix );
+  }
+  if (stdioFlag) {
+    while(true) {
+      string query;
+      if (getline(cin, query, '\n').eof()) {
+        return 0;
+      }
+      cout << lookup( query ) << endl;
+    }
+  } 
+  else if (queryFlag) {
+    cout << lookup( query ) << endl;
+  }
+  return 0;
+}
+
+size_t lookup( string query ) {
+  cerr << "query is " << query << endl;
+  vector< string > queryString = tokenize( query.c_str() );
+  return suffixArray.Count( queryString );
+}
+
+vector<string> tokenize( const char input[] )
+{
+  vector< string > token;
+  bool betweenWords = true;
+  int start=0;
+  int i=0;
+  for(; input[i] != '\0'; i++) {
+    bool isSpace = (input[i] == ' ' || input[i] == '\t');
+
+    if (!isSpace && betweenWords) {
+      start = i;
+      betweenWords = false;
+    } else if (isSpace && !betweenWords) {
+      token.push_back( string( input+start, i-start ) );
+      betweenWords = true;
+    }
+  }
+  if (!betweenWords)
+    token.push_back( string( input+start, i-start ) );
+  return token;
+}
+
author	Philipp Koehn <pkoehn@odin.inf.ed.ac.uk>	2014-09-21 09:03:51 +0400
committer	Philipp Koehn <pkoehn@odin.inf.ed.ac.uk>	2014-09-21 09:03:51 +0400
commit	1ed54a6181b02da19d20c548d9f9484e34e21206 (patch)
tree	70c28f0ac2b4d5bde6a9a8667d9ff50180654290 /biconcor
parent	3740c9f24859620e68b3e77a5aa0084f9b2d18be (diff)