Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
Diffstat (limited to 'scripts/training/phrase-extract/tables-core.cpp')
-rw-r--r--scripts/training/phrase-extract/tables-core.cpp108
1 files changed, 108 insertions, 0 deletions
diff --git a/scripts/training/phrase-extract/tables-core.cpp b/scripts/training/phrase-extract/tables-core.cpp
new file mode 100644
index 000000000..8f8b454aa
--- /dev/null
+++ b/scripts/training/phrase-extract/tables-core.cpp
@@ -0,0 +1,108 @@
+// $Id: tables-core.cpp 3131 2010-04-13 16:29:55Z pjwilliams $
+//#include "beammain.h"
+#include "SafeGetline.h"
+#include "tables-core.h"
+
+#define TABLE_LINE_MAX_LENGTH 1000
+#define UNKNOWNSTR "UNK"
+
+// as in beamdecoder/tables.cpp
+vector<string> tokenize( const char* input ) {
+ vector< string > token;
+ bool betweenWords = true;
+ int start=0;
+ int i=0;
+ for(; input[i] != '\0'; i++) {
+ bool isSpace = (input[i] == ' ' || input[i] == '\t');
+
+ if (!isSpace && betweenWords) {
+ start = i;
+ betweenWords = false;
+ }
+ else if (isSpace && !betweenWords) {
+ token.push_back( string( input+start, i-start ) );
+ betweenWords = true;
+ }
+ }
+ if (!betweenWords)
+ token.push_back( string( input+start, i-start ) );
+ return token;
+}
+
+WORD_ID Vocabulary::storeIfNew( const WORD& word ) {
+ map<WORD, WORD_ID>::iterator i = lookup.find( word );
+
+ if( i != lookup.end() )
+ return i->second;
+
+ WORD_ID id = vocab.size();
+ vocab.push_back( word );
+ lookup[ word ] = id;
+ return id;
+}
+
+WORD_ID Vocabulary::getWordID( const WORD& word ) {
+ map<WORD, WORD_ID>::iterator i = lookup.find( word );
+ if( i == lookup.end() )
+ return 0;
+ return i->second;
+}
+
+PHRASE_ID PhraseTable::storeIfNew( const PHRASE& phrase ) {
+ map< PHRASE, PHRASE_ID >::iterator i = lookup.find( phrase );
+ if( i != lookup.end() )
+ return i->second;
+
+ PHRASE_ID id = phraseTable.size();
+ phraseTable.push_back( phrase );
+ lookup[ phrase ] = id;
+ return id;
+}
+
+PHRASE_ID PhraseTable::getPhraseID( const PHRASE& phrase ) {
+ map< PHRASE, PHRASE_ID >::iterator i = lookup.find( phrase );
+ if( i == lookup.end() )
+ return 0;
+ return i->second;
+}
+
+void PhraseTable::clear() {
+ lookup.clear();
+ phraseTable.clear();
+}
+
+void DTable::init() {
+ for(int i = -10; i<10; i++)
+ dtable[i] = -abs( i );
+}
+
+void DTable::load( const string& fileName ) {
+ ifstream inFile;
+ inFile.open(fileName.c_str());
+ istream *inFileP = &inFile;
+
+ char line[TABLE_LINE_MAX_LENGTH];
+ int i=0;
+ while(true) {
+ i++;
+ SAFE_GETLINE((*inFileP), line, TABLE_LINE_MAX_LENGTH, '\n', __FILE__);
+ if (inFileP->eof()) break;
+
+ vector<string> token = tokenize( line );
+ if (token.size() < 2) {
+ cerr << "line " << i << " in " << fileName << " too short, skipping\n";
+ continue;
+ }
+
+ int d = atoi( token[0].c_str() );
+ double prob = log( atof( token[1].c_str() ) );
+ dtable[ d ] = prob;
+ }
+}
+
+double DTable::get( int distortion ) {
+ if (dtable.find( distortion ) == dtable.end())
+ return log( 0.00001 );
+ return dtable[ distortion ];
+}
+