From 6d0f482361b0b879496c5f0497cf1b1ca2c9d4c4 Mon Sep 17 00:00:00 2001 From: Philipp Koehn Date: Mon, 20 Jul 2015 11:41:48 -0400 Subject: extended phrase lookup: print sentences, document id --- biconcor/SuffixArray.cpp | 263 ++++++++++++++++++++++++++++++++++++--------- biconcor/SuffixArray.h | 19 ++++ biconcor/phrase-lookup.cpp | 77 ++++++------- 3 files changed, 270 insertions(+), 89 deletions(-) (limited to 'biconcor') diff --git a/biconcor/SuffixArray.cpp b/biconcor/SuffixArray.cpp index 9566ede99..9466b0e0f 100644 --- a/biconcor/SuffixArray.cpp +++ b/biconcor/SuffixArray.cpp @@ -21,6 +21,11 @@ SuffixArray::SuffixArray() m_wordInSentence(NULL), m_sentence(NULL), m_sentenceLength(NULL), + m_document(NULL), + m_documentName(NULL), + m_documentNameLength(0), + m_documentCount(0), + m_useDocument(false), m_vcb(), m_size(0), m_sentenceCount(0) { } @@ -32,6 +37,8 @@ SuffixArray::~SuffixArray() free(m_wordInSentence); free(m_sentence); free(m_sentenceLength); + free(m_document); + free(m_documentName); } void SuffixArray::Create(const string& fileName ) @@ -46,22 +53,32 @@ void SuffixArray::Create(const string& fileName ) textFile.open(fileName.c_str()); if (!textFile) { - cerr << "no such file or directory " << fileName << endl; + cerr << "Error: no such file or directory " << fileName << endl; exit(1); } + // first pass through data: get size istream *fileP = &textFile; m_size = 0; m_sentenceCount = 0; + m_documentCount = 0; while(!fileP->eof()) { SAFE_GETLINE((*fileP), line, LINE_MAX_LENGTH, '\n'); if (fileP->eof()) break; + if (m_useDocument && ProcessDocumentLine(line,0)) continue; vector< WORD_ID > words = m_vcb.Tokenize( line ); m_size += words.size() + 1; m_sentenceCount++; } textFile.close(); cerr << m_size << " words (incl. sentence boundaries)" << endl; + if (m_useDocument) { + cerr << m_documentCount << " documents" << endl; + if (m_documentCount == 0) { + cerr << "Error: no documents found, aborting." << endl; + exit(1); + } + } // allocate memory m_array = (WORD_ID*) calloc( sizeof( WORD_ID ), m_size ); @@ -69,21 +86,31 @@ void SuffixArray::Create(const string& fileName ) m_wordInSentence = (char*) calloc( sizeof( char ), m_size ); m_sentence = (INDEX*) calloc( sizeof( INDEX ), m_size ); m_sentenceLength = (char*) calloc( sizeof( char ), m_sentenceCount ); + CheckAllocation(m_array != NULL, "m_array"); + CheckAllocation(m_index != NULL, "m_index"); + CheckAllocation(m_wordInSentence != NULL, "m_wordInSentence"); + CheckAllocation(m_sentence != NULL, "m_sentence"); + CheckAllocation(m_sentenceLength != NULL, "m_sentenceLength"); + if (m_useDocument) { + m_document = (INDEX*) calloc( sizeof( INDEX ), m_documentCount ); + m_documentName = (INDEX*) calloc( sizeof( char ), m_documentCount ); + m_documentNameBuffer = (char*) calloc( sizeof( char ), m_documentNameLength ); + CheckAllocation(m_document != NULL, "m_document"); + CheckAllocation(m_documentName != NULL, "m_documentName"); + CheckAllocation(m_documentNameBuffer != NULL, "m_documentNameBuffer"); + } - // fill the array + // second pass through data: fill the arrays int wordIndex = 0; int sentenceId = 0; + m_documentNameLength = 0; // re-use as counter + m_documentCount = 0; // re-use as counter textFile.open(fileName.c_str()); - - if (!textFile) { - cerr << "no such file or directory " << fileName << endl; - exit(1); - } - fileP = &textFile; while(!fileP->eof()) { SAFE_GETLINE((*fileP), line, LINE_MAX_LENGTH, '\n'); if (fileP->eof()) break; + if (m_useDocument && ProcessDocumentLine(line,sentenceId)) continue; vector< WORD_ID > words = m_vcb.Tokenize( line ); vector< WORD_ID >::const_iterator i; @@ -105,7 +132,7 @@ void SuffixArray::Create(const string& fileName ) m_buffer = (INDEX*) calloc( sizeof( INDEX ), m_size ); if (m_buffer == NULL) { - cerr << "cannot allocate memory to m_buffer" << endl; + cerr << "Error: cannot allocate memory to m_buffer" << endl; exit(1); } @@ -114,6 +141,45 @@ void SuffixArray::Create(const string& fileName ) cerr << "done sorting" << endl; } +// very specific code to deal with common crawl document ids +bool SuffixArray::ProcessDocumentLine( const char *line, const size_t sentenceId ) +{ + size_t i; + // first 32 characters are hex-hash + for(i=0; i<32; i++) { + if ((line[i] < '0' || line[i] > '9') && (line[i] < 'a' || line[i] > 'f')) { + return false; + } + } + if (line[i++] != ' ') return false; + + // second token is float + for (; line[i] != ' ' && line[i] != 0; i++) { + if (line[i] != '.' && (line[i] < '0' || line[i] > '9')) { + return false; + } + } + i++; + + // last token is url (=name) + size_t startName = i; + for (; line[i] != ' ' && line[i] != 0; i++) {} + if (line[i] == ' ') return false; + size_t endName = i+1; // include '\0' + + // second pass: record name and sentence number + if (m_document != NULL) { + m_documentName[m_documentCount] = m_documentNameLength; + for(size_t i=startName; i0) cout << " "; + cout << phrase[i]; + } + cout << '\t'; + INDEX start = 0; + INDEX end = m_size-1; + INDEX mid = FindFirst( phrase, start, end ); + if (mid == m_size) { // no matches + cout << "0 matches" << endl; + return; + } + + INDEX firstMatch = FindLast( phrase, mid, start, -1 ); + INDEX lastMatch = FindLast( phrase, mid, end, 1 ); + + // loop through all matches + cout << (lastMatch-firstMatch+1) << " matches" << endl; + for(INDEX i=firstMatch; i<=lastMatch;i++) { + // get sentence information + INDEX pos = GetPosition( i ); + INDEX start = pos - GetWordInSentence( pos ); + char length = GetSentenceLength( GetSentence( pos ) ); + // print document name + if (m_useDocument) { + INDEX sentence = GetSentence( pos ); + INDEX document = GetDocument( sentence ); + PrintDocumentName( document ); + cout << '\t'; + } + // print sentence + for(char i=0; i0) cout << " "; + cout << GetWord( start + i ); + } + cout << endl; + } +} + +SuffixArray::INDEX SuffixArray::GetDocument( INDEX sentence ) const +{ + // binary search + INDEX min = 0; + INDEX max = m_documentCount-1; + if (sentence >= m_document[max]) { + return max; + } + while(true) { + INDEX mid = (min + max) / 2; + if (sentence >= m_document[mid] && sentence < m_document[mid+1]) { + return mid; + } + if (sentence < m_document[mid]) { + max = mid-1; + } + else { + min = mid+1; + } + } +} + void SuffixArray::Save(const string& fileName ) const { FILE *pFile = fopen ( fileName.c_str() , "w" ); - if (pFile == NULL) { - cerr << "Cannot open " << fileName << endl; - exit(1); - } + if (pFile == NULL) Error("cannot open",fileName); fwrite( &m_size, sizeof(INDEX), 1, pFile ); fwrite( m_array, sizeof(WORD_ID), m_size, pFile ); // corpus @@ -288,6 +414,16 @@ void SuffixArray::Save(const string& fileName ) const fwrite( &m_sentenceCount, sizeof(INDEX), 1, pFile ); fwrite( m_sentenceLength, sizeof(char), m_sentenceCount, pFile); // sentence length + + char useDocument = m_useDocument; // not sure if that is needed + fwrite( &useDocument, sizeof(char), 1, pFile ); + if (m_useDocument) { + fwrite( &m_documentCount, sizeof(INDEX), 1, pFile ); + fwrite( m_document, sizeof(INDEX), m_documentCount, pFile ); + fwrite( m_documentName, sizeof(INDEX), m_documentCount, pFile ); + fwrite( &m_documentNameLength, sizeof(INDEX), 1, pFile ); + fwrite( m_documentNameBuffer, sizeof(char), m_documentNameLength, pFile ); + } fclose( pFile ); m_vcb.Save( fileName + ".src-vcb" ); @@ -296,56 +432,81 @@ void SuffixArray::Save(const string& fileName ) const void SuffixArray::Load(const string& fileName ) { FILE *pFile = fopen ( fileName.c_str() , "r" ); - if (pFile == NULL) { - cerr << "no such file or directory " << fileName << endl; - exit(1); - } + if (pFile == NULL) Error("no such file or directory", fileName); cerr << "loading from " << fileName << endl; - fread( &m_size, sizeof(INDEX), 1, pFile ); + fread( &m_size, sizeof(INDEX), 1, pFile ) + || Error("could not read m_size from", fileName); cerr << "words in corpus: " << m_size << endl; + m_array = (WORD_ID*) calloc( sizeof( WORD_ID ), m_size ); m_index = (INDEX*) calloc( sizeof( INDEX ), m_size ); m_wordInSentence = (char*) calloc( sizeof( char ), m_size ); m_sentence = (INDEX*) calloc( sizeof( INDEX ), m_size ); - - if (m_array == NULL) { - cerr << "Error: cannot allocate memory to m_array" << endl; - exit(1); - } - - if (m_index == NULL) { - cerr << "Error: cannot allocate memory to m_index" << endl; - exit(1); - } - - if (m_wordInSentence == NULL) { - cerr << "Error: cannot allocate memory to m_wordInSentence" << endl; - exit(1); - } - - if (m_sentence == NULL) { - cerr << "Error: cannot allocate memory to m_sentence" << endl; - exit(1); - } - - fread( m_array, sizeof(WORD_ID), m_size, pFile ); // corpus - fread( m_index, sizeof(INDEX), m_size, pFile ); // suffix array - fread( m_wordInSentence, sizeof(char), m_size, pFile); // word index - fread( m_sentence, sizeof(INDEX), m_size, pFile); // sentence index - - fread( &m_sentenceCount, sizeof(INDEX), 1, pFile ); + CheckAllocation(m_array != NULL, "m_array"); + CheckAllocation(m_index != NULL, "m_index"); + CheckAllocation(m_wordInSentence != NULL, "m_wordInSentence"); + CheckAllocation(m_sentence != NULL, "m_sentence"); + fread( m_array, sizeof(WORD_ID), m_size, pFile ) // corpus + || Error("could not read m_array from", fileName); + fread( m_index, sizeof(INDEX), m_size, pFile ) // suffix array + || Error("could not read m_index from", fileName); + fread( m_wordInSentence, sizeof(char), m_size, pFile) // word index + || Error("could not read m_wordInSentence from", fileName); + fread( m_sentence, sizeof(INDEX), m_size, pFile ) // sentence index + || Error("could not read m_sentence from", fileName); + + fread( &m_sentenceCount, sizeof(INDEX), 1, pFile ) + || Error("could not read m_sentenceCount from", fileName); cerr << "sentences in corpus: " << m_sentenceCount << endl; - m_sentenceLength = (char*) calloc( sizeof( char ), m_sentenceCount ); - if (m_sentenceLength == NULL) { - cerr << "Error: cannot allocate memory to m_sentenceLength" << endl; - exit(1); + m_sentenceLength = (char*) calloc( sizeof( char ), m_sentenceCount ); + CheckAllocation(m_sentenceLength != NULL, "m_sentenceLength"); + fread( m_sentenceLength, sizeof(char), m_sentenceCount, pFile) // sentence length + || Error("could not read m_sentenceLength from", fileName); + + if (m_useDocument) { // do not read it when you do not need it + char useDocument; + fread( &useDocument, sizeof(char), 1, pFile ) + || Error("could not read m_useDocument from", fileName); + if (!useDocument) { + cerr << "Error: stored suffix array does not have a document index\n"; + exit(1); + } + fread( &m_documentCount, sizeof(INDEX), 1, pFile ) + || Error("could not read m_documentCount from", fileName); + m_document = (INDEX*) calloc( sizeof( INDEX ), m_documentCount ); + m_documentName = (INDEX*) calloc( sizeof( INDEX ), m_documentCount ); + CheckAllocation(m_document != NULL, "m_document"); + CheckAllocation(m_documentName != NULL, "m_documentName"); + fread( m_document, sizeof(INDEX), m_documentCount, pFile ) + || Error("could not read m_document from", fileName); + fread( m_documentName, sizeof(INDEX), m_documentCount, pFile ) + || Error("could not read m_documentName from", fileName); + fread( &m_documentNameLength, sizeof(INDEX), 1, pFile ) + || Error("could not read m_documentNameLength from", fileName); + m_documentNameBuffer = (char*) calloc( sizeof( char ), m_documentNameLength ); + CheckAllocation(m_documentNameBuffer != NULL, "m_documentNameBuffer"); + fread( m_documentNameBuffer, sizeof(char), m_documentNameLength, pFile ) + || Error("could not read m_document from", fileName); } - fread( m_sentenceLength, sizeof(char), m_sentenceCount, pFile); // sentence length fclose( pFile ); m_vcb.Load( fileName + ".src-vcb" ); } + +void SuffixArray::CheckAllocation( bool check, const char *dataStructure ) const +{ + if (check) return; + cerr << "Error: could not allocate memory for " << dataStructure << endl; + exit(1); +} + +bool SuffixArray::Error( const char *message, const string &fileName) const +{ + cerr << "Error: " << message << " " << fileName << endl; + exit(1); + return true; // yeah, i know. +} diff --git a/biconcor/SuffixArray.h b/biconcor/SuffixArray.h index af7f5567e..f20702e41 100644 --- a/biconcor/SuffixArray.h +++ b/biconcor/SuffixArray.h @@ -15,6 +15,12 @@ private: INDEX *m_sentence; char *m_sentenceLength; WORD_ID m_endOfSentence; + INDEX *m_document; + INDEX *m_documentName; + char *m_documentNameBuffer; + size_t m_documentNameLength; + size_t m_documentCount; + bool m_useDocument; Vocabulary m_vcb; INDEX m_size; INDEX m_sentenceCount; @@ -28,6 +34,7 @@ public: ~SuffixArray(); void Create(const std::string& fileName ); + bool ProcessDocumentLine( const char* const, const size_t ); void Sort(INDEX start, INDEX end); int CompareIndex( INDEX a, INDEX b ) const; inline int CompareWord( WORD_ID a, WORD_ID b ) const; @@ -40,6 +47,7 @@ public: INDEX FindLast( const std::vector< WORD > &phrase, INDEX start, INDEX end, int direction ); int Match( const std::vector< WORD > &phrase, INDEX index ); void List( INDEX start, INDEX end ); + void PrintSentenceMatches( const std::vector< WORD > &phrase ); inline INDEX GetPosition( INDEX index ) const { return m_index[ index ]; } @@ -58,6 +66,17 @@ public: inline WORD GetWord( INDEX position ) const { return m_vcb.GetWord( m_array[position] ); } + void UseDocument() { + m_useDocument = true; + } + INDEX GetDocument( INDEX sentence ) const; + void PrintDocumentName( INDEX document ) { + for(INDEX i=m_documentName[ document ]; m_documentNameBuffer[i] != 0; i++) { + std::cout << m_documentNameBuffer[ i ]; + } + } void Save(const std::string& fileName ) const; void Load(const std::string& fileName ); + void CheckAllocation(bool, const char *dataStructure) const; + bool Error( const char* message, const std::string& fileName) const; }; diff --git a/biconcor/phrase-lookup.cpp b/biconcor/phrase-lookup.cpp index 60ab8db66..0b940a4e9 100644 --- a/biconcor/phrase-lookup.cpp +++ b/biconcor/phrase-lookup.cpp @@ -1,4 +1,5 @@ #include "SuffixArray.h" +#include "../util/tokenize.hh" #include using namespace std; @@ -13,10 +14,12 @@ int main(int argc, char* argv[]) string query; string fileNameSuffix; string fileNameSource; - int loadFlag = false; - int saveFlag = false; - int createFlag = false; - int queryFlag = false; + bool loadFlag = false; + bool saveFlag = false; + bool createFlag = false; + bool queryFlag = false; + bool querySentenceFlag = false; + int stdioFlag = false; // receive requests from STDIN, respond to STDOUT string info = "usage: biconcor\n\t[--load model-file]\n\t[--save model-file]\n\t[--create corpus]\n\t[--query string]\n\t[--stdio]\n"; while(1) { @@ -25,11 +28,14 @@ int main(int argc, char* argv[]) {"save", required_argument, 0, 's'}, {"create", required_argument, 0, 'c'}, {"query", required_argument, 0, 'q'}, + {"query-sentence", required_argument, 0, 'Q'}, + {"document", required_argument, 0, 'd'}, {"stdio", no_argument, 0, 'i'}, + {"stdio-sentence", no_argument, 0, 'I'}, {0, 0, 0, 0} }; int option_index = 0; - int c = getopt_long (argc, argv, "l:s:c:q:i", long_options, &option_index); + int c = getopt_long (argc, argv, "l:s:c:q:Q:iId", long_options, &option_index); if (c == -1) break; switch (c) { case 'l': @@ -48,17 +54,25 @@ int main(int argc, char* argv[]) query = string(optarg); queryFlag = true; break; + case 'Q': + query = string(optarg); + querySentenceFlag = true; + break; case 'i': stdioFlag = true; break; + case 'I': + stdioFlag = true; + querySentenceFlag = true; + break; + case 'd': + suffixArray.UseDocument(); + break; default: cerr << info; exit(1); } } - if (stdioFlag) { - queryFlag = true; - } // check if parameter settings are legal if (saveFlag && !createFlag) { @@ -74,7 +88,7 @@ int main(int argc, char* argv[]) exit(1); } - // do your thing + // get suffix array if (createFlag) { cerr << "will create\n"; cerr << "corpus is in " << fileNameSource << endl; @@ -88,16 +102,29 @@ int main(int argc, char* argv[]) cerr << "will load from " << fileNameSuffix << endl; suffixArray.Load( fileNameSuffix ); } + + // do something with it if (stdioFlag) { while(true) { string query; if (getline(cin, query, '\n').eof()) { return 0; } - cout << lookup( query ) << endl; + if (querySentenceFlag) { + vector< string > queryString = util::tokenize( query.c_str() ); + suffixArray.PrintSentenceMatches( queryString ); + } + else { + cout << lookup( query ) << endl; + } } - } else if (queryFlag) { + } + else if (queryFlag) { cout << lookup( query ) << endl; + } + else if (querySentenceFlag) { + vector< string > queryString = util::tokenize( query.c_str() ); + suffixArray.PrintSentenceMatches( queryString ); } return 0; } @@ -105,32 +132,6 @@ int main(int argc, char* argv[]) size_t lookup( string query ) { cerr << "query is " << query << endl; - vector< string > queryString = tokenize( query.c_str() ); + vector< string > queryString = util::tokenize( query.c_str() ); return suffixArray.Count( queryString ); } - -// Duplicate of definition in util/tokenize.hh. -// TODO: Can we de-duplicate this? At the time of writing biconcor does not -// use util at all. -vector tokenize(const char input[]) -{ - vector< string > token; - bool betweenWords = true; - int start=0; - int i; - for(i = 0; input[i] != '\0'; i++) { - const bool isSpace = (input[i] == ' ' || input[i] == '\t'); - - if (!isSpace && betweenWords) { - start = i; - betweenWords = false; - } else if (isSpace && !betweenWords) { - token.push_back( string( input+start, i-start ) ); - betweenWords = true; - } - } - if (!betweenWords) - token.push_back( string( input+start, i-start ) ); - return token; -} - -- cgit v1.2.3