1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
|
#include "_UniversalVocabulary.h"
#include <string>
#include <fstream>
#include <iostream>
#include <cstring>
#include <stdlib.h>
using namespace std;
C_UniversalVocabulary::C_UniversalVocabulary(const char * universalVocFileName)
{
int fileNameSize=strlen(universalVocFileName);
fileNameSize++;
this->universalCorpusFileName = (char *) malloc(sizeof(char)*fileNameSize);
sprintf(this->universalCorpusFileName,"%s\0", universalVocFileName);
this->universalVoc = new C_IDVocabulary(universalVocFileName);
}
C_UniversalVocabulary::~C_UniversalVocabulary()
{
free(this->universalCorpusFileName);
delete(this->universalVoc);
}
/**
* Update the universal vocabulary with words in a new corpus
* Output the updated universal vocabulary
* Output the vocabulary needed for the new corpus too
*
* Revision $Rev: 3794 $
* Last Modified $LastChangedDate: 2007-06-29 02:17:32 -0400 (Fri, 29 Jun 2007) $
**/
void C_UniversalVocabulary::updateWithNewCorpus(const char * newCorpusFileName)
{
ifstream textStream;
textStream.open(newCorpusFileName);
if(!textStream){
fprintf(stderr,"Corpus file %s does not exist. Exit!\n",newCorpusFileName);
exit(-1);
}
//add reserved words from universal voc
for(IndexType vocId=1; vocId<=NUMBER_OF_RESERVED_WORDS_IN_VOC; vocId++){
C_String reservedWordText = this->universalVoc->getText(vocId);
this->wordsUsedInTheNewCorpus.insert(make_pair(reservedWordText, vocId));
}
string aLine;
unsigned int sentNumber = 1;
unsigned int corpusSize = 0;
char * thisToken;
char delimit[] =" \t\r\n";
map<C_String, IndexType, ltstr>::iterator iterWordsUsedInTheNewCorpus;
getline(textStream, aLine);
while(!textStream.eof()){
if(aLine.length()>0){
thisToken = strtok((char*) aLine.c_str(), delimit );
while( thisToken != NULL ) {
C_String thisWord(thisToken);
//check if this word has already been seen
iterWordsUsedInTheNewCorpus = this->wordsUsedInTheNewCorpus.find(thisWord);
if(iterWordsUsedInTheNewCorpus == this->wordsUsedInTheNewCorpus.end()){
//new type
IndexType vocId = this->universalVoc->getId(thisWord);
this->wordsUsedInTheNewCorpus.insert(make_pair(thisWord, vocId));
}
// While there are tokens in "string"
// Get next token:
thisToken = strtok( NULL, delimit);
}
}
getline(textStream, aLine);
}
//now output the updated universal vocabulary
this->universalVoc->outputToFile(this->universalCorpusFileName);
//output the vocabulary needed for the new corpus
char vocabularyForNewCorpusFileName[1024];
sprintf(vocabularyForNewCorpusFileName, "%s.id_voc", newCorpusFileName);
ofstream outputVocFile;
outputVocFile.open(vocabularyForNewCorpusFileName);
if(!outputVocFile){
cerr<<"Can not open "<<vocabularyForNewCorpusFileName<<" to write vocabulary\n";
exit(-1);
}
iterWordsUsedInTheNewCorpus = this->wordsUsedInTheNewCorpus.begin();
while(iterWordsUsedInTheNewCorpus!=this->wordsUsedInTheNewCorpus.end()){
outputVocFile<<iterWordsUsedInTheNewCorpus->first.toString()<<"\t"<<iterWordsUsedInTheNewCorpus->second<<endl;
iterWordsUsedInTheNewCorpus++;
}
outputVocFile.close();
}
|