Src/Utils/_UniversalVocabulary.cpp


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117

#include "_UniversalVocabulary.h"
#include <string>
#include <fstream>
#include <iostream>
#include <cstring>
#include <stdlib.h>

using namespace std;

C_UniversalVocabulary::C_UniversalVocabulary(const char * universalVocFileName)
{
	int fileNameSize=strlen(universalVocFileName);
	fileNameSize++;

	this->universalCorpusFileName = (char *) malloc(sizeof(char)*fileNameSize);
	sprintf(this->universalCorpusFileName,"%s\0", universalVocFileName);

	this->universalVoc = new C_IDVocabulary(universalVocFileName);

}

C_UniversalVocabulary::~C_UniversalVocabulary()
{
	free(this->universalCorpusFileName);
	delete(this->universalVoc);
}


/**
* Update the universal vocabulary with words in a new corpus
* Output the updated universal vocabulary
* Output the vocabulary needed for the new corpus too
*
* Revision $Rev: 3794 $
* Last Modified $LastChangedDate: 2007-06-29 02:17:32 -0400 (Fri, 29 Jun 2007) $
**/
void C_UniversalVocabulary::updateWithNewCorpus(const char * newCorpusFileName)
{

	ifstream textStream;
	textStream.open(newCorpusFileName);

	if(!textStream){
		fprintf(stderr,"Corpus file %s does not exist. Exit!\n",newCorpusFileName);
		exit(-1);
	}


	//add reserved words from universal voc
	for(IndexType vocId=1; vocId<=NUMBER_OF_RESERVED_WORDS_IN_VOC; vocId++){
		C_String reservedWordText = this->universalVoc->getText(vocId);
		this->wordsUsedInTheNewCorpus.insert(make_pair(reservedWordText, vocId));
	}

	string aLine;
	unsigned int sentNumber = 1;
	unsigned int corpusSize = 0;

	char * thisToken;
	char delimit[] =" \t\r\n";
	map<C_String, IndexType, ltstr>::iterator iterWordsUsedInTheNewCorpus;
	

	getline(textStream, aLine);
	while(!textStream.eof()){

		if(aLine.length()>0){

			thisToken = strtok((char*) aLine.c_str(), delimit );
			while( thisToken != NULL ) {			
				
				C_String thisWord(thisToken);

				//check if this word has already been seen
				iterWordsUsedInTheNewCorpus = this->wordsUsedInTheNewCorpus.find(thisWord);

				if(iterWordsUsedInTheNewCorpus == this->wordsUsedInTheNewCorpus.end()){
					//new type
					IndexType vocId = this->universalVoc->getId(thisWord);
					this->wordsUsedInTheNewCorpus.insert(make_pair(thisWord, vocId));
				}

				
				// While there are tokens in "string"
				// Get next token: 
				thisToken = strtok( NULL, delimit);
			}

		}
		
		getline(textStream, aLine);
	}


	//now output the updated universal vocabulary
	this->universalVoc->outputToFile(this->universalCorpusFileName);

	//output the vocabulary needed for the new corpus
	char vocabularyForNewCorpusFileName[1024];
	sprintf(vocabularyForNewCorpusFileName, "%s.id_voc", newCorpusFileName);

	ofstream outputVocFile;
	outputVocFile.open(vocabularyForNewCorpusFileName);

	if(!outputVocFile){
		cerr<<"Can not open "<<vocabularyForNewCorpusFileName<<" to write vocabulary\n";
		exit(-1);
	}

	iterWordsUsedInTheNewCorpus = this->wordsUsedInTheNewCorpus.begin();
	while(iterWordsUsedInTheNewCorpus!=this->wordsUsedInTheNewCorpus.end()){
		outputVocFile<<iterWordsUsedInTheNewCorpus->first.toString()<<"\t"<<iterWordsUsedInTheNewCorpus->second<<endl;
		iterWordsUsedInTheNewCorpus++;
	}

	outputVocFile.close();
}