Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/salm.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
Diffstat (limited to 'Src/Utils/_UniversalVocabulary.cpp~')
-rwxr-xr-xSrc/Utils/_UniversalVocabulary.cpp~117
1 files changed, 117 insertions, 0 deletions
diff --git a/Src/Utils/_UniversalVocabulary.cpp~ b/Src/Utils/_UniversalVocabulary.cpp~
new file mode 100755
index 0000000..50a7396
--- /dev/null
+++ b/Src/Utils/_UniversalVocabulary.cpp~
@@ -0,0 +1,117 @@
+#include "_UniversalVocabulary.h"
+#include "malloc.h"
+#include <string>
+#include <fstream>
+#include <iostream>
+#include <cstring>
+
+using namespace std;
+
+C_UniversalVocabulary::C_UniversalVocabulary(const char * universalVocFileName)
+{
+ int fileNameSize=strlen(universalVocFileName);
+ fileNameSize++;
+
+ this->universalCorpusFileName = (char *) malloc(sizeof(char)*fileNameSize);
+ sprintf(this->universalCorpusFileName,"%s\0", universalVocFileName);
+
+ this->universalVoc = new C_IDVocabulary(universalVocFileName);
+
+}
+
+C_UniversalVocabulary::~C_UniversalVocabulary()
+{
+ free(this->universalCorpusFileName);
+ delete(this->universalVoc);
+}
+
+
+/**
+* Update the universal vocabulary with words in a new corpus
+* Output the updated universal vocabulary
+* Output the vocabulary needed for the new corpus too
+*
+* Revision $Rev: 3794 $
+* Last Modified $LastChangedDate: 2007-06-29 02:17:32 -0400 (Fri, 29 Jun 2007) $
+**/
+void C_UniversalVocabulary::updateWithNewCorpus(const char * newCorpusFileName)
+{
+
+ ifstream textStream;
+ textStream.open(newCorpusFileName);
+
+ if(textStream==NULL){
+ fprintf(stderr,"Corpus file %s does not exist. Exit!\n",newCorpusFileName);
+ exit(-1);
+ }
+
+
+ //add reserved words from universal voc
+ for(IndexType vocId=1; vocId<=NUMBER_OF_RESERVED_WORDS_IN_VOC; vocId++){
+ C_String reservedWordText = this->universalVoc->getText(vocId);
+ this->wordsUsedInTheNewCorpus.insert(make_pair(reservedWordText, vocId));
+ }
+
+ string aLine;
+ unsigned int sentNumber = 1;
+ unsigned int corpusSize = 0;
+
+ char * thisToken;
+ char delimit[] =" \t\r\n";
+ map<C_String, IndexType, ltstr>::iterator iterWordsUsedInTheNewCorpus;
+
+
+ getline(textStream, aLine);
+ while(!textStream.eof()){
+
+ if(aLine.length()>0){
+
+ thisToken = strtok((char*) aLine.c_str(), delimit );
+ while( thisToken != NULL ) {
+
+ C_String thisWord(thisToken);
+
+ //check if this word has already been seen
+ iterWordsUsedInTheNewCorpus = this->wordsUsedInTheNewCorpus.find(thisWord);
+
+ if(iterWordsUsedInTheNewCorpus == this->wordsUsedInTheNewCorpus.end()){
+ //new type
+ IndexType vocId = this->universalVoc->getId(thisWord);
+ this->wordsUsedInTheNewCorpus.insert(make_pair(thisWord, vocId));
+ }
+
+
+ // While there are tokens in "string"
+ // Get next token:
+ thisToken = strtok( NULL, delimit);
+ }
+
+ }
+
+ getline(textStream, aLine);
+ }
+
+
+ //now output the updated universal vocabulary
+ this->universalVoc->outputToFile(this->universalCorpusFileName);
+
+ //output the vocabulary needed for the new corpus
+ char vocabularyForNewCorpusFileName[1024];
+ sprintf(vocabularyForNewCorpusFileName, "%s.id_voc", newCorpusFileName);
+
+ ofstream outputVocFile;
+ outputVocFile.open(vocabularyForNewCorpusFileName);
+
+ if(!outputVocFile){
+ cerr<<"Can not open "<<vocabularyForNewCorpusFileName<<" to write vocabulary\n";
+ exit(-1);
+ }
+
+ iterWordsUsedInTheNewCorpus = this->wordsUsedInTheNewCorpus.begin();
+ while(iterWordsUsedInTheNewCorpus!=this->wordsUsedInTheNewCorpus.end()){
+ outputVocFile<<iterWordsUsedInTheNewCorpus->first.toString()<<"\t"<<iterWordsUsedInTheNewCorpus->second<<endl;
+ iterWordsUsedInTheNewCorpus++;
+ }
+
+ outputVocFile.close();
+}