mgizapp/src/ttableDiff.hpp


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119

/* -*- Mode: C; indent-tabs-mode: t; c-basic-offset: 4; tab-width: 4 -*- */
/*
 * newgiza
 * Copyright (C) Qin Gao 2007 <qing@cs.cmu.edu>
 * 
 * newgiza is free software.
 * 
 * You may redistribute it and/or modify it under the terms of the
 * GNU General Public License, as published by the Free Software
 * Foundation; either version 2 of the License, or (at your option)
 * any later version.
 * 
 * newgiza is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
 * See the GNU General Public License for more details.
 * 
 * You should have received a copy of the GNU General Public License
 * along with newgiza.  If not, write to:
 * 	The Free Software Foundation, Inc.,
 * 	51 Franklin Street, Fifth Floor
 * 	Boston, MA  02110-1301, USA.
 */


#ifndef _TTABLEDIFF_HPP_
#define _TTABLEDIFF_HPP_
#include "TTables.h"
#include <sstream>
#include <string>
#include "types.h"

using namespace std;
/*!
This class is meant to create a difference file in order to make
GIZA paralell.
*/
template <class COUNT,class PROB>
class CTTableDiff{
private:
		INT32 noEnglishWords;  // total number of unique source words
		INT32 noFrenchWords;   // total number of unique target words
		/*!
		Store only the counting*/	
#ifdef WIN32
		typedef hash_map<wordPairIds, COUNT, hashpair> wordpair_hash;
#else
		typedef hash_map<wordPairIds, COUNT, hashpair, equal_to<wordPairIds> > wordpair_hash;
#endif
		wordpair_hash ef;

public:
		INT32 SaveToFile(const char* filename){
			ofstream ofs(filename);
			if(!ofs.is_open()){
				return -1;
			}else{
				wordpair_hash::iterator it;
				for( it = ef.begin() ; it != ef.end(); it++){
					ofs << it->first.first << " " << it->first.second << " "
						<< it->second << std::endl;
				}
			}
			return SUCCESS;
		}
	
		INT32 LoadFromFile(const char* filename){
			ef.clear();
			ifstream ifs(filename);
			if(!ifs.is_open()){
				return -1;
			}
			string sline;
			while(!ifs.eof()){
				sline = "";
				std::getline(ifs,sline);
				if(sline.length()){
					//cout << sline << endl;
					stringstream ss(sline.c_str());
					WordIndex we=-1,wf=-1;
					COUNT ct=-1 ;
					ss >> we >> wf >> ct;
					if(we==-1||wf==-1||ct==-1)
						continue;
					ef[wordPairIds(we,wf)] = ct;
				}
			}
			return SUCCESS;
		}
	
		COUNT * GetPtr(WordIndex e, WordIndex f){
			// look up this pair and return its position
			wordpair_hash::iterator i = ef.find(wordPairIds(e, f)); 
			if(i != ef.end())  // if it exists, return a pointer to it.
				return(&((*i).second));
			else return(0) ; // else return NULL pointer
		}
	
		void incCount(WordIndex e, WordIndex f, COUNT inc) 
		// increments the count of the given word pair. if the pair does not exist, 
		// it creates it with the given value.
		{
			if( inc )
				ef[wordPairIds(e, f)] += inc ;
		}
	
		INT32 AugmentTTable(tmodel<COUNT,PROB>& ttable){
			wordpair_hash::iterator it;
			for( it = ef.begin() ; it != ef.end(); it++){
				ttable.incCount(it->first.first,it->first.second,it->second);
			}
			return SUCCESS;
		}
	
protected:

};

#endif // _TTABLEDIFF_HPP_