Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
Diffstat (limited to 'misc/TransliterationMining.cpp')
-rw-r--r--misc/TransliterationMining.cpp927
1 files changed, 442 insertions, 485 deletions
diff --git a/misc/TransliterationMining.cpp b/misc/TransliterationMining.cpp
index ec272b93a..8c2291864 100644
--- a/misc/TransliterationMining.cpp
+++ b/misc/TransliterationMining.cpp
@@ -1,12 +1,12 @@
- /*
+/*
########################################################################################
- Transliteration Mining - A Program to Extract Transliteration Pairs from
- a bilingual word list
- Source Contributor: Nadir Durrani
+Transliteration Mining - A Program to Extract Transliteration Pairs from
+a bilingual word list
+Source Contributor: Nadir Durrani
########################################################################################
-
+
*/
#include <cstdlib>
@@ -25,419 +25,392 @@ using namespace std;
double initTransitionProb;
double LAMBDA;
-double addLogProbs(double A , double B) // this function adds probabilities ...
+double addLogProbs(double A , double B) // this function adds probabilities ...
{
-
- if (A == B)
- return (A + log10(2.0));
-
- if (A > B)
- {
- if (A - B > 6) // A is a lot bigger ...
- return A;
- else
- return (A + log10(1+pow(10,(B-A))));
- }
-
- else // B > A
- {
- if (B - A > 6)
- return B;
- else
- return (B + log10(1+pow(10,(A-B))));
- }
-
+
+ if (A == B)
+ return (A + log10(2.0));
+
+ if (A > B) {
+ if (A - B > 6) // A is a lot bigger ...
+ return A;
+ else
+ return (A + log10(1+pow(10,(B-A))));
+ }
+
+ else { // B > A
+ if (B - A > 6)
+ return B;
+ else
+ return (B + log10(1+pow(10,(A-B))));
+ }
+
}
class NodeStructure
{
- public:
-
- NodeStructure(){};
- NodeStructure(vector <string> & s , vector <string> & t);
- double getPosterior(){return PPR;}
- void computeFwdBckProbs(map <string , double> & gammas, map <string, double> & alignmentCounts);
- void computeNonTransliterationProb (map <string , double> & sourceUnigrams , map <string , double> & targetUnigrams);
- void print();
-
- vector <string> source;
- vector <string> target;
- ~NodeStructure(){};
-
- private:
-
- double NTR; // Non-transliteration probability of a sentence pair ...
- double PPR; // Posterior Probability ...
- double ALPHA;
- double BETA;
-
- void computeGammaForEdges(map < pair <int , int> , double > & parents, map < pair <int , int> , double > & children , map <string, double> & transitionProbs , map <string, double> & alignmentCounts);
- double computeFwdProbs(pair <int , int> & ST, map <string , double> & gammas, map < pair <int , int> , double > & parents);
- double FwdProb (pair <int , int> & TS, map <string , double> & gammas, map < pair <int , int> , double > & parents);
- double BckProb (pair <int , int> & TS, map <string , double> & gammas, map < pair <int , int> , double > & chidren);
- double computeBckProbs(pair <int , int> & ST, map <string , double> & gammas, map < pair <int , int> , double > & children);
- void getIncomingEdges (pair <int , int> & ST , vector < pair < int , int> > & incomingEdges);
- void getOutgoingEdges (pair <int , int> & ST , vector < pair < int , int> > & outgoingEdges);
- double getTransitionProb(map <string, double> & transitionProbs , pair <int,int> & edge);
- void updateAlignmentCount(map <string, double> & transitionProbs, map <string, double> & alignmentCounts , pair <int,int> & edge , double alpha , double beta);
- void computePosteriorProb();
- double scaleGamma(double g);
- void getEdge (pair <int , int> & v1 , pair <int , int> & v2 , pair <int , int> & v3);
-
+public:
+
+ NodeStructure() {};
+ NodeStructure(vector <string> & s , vector <string> & t);
+ double getPosterior() {
+ return PPR;
+ }
+ void computeFwdBckProbs(map <string , double> & gammas, map <string, double> & alignmentCounts);
+ void computeNonTransliterationProb (map <string , double> & sourceUnigrams , map <string , double> & targetUnigrams);
+ void print();
+
+ vector <string> source;
+ vector <string> target;
+ ~NodeStructure() {};
+
+private:
+
+ double NTR; // Non-transliteration probability of a sentence pair ...
+ double PPR; // Posterior Probability ...
+ double ALPHA;
+ double BETA;
+
+ void computeGammaForEdges(map < pair <int , int> , double > & parents, map < pair <int , int> , double > & children , map <string, double> & transitionProbs , map <string, double> & alignmentCounts);
+ double computeFwdProbs(pair <int , int> & ST, map <string , double> & gammas, map < pair <int , int> , double > & parents);
+ double FwdProb (pair <int , int> & TS, map <string , double> & gammas, map < pair <int , int> , double > & parents);
+ double BckProb (pair <int , int> & TS, map <string , double> & gammas, map < pair <int , int> , double > & chidren);
+ double computeBckProbs(pair <int , int> & ST, map <string , double> & gammas, map < pair <int , int> , double > & children);
+ void getIncomingEdges (pair <int , int> & ST , vector < pair < int , int> > & incomingEdges);
+ void getOutgoingEdges (pair <int , int> & ST , vector < pair < int , int> > & outgoingEdges);
+ double getTransitionProb(map <string, double> & transitionProbs , pair <int,int> & edge);
+ void updateAlignmentCount(map <string, double> & transitionProbs, map <string, double> & alignmentCounts , pair <int,int> & edge , double alpha , double beta);
+ void computePosteriorProb();
+ double scaleGamma(double g);
+ void getEdge (pair <int , int> & v1 , pair <int , int> & v2 , pair <int , int> & v3);
+
};
void NodeStructure :: print()
{
-
- for (int i = 0; i < source.size(); i++)
- cout<<source[i];
-
- cout<<"\t";
- for (int i = 0; i < target.size(); i++)
- cout<<target[i];
+ for (int i = 0; i < source.size(); i++)
+ cout<<source[i];
+
+ cout<<"\t";
+
+ for (int i = 0; i < target.size(); i++)
+ cout<<target[i];
- cout<<"\t"<<pow(10,PPR)<<endl;
+ cout<<"\t"<<pow(10,PPR)<<endl;
}
NodeStructure :: NodeStructure(vector <string> & s , vector <string> & t)
{
- source = s;
- target = t;
+ source = s;
+ target = t;
}
void NodeStructure :: getEdge (pair <int , int> & v1 , pair <int , int> & v2 , pair <int , int> & v3)
{
- if (v2.first - v1.first == 0)
- v3.first = -1;
- else
- v3.first = v2.first;
-
- if (v2.second - v1.second == 0)
- v3.second = -1;
- else
- v3.second = v2.second;
+ if (v2.first - v1.first == 0)
+ v3.first = -1;
+ else
+ v3.first = v2.first;
+
+ if (v2.second - v1.second == 0)
+ v3.second = -1;
+ else
+ v3.second = v2.second;
}
void NodeStructure :: computeGammaForEdges(map < pair <int , int> , double > & parents, map < pair <int , int> , double > & children , map <string, double> & transitionProbs , map <string, double> & alignmentCounts)
{
- vector < pair < int , int> > incomingEdges;
- map < pair <int , int> , double > :: iterator cIter;
- map < pair <int , int> , double > :: iterator pIter;
- pair <int , int> ST = make_pair (-1,-1);
- pair <int , int> edge;
-
- children.erase(ST);
- double tProb;
- double alpha;
- double beta;
-
- for (cIter = children.begin(); cIter != children.end(); cIter++)
- {
- ST = cIter->first;
-
- getIncomingEdges (ST , incomingEdges);
- beta = cIter->second;
-
- for (int i = 0; i< incomingEdges.size(); i++)
- {
- pIter = parents.find(incomingEdges[i]);
-
- alpha = pIter->second;
- getEdge (incomingEdges[i] , ST , edge);
-
- updateAlignmentCount(transitionProbs, alignmentCounts , edge , alpha , beta);
- }
- }
+ vector < pair < int , int> > incomingEdges;
+ map < pair <int , int> , double > :: iterator cIter;
+ map < pair <int , int> , double > :: iterator pIter;
+ pair <int , int> ST = make_pair (-1,-1);
+ pair <int , int> edge;
+
+ children.erase(ST);
+ double tProb;
+ double alpha;
+ double beta;
+
+ for (cIter = children.begin(); cIter != children.end(); cIter++) {
+ ST = cIter->first;
+
+ getIncomingEdges (ST , incomingEdges);
+ beta = cIter->second;
+
+ for (int i = 0; i< incomingEdges.size(); i++) {
+ pIter = parents.find(incomingEdges[i]);
+
+ alpha = pIter->second;
+ getEdge (incomingEdges[i] , ST , edge);
+
+ updateAlignmentCount(transitionProbs, alignmentCounts , edge , alpha , beta);
+ }
+ }
}
-void NodeStructure :: computeNonTransliterationProb (map <string , double> & sourceUnigrams , map <string , double> & targetUnigrams)
+void NodeStructure :: computeNonTransliterationProb (map <string , double> & sourceUnigrams , map <string , double> & targetUnigrams)
{
-
- NTR = 0.0;
-
- for (int i = 0; i < source.size(); i++)
- {
- NTR += sourceUnigrams[source[i]];
- }
-
- for (int i = 0; i < target.size(); i++)
- {
-
- NTR += targetUnigrams[target[i]];
- }
+
+ NTR = 0.0;
+
+ for (int i = 0; i < source.size(); i++) {
+ NTR += sourceUnigrams[source[i]];
+ }
+
+ for (int i = 0; i < target.size(); i++) {
+
+ NTR += targetUnigrams[target[i]];
+ }
}
double NodeStructure :: scaleGamma(double g)
{
- double translit = log10 (1 - pow (10, PPR));
- return g + translit;
+ double translit = log10 (1 - pow (10, PPR));
+ return g + translit;
}
void NodeStructure :: computePosteriorProb()
{
- double LAMBDA2 = log10(1 - pow(10, LAMBDA));
- double transliterate = LAMBDA2 + ALPHA; // Transliteration Prob ...
- double translate = LAMBDA + NTR; // Translation Prob ...
- double trans = transliterate - translate;
- //cout<<LAMBDA<<" "<<LAMBDA2<<endl;
- //cout<<transliterate<<" "<<translate<<" "<<trans<<endl;
- //cout<<pow(10 , trans)<<endl;
- double prob = 1/(1+ pow(10 , trans));
- PPR = log10(prob);
-
- //cout<<"Posterior Prob "<<PPR<<endl;
+ double LAMBDA2 = log10(1 - pow(10, LAMBDA));
+ double transliterate = LAMBDA2 + ALPHA; // Transliteration Prob ...
+ double translate = LAMBDA + NTR; // Translation Prob ...
+ double trans = transliterate - translate;
+ //cout<<LAMBDA<<" "<<LAMBDA2<<endl;
+ //cout<<transliterate<<" "<<translate<<" "<<trans<<endl;
+ //cout<<pow(10 , trans)<<endl;
+ double prob = 1/(1+ pow(10 , trans));
+ PPR = log10(prob);
+
+ //cout<<"Posterior Prob "<<PPR<<endl;
}
-
+
void NodeStructure :: computeFwdBckProbs(map <string , double> & gammas , map <string, double> & alignmentCounts)
{
- pair <int , int> START = make_pair (source.size()-1 , target.size()-1);
- pair <int , int> END = make_pair (-1 , -1);
+ pair <int , int> START = make_pair (source.size()-1 , target.size()-1);
+ pair <int , int> END = make_pair (-1 , -1);
- map < pair <int , int> , double > parents;
- parents[make_pair(-1,-1)] = 0.0;
- map < pair <int , int> , double > children;
- children[make_pair(source.size()-1,target.size()-1)] = 0.0;
+ map < pair <int , int> , double > parents;
+ parents[make_pair(-1,-1)] = 0.0;
+ map < pair <int , int> , double > children;
+ children[make_pair(source.size()-1,target.size()-1)] = 0.0;
- ALPHA = computeFwdProbs(START , gammas, parents);
- BETA = computeBckProbs(END , gammas, children);
-
- computePosteriorProb();
- //cout<<"Alpha "<<ALPHA<<" Beta "<<BETA<<endl;
- computeGammaForEdges(parents , children , gammas , alignmentCounts);
+ ALPHA = computeFwdProbs(START , gammas, parents);
+ BETA = computeBckProbs(END , gammas, children);
+
+ computePosteriorProb();
+ //cout<<"Alpha "<<ALPHA<<" Beta "<<BETA<<endl;
+ computeGammaForEdges(parents , children , gammas , alignmentCounts);
}
void NodeStructure :: getIncomingEdges (pair <int , int> & ST , vector < pair < int , int> > & incomingEdges)
{
- incomingEdges.clear();
-
- if (ST.first == -1) // Source is NULL ..
- {
- incomingEdges.push_back(make_pair(ST.first , ST.second-1));
- }
- else if (ST.second == -1) // Target is NULL ...
- {
- incomingEdges.push_back(make_pair(ST.first-1 , ST.second));
- }
- else
- {
- incomingEdges.push_back(make_pair(ST.first , ST.second-1));
- incomingEdges.push_back(make_pair(ST.first-1 , ST.second));
- incomingEdges.push_back(make_pair(ST.first-1 , ST.second-1));
- }
+ incomingEdges.clear();
+
+ if (ST.first == -1) { // Source is NULL ..
+ incomingEdges.push_back(make_pair(ST.first , ST.second-1));
+ } else if (ST.second == -1) { // Target is NULL ...
+ incomingEdges.push_back(make_pair(ST.first-1 , ST.second));
+ } else {
+ incomingEdges.push_back(make_pair(ST.first , ST.second-1));
+ incomingEdges.push_back(make_pair(ST.first-1 , ST.second));
+ incomingEdges.push_back(make_pair(ST.first-1 , ST.second-1));
+ }
}
void NodeStructure :: getOutgoingEdges (pair <int , int> & ST , vector < pair < int , int> > & outgoingEdges)
{
- if (ST.first == source.size()-1) // Source is END ..
- {
- outgoingEdges.push_back(make_pair(ST.first , ST.second+1));
- }
- else if (ST.second == target.size()-1) // Target is END ...
- {
- outgoingEdges.push_back(make_pair(ST.first+1 , ST.second));
- }
- else
- {
- outgoingEdges.push_back(make_pair(ST.first , ST.second+1));
- outgoingEdges.push_back(make_pair(ST.first+1 , ST.second));
- outgoingEdges.push_back(make_pair(ST.first+1 , ST.second+1));
- }
+ if (ST.first == source.size()-1) { // Source is END ..
+ outgoingEdges.push_back(make_pair(ST.first , ST.second+1));
+ } else if (ST.second == target.size()-1) { // Target is END ...
+ outgoingEdges.push_back(make_pair(ST.first+1 , ST.second));
+ } else {
+ outgoingEdges.push_back(make_pair(ST.first , ST.second+1));
+ outgoingEdges.push_back(make_pair(ST.first+1 , ST.second));
+ outgoingEdges.push_back(make_pair(ST.first+1 , ST.second+1));
+ }
}
void NodeStructure :: updateAlignmentCount(map <string, double> & transitionProbs, map <string, double> & alignmentCounts , pair <int,int> & edge , double alpha , double beta)
{
- double tProb;
- double tgamma;
- double gamma;
- map <string , double> :: iterator aCounts;
- string query;
-
- if (edge.first == -1)
- query = "NULL";
- else
- query = source[edge.first];
-
- query += "-";
-
- if (edge.second == -1)
- query += "NULL";
- else
- query += target[edge.second];
-
- //cout<<" Query "<<query<<endl;
- if (transitionProbs.size() == 0)
- tProb = initTransitionProb;
- else
- tProb = transitionProbs[query];
-
-
- tgamma = alpha + tProb + beta - ALPHA;
- gamma = scaleGamma(tgamma);
- //cout<<alpha<<" "<<beta<<" "<<gamma<<endl;
- //cout<<tProb<<" "<<ALPHA<<endl;
-
- aCounts = alignmentCounts.find(query);
-
- if (aCounts == alignmentCounts.end())
- {
- alignmentCounts[query] = gamma;
- }
- else
- {
- double temp = aCounts->second;
- aCounts->second = addLogProbs(temp , gamma);
- }
-
+ double tProb;
+ double tgamma;
+ double gamma;
+ map <string , double> :: iterator aCounts;
+ string query;
+
+ if (edge.first == -1)
+ query = "NULL";
+ else
+ query = source[edge.first];
+
+ query += "-";
+
+ if (edge.second == -1)
+ query += "NULL";
+ else
+ query += target[edge.second];
+
+ //cout<<" Query "<<query<<endl;
+ if (transitionProbs.size() == 0)
+ tProb = initTransitionProb;
+ else
+ tProb = transitionProbs[query];
+
+
+ tgamma = alpha + tProb + beta - ALPHA;
+ gamma = scaleGamma(tgamma);
+ //cout<<alpha<<" "<<beta<<" "<<gamma<<endl;
+ //cout<<tProb<<" "<<ALPHA<<endl;
+
+ aCounts = alignmentCounts.find(query);
+
+ if (aCounts == alignmentCounts.end()) {
+ alignmentCounts[query] = gamma;
+ } else {
+ double temp = aCounts->second;
+ aCounts->second = addLogProbs(temp , gamma);
+ }
+
}
double NodeStructure :: getTransitionProb(map <string, double> & transitionProbs , pair <int,int> & edge)
{
- if (transitionProbs.size() == 0)
- return initTransitionProb;
-
- string query;
-
- if (edge.first == -1)
- query = "NULL";
- else
- query = source[edge.first];
-
- query += "-";
-
- if (edge.second == -1)
- query += "NULL";
- else
- query += target[edge.second];
-
- //cout<<" Query "<<query<<endl;
- return transitionProbs[query];
+ if (transitionProbs.size() == 0)
+ return initTransitionProb;
+
+ string query;
+
+ if (edge.first == -1)
+ query = "NULL";
+ else
+ query = source[edge.first];
+
+ query += "-";
+
+ if (edge.second == -1)
+ query += "NULL";
+ else
+ query += target[edge.second];
+
+ //cout<<" Query "<<query<<endl;
+ return transitionProbs[query];
}
double NodeStructure :: FwdProb (pair <int , int> & TS, map <string , double> & gammas, map < pair <int , int> , double > & parents)
{
- double thisAlpha;
- double alpha = -2000;
- vector < pair < int , int> > incomingEdges;
- pair <int , int> edge;
-
-
- getIncomingEdges (TS , incomingEdges);
-
- for (int k = 0; k < incomingEdges.size(); k++)
- {
- thisAlpha = parents[incomingEdges[k]];
- getEdge (incomingEdges[k], TS , edge);
- thisAlpha += getTransitionProb(gammas , edge); // Get Transition Prob ...
- double temp = alpha;
- alpha = addLogProbs(temp , thisAlpha); // Sum of all parents * transition prob ..
- // cout<<temp<<"+"<<thisAlpha<<"="<<alpha<<endl;
- }
-
- return alpha;
+ double thisAlpha;
+ double alpha = -2000;
+ vector < pair < int , int> > incomingEdges;
+ pair <int , int> edge;
+
+
+ getIncomingEdges (TS , incomingEdges);
+
+ for (int k = 0; k < incomingEdges.size(); k++) {
+ thisAlpha = parents[incomingEdges[k]];
+ getEdge (incomingEdges[k], TS , edge);
+ thisAlpha += getTransitionProb(gammas , edge); // Get Transition Prob ...
+ double temp = alpha;
+ alpha = addLogProbs(temp , thisAlpha); // Sum of all parents * transition prob ..
+ // cout<<temp<<"+"<<thisAlpha<<"="<<alpha<<endl;
+ }
+
+ return alpha;
}
double NodeStructure :: computeFwdProbs(pair <int , int> & ST, map <string , double> & gammas, map < pair <int , int> , double > & parents)
{
-
- pair <int , int> TS;
- double alpha;
-
- for (int i = 0; i < source.size(); i++)
- {
- TS = make_pair (i , -1);
- alpha = FwdProb (TS, gammas, parents);
- parents[TS] = alpha;
- }
-
- for (int i = 0; i < target.size(); i++)
- {
- TS = make_pair (-1 , i);
- alpha = FwdProb (TS, gammas, parents);
- parents[TS] = alpha;
- }
-
- for (int i = 0; i < source.size(); i++)
- {
- for (int j = 0; j < target.size(); j++)
- {
- TS = make_pair (i , j);
- alpha = FwdProb (TS, gammas, parents);
- parents[TS] = alpha;
- }
- }
-
- return parents[ST];
+
+ pair <int , int> TS;
+ double alpha;
+
+ for (int i = 0; i < source.size(); i++) {
+ TS = make_pair (i , -1);
+ alpha = FwdProb (TS, gammas, parents);
+ parents[TS] = alpha;
+ }
+
+ for (int i = 0; i < target.size(); i++) {
+ TS = make_pair (-1 , i);
+ alpha = FwdProb (TS, gammas, parents);
+ parents[TS] = alpha;
+ }
+
+ for (int i = 0; i < source.size(); i++) {
+ for (int j = 0; j < target.size(); j++) {
+ TS = make_pair (i , j);
+ alpha = FwdProb (TS, gammas, parents);
+ parents[TS] = alpha;
+ }
+ }
+
+ return parents[ST];
}
double NodeStructure :: BckProb (pair <int , int> & TS, map <string , double> & gammas, map < pair <int , int> , double > & children)
{
- double thisBeta;
- double beta = -2000;
- vector < pair < int , int> > outgoingEdges;
- pair <int , int> edge;
-
- getOutgoingEdges (TS , outgoingEdges);
-
- for (int k = 0; k < outgoingEdges.size(); k++)
- {
- thisBeta = children[outgoingEdges[k]];
- getEdge (TS , outgoingEdges[k], edge);
- thisBeta += getTransitionProb(gammas , edge); // Get Transition Prob ...
- double temp = beta;
- beta = addLogProbs(temp , thisBeta); // Sum of all parents * transition prob ..
- // cout<<temp<<"+"<<thisAlpha<<"="<<alpha<<endl;
- }
-
- return beta;
+ double thisBeta;
+ double beta = -2000;
+ vector < pair < int , int> > outgoingEdges;
+ pair <int , int> edge;
+
+ getOutgoingEdges (TS , outgoingEdges);
+
+ for (int k = 0; k < outgoingEdges.size(); k++) {
+ thisBeta = children[outgoingEdges[k]];
+ getEdge (TS , outgoingEdges[k], edge);
+ thisBeta += getTransitionProb(gammas , edge); // Get Transition Prob ...
+ double temp = beta;
+ beta = addLogProbs(temp , thisBeta); // Sum of all parents * transition prob ..
+ // cout<<temp<<"+"<<thisAlpha<<"="<<alpha<<endl;
+ }
+
+ return beta;
}
double NodeStructure :: computeBckProbs(pair <int , int> & ST, map <string , double> & gammas, map < pair <int , int> , double > & children)
{
-
- pair <int , int> TS;
- double beta;
-
- for (int i = source.size()-2; i >= -1; i--)
- {
- TS = make_pair (i , target.size()-1);
- beta = BckProb (TS, gammas, children);
- children[TS] = beta;
- }
-
- for (int i = target.size()-2; i >=-1; i--)
- {
- TS = make_pair (source.size()-1 , i);
- beta = BckProb (TS, gammas, children);
- children[TS] = beta;
- }
-
- for (int i = source.size()-2 ; i >= -1 ; i--)
- {
- for (int j = target.size()-2 ; j >= -1; j--)
- {
- TS = make_pair (i , j);
- beta = BckProb (TS, gammas, children);
- children[TS] = beta;
- }
- }
-
- return children[ST];
+
+ pair <int , int> TS;
+ double beta;
+
+ for (int i = source.size()-2; i >= -1; i--) {
+ TS = make_pair (i , target.size()-1);
+ beta = BckProb (TS, gammas, children);
+ children[TS] = beta;
+ }
+
+ for (int i = target.size()-2; i >=-1; i--) {
+ TS = make_pair (source.size()-1 , i);
+ beta = BckProb (TS, gammas, children);
+ children[TS] = beta;
+ }
+
+ for (int i = source.size()-2 ; i >= -1 ; i--) {
+ for (int j = target.size()-2 ; j >= -1; j--) {
+ TS = make_pair (i , j);
+ beta = BckProb (TS, gammas, children);
+ children[TS] = beta;
+ }
+ }
+
+ return children[ST];
}
@@ -445,204 +418,188 @@ double NodeStructure :: computeBckProbs(pair <int , int> & ST, map <string , dou
void loadInput(const char * fileName, vector <string> & input)
{
- /* This function loads a file into a vector of strings */
-
- ifstream sr (fileName);
- string line;
-
- if(sr.is_open())
- {
- while(getline(sr , line ))
- {
- input.push_back(line);
- }
-
- sr.close();
- }
- else
- {
- cout<<"Unable to read "<<fileName<<endl;
- exit(1);
- }
+ /* This function loads a file into a vector of strings */
+
+ ifstream sr (fileName);
+ string line;
+
+ if(sr.is_open()) {
+ while(getline(sr , line )) {
+ input.push_back(line);
+ }
+
+ sr.close();
+ } else {
+ cout<<"Unable to read "<<fileName<<endl;
+ exit(1);
+ }
}
void printGammas(map <string, double> & alignmentCounts)
{
- map <string , double> :: iterator aCounts;
+ map <string , double> :: iterator aCounts;
- for (aCounts = alignmentCounts.begin(); aCounts != alignmentCounts.end(); aCounts++)
- {
- cout<<aCounts->first<<" "<<aCounts->second<<endl;
- }
+ for (aCounts = alignmentCounts.begin(); aCounts != alignmentCounts.end(); aCounts++) {
+ cout<<aCounts->first<<" "<<aCounts->second<<endl;
+ }
}
void getWords(string s, vector <string> & currInput)
{
- /* This function splits a string into vector of strings using space character as a delimiter */
+ /* This function splits a string into vector of strings using space character as a delimiter */
- istringstream iss(s);
- currInput.clear();
- do
- {
- string sub;
- iss >> sub;
- currInput.push_back(sub);
+ istringstream iss(s);
+ currInput.clear();
+ do {
+ string sub;
+ iss >> sub;
+ currInput.push_back(sub);
- } while (iss);
+ } while (iss);
- currInput.pop_back();
+ currInput.pop_back();
}
double getInitTransitionProb(int sourceToken, int targetToken)
{
- double prod = sourceToken * targetToken;
- return log10(1/prod);
+ double prod = sourceToken * targetToken;
+ return log10(1/prod);
}
void runIteration(map <int , NodeStructure> & graph , map <string , double> & gammas , int size)
{
- map <string, double> alignmentCounts;
- map <int , NodeStructure> :: iterator i;
- map <string , double> :: iterator aCounts;
- double sum = -2000.0;
- double tPPR = -2000.0;
-
- for (i = graph.begin(); i != graph.end(); i++)
- {
-
- i->second.computeFwdBckProbs(gammas , alignmentCounts);
- double temp = tPPR;
-
- tPPR = addLogProbs(graph[i->first].getPosterior() , temp);
-
- }
-
- for (aCounts = alignmentCounts.begin(); aCounts != alignmentCounts.end(); aCounts++)
- {
- double temp = sum;
- sum = addLogProbs(aCounts->second, temp);
- }
-
-
- for (aCounts = alignmentCounts.begin(); aCounts != alignmentCounts.end(); aCounts++) // Normalizing ...
- {
- aCounts->second = aCounts->second - sum;
- }
-
- gammas.clear();
- gammas = alignmentCounts;
-
- LAMBDA = tPPR - log10(size);
+ map <string, double> alignmentCounts;
+ map <int , NodeStructure> :: iterator i;
+ map <string , double> :: iterator aCounts;
+ double sum = -2000.0;
+ double tPPR = -2000.0;
+
+ for (i = graph.begin(); i != graph.end(); i++) {
+
+ i->second.computeFwdBckProbs(gammas , alignmentCounts);
+ double temp = tPPR;
+
+ tPPR = addLogProbs(graph[i->first].getPosterior() , temp);
+
+ }
+
+ for (aCounts = alignmentCounts.begin(); aCounts != alignmentCounts.end(); aCounts++) {
+ double temp = sum;
+ sum = addLogProbs(aCounts->second, temp);
+ }
+
+
+ for (aCounts = alignmentCounts.begin(); aCounts != alignmentCounts.end(); aCounts++) { // Normalizing ...
+ aCounts->second = aCounts->second - sum;
+ }
+
+ gammas.clear();
+ gammas = alignmentCounts;
+
+ LAMBDA = tPPR - log10(size);
}
void setNTRProbabilities(map <int , NodeStructure> & graph , map <string , double> & sourceTypes , map <string , double > & targetTypes, double sourceTokens, double targetTokens)
{
-
- map <string , double> :: iterator i;
- map <int , NodeStructure> :: iterator j;
-
- for (i = sourceTypes.begin(); i!= sourceTypes.end(); i++)
- {
- i->second = log10(i->second/sourceTokens);
- }
+ map <string , double> :: iterator i;
+ map <int , NodeStructure> :: iterator j;
+
+
+ for (i = sourceTypes.begin(); i!= sourceTypes.end(); i++) {
+ i->second = log10(i->second/sourceTokens);
+ }
- for (i = targetTypes.begin(); i!= targetTypes.end(); i++)
- {
- i->second = log10(i->second/targetTokens);
- }
+ for (i = targetTypes.begin(); i!= targetTypes.end(); i++) {
+ i->second = log10(i->second/targetTokens);
+ }
- for (j = graph.begin(); j != graph.end(); j++)
- {
- j->second.computeNonTransliterationProb(sourceTypes , targetTypes);
- }
+ for (j = graph.begin(); j != graph.end(); j++) {
+ j->second.computeNonTransliterationProb(sourceTypes , targetTypes);
+ }
}
void printPosterior(map <int , NodeStructure> & graph)
{
- map <int , NodeStructure> :: iterator i;
+ map <int , NodeStructure> :: iterator i;
- for (i = graph.begin(); i != graph.end(); i++)
- graph[i->first].print();
+ for (i = graph.begin(); i != graph.end(); i++)
+ graph[i->first].print();
}
int main(int argc, char * argv[])
{
- vector <string> input;
- vector <string> source;
- vector <string> target;
- map <string , double> sourceTypes;
- map <string , double> targetTypes;
- set < vector <string> > tgt;
- set < vector <string> > src;
- double sourceTokens = 0;
- double targetTokens = 0;
- map <int , NodeStructure> graph;
- map <string , double> gammas;
-
- loadInput(argv[1],input);
-
- cerr<<"Constructing Graph "<<endl;
-
- for(int i=0; i<input.size(); i+=2)
- {
-
- //cerr<<input[i]<<endl;
- //cerr<<input[i+1]<<endl;
-
-
- getWords(input[i],source);
- getWords(input[i+1],target);
-
- if (src.find(source) == src.end())
- {
- for (int j = 0; j< source.size(); j++)
- sourceTypes[source[j]]++;
- src.insert(source);
- sourceTokens += source.size();
- }
-
- if (tgt.find(target) == tgt.end())
- {
- for (int j = 0; j< target.size(); j++)
- targetTypes[target[j]]++;
-
- tgt.insert(target);
- targetTokens += target.size();
- }
-
- NodeStructure obj (source,target);
- graph[i] = obj;
-
- }
-
- setNTRProbabilities(graph, sourceTypes, targetTypes, sourceTokens, targetTokens);
- initTransitionProb = getInitTransitionProb(sourceTypes.size()+1, targetTypes.size()+1);
-
- LAMBDA = log10(0.5);
-
-
- for (int i = 0; i< 10; i++)
- {
-
- cerr<<"Computing Probs : iteration "<<i+1<<endl;
- runIteration(graph , gammas , input.size()/2);
-
- }
-
- printPosterior(graph);
- cerr<<"Finished..."<<endl;
-
- return 0;
+ vector <string> input;
+ vector <string> source;
+ vector <string> target;
+ map <string , double> sourceTypes;
+ map <string , double> targetTypes;
+ set < vector <string> > tgt;
+ set < vector <string> > src;
+ double sourceTokens = 0;
+ double targetTokens = 0;
+ map <int , NodeStructure> graph;
+ map <string , double> gammas;
+
+ loadInput(argv[1],input);
+
+ cerr<<"Constructing Graph "<<endl;
+
+ for(int i=0; i<input.size(); i+=2) {
+
+ //cerr<<input[i]<<endl;
+ //cerr<<input[i+1]<<endl;
+
+
+ getWords(input[i],source);
+ getWords(input[i+1],target);
+
+ if (src.find(source) == src.end()) {
+ for (int j = 0; j< source.size(); j++)
+ sourceTypes[source[j]]++;
+ src.insert(source);
+ sourceTokens += source.size();
+ }
+
+ if (tgt.find(target) == tgt.end()) {
+ for (int j = 0; j< target.size(); j++)
+ targetTypes[target[j]]++;
+
+ tgt.insert(target);
+ targetTokens += target.size();
+ }
+
+ NodeStructure obj (source,target);
+ graph[i] = obj;
+
+ }
+
+ setNTRProbabilities(graph, sourceTypes, targetTypes, sourceTokens, targetTokens);
+ initTransitionProb = getInitTransitionProb(sourceTypes.size()+1, targetTypes.size()+1);
+
+ LAMBDA = log10(0.5);
+
+
+ for (int i = 0; i< 10; i++) {
+
+ cerr<<"Computing Probs : iteration "<<i+1<<endl;
+ runIteration(graph , gammas , input.size()/2);
+
+ }
+
+ printPosterior(graph);
+ cerr<<"Finished..."<<endl;
+
+ return 0;
}