/* ######################################################################################## Transliteration Mining - A Program to Extract Transliteration Pairs from a bilingual word list Source Contributor: Nadir Durrani ######################################################################################## */ #include #include #include #include #include #include #include #include using namespace std; double initTransitionProb; double LAMBDA; double addLogProbs(double A , double B) // this function adds probabilities ... { if (A == B) return (A + log10(2.0)); if (A > B) { if (A - B > 6) // A is a lot bigger ... return A; else return (A + log10(1+pow(10,(B-A)))); } else { // B > A if (B - A > 6) return B; else return (B + log10(1+pow(10,(A-B)))); } } class NodeStructure { public: NodeStructure() {}; NodeStructure(vector & s , vector & t); double getPosterior() { return PPR; } void computeFwdBckProbs(map & gammas, map & alignmentCounts); void computeNonTransliterationProb (map & sourceUnigrams , map & targetUnigrams); void print(); vector source; vector target; ~NodeStructure() {}; private: double NTR; // Non-transliteration probability of a sentence pair ... double PPR; // Posterior Probability ... double ALPHA; double BETA; void computeGammaForEdges(map < pair , double > & parents, map < pair , double > & children , map & transitionProbs , map & alignmentCounts); double computeFwdProbs(pair & ST, map & gammas, map < pair , double > & parents); double FwdProb (pair & TS, map & gammas, map < pair , double > & parents); double BckProb (pair & TS, map & gammas, map < pair , double > & chidren); double computeBckProbs(pair & ST, map & gammas, map < pair , double > & children); void getIncomingEdges (pair & ST , vector < pair < int , int> > & incomingEdges); void getOutgoingEdges (pair & ST , vector < pair < int , int> > & outgoingEdges); double getTransitionProb(map & transitionProbs , pair & edge); void updateAlignmentCount(map & transitionProbs, map & alignmentCounts , pair & edge , double alpha , double beta); void computePosteriorProb(); double scaleGamma(double g); void getEdge (pair & v1 , pair & v2 , pair & v3); }; void NodeStructure :: print() { for (int i = 0; i < source.size(); i++) cout< & s , vector & t) { source = s; target = t; } void NodeStructure :: getEdge (pair & v1 , pair & v2 , pair & v3) { if (v2.first - v1.first == 0) v3.first = -1; else v3.first = v2.first; if (v2.second - v1.second == 0) v3.second = -1; else v3.second = v2.second; } void NodeStructure :: computeGammaForEdges(map < pair , double > & parents, map < pair , double > & children , map & transitionProbs , map & alignmentCounts) { vector < pair < int , int> > incomingEdges; map < pair , double > :: iterator cIter; map < pair , double > :: iterator pIter; pair ST = make_pair (-1,-1); pair edge; children.erase(ST); double tProb; double alpha; double beta; for (cIter = children.begin(); cIter != children.end(); cIter++) { ST = cIter->first; getIncomingEdges (ST , incomingEdges); beta = cIter->second; for (int i = 0; i< incomingEdges.size(); i++) { pIter = parents.find(incomingEdges[i]); alpha = pIter->second; getEdge (incomingEdges[i] , ST , edge); updateAlignmentCount(transitionProbs, alignmentCounts , edge , alpha , beta); } } } void NodeStructure :: computeNonTransliterationProb (map & sourceUnigrams , map & targetUnigrams) { NTR = 0.0; for (int i = 0; i < source.size(); i++) { NTR += sourceUnigrams[source[i]]; } for (int i = 0; i < target.size(); i++) { NTR += targetUnigrams[target[i]]; } } double NodeStructure :: scaleGamma(double g) { double translit = log10 (1 - pow (10, PPR)); return g + translit; } void NodeStructure :: computePosteriorProb() { double LAMBDA2 = log10(1 - pow(10, LAMBDA)); double transliterate = LAMBDA2 + ALPHA; // Transliteration Prob ... double translate = LAMBDA + NTR; // Translation Prob ... double trans = transliterate - translate; //cout< & gammas , map & alignmentCounts) { pair START = make_pair (source.size()-1 , target.size()-1); pair END = make_pair (-1 , -1); map < pair , double > parents; parents[make_pair(-1,-1)] = 0.0; map < pair , double > children; children[make_pair(source.size()-1,target.size()-1)] = 0.0; ALPHA = computeFwdProbs(START , gammas, parents); BETA = computeBckProbs(END , gammas, children); computePosteriorProb(); //cout<<"Alpha "< & ST , vector < pair < int , int> > & incomingEdges) { incomingEdges.clear(); if (ST.first == -1) { // Source is NULL .. incomingEdges.push_back(make_pair(ST.first , ST.second-1)); } else if (ST.second == -1) { // Target is NULL ... incomingEdges.push_back(make_pair(ST.first-1 , ST.second)); } else { incomingEdges.push_back(make_pair(ST.first , ST.second-1)); incomingEdges.push_back(make_pair(ST.first-1 , ST.second)); incomingEdges.push_back(make_pair(ST.first-1 , ST.second-1)); } } void NodeStructure :: getOutgoingEdges (pair & ST , vector < pair < int , int> > & outgoingEdges) { if (ST.first == source.size()-1) { // Source is END .. outgoingEdges.push_back(make_pair(ST.first , ST.second+1)); } else if (ST.second == target.size()-1) { // Target is END ... outgoingEdges.push_back(make_pair(ST.first+1 , ST.second)); } else { outgoingEdges.push_back(make_pair(ST.first , ST.second+1)); outgoingEdges.push_back(make_pair(ST.first+1 , ST.second)); outgoingEdges.push_back(make_pair(ST.first+1 , ST.second+1)); } } void NodeStructure :: updateAlignmentCount(map & transitionProbs, map & alignmentCounts , pair & edge , double alpha , double beta) { double tProb; double tgamma; double gamma; map :: iterator aCounts; string query; if (edge.first == -1) query = "NULL"; else query = source[edge.first]; query += "-"; if (edge.second == -1) query += "NULL"; else query += target[edge.second]; //cout<<" Query "<second; aCounts->second = addLogProbs(temp , gamma); } } double NodeStructure :: getTransitionProb(map & transitionProbs , pair & edge) { if (transitionProbs.size() == 0) return initTransitionProb; string query; if (edge.first == -1) query = "NULL"; else query = source[edge.first]; query += "-"; if (edge.second == -1) query += "NULL"; else query += target[edge.second]; //cout<<" Query "< & TS, map & gammas, map < pair , double > & parents) { double thisAlpha; double alpha = -2000; vector < pair < int , int> > incomingEdges; pair edge; getIncomingEdges (TS , incomingEdges); for (int k = 0; k < incomingEdges.size(); k++) { thisAlpha = parents[incomingEdges[k]]; getEdge (incomingEdges[k], TS , edge); thisAlpha += getTransitionProb(gammas , edge); // Get Transition Prob ... double temp = alpha; alpha = addLogProbs(temp , thisAlpha); // Sum of all parents * transition prob .. // cout< & ST, map & gammas, map < pair , double > & parents) { pair TS; double alpha; for (int i = 0; i < source.size(); i++) { TS = make_pair (i , -1); alpha = FwdProb (TS, gammas, parents); parents[TS] = alpha; } for (int i = 0; i < target.size(); i++) { TS = make_pair (-1 , i); alpha = FwdProb (TS, gammas, parents); parents[TS] = alpha; } for (int i = 0; i < source.size(); i++) { for (int j = 0; j < target.size(); j++) { TS = make_pair (i , j); alpha = FwdProb (TS, gammas, parents); parents[TS] = alpha; } } return parents[ST]; } double NodeStructure :: BckProb (pair & TS, map & gammas, map < pair , double > & children) { double thisBeta; double beta = -2000; vector < pair < int , int> > outgoingEdges; pair edge; getOutgoingEdges (TS , outgoingEdges); for (int k = 0; k < outgoingEdges.size(); k++) { thisBeta = children[outgoingEdges[k]]; getEdge (TS , outgoingEdges[k], edge); thisBeta += getTransitionProb(gammas , edge); // Get Transition Prob ... double temp = beta; beta = addLogProbs(temp , thisBeta); // Sum of all parents * transition prob .. // cout< & ST, map & gammas, map < pair , double > & children) { pair TS; double beta; for (int i = source.size()-2; i >= -1; i--) { TS = make_pair (i , target.size()-1); beta = BckProb (TS, gammas, children); children[TS] = beta; } for (int i = target.size()-2; i >=-1; i--) { TS = make_pair (source.size()-1 , i); beta = BckProb (TS, gammas, children); children[TS] = beta; } for (int i = source.size()-2 ; i >= -1 ; i--) { for (int j = target.size()-2 ; j >= -1; j--) { TS = make_pair (i , j); beta = BckProb (TS, gammas, children); children[TS] = beta; } } return children[ST]; } void loadInput(const char * fileName, vector & input) { /* This function loads a file into a vector of strings */ ifstream sr (fileName); string line; if(sr.is_open()) { while(getline(sr , line )) { input.push_back(line); } sr.close(); } else { cout<<"Unable to read "< & alignmentCounts) { map :: iterator aCounts; for (aCounts = alignmentCounts.begin(); aCounts != alignmentCounts.end(); aCounts++) { cout<first<<" "<second< & currInput) { /* This function splits a string into vector of strings using space character as a delimiter */ istringstream iss(s); currInput.clear(); do { string sub; iss >> sub; currInput.push_back(sub); } while (iss); currInput.pop_back(); } double getInitTransitionProb(int sourceToken, int targetToken) { double prod = sourceToken * targetToken; return log10(1/prod); } void runIteration(map & graph , map & gammas , int size) { map alignmentCounts; map :: iterator i; map :: iterator aCounts; double sum = -2000.0; double tPPR = -2000.0; for (i = graph.begin(); i != graph.end(); i++) { i->second.computeFwdBckProbs(gammas , alignmentCounts); double temp = tPPR; tPPR = addLogProbs(graph[i->first].getPosterior() , temp); } for (aCounts = alignmentCounts.begin(); aCounts != alignmentCounts.end(); aCounts++) { double temp = sum; sum = addLogProbs(aCounts->second, temp); } for (aCounts = alignmentCounts.begin(); aCounts != alignmentCounts.end(); aCounts++) { // Normalizing ... aCounts->second = aCounts->second - sum; } gammas.clear(); gammas = alignmentCounts; LAMBDA = tPPR - log10(size); } void setNTRProbabilities(map & graph , map & sourceTypes , map & targetTypes, double sourceTokens, double targetTokens) { map :: iterator i; map :: iterator j; for (i = sourceTypes.begin(); i!= sourceTypes.end(); i++) { i->second = log10(i->second/sourceTokens); } for (i = targetTypes.begin(); i!= targetTypes.end(); i++) { i->second = log10(i->second/targetTokens); } for (j = graph.begin(); j != graph.end(); j++) { j->second.computeNonTransliterationProb(sourceTypes , targetTypes); } } void printPosterior(map & graph) { map :: iterator i; for (i = graph.begin(); i != graph.end(); i++) graph[i->first].print(); } int main(int argc, char * argv[]) { vector input; vector source; vector target; map sourceTypes; map targetTypes; set < vector > tgt; set < vector > src; double sourceTokens = 0; double targetTokens = 0; map graph; map gammas; loadInput(argv[1],input); cerr<<"Constructing Graph "<