Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mgiza.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
Diffstat (limited to 'mgizapp/src/mkcls/KategProblemWBC.cpp')
-rw-r--r--mgizapp/src/mkcls/KategProblemWBC.cpp344
1 files changed, 344 insertions, 0 deletions
diff --git a/mgizapp/src/mkcls/KategProblemWBC.cpp b/mgizapp/src/mkcls/KategProblemWBC.cpp
new file mode 100644
index 0000000..1a0d439
--- /dev/null
+++ b/mgizapp/src/mkcls/KategProblemWBC.cpp
@@ -0,0 +1,344 @@
+/*
+
+Copyright (C) 1997,1998,1999,2000,2001 Franz Josef Och
+
+mkcls - a program for making word classes .
+
+This program is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public License
+as published by the Free Software Foundation; either version 2
+of the License, or (at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
+USA.
+
+*/
+
+
+
+
+#include <stdlib.h>
+#include "KategProblem.h"
+
+static int oneFreqCompareSteigend(const void *p,const void *j)
+{
+#ifdef FREQTYPE_DOUBLE
+ if( (((OneFreq *)p)->n < ((OneFreq *)j)->n) )
+ return -1;
+ if( (((OneFreq *)p)->n > ((OneFreq *)j)->n) )
+ return +1;
+ else
+ return 0;
+#else
+ return ((OneFreq *)p)->n - ((OneFreq *)j)->n;
+#endif
+}
+static int oneFreqCompareFallend(const void *p,const void *j)
+{
+#ifdef FREQTYPE_DOUBLE
+ if( (((OneFreq *)p)->n > ((OneFreq *)j)->n) )
+ return -1;
+ if( (((OneFreq *)p)->n < ((OneFreq *)j)->n) )
+ return +1;
+ else
+ return 0;
+#else
+ return -((OneFreq *)p)->n + ((OneFreq *)j)->n;
+#endif
+}
+
+
+KategProblemWBC::KategProblemWBC(int n,int minw)
+: _n1(n,0),_n2(n,0),with_h_of_words(0),afterFilled(n,0),beforeFilled(n,0),filled(0),fixedWord(n,-1),absteigend(0),nWords(n),nTranspWords(0),
+ mindestAnzahl(minw),after(n),before(n),minIndex(n,-1),maxIndex(n,-1)
+
+{
+}
+
+KategProblemWBC::~KategProblemWBC()
+
+{
+ massert( after.size()==nWords);
+ if( absteigend )
+ delete absteigend;
+}
+
+void KategProblemWBC::init(int specialFixedWord)
+{
+
+ nTranspWords=0;
+ int i;
+ for(i=0;i<_n1.size();i++)
+ {
+ if( (_n1[i]<mindestAnzahl && _n2[i]<mindestAnzahl && minIndex[i]<=1) ||i==specialFixedWord )
+ {
+
+ if(!( fixedWord[i]==1 || fixedWord[i]== -1))
+ cerr << "mkcls:KategProblemWBC::init::ERROR: " << i << " " << fixedWord[i] << endl;
+ fixedWord[i]=1;
+ }
+ else if(fixedWord[i]<0)
+ nTranspWords++;
+ }
+ if( absteigend==0 )
+ absteigend= &(getSortedList(0));
+
+
+
+
+
+ if(verboseMode && nTranspWords!=_n1.size()-1 )
+ cout << "Es sind: " <<nTranspWords<<" transportierbar.\n";
+}
+
+void KategProblemWBC::set_h_of_words(double s)
+
+{
+ with_h_of_words=1;
+ h_of_words = -s;
+}
+
+double KategProblemWBC::get_h_of_words()
+
+{
+ if( with_h_of_words )
+ return -h_of_words;
+ else
+ {
+ h_of_words=0;
+ for(int i=0;i<nWords;i++)
+ h_of_words+=0.5*(kat_h(_n2[i])+kat_h(_n1[i]));
+ with_h_of_words=1;
+ return -h_of_words;
+ }
+}
+
+
+void KategProblemWBC::setAfterWords(int w,int anzahl)
+
+{
+ OneFreq o;
+ o.w=-1;
+ o.n=0;
+ afterFilled[w]=0;
+ after[w].init(anzahl,o,1);
+}
+void KategProblemWBC::setBeforeWords(int w,int anzahl)
+
+{
+ OneFreq o;
+ o.w=-1;
+ o.n=0;
+ beforeFilled[w]=0;
+ before[w].init(anzahl,o,1);
+}
+
+
+void KategProblemWBC::setFreq(int w1,int w2,FreqType anzahl)
+
+{
+ OneFreq o;
+ o.n=anzahl;
+
+ o.w=w2;
+ after[w1][afterFilled[w1]++]=o;
+ _n1[w1]+=anzahl;
+ o.w=w1;
+ before[w2][beforeFilled[w2]++]=o;
+ _n2[w2]+=anzahl;
+}
+
+void KategProblemWBC::addFreq(int w1,int w2,FreqType anzahl)
+
+{
+ OneFreq o;
+ o.n=anzahl;
+ int pos=-1,i;
+ for(i=0;i<afterFilled[w1];i++)
+ if(after[w1][i].w==w2)
+ pos=i;
+
+ if(pos==-1)
+ {
+ o.w=w2;
+ after[w1][afterFilled[w1]++]=o;
+ }
+ else
+ after[w1][pos].n+=anzahl;
+ _n1[w1]+=anzahl;
+
+ pos=-1;
+ for(i=0;i<beforeFilled[w2];i++)
+ if(before[w2][i].w==w1)
+ pos=i;
+ if(pos==-1)
+ {
+ o.w=w1;
+ before[w2][beforeFilled[w2]++]=o;
+ }
+ else
+ before[w2][pos].n+=anzahl;
+ _n2[w2]+=anzahl;
+}
+
+
+short KategProblemWBC::testFull(int doIt)
+
+{
+ int enaNom=0;
+ int afterFilledSum=0,beforeFilledSum=0;
+ int ret=1,i;
+ for(i=0;i<nWords;i++)
+ {
+ if( n1(i)==1 && n2(i)==1 )
+ enaNom++;
+ afterFilledSum+=afterFilled[i];
+ beforeFilledSum+=beforeFilled[i];
+ if(afterFilled[i]!=after[i].size())
+ {
+ ret=0;
+ if( doIt )
+ after[i].resize(afterFilled[i]);
+ }
+ if(beforeFilled[i]!=before[i].size())
+ {
+ ret=0;
+ if( doIt )
+ before[i].resize(beforeFilled[i]);
+ }
+
+ }
+ if( ret==0 && !doIt )
+ {
+ cerr << "Error: Unfilled word bigram statistics.\n";
+ exit(1);
+ }
+ else
+ filled=1;
+ if( verboseMode>1 )
+ {
+ cout << "MEAN(|L(w)|+|R(w)|)=" << (beforeFilledSum/(float)nWords)
+ +(afterFilledSum/(float)nWords) << endl;
+ cout << "Hapaslegomena: " << enaNom << endl;
+ }
+ int symmetrisch=1;
+ for(i=0;i<nWords;i++)
+ {
+ int j;
+ massert(before[i].size()==beforeFilled[i]);
+ massert( after[i].size()== afterFilled[i]);
+ FreqType sum=0;
+ for(j=0;j<after[i].size();j++)
+ sum+=after[i][j].n;
+ massert( sum==_n1[i] );
+ sum=0;
+ for(j=0;j<before[i].size();j++)
+ sum+=before[i][j].n;
+ massert(sum==_n2[i]);
+ if(_n1[i]!=_n2[i])
+ {
+ symmetrisch=0;
+ if( verboseMode>1 )
+ cout << "Asymmetrie: " << i << " " << _n1[i] << " " << _n2[i] << endl;
+ }
+
+ }
+ if(verboseMode && symmetrisch==0)
+ cout << "Warning: word bigram statistic is not symmetric "
+ "(this is possibly an error)\n";
+ return ret;
+}
+
+Array<Word> &KategProblemWBC::getSortedList(int steigend)
+
+{
+ int siz=_n2.size(),i;
+ massert(filled);
+ Array<Word> &sortedList =*new Array<Word>(siz);
+ Array<OneFreq> list(siz);
+ int pos=0;
+ for(i=0;i<siz;i++)
+ {
+ if( fixedWord[i]<0 )
+ {
+ list[pos].w=i;
+ list[pos].n=_n1[i];
+ pos++;
+ }
+ }
+ int anzFree=pos;
+ for(i=0;i<siz;i++)
+ {
+ if( fixedWord[i]>=0 )
+ {
+ list[pos].w=i;
+ list[pos].n=_n1[i];
+ pos++;
+ }
+ }
+ massert(pos==siz);
+ if(steigend )
+ qsort(list.getPointerToData(),anzFree,sizeof(OneFreq),oneFreqCompareSteigend);
+ else
+ qsort(list.getPointerToData(),anzFree,sizeof(OneFreq),oneFreqCompareFallend);
+ massert( anzFree<=list.size() );
+
+ for(i=0;i<siz;i++)
+ {
+ sortedList[i]=list[i].w;
+ massert(steigend || i==0 || i>=anzFree || list[i-1].n>=list[i].n );
+ massert((!steigend) || i==0 || i>=anzFree || list[i-1].n<=list[i].n );
+ }
+ return sortedList;
+}
+
+FreqType KategProblemWBC::numberOfWords()
+
+{
+ FreqType n1=0,n2=0;
+ for(int i=0;i<_n1.size();i++)
+ {
+ n1+=_n1[i];
+ n2+=_n2[i];
+ }
+ #ifndef FREQTYPE_DOUBLE
+ massert(n1==n2);
+ #endif
+ return n1;
+}
+
+void KategProblemWBC::setDollar(int n)
+
+{
+ if( fixedWord[n]<0 )
+ nTranspWords--;
+ fixedWord[n]=0;
+}
+
+void KategProblemWBC::initializeIndex(const leda_array<string>&words,char firstChar,int unten,int oben,bool noHapas)
+{
+ int n=0;
+ int i;
+ massert(-1<unten);massert(unten<oben);
+ if( verboseMode )
+ cout << "InitializeIndex: " << firstChar << " u:" << unten << " o:" << oben << " " << noHapas << endl;
+ over_array(words,i)
+ {
+ if( words[i][0]==firstChar && (noHapas || ((short)(n1(i)+0.0001))>=mindestAnzahl || ((short)(n2(i)+0.0001))>=mindestAnzahl) )
+ {
+ minIndex[i]=unten;
+ maxIndex[i]=oben;
+ n++;
+ }
+ }
+ if( verboseMode )
+ cout << "InitializeIndex gefunden fuer " << n << " Woerter.\n";
+}
+