Welcome to mirror list, hosted at ThFree Co, Russian Federation.

NgramMatchingFreqAndNonCompositionality4Sent.cpp~ « Applications « SuffixArraySearch « SuffixArrayApplications « Src - github.com/moses-smt/salm.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
blob: 294724e02fd9b23dd22ffd6afe02802251a34a04 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
#include "stdio.h"
#include "stdlib.h"
#include "float.h"
#include "_SuffixArraySearchApplicationBase.h"
#include <iostream>
#include <vector>
#include <cstring>


using namespace std;

int SHOW_DEBUG_INFO = 0;

///Given src sentence length, convert the index in one-dimensional table to pair<startingPosInSrcSent, n>
///startingPosInSrcSent starts at 0, n is the n-gram length
void local_oneDimensionTableIndexToTwoDimension(unsigned int index, unsigned int sentLen, unsigned int &posInSrcSent, unsigned int &n)
{
    n = index / sentLen + 1;
    posInSrcSent = index % sentLen;
}

///Given the starting position in src sentence and the length of the n-gram
///calculate the index in the table
///posInSent starts at 0, n is the actual len of n-gram, starts at 1
unsigned int local_twoDimensionIndexToOneDimensionTableIndex(unsigned int posInSent, unsigned int n, unsigned int sentLen)
{
    unsigned int indexInTable = (n-1)*sentLen + posInSent;

    return indexInTable;
}

/**
* Given a corpus indexed by its suffix array
* calcuate the non-compositionalities of the embedded n-grams in a testing sentence
*
* Revision $Rev: 3665 $
* Last Modified $LastChangedDate: 2007-06-16 15:40:59 -0400 (Sat, 16 Jun 2007) $
**/
int main(int argc, char* argv[]){
	//-----------------------------------------------------------------------------
	//check parameter


	if(argc<2){		

		fprintf(stderr,"\nUsage:\n");
		fprintf(stderr,"\n%s corpusFileNameStem < testing sentences\n\n",argv[0]);
		
		exit(0);
	}


	//-----------------------------------------------------------------------------	
		
	C_SuffixArraySearchApplicationBase SA;
		
	char tmpString[1000];
	double bigN = 1000000;
	
	fprintf(stderr,"Loading data...\n");
	SA.loadData_forSearch(argv[1], false, true);

	fprintf(stderr,"Input Sentences:\n");

	while(!cin.eof()){
		cin.getline(tmpString,100000,'\n');
		if(strlen(tmpString)>0){
			
			SA.displayNgramMatchingFreq4Sent(tmpString);

			printf("\n");

			int sentLen;
		  
			S_sentSearchTableElement * matchingTable = SA.constructNgramSearchTable4SentWithLCP(tmpString, sentLen);
		  
			//convert this to frequency table
			double * freqTable = (double *) malloc (sizeof(double)*sentLen*sentLen);

			for(unsigned int i=0;i<(sentLen*sentLen);i++){
				//all the short n-grams should all exist and their frequency information should be in table now
				unsigned int startPos, n;
				double minNc;
				int leftNWithMinNc;

				local_oneDimensionTableIndexToTwoDimension(i, sentLen, startPos, n);

				if(matchingTable[i].found){
					double freq = matchingTable[i].endingPosInSA - matchingTable[i].startPosInSA +1; 
					freqTable[i]=freq;

					

					//consider all splitting method
					minNc = DBL_MAX;
					
					for(unsigned int leftN=1;leftN<n;leftN++){
						int index_left = local_twoDimensionIndexToOneDimensionTableIndex(startPos, leftN, sentLen);
						int index_right = local_twoDimensionIndexToOneDimensionTableIndex(startPos+leftN, n-leftN, sentLen);

						double leftFreq = freqTable[index_left];
						double rightFreq = freqTable[index_right];

						double nc = freq*bigN/(leftFreq*rightFreq);

						if(nc<minNc){
							minNc = nc;
							leftNWithMinNc = leftN;
						}

					}					
				}
				else{
					freqTable[i]=0;
					minNc = 0;
				}

				if(startPos==0){
					printf("\n%d\t",n);
				}

				if(n==1){
					printf("A\t");	//atom word, no way to break it
				}
				else{
					if(minNc>0){
						printf("%.1f[%d]\t", minNc, leftNWithMinNc);
					}
					else{
						printf("_\t");
					}
				}
			}

			printf("\n");
		

			free(matchingTable);
			free(freqTable);


	  }
	}
	return 1;
}