diff options
author | Barry Haddow <barry.haddow@gmail.com> | 2012-10-14 17:18:03 +0400 |
---|---|---|
committer | Barry Haddow <barry.haddow@gmail.com> | 2012-10-14 17:18:03 +0400 |
commit | 61ae24aa5d1db2c117c1aa90ca65ef4fe41490fa (patch) | |
tree | 60d5814cfcabe7325ff7b0a8dfbe8c0e3d1e1a7f /contrib | |
parent | 732b299ba3612094af67f4e336011fe337089e8a (diff) | |
parent | 0d9095983d70a032fb596366e2445e338301b212 (diff) |
Merge remote-tracking branch 'origin/master' into miramerge
Conflicts:
moses/src/PhraseDictionary.cpp
moses/src/TargetPhrase.cpp
moses/src/TargetPhrase.h
Diffstat (limited to 'contrib')
24 files changed, 20 insertions, 3899 deletions
diff --git a/contrib/fuzzy-match/Makefile b/contrib/fuzzy-match/Makefile deleted file mode 100644 index 5bb884a51..000000000 --- a/contrib/fuzzy-match/Makefile +++ /dev/null @@ -1,16 +0,0 @@ -all: suffix-test fuzzy-match fuzzy-match2 - -clean: - rm -f *.o - -.cpp.o: - g++ -O6 -g -c $< - -suffix-test: Vocabulary.o SuffixArray.o suffix-test.o - g++ Vocabulary.o SuffixArray.o suffix-test.o -o suffix-test - -fuzzy-match: Vocabulary.o SuffixArray.o old/fuzzy-match.o - g++ Vocabulary.o SuffixArray.o fuzzy-match.o -o fuzzy-match - -fuzzy-match2: Vocabulary.o SuffixArray.o fuzzy-match2.o Util.o - g++ Vocabulary.o SuffixArray.o fuzzy-match2.o Util.o -o fuzzy-match2 diff --git a/contrib/fuzzy-match/Match.h b/contrib/fuzzy-match/Match.h deleted file mode 100644 index 6fc8bb42f..000000000 --- a/contrib/fuzzy-match/Match.h +++ /dev/null @@ -1,29 +0,0 @@ -// -// Match.h -// fuzzy-match -// -// Created by Hieu Hoang on 25/07/2012. -// Copyright 2012 __MyCompanyName__. All rights reserved. -// - -#ifndef fuzzy_match_Match_h -#define fuzzy_match_Match_h - -/* data structure for n-gram match between input and corpus */ - -class Match { -public: - int input_start; - int input_end; - int tm_start; - int tm_end; - int min_cost; - int max_cost; - int internal_cost; - Match( int is, int ie, int ts, int te, int min, int max, int i ) - :input_start(is), input_end(ie), tm_start(ts), tm_end(te), min_cost(min), max_cost(max), internal_cost(i) - {} -}; - - -#endif diff --git a/contrib/fuzzy-match/SentenceAlignment.h b/contrib/fuzzy-match/SentenceAlignment.h deleted file mode 100644 index 4d92fd635..000000000 --- a/contrib/fuzzy-match/SentenceAlignment.h +++ /dev/null @@ -1,48 +0,0 @@ -// -// SentenceAlignment.h -// fuzzy-match -// -// Created by Hieu Hoang on 25/07/2012. -// Copyright 2012 __MyCompanyName__. All rights reserved. -// - -#ifndef fuzzy_match_SentenceAlignment_h -#define fuzzy_match_SentenceAlignment_h - -#include <sstream> -#include "Vocabulary.h" - -extern Vocabulary vocabulary; - -struct SentenceAlignment -{ - int count; - vector< WORD_ID > target; - vector< pair<int,int> > alignment; - - SentenceAlignment() - {} - - string getTargetString() const - { - stringstream strme; - for (size_t i = 0; i < target.size(); ++i) { - const WORD &word = vocabulary.GetWord(target[i]); - strme << word << " "; - } - return strme.str(); - } - - string getAlignmentString() const - { - stringstream strme; - for (size_t i = 0; i < alignment.size(); ++i) { - const pair<int,int> &alignPair = alignment[i]; - strme << alignPair.first << "-" << alignPair.second << " "; - } - return strme.str(); - } - -}; - -#endif diff --git a/contrib/fuzzy-match/SuffixArray.cpp b/contrib/fuzzy-match/SuffixArray.cpp deleted file mode 100644 index e0aa3da91..000000000 --- a/contrib/fuzzy-match/SuffixArray.cpp +++ /dev/null @@ -1,244 +0,0 @@ -#include "SuffixArray.h" -#include <string> -#include <stdlib.h> -#include <cstring> - -using namespace std; - -SuffixArray::SuffixArray( string fileName ) -{ - m_vcb.StoreIfNew( "<uNk>" ); - m_endOfSentence = m_vcb.StoreIfNew( "<s>" ); - - ifstream extractFile; - char line[LINE_MAX_LENGTH]; - - // count the number of words first; - extractFile.open(fileName.c_str()); - istream *fileP = &extractFile; - m_size = 0; - size_t sentenceCount = 0; - while(!fileP->eof()) { - SAFE_GETLINE((*fileP), line, LINE_MAX_LENGTH, '\n'); - if (fileP->eof()) break; - vector< WORD_ID > words = m_vcb.Tokenize( line ); - m_size += words.size() + 1; - sentenceCount++; - } - extractFile.close(); - cerr << m_size << " words (incl. sentence boundaries)" << endl; - - // allocate memory - m_array = (WORD_ID*) calloc( sizeof( WORD_ID ), m_size ); - m_index = (INDEX*) calloc( sizeof( INDEX ), m_size ); - m_wordInSentence = (char*) calloc( sizeof( char ), m_size ); - m_sentence = (size_t*) calloc( sizeof( size_t ), m_size ); - m_sentenceLength = (char*) calloc( sizeof( char ), sentenceCount ); - - // fill the array - int wordIndex = 0; - int sentenceId = 0; - extractFile.open(fileName.c_str()); - fileP = &extractFile; - while(!fileP->eof()) { - SAFE_GETLINE((*fileP), line, LINE_MAX_LENGTH, '\n'); - if (fileP->eof()) break; - vector< WORD_ID > words = m_vcb.Tokenize( line ); - vector< WORD_ID >::const_iterator i; - - for( i=words.begin(); i!=words.end(); i++) - { - m_index[ wordIndex ] = wordIndex; - m_sentence[ wordIndex ] = sentenceId; - m_wordInSentence[ wordIndex ] = i-words.begin(); - m_array[ wordIndex++ ] = *i; - } - m_index[ wordIndex ] = wordIndex; - m_array[ wordIndex++ ] = m_endOfSentence; - m_sentenceLength[ sentenceId++ ] = words.size(); - } - extractFile.close(); - cerr << "done reading " << wordIndex << " words, " << sentenceId << " sentences." << endl; - // List(0,9); - - // sort - m_buffer = (INDEX*) calloc( sizeof( INDEX ), m_size ); - Sort( 0, m_size-1 ); - free( m_buffer ); - cerr << "done sorting" << endl; -} - -// good ol' quick sort -void SuffixArray::Sort(INDEX start, INDEX end) { - if (start == end) return; - INDEX mid = (start+end+1)/2; - Sort( start, mid-1 ); - Sort( mid, end ); - - // merge - int i = start; - int j = mid; - int k = 0; - int length = end-start+1; - while( k<length ) - { - if (i == mid ) - { - m_buffer[ k++ ] = m_index[ j++ ]; - } - else if (j > end ) - { - m_buffer[ k++ ] = m_index[ i++ ]; - } - else { - if (CompareIndex( m_index[i], m_index[j] ) < 0) - { - m_buffer[ k++ ] = m_index[ i++ ]; - } - else - { - m_buffer[ k++ ] = m_index[ j++ ]; - } - } - } - - memcpy( ((char*)m_index) + sizeof( INDEX ) * start, - ((char*)m_buffer), sizeof( INDEX ) * (end-start+1) ); -} - -SuffixArray::~SuffixArray() -{ - free(m_index); - free(m_array); -} - -int SuffixArray::CompareIndex( INDEX a, INDEX b ) const -{ - // skip over identical words - INDEX offset = 0; - while( a+offset < m_size && - b+offset < m_size && - m_array[ a+offset ] == m_array[ b+offset ] ) - { offset++; } - - if( a+offset == m_size ) return -1; - if( b+offset == m_size ) return 1; - return CompareWord( m_array[ a+offset ], m_array[ b+offset ] ); -} - -inline int SuffixArray::CompareWord( WORD_ID a, WORD_ID b ) const -{ - // cerr << "c(" << m_vcb.GetWord(a) << ":" << m_vcb.GetWord(b) << ")=" << m_vcb.GetWord(a).compare( m_vcb.GetWord(b) ) << endl; - return m_vcb.GetWord(a).compare( m_vcb.GetWord(b) ); -} - -int SuffixArray::Count( const vector< WORD > &phrase ) -{ - INDEX dummy; - return LimitedCount( phrase, m_size, dummy, dummy, 0, m_size-1 ); -} - -bool SuffixArray::MinCount( const vector< WORD > &phrase, INDEX min ) -{ - INDEX dummy; - return LimitedCount( phrase, min, dummy, dummy, 0, m_size-1 ) >= min; -} - -bool SuffixArray::Exists( const vector< WORD > &phrase ) -{ - INDEX dummy; - return LimitedCount( phrase, 1, dummy, dummy, 0, m_size-1 ) == 1; -} - -int SuffixArray::FindMatches( const vector< WORD > &phrase, INDEX &firstMatch, INDEX &lastMatch, INDEX search_start, INDEX search_end ) -{ - return LimitedCount( phrase, m_size, firstMatch, lastMatch, search_start, search_end ); -} - -int SuffixArray::LimitedCount( const vector< WORD > &phrase, INDEX min, INDEX &firstMatch, INDEX &lastMatch, INDEX search_start, INDEX search_end ) -{ - // cerr << "FindFirst\n"; - INDEX start = search_start; - INDEX end = (search_end == -1) ? (m_size-1) : search_end; - INDEX mid = FindFirst( phrase, start, end ); - // cerr << "done\n"; - if (mid == m_size) return 0; // no matches - if (min == 1) return 1; // only existance check - - int matchCount = 1; - - //cerr << "before...\n"; - firstMatch = FindLast( phrase, mid, start, -1 ); - matchCount += mid - firstMatch; - - //cerr << "after...\n"; - lastMatch = FindLast( phrase, mid, end, 1 ); - matchCount += lastMatch - mid; - - return matchCount; -} - -SuffixArray::INDEX SuffixArray::FindLast( const vector< WORD > &phrase, INDEX start, INDEX end, int direction ) -{ - end += direction; - while(true) - { - INDEX mid = ( start + end + (direction>0 ? 0 : 1) )/2; - - int match = Match( phrase, mid ); - int matchNext = Match( phrase, mid+direction ); - //cerr << "\t" << start << ";" << mid << ";" << end << " -> " << match << "," << matchNext << endl; - - if (match == 0 && matchNext != 0) return mid; - - if (match == 0) // mid point is a match - start = mid; - else - end = mid; - } -} - -SuffixArray::INDEX SuffixArray::FindFirst( const vector< WORD > &phrase, INDEX &start, INDEX &end ) -{ - while(true) - { - INDEX mid = ( start + end + 1 )/2; - //cerr << "FindFirst(" << start << ";" << mid << ";" << end << ")\n"; - int match = Match( phrase, mid ); - - if (match == 0) return mid; - if (start >= end && match != 0 ) return m_size; - - if (match > 0) - start = mid+1; - else - end = mid-1; - } -} - -int SuffixArray::Match( const vector< WORD > &phrase, INDEX index ) -{ - INDEX pos = m_index[ index ]; - for(INDEX i=0; i<phrase.size() && i+pos<m_size; i++) - { - int match = CompareWord( m_vcb.GetWordID( phrase[i] ), m_array[ pos+i ] ); - // cerr << "{" << index << "+" << i << "," << pos+i << ":" << match << "}" << endl; - if (match != 0) - return match; - } - return 0; -} - -void SuffixArray::List(INDEX start, INDEX end) -{ - for(INDEX i=start; i<=end; i++) - { - INDEX pos = m_index[ i ]; - // cerr << i << ":" << pos << "\t"; - for(int j=0; j<5 && j+pos<m_size; j++) - { - cout << " " << m_vcb.GetWord( m_array[ pos+j ] ); - } - // cerr << "\n"; - } -} diff --git a/contrib/fuzzy-match/SuffixArray.h b/contrib/fuzzy-match/SuffixArray.h deleted file mode 100644 index 2deed4c32..000000000 --- a/contrib/fuzzy-match/SuffixArray.h +++ /dev/null @@ -1,45 +0,0 @@ -#include "Vocabulary.h" - -#pragma once - -#define LINE_MAX_LENGTH 10000 - - -class SuffixArray -{ -public: - typedef unsigned int INDEX; - -private: - WORD_ID *m_array; - INDEX *m_index; - INDEX *m_buffer; - char *m_wordInSentence; - size_t *m_sentence; - char *m_sentenceLength; - WORD_ID m_endOfSentence; - Vocabulary m_vcb; - INDEX m_size; - -public: - SuffixArray( string fileName ); - ~SuffixArray(); - - void Sort(INDEX start, INDEX end); - int CompareIndex( INDEX a, INDEX b ) const; - inline int CompareWord( WORD_ID a, WORD_ID b ) const; - int Count( const vector< WORD > &phrase ); - bool MinCount( const vector< WORD > &phrase, INDEX min ); - bool Exists( const vector< WORD > &phrase ); - int FindMatches( const vector< WORD > &phrase, INDEX &firstMatch, INDEX &lastMatch, INDEX search_start = 0, INDEX search_end = -1 ); - int LimitedCount( const vector< WORD > &phrase, INDEX min, INDEX &firstMatch, INDEX &lastMatch, INDEX search_start = -1, INDEX search_end = 0 ); - INDEX FindFirst( const vector< WORD > &phrase, INDEX &start, INDEX &end ); - INDEX FindLast( const vector< WORD > &phrase, INDEX start, INDEX end, int direction ); - int Match( const vector< WORD > &phrase, INDEX index ); - void List( INDEX start, INDEX end ); - inline INDEX GetPosition( INDEX index ) { return m_index[ index ]; } - inline size_t GetSentence( INDEX position ) { return m_sentence[position]; } - inline char GetWordInSentence( INDEX position ) { return m_wordInSentence[position]; } - inline char GetSentenceLength( size_t sentenceId ) { return m_sentenceLength[sentenceId]; } - inline INDEX GetSize() { return m_size; } -}; diff --git a/contrib/fuzzy-match/Util.cpp b/contrib/fuzzy-match/Util.cpp deleted file mode 100644 index 4d750791e..000000000 --- a/contrib/fuzzy-match/Util.cpp +++ /dev/null @@ -1,147 +0,0 @@ -// -// Util.cpp -// fuzzy-match -// -// Created by Hieu Hoang on 26/07/2012. -// Copyright 2012 __MyCompanyName__. All rights reserved. -// - -#include <iostream> -#include <stdio.h> -#include "Util.h" -#include "SentenceAlignment.h" -#include "SuffixArray.h" - -void load_corpus( const char* fileName, vector< vector< WORD_ID > > &corpus ) -{ // source - ifstream fileStream; - fileStream.open(fileName); - if (!fileStream) { - cerr << "file not found: " << fileName << endl; - exit(1); - } - cerr << "loading " << fileName << endl; - - istream *fileStreamP = &fileStream; - - char line[LINE_MAX_LENGTH]; - while(true) - { - SAFE_GETLINE((*fileStreamP), line, LINE_MAX_LENGTH, '\n'); - if (fileStreamP->eof()) break; - corpus.push_back( vocabulary.Tokenize( line ) ); - } -} - -void load_target( const char* fileName, vector< vector< SentenceAlignment > > &corpus) -{ - ifstream fileStream; - fileStream.open(fileName); - if (!fileStream) { - cerr << "file not found: " << fileName << endl; - exit(1); - } - cerr << "loading " << fileName << endl; - - istream *fileStreamP = &fileStream; - - WORD_ID delimiter = vocabulary.StoreIfNew("|||"); - - int lineNum = 0; - char line[LINE_MAX_LENGTH]; - while(true) - { - SAFE_GETLINE((*fileStreamP), line, LINE_MAX_LENGTH, '\n'); - if (fileStreamP->eof()) break; - - vector<WORD_ID> toks = vocabulary.Tokenize( line ); - - corpus.push_back(vector< SentenceAlignment >()); - vector< SentenceAlignment > &vec = corpus.back(); - - vec.push_back(SentenceAlignment()); - SentenceAlignment *sentence = &vec.back(); - - const WORD &countStr = vocabulary.GetWord(toks[0]); - sentence->count = atoi(countStr.c_str()); - - for (size_t i = 1; i < toks.size(); ++i) { - WORD_ID wordId = toks[i]; - - if (wordId == delimiter) { - // target and alignments can have multiple sentences. - vec.push_back(SentenceAlignment()); - sentence = &vec.back(); - - // count - ++i; - - const WORD &countStr = vocabulary.GetWord(toks[i]); - sentence->count = atoi(countStr.c_str()); - } - else { - // just a normal word, add - sentence->target.push_back(wordId); - } - } - - ++lineNum; - - } - -} - - -void load_alignment( const char* fileName, vector< vector< SentenceAlignment > > &corpus ) -{ - ifstream fileStream; - fileStream.open(fileName); - if (!fileStream) { - cerr << "file not found: " << fileName << endl; - exit(1); - } - cerr << "loading " << fileName << endl; - - istream *fileStreamP = &fileStream; - - string delimiter = "|||"; - - int lineNum = 0; - char line[LINE_MAX_LENGTH]; - while(true) - { - SAFE_GETLINE((*fileStreamP), line, LINE_MAX_LENGTH, '\n'); - if (fileStreamP->eof()) break; - - vector< SentenceAlignment > &vec = corpus[lineNum]; - size_t targetInd = 0; - SentenceAlignment *sentence = &vec[targetInd]; - - vector<string> toks = Tokenize(line); - - for (size_t i = 0; i < toks.size(); ++i) { - string &tok = toks[i]; - - if (tok == delimiter) { - // target and alignments can have multiple sentences. - ++targetInd; - sentence = &vec[targetInd]; - - ++i; - } - else { - // just a normal alignment, add - vector<int> alignPoint = Tokenize<int>(tok, "-"); - assert(alignPoint.size() == 2); - sentence->alignment.push_back(pair<int,int>(alignPoint[0], alignPoint[1])); - } - } - - ++lineNum; - - } -} - - - - diff --git a/contrib/fuzzy-match/Util.h b/contrib/fuzzy-match/Util.h deleted file mode 100644 index 7bb13d032..000000000 --- a/contrib/fuzzy-match/Util.h +++ /dev/null @@ -1,87 +0,0 @@ -// -// Util.h -// fuzzy-match -// -// Created by Hieu Hoang on 25/07/2012. -// Copyright 2012 __MyCompanyName__. All rights reserved. -// - -#ifndef fuzzy_match_Util_h -#define fuzzy_match_Util_h - -#include <vector> -#include <sstream> -#include "Vocabulary.h" - -class SentenceAlignment; - -void load_corpus( const char* fileName, std::vector< std::vector< WORD_ID > > &corpus ); -void load_target( const char* fileName, std::vector< std::vector< SentenceAlignment > > &corpus); -void load_alignment( const char* fileName, std::vector< std::vector< SentenceAlignment > > &corpus ); - -/** - * Convert vector of type T to string - */ -template <typename T> -std::string Join(const std::string& delimiter, const std::vector<T>& items) -{ - std::ostringstream outstr; - if(items.size() == 0) return ""; - outstr << items[0]; - for(unsigned int i = 1; i < items.size(); i++) - outstr << delimiter << items[i]; - return outstr.str(); -} - -//! convert string to variable of type T. Used to reading floats, int etc from files -template<typename T> -inline T Scan(const std::string &input) -{ - std::stringstream stream(input); - T ret; - stream >> ret; - return ret; -} - -//! convert vectors of string to vectors of type T variables -template<typename T> -inline std::vector<T> Scan(const std::vector< std::string > &input) -{ - std::vector<T> output(input.size()); - for (size_t i = 0 ; i < input.size() ; i++) { - output[i] = Scan<T>( input[i] ); - } - return output; -} - -inline std::vector<std::string> Tokenize(const std::string& str, - const std::string& delimiters = " \t") -{ - std::vector<std::string> tokens; - // Skip delimiters at beginning. - std::string::size_type lastPos = str.find_first_not_of(delimiters, 0); - // Find first "non-delimiter". - std::string::size_type pos = str.find_first_of(delimiters, lastPos); - - while (std::string::npos != pos || std::string::npos != lastPos) { - // Found a token, add it to the vector. - tokens.push_back(str.substr(lastPos, pos - lastPos)); - // Skip delimiters. Note the "not_of" - lastPos = str.find_first_not_of(delimiters, pos); - // Find next "non-delimiter" - pos = str.find_first_of(delimiters, lastPos); - } - - return tokens; -} - -template<typename T> -inline std::vector<T> Tokenize( const std::string &input - , const std::string& delimiters = " \t") -{ - std::vector<std::string> stringVector = Tokenize(input, delimiters); - return Scan<T>( stringVector ); -} - - -#endif diff --git a/contrib/fuzzy-match/Vocabulary.cpp b/contrib/fuzzy-match/Vocabulary.cpp deleted file mode 100644 index 4492eec95..000000000 --- a/contrib/fuzzy-match/Vocabulary.cpp +++ /dev/null @@ -1,45 +0,0 @@ -// $Id: Vocabulary.cpp 1565 2008-02-22 14:42:01Z bojar $
-#include "Vocabulary.h"
-
-// as in beamdecoder/tables.cpp
-vector<WORD_ID> Vocabulary::Tokenize( const char input[] ) {
- vector< WORD_ID > token;
- bool betweenWords = true;
- int start=0;
- int i=0;
- for(; input[i] != '\0'; i++) {
- bool isSpace = (input[i] == ' ' || input[i] == '\t');
-
- if (!isSpace && betweenWords) {
- start = i;
- betweenWords = false;
- }
- else if (isSpace && !betweenWords) {
- token.push_back( StoreIfNew ( string( input+start, i-start ) ) );
- betweenWords = true;
- }
- }
- if (!betweenWords)
- token.push_back( StoreIfNew ( string( input+start, i-start ) ) );
- return token;
-}
-
-WORD_ID Vocabulary::StoreIfNew( const WORD& word ) {
- map<WORD, WORD_ID>::iterator i = lookup.find( word );
-
- if( i != lookup.end() )
- return i->second;
-
- WORD_ID id = vocab.size();
- vocab.push_back( word );
- lookup[ word ] = id;
- return id;
-}
-
-WORD_ID Vocabulary::GetWordID( const WORD &word ) {
- map<WORD, WORD_ID>::iterator i = lookup.find( word );
- if( i == lookup.end() )
- return 0;
- WORD_ID w= (WORD_ID) i->second;
- return w;
-}
diff --git a/contrib/fuzzy-match/Vocabulary.h b/contrib/fuzzy-match/Vocabulary.h deleted file mode 100644 index 3e48847a7..000000000 --- a/contrib/fuzzy-match/Vocabulary.h +++ /dev/null @@ -1,40 +0,0 @@ -// $Id: tables-core.h 1470 2007-10-02 21:43:54Z redpony $ - -#pragma once - -#include <iostream> -#include <fstream> -#include <assert.h> -#include <stdlib.h> -#include <string> -#include <queue> -#include <map> -#include <cmath> - -using namespace std; - -#define MAX_LENGTH 10000 - -#define SAFE_GETLINE(_IS, _LINE, _SIZE, _DELIM) { \ - _IS.getline(_LINE, _SIZE, _DELIM); \ - if(_IS.fail() && !_IS.bad() && !_IS.eof()) _IS.clear(); \ - if (_IS.gcount() == _SIZE-1) { \ - cerr << "Line too long! Buffer overflow. Delete lines >=" \ - << _SIZE << " chars or raise MAX_LENGTH in phrase-extract/tables-core.cpp" \ - << endl; \ - exit(1); \ - } \ - } - -typedef string WORD; -typedef unsigned int WORD_ID; - -class Vocabulary { - public: - map<WORD, WORD_ID> lookup; - vector< WORD > vocab; - WORD_ID StoreIfNew( const WORD& ); - WORD_ID GetWordID( const WORD& ); - vector<WORD_ID> Tokenize( const char[] ); - inline WORD &GetWord( WORD_ID id ) const { WORD &i = (WORD&) vocab[ id ]; return i; } -}; diff --git a/contrib/fuzzy-match/fuzzy-match2.cpp b/contrib/fuzzy-match/fuzzy-match2.cpp deleted file mode 100644 index c1252aa03..000000000 --- a/contrib/fuzzy-match/fuzzy-match2.cpp +++ /dev/null @@ -1,460 +0,0 @@ -#include <stdio.h> -#include <stdlib.h> -#include <getopt.h> -#include <map> -#include <algorithm> -#include <iostream> -#include <fstream> -#include <cstring> -#include <time.h> -#include <fstream> -#include "SentenceAlignment.h" -#include "fuzzy-match2.h" -#include "SuffixArray.h" - -/** This implementation is explained in - Koehn and Senellart: "Fast Approximate String Matching - with Suffix Arrays and A* Parsing" (AMTA 2010) ***/ - -using namespace std; - -int main(int argc, char* argv[]) -{ - vector< vector< WORD_ID > > source, input; - vector< vector< SentenceAlignment > > targetAndAlignment; - - - while(1) { - static struct option long_options[] = { - {"basic", no_argument, &basic_flag, 1}, - {"word", no_argument, &lsed_flag, 0}, - {"unrefined", no_argument, &refined_flag, 0}, - {"nolengthfilter", no_argument, &length_filter_flag, 0}, - {"noparse", no_argument, &parse_flag, 0}, - {"multiple", no_argument, &multiple_flag, 1}, - {"minmatch", required_argument, 0, 'm'}, - {0, 0, 0, 0} - }; - int option_index = 0; - int c = getopt_long (argc, argv, "m:", long_options, &option_index); - if (c == -1) break; - switch (c) { - case 0: -// if (long_options[option_index].flag != 0) -// break; -// printf ("option %s", long_options[option_index].name); -// if (optarg) -// printf (" with arg %s", optarg); -// printf ("\n"); - break; - case 'm': - min_match = atoi(optarg); - if (min_match < 1 || min_match > 100) { - cerr << "error: --minmatch must have value in range 1..100\n"; - exit(1); - } - cerr << "setting min match to " << min_match << endl; - break; - default: - cerr << "usage: syntax: ./fuzzy-match input corpus [--basic] [--word] [--minmatch 1..100]\n"; - exit(1); - } - } - if (lsed_flag) { cerr << "lsed\n"; } - if (basic_flag) { cerr << "basic\n"; } - if (refined_flag) { cerr << "refined\n"; } - if (length_filter_flag) { cerr << "length filter\n"; } - if (parse_flag) { cerr << "parse\n"; } -// exit(1); - - - if (optind+4 != argc) { - cerr << "syntax: ./fuzzy-match input source target alignment [--basic] [--word] [--minmatch 1..100]\n"; - exit(1); - } - - load_corpus(argv[optind], input); - load_corpus(argv[optind+1], source); - load_target(argv[optind+2], targetAndAlignment); - load_alignment(argv[optind+3], targetAndAlignment); - - // ./fuzzy-match input corpus [-basic] - -// load_corpus("../corpus/tm.truecased.4.en", source); -// load_corpus("../corpus/tm.truecased.4.it", target); -// load_corpus("../evaluation/test.input.tc.4", input); - -// load_corpus("../../acquis-truecase/corpus/acquis.truecased.190.en", source); -// load_corpus("../../acquis-truecase/evaluation/ac-test.input.tc.190", input); - -// load_corpus("../corpus/tm.truecased.16.en", source); -// load_corpus("../evaluation/test.input.tc.16", input); - - if (basic_flag) { - cerr << "using basic method\n"; - clock_t start_main_clock2 = clock(); - basic_fuzzy_match( source, input ); - cerr << "total: " << (1000 * (clock()-start_main_clock2) / CLOCKS_PER_SEC) << endl; - exit(1); - } - - cerr << "number of input sentences " << input.size() << endl; - - cerr << "creating suffix array...\n"; -// SuffixArray suffixArray( "../corpus/tm.truecased.4.en" ); -// SuffixArray suffixArray( "../../acquis-truecase/corpus/acquis.truecased.190.en" ); - SuffixArray suffixArray( argv[optind+1] ); - - clock_t start_main_clock = clock(); - - // looping through all input sentences... - cerr << "looping...\n"; - for(unsigned int sentenceInd = 0; sentenceInd < input.size(); sentenceInd++) - { - clock_t start_clock = clock(); - // if (i % 10 == 0) cerr << "."; - - // establish some basic statistics - - // int input_length = compute_length( input[i] ); - int input_length = input[sentenceInd].size(); - int best_cost = input_length * (100-min_match) / 100 + 1; - - int match_count = 0; // how many substring matches to be considered - //cerr << endl << "sentence " << i << ", length " << input_length << ", best_cost " << best_cost << endl; - - // find match ranges in suffix array - vector< vector< pair< SuffixArray::INDEX, SuffixArray::INDEX > > > match_range; - for(size_t start=0;start<input[sentenceInd].size();start++) - { - SuffixArray::INDEX prior_first_match = 0; - SuffixArray::INDEX prior_last_match = suffixArray.GetSize()-1; - vector< string > substring; - bool stillMatched = true; - vector< pair< SuffixArray::INDEX, SuffixArray::INDEX > > matchedAtThisStart; - //cerr << "start: " << start; - for(int word=start; stillMatched && word<input[sentenceInd].size(); word++) - { - substring.push_back( vocabulary.GetWord( input[sentenceInd][word] ) ); - - // only look up, if needed (i.e. no unnecessary short gram lookups) -// if (! word-start+1 <= short_match_max_length( input_length ) ) - // { - SuffixArray::INDEX first_match, last_match; - stillMatched = false; - if (suffixArray.FindMatches( substring, first_match, last_match, prior_first_match, prior_last_match ) ) - { - stillMatched = true; - matchedAtThisStart.push_back( make_pair( first_match, last_match ) ); - //cerr << " (" << first_match << "," << last_match << ")"; - //cerr << " " << ( last_match - first_match + 1 ); - prior_first_match = first_match; - prior_last_match = last_match; - } - //} - } - //cerr << endl; - match_range.push_back( matchedAtThisStart ); - } - - clock_t clock_range = clock(); - - map< int, vector< Match > > sentence_match; - map< int, int > sentence_match_word_count; - - // go through all matches, longest first - for(int length = input[sentenceInd].size(); length >= 1; length--) - { - // do not create matches, if these are handled by the short match function - if (length <= short_match_max_length( input_length ) ) - { - continue; - } - - unsigned int count = 0; - for(int start = 0; start <= input[sentenceInd].size() - length; start++) - { - if (match_range[start].size() >= length) - { - pair< SuffixArray::INDEX, SuffixArray::INDEX > &range = match_range[start][length-1]; - // cerr << " (" << range.first << "," << range.second << ")"; - count += range.second - range.first + 1; - - for(SuffixArray::INDEX i=range.first; i<=range.second; i++) - { - int position = suffixArray.GetPosition( i ); - - // sentence length mismatch - size_t sentence_id = suffixArray.GetSentence( position ); - int sentence_length = suffixArray.GetSentenceLength( sentence_id ); - int diff = abs( (int)sentence_length - (int)input_length ); - // cerr << endl << i << "\tsentence " << sentence_id << ", length " << sentence_length; - //if (length <= 2 && input_length>=5 && - // sentence_match.find( sentence_id ) == sentence_match.end()) - // continue; - - if (diff > best_cost) - continue; - - // compute minimal cost - int start_pos = suffixArray.GetWordInSentence( position ); - int end_pos = start_pos + length-1; - // cerr << endl << "\t" << start_pos << "-" << end_pos << " (" << sentence_length << ") vs. " - // << start << "-" << (start+length-1) << " (" << input_length << ")"; - // different number of prior words -> cost is at least diff - int min_cost = abs( start - start_pos ); - - // same number of words, but not sent. start -> cost is at least 1 - if (start == start_pos && start>0) - min_cost++; - - // different number of remaining words -> cost is at least diff - min_cost += abs( ( sentence_length-1 - end_pos ) - - ( input_length-1 - (start+length-1) ) ); - - // same number of words, but not sent. end -> cost is at least 1 - if ( sentence_length-1 - end_pos == - input_length-1 - (start+length-1) - && end_pos != sentence_length-1 ) - min_cost++; - - // cerr << " -> min_cost " << min_cost; - if (min_cost > best_cost) - continue; - - // valid match - match_count++; - - // compute maximal cost - int max_cost = max( start, start_pos ) - + max( sentence_length-1 - end_pos, - input_length-1 - (start+length-1) ); - // cerr << ", max_cost " << max_cost; - - Match m = Match( start, start+length-1, - start_pos, start_pos+length-1, - min_cost, max_cost, 0); - sentence_match[ sentence_id ].push_back( m ); - sentence_match_word_count[ sentence_id ] += length; - - if (max_cost < best_cost) - { - best_cost = max_cost; - if (best_cost == 0) break; - } - //if (match_count >= MAX_MATCH_COUNT) break; - } - } - // cerr << endl; - if (best_cost == 0) break; - //if (match_count >= MAX_MATCH_COUNT) break; - } - // cerr << count << " matches at length " << length << " in " << sentence_match.size() << " tm." << endl; - - if (best_cost == 0) break; - //if (match_count >= MAX_MATCH_COUNT) break; - } - cerr << match_count << " matches in " << sentence_match.size() << " sentences." << endl; - - clock_t clock_matches = clock(); - - // consider each sentence for which we have matches - int old_best_cost = best_cost; - int tm_count_word_match = 0; - int tm_count_word_match2 = 0; - int pruned_match_count = 0; - if (short_match_max_length( input_length )) - { - init_short_matches( input[sentenceInd] ); - } - vector< int > best_tm; - typedef map< int, vector< Match > >::iterator I; - - clock_t clock_validation_sum = 0; - - for(I tm=sentence_match.begin(); tm!=sentence_match.end(); tm++) - { - int tmID = tm->first; - int tm_length = suffixArray.GetSentenceLength(tmID); - vector< Match > &match = tm->second; - add_short_matches( match, source[tmID], input_length, best_cost ); - - //cerr << "match in sentence " << tmID << ": " << match.size() << " [" << tm_length << "]" << endl; - - // quick look: how many words are matched - int words_matched = 0; - for(int m=0;m<match.size();m++) { - - if (match[m].min_cost <= best_cost) // makes no difference - words_matched += match[m].input_end - match[m].input_start + 1; - } - if (max(input_length,tm_length) - words_matched > best_cost) - { - if (length_filter_flag) continue; - } - tm_count_word_match++; - - // prune, check again how many words are matched - vector< Match > pruned = prune_matches( match, best_cost ); - words_matched = 0; - for(int p=0;p<pruned.size();p++) { - words_matched += pruned[p].input_end - pruned[p].input_start + 1; - } - if (max(input_length,tm_length) - words_matched > best_cost) - { - if (length_filter_flag) continue; - } - tm_count_word_match2++; - - pruned_match_count += pruned.size(); - int prior_best_cost = best_cost; - int cost; - - clock_t clock_validation_start = clock(); - if (! parse_flag || - pruned.size()>=10) // to prevent worst cases - { - string path; - cost = sed( input[sentenceInd], source[tmID], path, false ); - if (cost < best_cost) - { - best_cost = cost; - } - } - - else - { - cost = parse_matches( pruned, input_length, tm_length, best_cost ); - if (prior_best_cost != best_cost) - { - best_tm.clear(); - } - } - clock_validation_sum += clock() - clock_validation_start; - if (cost == best_cost) - { - best_tm.push_back( tmID ); - } - } - cerr << "reduced best cost from " << old_best_cost << " to " << best_cost << endl; - cerr << "tm considered: " << sentence_match.size() - << " word-matched: " << tm_count_word_match - << " word-matched2: " << tm_count_word_match2 - << " best: " << best_tm.size() << endl; - - cerr << "pruned matches: " << ((float)pruned_match_count/(float)tm_count_word_match2) << endl; - - // create xml and extract files - string inputStr, sourceStr; - for (size_t pos = 0; pos < input_length; ++pos) { - inputStr += vocabulary.GetWord(input[sentenceInd][pos]) + " "; - } - - // do not try to find the best ... report multiple matches - if (multiple_flag) { - int input_letter_length = compute_length( input[sentenceInd] ); - for(int si=0; si<best_tm.size(); si++) { - int s = best_tm[si]; - string path; - unsigned int letter_cost = sed( input[sentenceInd], source[s], path, true ); - // do not report multiple identical sentences, but just their count - cout << sentenceInd << " "; // sentence number - cout << letter_cost << "/" << input_letter_length << " "; - cout << "(" << best_cost <<"/" << input_length <<") "; - cout << "||| " << s << " ||| " << path << endl; - - vector<WORD_ID> &sourceSentence = source[s]; - vector<SentenceAlignment> &targets = targetAndAlignment[s]; - create_extract(sentenceInd, best_cost, sourceSentence, targets, inputStr, path); - - } - } // if (multiple_flag) - else { - - // find the best matches according to letter sed - string best_path = ""; - int best_match = -1; - int best_letter_cost; - if (lsed_flag) { - best_letter_cost = compute_length( input[sentenceInd] ) * min_match / 100 + 1; - for(int si=0; si<best_tm.size(); si++) - { - int s = best_tm[si]; - string path; - unsigned int letter_cost = sed( input[sentenceInd], source[s], path, true ); - if (letter_cost < best_letter_cost) - { - best_letter_cost = letter_cost; - best_path = path; - best_match = s; - } - } - } - // if letter sed turned off, just compute path for first match - else { - if (best_tm.size() > 0) { - string path; - sed( input[sentenceInd], source[best_tm[0]], path, false ); - best_path = path; - best_match = best_tm[0]; - } - } - cerr << "elapsed: " << (1000 * (clock()-start_clock) / CLOCKS_PER_SEC) - << " ( range: " << (1000 * (clock_range-start_clock) / CLOCKS_PER_SEC) - << " match: " << (1000 * (clock_matches-clock_range) / CLOCKS_PER_SEC) - << " tm: " << (1000 * (clock()-clock_matches) / CLOCKS_PER_SEC) - << " (validation: " << (1000 * (clock_validation_sum) / CLOCKS_PER_SEC) << ")" - << " )" << endl; - if (lsed_flag) { - cout << best_letter_cost << "/" << compute_length( input[sentenceInd] ) << " ("; - } - cout << best_cost <<"/" << input_length; - if (lsed_flag) cout << ")"; - cout << " ||| " << best_match << " ||| " << best_path << endl; - - // creat xml & extracts - vector<WORD_ID> &sourceSentence = source[best_match]; - vector<SentenceAlignment> &targets = targetAndAlignment[best_match]; - create_extract(sentenceInd, best_cost, sourceSentence, targets, inputStr, best_path); - - } // else if (multiple_flag) - - - } - cerr << "total: " << (1000 * (clock()-start_main_clock) / CLOCKS_PER_SEC) << endl; - -} - -void create_extract(int sentenceInd, int cost, const vector< WORD_ID > &sourceSentence, const vector<SentenceAlignment> &targets, const string &inputStr, const string &path) -{ - string sourceStr; - for (size_t pos = 0; pos < sourceSentence.size(); ++pos) { - WORD_ID wordId = sourceSentence[pos]; - sourceStr += vocabulary.GetWord(wordId) + " "; - } - - char *inputFileName = tmpnam(NULL); - ofstream inputFile(inputFileName); - - for (size_t targetInd = 0; targetInd < targets.size(); ++targetInd) { - const SentenceAlignment &sentenceAlignment = targets[targetInd]; - string targetStr = sentenceAlignment.getTargetString(); - string alignStr = sentenceAlignment.getAlignmentString(); - - inputFile - << sentenceInd << endl - << cost << endl - << sourceStr << endl - << inputStr << endl - << targetStr << endl - << alignStr << endl - << path << endl - << sentenceAlignment.count << endl; - - } - - string cmd = string("perl create_xml.perl < ") + inputFileName; - cerr << cmd << endl; - inputFile.close(); - -} diff --git a/contrib/fuzzy-match/fuzzy-match2.h b/contrib/fuzzy-match/fuzzy-match2.h deleted file mode 100644 index 614bf971f..000000000 --- a/contrib/fuzzy-match/fuzzy-match2.h +++ /dev/null @@ -1,561 +0,0 @@ -// -// fuzzy-match2.h -// fuzzy-match -// -// Created by Hieu Hoang on 25/07/2012. -// Copyright 2012 __MyCompanyName__. All rights reserved. -// - -#ifndef fuzzy_match_fuzzy_match2_h -#define fuzzy_match_fuzzy_match2_h - -#include <string> -#include <sstream> -#include <vector> -#include "Vocabulary.h" -#include "SuffixArray.h" -#include "Util.h" -#include "Match.h" - -#define MAX_MATCH_COUNT 10000000 - -Vocabulary vocabulary; - -int basic_flag = false; -int lsed_flag = true; -int refined_flag = true; -int length_filter_flag = true; -int parse_flag = true; -int min_match = 70; -int multiple_flag = false; -int multiple_slack = 0; -int multiple_max = 100; -map< WORD_ID,vector< int > > single_word_index; -// global cache for word pairs -map< pair< WORD_ID, WORD_ID >, unsigned int > lsed; - -void create_extract(int sentenceInd, int cost, const vector< WORD_ID > &sourceSentence, const vector<SentenceAlignment> &targets, const string &inputStr, const string &path); - - - -/* Letter string edit distance, e.g. sub 'their' to 'there' costs 2 */ - -unsigned int letter_sed( WORD_ID aIdx, WORD_ID bIdx ) -{ - // check if already computed -> lookup in cache - pair< WORD_ID, WORD_ID > pIdx = make_pair( aIdx, bIdx ); - map< pair< WORD_ID, WORD_ID >, unsigned int >::const_iterator lookup = lsed.find( pIdx ); - if (lookup != lsed.end()) - { - return (lookup->second); - } - - // get surface strings for word indices - const string &a = vocabulary.GetWord( aIdx ); - const string &b = vocabulary.GetWord( bIdx ); - - // initialize cost matrix - unsigned int **cost = (unsigned int**) calloc( sizeof( unsigned int* ), a.size()+1 ); - for( unsigned int i=0; i<=a.size(); i++ ) { - cost[i] = (unsigned int*) calloc( sizeof(unsigned int), b.size()+1 ); - cost[i][0] = i; - } - for( unsigned int j=0; j<=b.size(); j++ ) { - cost[0][j] = j; - } - - // core string edit distance loop - for( unsigned int i=1; i<=a.size(); i++ ) { - for( unsigned int j=1; j<=b.size(); j++ ) { - - unsigned int ins = cost[i-1][j] + 1; - unsigned int del = cost[i][j-1] + 1; - bool match = (a.substr(i-1,1).compare( b.substr(j-1,1) ) == 0); - unsigned int diag = cost[i-1][j-1] + (match ? 0 : 1); - - unsigned int min = (ins < del) ? ins : del; - min = (diag < min) ? diag : min; - - cost[i][j] = min; - } - } - - // clear out memory - unsigned int final = cost[a.size()][b.size()]; - for( unsigned int i=0; i<=a.size(); i++ ) { - free( cost[i] ); - } - free( cost ); - - // cache and return result - lsed[ pIdx ] = final; - return final; -} - -/* string edit distance implementation */ - -unsigned int sed( const vector< WORD_ID > &a, const vector< WORD_ID > &b, string &best_path, bool use_letter_sed ) { - - // initialize cost and path matrices - unsigned int **cost = (unsigned int**) calloc( sizeof( unsigned int* ), a.size()+1 ); - char **path = (char**) calloc( sizeof( char* ), a.size()+1 ); - - for( unsigned int i=0; i<=a.size(); i++ ) { - cost[i] = (unsigned int*) calloc( sizeof(unsigned int), b.size()+1 ); - path[i] = (char*) calloc( sizeof(char), b.size()+1 ); - if (i>0) - { - cost[i][0] = cost[i-1][0]; - if (use_letter_sed) - { - cost[i][0] += vocabulary.GetWord( a[i-1] ).size(); - } - else - { - cost[i][0]++; - } - } - else - { - cost[i][0] = 0; - } - path[i][0] = 'I'; - } - - for( unsigned int j=0; j<=b.size(); j++ ) { - if (j>0) - { - cost[0][j] = cost[0][j-1]; - if (use_letter_sed) - { - cost[0][j] += vocabulary.GetWord( b[j-1] ).size(); - } - else - { - cost[0][j]++; - } - } - else - { - cost[0][j] = 0; - } - path[0][j] = 'D'; - } - - // core string edit distance algorithm - for( unsigned int i=1; i<=a.size(); i++ ) { - for( unsigned int j=1; j<=b.size(); j++ ) { - unsigned int ins = cost[i-1][j]; - unsigned int del = cost[i][j-1]; - unsigned int match; - if (use_letter_sed) - { - ins += vocabulary.GetWord( a[i-1] ).size(); - del += vocabulary.GetWord( b[j-1] ).size(); - match = letter_sed( a[i-1], b[j-1] ); - } - else - { - ins++; - del++; - match = ( a[i-1] == b[j-1] ) ? 0 : 1; - } - unsigned int diag = cost[i-1][j-1] + match; - - char action = (ins < del) ? 'I' : 'D'; - unsigned int min = (ins < del) ? ins : del; - if (diag < min) - { - action = (match>0) ? 'S' : 'M'; - min = diag; - } - - cost[i][j] = min; - path[i][j] = action; - } - } - - // construct string for best path - unsigned int i = a.size(); - unsigned int j = b.size(); - best_path = ""; - while( i>0 || j>0 ) - { - best_path = path[i][j] + best_path; - if (path[i][j] == 'I') - { - i--; - } - else if (path[i][j] == 'D') - { - j--; - } - else - { - i--; - j--; - } - } - - - // clear out memory - unsigned int final = cost[a.size()][b.size()]; - - for( unsigned int i=0; i<=a.size(); i++ ) { - free( cost[i] ); - free( path[i] ); - } - free( cost ); - free( path ); - - // return result - return final; -} - -/* utlility function: compute length of sentence in characters - (spaces do not count) */ - -unsigned int compute_length( const vector< WORD_ID > &sentence ) -{ - unsigned int length = 0; for( unsigned int i=0; i<sentence.size(); i++ ) - { - length += vocabulary.GetWord( sentence[i] ).size(); - } - return length; -} - -/* brute force method: compare input to all corpus sentences */ - -int basic_fuzzy_match( vector< vector< WORD_ID > > source, - vector< vector< WORD_ID > > input ) -{ - // go through input set... - for(unsigned int i=0;i<input.size();i++) - { - bool use_letter_sed = false; - - // compute sentence length and worst allowed cost - unsigned int input_length; - if (use_letter_sed) - { - input_length = compute_length( input[i] ); - } - else - { - input_length = input[i].size(); - } - unsigned int best_cost = input_length * (100-min_match) / 100 + 2; - string best_path = ""; - int best_match = -1; - - // go through all corpus sentences - for(unsigned int s=0;s<source.size();s++) - { - int source_length; - if (use_letter_sed) - { - source_length = compute_length( source[s] ); - } - else - { - source_length = source[s].size(); - } - int diff = abs((int)source_length - (int)input_length); - if (length_filter_flag && (diff >= best_cost)) - { - continue; - } - - // compute string edit distance - string path; - unsigned int cost = sed( input[i], source[s], path, use_letter_sed ); - - // update if new best - if (cost < best_cost) - { - best_cost = cost; - best_path = path; - best_match = s; - } - } - cout << best_cost << " ||| " << best_match << " ||| " << best_path << endl; - } -} - -/* definition of short matches - very short n-gram matches (1-grams) will not be looked up in - the suffix array, since there are too many matches - and for longer sentences, at least one 2-gram match must occur */ - -inline int short_match_max_length( int input_length ) -{ - if ( ! refined_flag ) - return 0; - if ( input_length >= 5 ) - return 1; - return 0; -} - -/* if we have non-short matches in a sentence, we need to - take a closer look at it. - this function creates a hash map for all input words and their positions - (to be used by the next function) - (done here, because this has be done only once for an input sentence) */ - -void init_short_matches( const vector< WORD_ID > &input ) -{ - int max_length = short_match_max_length( input.size() ); - if (max_length == 0) - return; - - single_word_index.clear(); - - // store input words and their positions in hash map - for(int i=0; i<input.size(); i++) - { - if (single_word_index.find( input[i] ) == single_word_index.end()) - { - vector< int > position_vector; - single_word_index[ input[i] ] = position_vector; - } - single_word_index[ input[i] ].push_back( i ); - } -} - -/* add all short matches to list of matches for a sentence */ - -void add_short_matches( vector< Match > &match, const vector< WORD_ID > &tm, int input_length, int best_cost ) -{ - int max_length = short_match_max_length( input_length ); - if (max_length == 0) - return; - - int tm_length = tm.size(); - map< WORD_ID,vector< int > >::iterator input_word_hit; - for(int t_pos=0; t_pos<tm.size(); t_pos++) - { - input_word_hit = single_word_index.find( tm[t_pos] ); - if (input_word_hit != single_word_index.end()) - { - vector< int > &position_vector = input_word_hit->second; - for(int j=0; j<position_vector.size(); j++) - { - int &i_pos = position_vector[j]; - - // before match - int max_cost = max( i_pos , t_pos ); - int min_cost = abs( i_pos - t_pos ); - if ( i_pos>0 && i_pos == t_pos ) - min_cost++; - - // after match - max_cost += max( (input_length-i_pos) , (tm_length-t_pos)); - min_cost += abs( (input_length-i_pos) - (tm_length-t_pos)); - if ( i_pos != input_length-1 && (input_length-i_pos) == (tm_length-t_pos)) - min_cost++; - - if (min_cost <= best_cost) - { - Match new_match( i_pos,i_pos, t_pos,t_pos, min_cost,max_cost,0 ); - match.push_back( new_match ); - } - } - } - } -} - -/* remove matches that are subsumed by a larger match */ - -vector< Match > prune_matches( const vector< Match > &match, int best_cost ) -{ - //cerr << "\tpruning"; - vector< Match > pruned; - for(int i=match.size()-1; i>=0; i--) - { - //cerr << " (" << match[i].input_start << "," << match[i].input_end - // << " ; " << match[i].tm_start << "," << match[i].tm_end - // << " * " << match[i].min_cost << ")"; - - //if (match[i].min_cost > best_cost) - // continue; - - bool subsumed = false; - for(int j=match.size()-1; j>=0; j--) - { - if (i!=j // do not compare match with itself - && ( match[i].input_end - match[i].input_start <= - match[j].input_end - match[j].input_start ) // i shorter than j - && ((match[i].input_start == match[j].input_start && - match[i].tm_start == match[j].tm_start ) || - (match[i].input_end == match[j].input_end && - match[i].tm_end == match[j].tm_end) ) ) - { - subsumed = true; - } - } - if (! subsumed && match[i].min_cost <= best_cost) - { - //cerr << "*"; - pruned.push_back( match[i] ); - } - } - //cerr << endl; - return pruned; -} - -/* A* parsing method to compute string edit distance */ - -int parse_matches( vector< Match > &match, int input_length, int tm_length, int &best_cost ) -{ - // cerr << "sentence has " << match.size() << " matches, best cost: " << best_cost << ", lengths input: " << input_length << " tm: " << tm_length << endl; - - if (match.size() == 1) - return match[0].max_cost; - if (match.size() == 0) - return input_length+tm_length; - - int this_best_cost = input_length + tm_length; - for(int i=0;i<match.size();i++) - { - this_best_cost = min( this_best_cost, match[i].max_cost ); - } - // cerr << "\tthis best cost: " << this_best_cost << endl; - - // bottom up combination of spans - vector< vector< Match > > multi_match; - multi_match.push_back( match ); - - int match_level = 1; - while(multi_match[ match_level-1 ].size()>0) - { - // init vector - vector< Match > empty; - multi_match.push_back( empty ); - - for(int first_level = 0; first_level <= (match_level-1)/2; first_level++) - { - int second_level = match_level - first_level -1; - //cerr << "\tcombining level " << first_level << " and " << second_level << endl; - - vector< Match > &first_match = multi_match[ first_level ]; - vector< Match > &second_match = multi_match[ second_level ]; - - for(int i1 = 0; i1 < first_match.size(); i1++) { - for(int i2 = 0; i2 < second_match.size(); i2++) { - - // do not combine the same pair twice - if (first_level == second_level && i2 <= i1) - { - continue; - } - - // get sorted matches (first is before second) - Match *first, *second; - if (first_match[i1].input_start < second_match[i2].input_start ) - { - first = &first_match[i1]; - second = &second_match[i2]; - } - else - { - second = &first_match[i1]; - first = &second_match[i2]; - } - - //cerr << "\tcombining " - // << "(" << first->input_start << "," << first->input_end << "), " - // << first->tm_start << " [" << first->internal_cost << "]" - // << " with " - // << "(" << second->input_start << "," << second->input_end << "), " - // << second->tm_start<< " [" << second->internal_cost << "]" - // << endl; - - // do not process overlapping matches - if (first->input_end >= second->input_start) - { - continue; - } - - // no overlap / mismatch in tm - if (first->tm_end >= second->tm_start) - { - continue; - } - - // compute cost - int min_cost = 0; - int max_cost = 0; - - // initial - min_cost += abs( first->input_start - first->tm_start ); - max_cost += max( first->input_start, first->tm_start ); - - // same number of words, but not sent. start -> cost is at least 1 - if (first->input_start == first->tm_start && first->input_start > 0) - { - min_cost++; - } - - // in-between - int skipped_words = second->input_start - first->input_end -1; - int skipped_words_tm = second->tm_start - first->tm_end -1; - int internal_cost = max( skipped_words, skipped_words_tm ); - internal_cost += first->internal_cost + second->internal_cost; - min_cost += internal_cost; - max_cost += internal_cost; - - // final - min_cost += abs( (tm_length-1 - second->tm_end) - - (input_length-1 - second->input_end) ); - max_cost += max( (tm_length-1 - second->tm_end), - (input_length-1 - second->input_end) ); - - // same number of words, but not sent. end -> cost is at least 1 - if ( ( input_length-1 - second->input_end - == tm_length-1 - second->tm_end ) - && input_length-1 != second->input_end ) - { - min_cost++; - } - - // cerr << "\tcost: " << min_cost << "-" << max_cost << endl; - - // if worst than best cost, forget it - if (min_cost > best_cost) - { - continue; - } - - // add match - Match new_match( first->input_start, - second->input_end, - first->tm_start, - second->tm_end, - min_cost, - max_cost, - internal_cost); - multi_match[ match_level ].push_back( new_match ); - // cerr << "\tstored\n"; - - // possibly updating this_best_cost - if (max_cost < this_best_cost) - { - // cerr << "\tupdating this best cost to " << max_cost << "\n"; - this_best_cost = max_cost; - - // possibly updating best_cost - if (max_cost < best_cost) - { - // cerr << "\tupdating best cost to " << max_cost << "\n"; - best_cost = max_cost; - } - } - } - } - } - match_level++; - } - return this_best_cost; -} - -#endif diff --git a/contrib/fuzzy-match/make-xml-from-match.perl b/contrib/fuzzy-match/make-xml-from-match.perl deleted file mode 100644 index b5c213a3d..000000000 --- a/contrib/fuzzy-match/make-xml-from-match.perl +++ /dev/null @@ -1,214 +0,0 @@ -#!/usr/bin/perl -w - -use strict; - -my $DEBUG = 1; - -my $match_file = "tm/BEST.acquis-xml-escaped.4.uniq"; -my $source_file = "data/acquis.truecased.4.en.uniq"; -my $target_file = "data/acquis.truecased.4.fr.uniq.most-frequent"; -my $alignment_file = "data/acquis.truecased.4.align.uniq.most-frequent"; -my $out_file = "data/ac-test.input.xml.4.uniq"; -my $in_file = "evaluation/ac-test.input.tc.4"; - -#my $match_file = "tm/BEST.acquis-xml-escaped.4"; -#my $source_file = "corpus/acquis.truecased.4.en"; -#my $target_file = "corpus/acquis.truecased.4.fr"; -#my $alignment_file = "model/aligned.4.grow-diag-final-and"; -#my $out_file = "data/ac-test.input.xml.4"; -#my $in_file = "evaluation/ac-test.input.tc.4"; - -#my $match_file = "tm/BEST.acquis.with"; -#my $source_file = "../acquis-truecase/corpus/acquis.truecased.190.en"; -#my $target_file = "../acquis-truecase/corpus/acquis.truecased.190.fr"; -#my $alignment_file = "../acquis-truecase/model/aligned.190.grow-diag-final-and"; -#my $out_file = "data/ac-test.input.xml"; -#my $in_file = "evaluation/ac-test.input.tc.1"; - -my @INPUT = `cat $in_file`; chop(@INPUT); -my @SOURCE = `cat $source_file`; chop(@SOURCE); -my @TARGET = `cat $target_file`; chop(@TARGET); -my @ALIGNMENT = `cat $alignment_file`; chop(@ALIGNMENT); - -open(MATCH,$match_file); -open(FRAME,">$out_file"); -for(my $i=0;$i<4107;$i++) { - - # get match data - my $match = <MATCH>; - chop($match); - my ($score,$sentence,$path) = split(/ \|\|\| /,$match); - - # construct frame - if ($sentence < 1e9 && $sentence >= 0) { - my $frame = &create_xml($SOURCE[$sentence], - $INPUT[$i], - $TARGET[$sentence], - $ALIGNMENT[$sentence], - $path); - print FRAME $frame."\n"; - } - - # no frame -> output source - else { - print FRAME $INPUT[$i]."\n"; - } -} -close(FRAME); -close(MATCH); - -sub create_xml { - my ($source,$input,$target,$alignment,$path) = @_; - - my @INPUT = split(/ /,$input); - my @SOURCE = split(/ /,$source); - my @TARGET = split(/ /,$target); - my %ALIGN = &create_alignment($alignment); - - my %FRAME_INPUT; - my @TARGET_BITMAP; - foreach (@TARGET) { push @TARGET_BITMAP,1 } - - ### STEP 1: FIND MISMATCHES - - my ($s,$i) = (0,0); - my $currently_matching = 0; - my ($start_s,$start_i) = (0,0); - - $path .= "X"; # indicate end - print "$input\n$source\n$target\n$path\n"; - for(my $p=0;$p<length($path);$p++) { - my $action = substr($path,$p,1); - - # beginning of a mismatch - if ($currently_matching && $action ne "M" && $action ne "X") { - $start_i = $i; - $start_s = $s; - $currently_matching = 0; - } - - # end of a mismatch - elsif (!$currently_matching && - ($action eq "M" || $action eq "X")) { - - # remove use of affected target words - for(my $ss = $start_s; $ss<$s; $ss++) { - foreach my $tt (keys %{${$ALIGN{'s'}}[$ss]}) { - $TARGET_BITMAP[$tt] = 0; - } - - # also remove enclosed unaligned words? - } - - # are there input words that need to be inserted ? - print "($start_i<$i)?\n"; - if ($start_i<$i) { - - # take note of input words to be inserted - my $insertion = ""; - for(my $ii = $start_i; $ii<$i; $ii++) { - $insertion .= $INPUT[$ii]." "; - } - - # find position for inserted input words - - # find first removed target word - my $start_t = 1000; - for(my $ss = $start_s; $ss<$s; $ss++) { - foreach my $tt (keys %{${$ALIGN{'s'}}[$ss]}) { - $start_t = $tt if $tt < $start_t; - } - } - - # end of sentence? add to end - if ($start_t == 1000 && $i > $#INPUT) { - $start_t = $#TARGET; - } - - # backtrack to previous words if unaligned - if ($start_t == 1000) { - $start_t = -1; - for(my $ss = $s-1; $start_t==-1 && $ss>=0; $ss--) { - foreach my $tt (keys %{${$ALIGN{'s'}}[$ss]}) { - $start_t = $tt if $tt > $start_t; - } - } - } - $FRAME_INPUT{$start_t} .= $insertion; - } - - $currently_matching = 1; - } - - print "$action $s $i ($start_s $start_i) $currently_matching"; - if ($action ne "I") { - print " ->"; - foreach my $tt (keys %{${$ALIGN{'s'}}[$s]}) { - print " ".$tt; - } - } - print "\n"; - $s++ unless $action eq "I"; - $i++ unless $action eq "D"; - } - - - print $target."\n"; - foreach (@TARGET_BITMAP) { print $_; } print "\n"; - foreach (sort keys %FRAME_INPUT) { - print "$_: $FRAME_INPUT{$_}\n"; - } - - ### STEP 2: BUILD FRAME - - # modify frame - my $frame = ""; - $frame = $FRAME_INPUT{-1} if defined $FRAME_INPUT{-1}; - - my $currently_included = 0; - my $start_t = -1; - push @TARGET_BITMAP,0; # indicate end - - for(my $t=0;$t<=scalar(@TARGET);$t++) { - - # beginning of tm target inclusion - if (!$currently_included && $TARGET_BITMAP[$t]) { - $start_t = $t; - $currently_included = 1; - } - - # end of tm target inclusion (not included word or inserted input) - elsif ($currently_included && - (!$TARGET_BITMAP[$t] || defined($FRAME_INPUT{$t}))) { - # add xml (unless change is at the beginning of the sentence - if ($start_t >= 0) { - my $target = ""; - print "for(tt=$start_t;tt<$t+$TARGET_BITMAP[$t]);\n"; - for(my $tt=$start_t;$tt<$t+$TARGET_BITMAP[$t];$tt++) { - $target .= $TARGET[$tt] . " "; - } - chop($target); - $frame .= "<xml translation=\"$target\"> x </xml> "; - } - $currently_included = 0; - } - - $frame .= $FRAME_INPUT{$t} if defined $FRAME_INPUT{$t}; - print "$TARGET_BITMAP[$t] $t ($start_t) $currently_included\n"; - } - - print $frame."\n-------------------------------------\n"; - return $frame; -} - -sub create_alignment { - my ($line) = @_; - my (@ALIGNED_TO_S,@ALIGNED_TO_T); - foreach my $point (split(/ /,$line)) { - my ($s,$t) = split(/\-/,$point); - $ALIGNED_TO_S[$s]{$t}++; - $ALIGNED_TO_T[$t]{$s}++; - } - my %ALIGNMENT = ( 's' => \@ALIGNED_TO_S, 't' => \@ALIGNED_TO_T ); - return %ALIGNMENT; -} diff --git a/contrib/fuzzy-match/old/fuzzy-match.cpp b/contrib/fuzzy-match/old/fuzzy-match.cpp deleted file mode 100644 index 76c69e246..000000000 --- a/contrib/fuzzy-match/old/fuzzy-match.cpp +++ /dev/null @@ -1,982 +0,0 @@ -#include <stdio.h> -#include <stdlib.h> -#include <getopt.h> -#include <vector> -#include <map> -#include <string> -#include <algorithm> -#include <iostream> -#include <fstream> -#include <cstring> -#include <time.h> - -#include "Vocabulary.h" -#include "SuffixArray.h" - -/** This implementation is explained in - Koehn and Senellart: "Fast Approximate String Matching - with Suffix Arrays and A* Parsing" (AMTA 2010) ***/ - -using namespace std; - -Vocabulary vocabulary; - -int basic_flag = false; -int lsed_flag = true; -int refined_flag = true; -int length_filter_flag = true; -int parse_flag = true; -int min_match = 70; -int multiple_flag = false; -int multiple_slack = 0; -int multiple_max = 100; - -void load_corpus( char* fileName, vector< vector< WORD_ID > > &corpus ) -{ - ifstream fileStream; - fileStream.open(fileName); - if (!fileStream) { - cerr << "file not found: " << fileName << endl; - exit(1); - } - istream *fileStreamP = &fileStream; - - char line[LINE_MAX_LENGTH]; - while(true) - { - SAFE_GETLINE((*fileStreamP), line, LINE_MAX_LENGTH, '\n'); - if (fileStreamP->eof()) break; - corpus.push_back( vocabulary.Tokenize( line ) ); - } -} - - -/* Letter string edit distance, e.g. sub 'their' to 'there' costs 2 */ - -// global cache for word pairs -map< pair< WORD_ID, WORD_ID >, unsigned int > lsed; - -unsigned int letter_sed( WORD_ID aIdx, WORD_ID bIdx ) -{ - // check if already computed -> lookup in cache - pair< WORD_ID, WORD_ID > pIdx = make_pair( aIdx, bIdx ); - map< pair< WORD_ID, WORD_ID >, unsigned int >::const_iterator lookup = lsed.find( pIdx ); - if (lookup != lsed.end()) - { - return (lookup->second); - } - - // get surface strings for word indices - const string &a = vocabulary.GetWord( aIdx ); - const string &b = vocabulary.GetWord( bIdx ); - - // initialize cost matrix - unsigned int **cost = (unsigned int**) calloc( sizeof( unsigned int* ), a.size()+1 ); - for( unsigned int i=0; i<=a.size(); i++ ) { - cost[i] = (unsigned int*) calloc( sizeof(unsigned int), b.size()+1 ); - cost[i][0] = i; - } - for( unsigned int j=0; j<=b.size(); j++ ) { - cost[0][j] = j; - } - - // core string edit distance loop - for( unsigned int i=1; i<=a.size(); i++ ) { - for( unsigned int j=1; j<=b.size(); j++ ) { - - unsigned int ins = cost[i-1][j] + 1; - unsigned int del = cost[i][j-1] + 1; - bool match = (a.substr(i-1,1).compare( b.substr(j-1,1) ) == 0); - unsigned int diag = cost[i-1][j-1] + (match ? 0 : 1); - - unsigned int min = (ins < del) ? ins : del; - min = (diag < min) ? diag : min; - - cost[i][j] = min; - } - } - - // clear out memory - unsigned int final = cost[a.size()][b.size()]; - for( unsigned int i=0; i<=a.size(); i++ ) { - free( cost[i] ); - } - free( cost ); - - // cache and return result - lsed[ pIdx ] = final; - return final; -} - -/* string edit distance implementation */ - -unsigned int sed( const vector< WORD_ID > &a, const vector< WORD_ID > &b, string &best_path, bool use_letter_sed ) { - - // initialize cost and path matrices - unsigned int **cost = (unsigned int**) calloc( sizeof( unsigned int* ), a.size()+1 ); - char **path = (char**) calloc( sizeof( char* ), a.size()+1 ); - - for( unsigned int i=0; i<=a.size(); i++ ) { - cost[i] = (unsigned int*) calloc( sizeof(unsigned int), b.size()+1 ); - path[i] = (char*) calloc( sizeof(char), b.size()+1 ); - if (i>0) - { - cost[i][0] = cost[i-1][0]; - if (use_letter_sed) - { - cost[i][0] += vocabulary.GetWord( a[i-1] ).size(); - } - else - { - cost[i][0]++; - } - } - else - { - cost[i][0] = 0; - } - path[i][0] = 'I'; - } - - for( unsigned int j=0; j<=b.size(); j++ ) { - if (j>0) - { - cost[0][j] = cost[0][j-1]; - if (use_letter_sed) - { - cost[0][j] += vocabulary.GetWord( b[j-1] ).size(); - } - else - { - cost[0][j]++; - } - } - else - { - cost[0][j] = 0; - } - path[0][j] = 'D'; - } - - // core string edit distance algorithm - for( unsigned int i=1; i<=a.size(); i++ ) { - for( unsigned int j=1; j<=b.size(); j++ ) { - unsigned int ins = cost[i-1][j]; - unsigned int del = cost[i][j-1]; - unsigned int match; - if (use_letter_sed) - { - ins += vocabulary.GetWord( a[i-1] ).size(); - del += vocabulary.GetWord( b[j-1] ).size(); - match = letter_sed( a[i-1], b[j-1] ); - } - else - { - ins++; - del++; - match = ( a[i-1] == b[j-1] ) ? 0 : 1; - } - unsigned int diag = cost[i-1][j-1] + match; - - char action = (ins < del) ? 'I' : 'D'; - unsigned int min = (ins < del) ? ins : del; - if (diag < min) - { - action = (match>0) ? 'S' : 'M'; - min = diag; - } - - cost[i][j] = min; - path[i][j] = action; - } - } - - // construct string for best path - unsigned int i = a.size(); - unsigned int j = b.size(); - best_path = ""; - while( i>0 || j>0 ) - { - best_path = path[i][j] + best_path; - if (path[i][j] == 'I') - { - i--; - } - else if (path[i][j] == 'D') - { - j--; - } - else - { - i--; - j--; - } - } - - - // clear out memory - unsigned int final = cost[a.size()][b.size()]; - - for( unsigned int i=0; i<=a.size(); i++ ) { - free( cost[i] ); - free( path[i] ); - } - free( cost ); - free( path ); - - // return result - return final; -} - -/* utlility function: compute length of sentence in characters - (spaces do not count) */ - -unsigned int compute_length( const vector< WORD_ID > &sentence ) -{ - unsigned int length = 0; for( unsigned int i=0; i<sentence.size(); i++ ) - { - length += vocabulary.GetWord( sentence[i] ).size(); - } - return length; -} - -/* brute force method: compare input to all corpus sentences */ - -int basic_fuzzy_match( vector< vector< WORD_ID > > source, - vector< vector< WORD_ID > > input ) -{ - // go through input set... - for(unsigned int i=0;i<input.size();i++) - { - bool use_letter_sed = false; - - // compute sentence length and worst allowed cost - unsigned int input_length; - if (use_letter_sed) - { - input_length = compute_length( input[i] ); - } - else - { - input_length = input[i].size(); - } - unsigned int best_cost = input_length * (100-min_match) / 100 + 2; - string best_path = ""; - int best_match = -1; - - // go through all corpus sentences - for(unsigned int s=0;s<source.size();s++) - { - int source_length; - if (use_letter_sed) - { - source_length = compute_length( source[s] ); - } - else - { - source_length = source[s].size(); - } - int diff = abs((int)source_length - (int)input_length); - if (length_filter_flag && (diff >= best_cost)) - { - continue; - } - - // compute string edit distance - string path; - unsigned int cost = sed( input[i], source[s], path, use_letter_sed ); - - // update if new best - if (cost < best_cost) - { - best_cost = cost; - best_path = path; - best_match = s; - } - } - cout << best_cost << " ||| " << best_match << " ||| " << best_path << endl; - } -} - -#define MAX_MATCH_COUNT 10000000 - -/* data structure for n-gram match between input and corpus */ - -class Match { -public: - int input_start; - int input_end; - int tm_start; - int tm_end; - int min_cost; - int max_cost; - int internal_cost; - Match( int is, int ie, int ts, int te, int min, int max, int i ) - :input_start(is), input_end(ie), tm_start(ts), tm_end(te), min_cost(min), max_cost(max), internal_cost(i) - {} -}; - -map< WORD_ID,vector< int > > single_word_index; - -/* definition of short matches - very short n-gram matches (1-grams) will not be looked up in - the suffix array, since there are too many matches - and for longer sentences, at least one 2-gram match must occur */ - -inline int short_match_max_length( int input_length ) -{ - if ( ! refined_flag ) - return 0; - if ( input_length >= 5 ) - return 1; - return 0; -} - -/* if we have non-short matches in a sentence, we need to - take a closer look at it. - this function creates a hash map for all input words and their positions - (to be used by the next function) - (done here, because this has be done only once for an input sentence) */ - -void init_short_matches( const vector< WORD_ID > &input ) -{ - int max_length = short_match_max_length( input.size() ); - if (max_length == 0) - return; - - single_word_index.clear(); - - // store input words and their positions in hash map - for(int i=0; i<input.size(); i++) - { - if (single_word_index.find( input[i] ) == single_word_index.end()) - { - vector< int > position_vector; - single_word_index[ input[i] ] = position_vector; - } - single_word_index[ input[i] ].push_back( i ); - } -} - -/* add all short matches to list of matches for a sentence */ - -void add_short_matches( vector< Match > &match, const vector< WORD_ID > &tm, int input_length, int best_cost ) -{ - int max_length = short_match_max_length( input_length ); - if (max_length == 0) - return; - - int tm_length = tm.size(); - map< WORD_ID,vector< int > >::iterator input_word_hit; - for(int t_pos=0; t_pos<tm.size(); t_pos++) - { - input_word_hit = single_word_index.find( tm[t_pos] ); - if (input_word_hit != single_word_index.end()) - { - vector< int > &position_vector = input_word_hit->second; - for(int j=0; j<position_vector.size(); j++) - { - int &i_pos = position_vector[j]; - - // before match - int max_cost = max( i_pos , t_pos ); - int min_cost = abs( i_pos - t_pos ); - if ( i_pos>0 && i_pos == t_pos ) - min_cost++; - - // after match - max_cost += max( (input_length-i_pos) , (tm_length-t_pos)); - min_cost += abs( (input_length-i_pos) - (tm_length-t_pos)); - if ( i_pos != input_length-1 && (input_length-i_pos) == (tm_length-t_pos)) - min_cost++; - - if (min_cost <= best_cost) - { - Match new_match( i_pos,i_pos, t_pos,t_pos, min_cost,max_cost,0 ); - match.push_back( new_match ); - } - } - } - } -} - -/* remove matches that are subsumed by a larger match */ - -vector< Match > prune_matches( const vector< Match > &match, int best_cost ) -{ - //cerr << "\tpruning"; - vector< Match > pruned; - for(int i=match.size()-1; i>=0; i--) - { - //cerr << " (" << match[i].input_start << "," << match[i].input_end - // << " ; " << match[i].tm_start << "," << match[i].tm_end - // << " * " << match[i].min_cost << ")"; - - //if (match[i].min_cost > best_cost) - // continue; - - bool subsumed = false; - for(int j=match.size()-1; j>=0; j--) - { - if (i!=j // do not compare match with itself - && ( match[i].input_end - match[i].input_start <= - match[j].input_end - match[j].input_start ) // i shorter than j - && ((match[i].input_start == match[j].input_start && - match[i].tm_start == match[j].tm_start ) || - (match[i].input_end == match[j].input_end && - match[i].tm_end == match[j].tm_end) ) ) - { - subsumed = true; - } - } - if (! subsumed && match[i].min_cost <= best_cost) - { - //cerr << "*"; - pruned.push_back( match[i] ); - } - } - //cerr << endl; - return pruned; -} - -/* A* parsing method to compute string edit distance */ - -int parse_matches( vector< Match > &match, int input_length, int tm_length, int &best_cost ) -{ - // cerr << "sentence has " << match.size() << " matches, best cost: " << best_cost << ", lengths input: " << input_length << " tm: " << tm_length << endl; - - if (match.size() == 1) - return match[0].max_cost; - if (match.size() == 0) - return input_length+tm_length; - - int this_best_cost = input_length + tm_length; - for(int i=0;i<match.size();i++) - { - this_best_cost = min( this_best_cost, match[i].max_cost ); - } - // cerr << "\tthis best cost: " << this_best_cost << endl; - - // bottom up combination of spans - vector< vector< Match > > multi_match; - multi_match.push_back( match ); - - int match_level = 1; - while(multi_match[ match_level-1 ].size()>0) - { - // init vector - vector< Match > empty; - multi_match.push_back( empty ); - - for(int first_level = 0; first_level <= (match_level-1)/2; first_level++) - { - int second_level = match_level - first_level -1; - //cerr << "\tcombining level " << first_level << " and " << second_level << endl; - - vector< Match > &first_match = multi_match[ first_level ]; - vector< Match > &second_match = multi_match[ second_level ]; - - for(int i1 = 0; i1 < first_match.size(); i1++) { - for(int i2 = 0; i2 < second_match.size(); i2++) { - - // do not combine the same pair twice - if (first_level == second_level && i2 <= i1) - { - continue; - } - - // get sorted matches (first is before second) - Match *first, *second; - if (first_match[i1].input_start < second_match[i2].input_start ) - { - first = &first_match[i1]; - second = &second_match[i2]; - } - else - { - second = &first_match[i1]; - first = &second_match[i2]; - } - - //cerr << "\tcombining " - // << "(" << first->input_start << "," << first->input_end << "), " - // << first->tm_start << " [" << first->internal_cost << "]" - // << " with " - // << "(" << second->input_start << "," << second->input_end << "), " - // << second->tm_start<< " [" << second->internal_cost << "]" - // << endl; - - // do not process overlapping matches - if (first->input_end >= second->input_start) - { - continue; - } - - // no overlap / mismatch in tm - if (first->tm_end >= second->tm_start) - { - continue; - } - - // compute cost - int min_cost = 0; - int max_cost = 0; - - // initial - min_cost += abs( first->input_start - first->tm_start ); - max_cost += max( first->input_start, first->tm_start ); - - // same number of words, but not sent. start -> cost is at least 1 - if (first->input_start == first->tm_start && first->input_start > 0) - { - min_cost++; - } - - // in-between - int skipped_words = second->input_start - first->input_end -1; - int skipped_words_tm = second->tm_start - first->tm_end -1; - int internal_cost = max( skipped_words, skipped_words_tm ); - internal_cost += first->internal_cost + second->internal_cost; - min_cost += internal_cost; - max_cost += internal_cost; - - // final - min_cost += abs( (tm_length-1 - second->tm_end) - - (input_length-1 - second->input_end) ); - max_cost += max( (tm_length-1 - second->tm_end), - (input_length-1 - second->input_end) ); - - // same number of words, but not sent. end -> cost is at least 1 - if ( ( input_length-1 - second->input_end - == tm_length-1 - second->tm_end ) - && input_length-1 != second->input_end ) - { - min_cost++; - } - - // cerr << "\tcost: " << min_cost << "-" << max_cost << endl; - - // if worst than best cost, forget it - if (min_cost > best_cost) - { - continue; - } - - // add match - Match new_match( first->input_start, - second->input_end, - first->tm_start, - second->tm_end, - min_cost, - max_cost, - internal_cost); - multi_match[ match_level ].push_back( new_match ); - // cerr << "\tstored\n"; - - // possibly updating this_best_cost - if (max_cost < this_best_cost) - { - // cerr << "\tupdating this best cost to " << max_cost << "\n"; - this_best_cost = max_cost; - - // possibly updating best_cost - if (max_cost < best_cost) - { - // cerr << "\tupdating best cost to " << max_cost << "\n"; - best_cost = max_cost; - } - } - } - } - } - match_level++; - } - return this_best_cost; -} - -int main(int argc, char* argv[]) -{ - vector< vector< WORD_ID > > source, input; - - while(1) { - static struct option long_options[] = { - {"basic", no_argument, &basic_flag, 1}, - {"word", no_argument, &lsed_flag, 0}, - {"unrefined", no_argument, &refined_flag, 0}, - {"nolengthfilter", no_argument, &length_filter_flag, 0}, - {"noparse", no_argument, &parse_flag, 0}, - {"multiple", no_argument, &multiple_flag, 1}, - {"minmatch", required_argument, 0, 'm'}, - {0, 0, 0, 0} - }; - int option_index = 0; - int c = getopt_long (argc, argv, "m:", long_options, &option_index); - if (c == -1) break; - switch (c) { - case 0: -// if (long_options[option_index].flag != 0) -// break; -// printf ("option %s", long_options[option_index].name); -// if (optarg) -// printf (" with arg %s", optarg); -// printf ("\n"); - break; - case 'm': - min_match = atoi(optarg); - if (min_match < 1 || min_match > 100) { - cerr << "error: --minmatch must have value in range 1..100\n"; - exit(1); - } - cerr << "setting min match to " << min_match << endl; - break; - default: - cerr << "usage: syntax: ./fuzzy-match input corpus [--basic] [--word] [--minmatch 1..100]\n"; - exit(1); - } - } - if (lsed_flag) { cerr << "lsed\n"; } - if (basic_flag) { cerr << "basic\n"; } - if (refined_flag) { cerr << "refined\n"; } - if (length_filter_flag) { cerr << "length filter\n"; } - if (parse_flag) { cerr << "parse\n"; } -// exit(1); - - - if (optind+2 != argc) { - cerr << "syntax: ./fuzzy-match input corpus [--basic] [--word] [--minmatch 1..100]\n"; - exit(1); - } - - cerr << "loading corpus...\n"; - - load_corpus(argv[optind], input); - load_corpus(argv[optind+1], source); - - // ./fuzzy-match input corpus [-basic] - -// load_corpus("../corpus/tm.truecased.4.en", source); -// load_corpus("../corpus/tm.truecased.4.it", target); -// load_corpus("../evaluation/test.input.tc.4", input); - -// load_corpus("../../acquis-truecase/corpus/acquis.truecased.190.en", source); -// load_corpus("../../acquis-truecase/evaluation/ac-test.input.tc.190", input); - -// load_corpus("../corpus/tm.truecased.16.en", source); -// load_corpus("../evaluation/test.input.tc.16", input); - - if (basic_flag) { - cerr << "using basic method\n"; - clock_t start_main_clock2 = clock(); - basic_fuzzy_match( source, input ); - cerr << "total: " << (1000 * (clock()-start_main_clock2) / CLOCKS_PER_SEC) << endl; - exit(1); - } - - cerr << "number of input sentences " << input.size() << endl; - - cerr << "creating suffix array...\n"; -// SuffixArray suffixArray( "../corpus/tm.truecased.4.en" ); -// SuffixArray suffixArray( "../../acquis-truecase/corpus/acquis.truecased.190.en" ); - SuffixArray suffixArray( argv[optind+1] ); - - clock_t start_main_clock = clock(); - - // looping through all input sentences... - cerr << "looping...\n"; - for(unsigned int i=0;i<input.size();i++) - { - clock_t start_clock = clock(); - // if (i % 10 == 0) cerr << "."; - int input_id = i; // clean up this mess! - - // establish some basic statistics - - // int input_length = compute_length( input[i] ); - int input_length = input[i].size(); - int best_cost = input_length * (100-min_match) / 100 + 1; - - int match_count = 0; // how many substring matches to be considered - //cerr << endl << "sentence " << i << ", length " << input_length << ", best_cost " << best_cost << endl; - - // find match ranges in suffix array - vector< vector< pair< SuffixArray::INDEX, SuffixArray::INDEX > > > match_range; - for(size_t start=0;start<input[i].size();start++) - { - SuffixArray::INDEX prior_first_match = 0; - SuffixArray::INDEX prior_last_match = suffixArray.GetSize()-1; - vector< string > substring; - bool stillMatched = true; - vector< pair< SuffixArray::INDEX, SuffixArray::INDEX > > matchedAtThisStart; - //cerr << "start: " << start; - for(int word=start; stillMatched && word<input[i].size(); word++) - { - substring.push_back( vocabulary.GetWord( input[i][word] ) ); - - // only look up, if needed (i.e. no unnecessary short gram lookups) -// if (! word-start+1 <= short_match_max_length( input_length ) ) - // { - SuffixArray::INDEX first_match, last_match; - stillMatched = false; - if (suffixArray.FindMatches( substring, first_match, last_match, prior_first_match, prior_last_match ) ) - { - stillMatched = true; - matchedAtThisStart.push_back( make_pair( first_match, last_match ) ); - //cerr << " (" << first_match << "," << last_match << ")"; - //cerr << " " << ( last_match - first_match + 1 ); - prior_first_match = first_match; - prior_last_match = last_match; - } - //} - } - //cerr << endl; - match_range.push_back( matchedAtThisStart ); - } - - clock_t clock_range = clock(); - - map< int, vector< Match > > sentence_match; - map< int, int > sentence_match_word_count; - - // go through all matches, longest first - for(int length = input[i].size(); length >= 1; length--) - { - // do not create matches, if these are handled by the short match function - if (length <= short_match_max_length( input_length ) ) - { - continue; - } - - unsigned int count = 0; - for(int start = 0; start <= input[i].size() - length; start++) - { - if (match_range[start].size() >= length) - { - pair< SuffixArray::INDEX, SuffixArray::INDEX > &range = match_range[start][length-1]; - // cerr << " (" << range.first << "," << range.second << ")"; - count += range.second - range.first + 1; - - for(SuffixArray::INDEX i=range.first; i<=range.second; i++) - { - int position = suffixArray.GetPosition( i ); - - // sentence length mismatch - size_t sentence_id = suffixArray.GetSentence( position ); - int sentence_length = suffixArray.GetSentenceLength( sentence_id ); - int diff = abs( (int)sentence_length - (int)input_length ); - // cerr << endl << i << "\tsentence " << sentence_id << ", length " << sentence_length; - //if (length <= 2 && input_length>=5 && - // sentence_match.find( sentence_id ) == sentence_match.end()) - // continue; - - if (diff > best_cost) - continue; - - // compute minimal cost - int start_pos = suffixArray.GetWordInSentence( position ); - int end_pos = start_pos + length-1; - // cerr << endl << "\t" << start_pos << "-" << end_pos << " (" << sentence_length << ") vs. " - // << start << "-" << (start+length-1) << " (" << input_length << ")"; - // different number of prior words -> cost is at least diff - int min_cost = abs( start - start_pos ); - - // same number of words, but not sent. start -> cost is at least 1 - if (start == start_pos && start>0) - min_cost++; - - // different number of remaining words -> cost is at least diff - min_cost += abs( ( sentence_length-1 - end_pos ) - - ( input_length-1 - (start+length-1) ) ); - - // same number of words, but not sent. end -> cost is at least 1 - if ( sentence_length-1 - end_pos == - input_length-1 - (start+length-1) - && end_pos != sentence_length-1 ) - min_cost++; - - // cerr << " -> min_cost " << min_cost; - if (min_cost > best_cost) - continue; - - // valid match - match_count++; - - // compute maximal cost - int max_cost = max( start, start_pos ) - + max( sentence_length-1 - end_pos, - input_length-1 - (start+length-1) ); - // cerr << ", max_cost " << max_cost; - - Match m = Match( start, start+length-1, - start_pos, start_pos+length-1, - min_cost, max_cost, 0); - sentence_match[ sentence_id ].push_back( m ); - sentence_match_word_count[ sentence_id ] += length; - - if (max_cost < best_cost) - { - best_cost = max_cost; - if (best_cost == 0) break; - } - //if (match_count >= MAX_MATCH_COUNT) break; - } - } - // cerr << endl; - if (best_cost == 0) break; - //if (match_count >= MAX_MATCH_COUNT) break; - } - // cerr << count << " matches at length " << length << " in " << sentence_match.size() << " tm." << endl; - - if (best_cost == 0) break; - //if (match_count >= MAX_MATCH_COUNT) break; - } - cerr << match_count << " matches in " << sentence_match.size() << " sentences." << endl; - - clock_t clock_matches = clock(); - - // consider each sentence for which we have matches - int old_best_cost = best_cost; - int tm_count_word_match = 0; - int tm_count_word_match2 = 0; - int pruned_match_count = 0; - if (short_match_max_length( input_length )) - { - init_short_matches( input[i] ); - } - vector< int > best_tm; - typedef map< int, vector< Match > >::iterator I; - - clock_t clock_validation_sum = 0; - - for(I tm=sentence_match.begin(); tm!=sentence_match.end(); tm++) - { - int tmID = tm->first; - int tm_length = suffixArray.GetSentenceLength(tmID); - vector< Match > &match = tm->second; - add_short_matches( match, source[tmID], input_length, best_cost ); - - //cerr << "match in sentence " << tmID << ": " << match.size() << " [" << tm_length << "]" << endl; - - // quick look: how many words are matched - int words_matched = 0; - for(int m=0;m<match.size();m++) { - - if (match[m].min_cost <= best_cost) // makes no difference - words_matched += match[m].input_end - match[m].input_start + 1; - } - if (max(input_length,tm_length) - words_matched > best_cost) - { - if (length_filter_flag) continue; - } - tm_count_word_match++; - - // prune, check again how many words are matched - vector< Match > pruned = prune_matches( match, best_cost ); - words_matched = 0; - for(int p=0;p<pruned.size();p++) { - words_matched += pruned[p].input_end - pruned[p].input_start + 1; - } - if (max(input_length,tm_length) - words_matched > best_cost) - { - if (length_filter_flag) continue; - } - tm_count_word_match2++; - - pruned_match_count += pruned.size(); - int prior_best_cost = best_cost; - int cost; - - clock_t clock_validation_start = clock(); - if (! parse_flag || - pruned.size()>=10) // to prevent worst cases - { - string path; - cost = sed( input[input_id], source[tmID], path, false ); - if (cost < best_cost) - { - best_cost = cost; - } - } - - else - { - cost = parse_matches( pruned, input_length, tm_length, best_cost ); - if (prior_best_cost != best_cost) - { - best_tm.clear(); - } - } - clock_validation_sum += clock() - clock_validation_start; - if (cost == best_cost) - { - best_tm.push_back( tmID ); - } - } - cerr << "reduced best cost from " << old_best_cost << " to " << best_cost << endl; - cerr << "tm considered: " << sentence_match.size() - << " word-matched: " << tm_count_word_match - << " word-matched2: " << tm_count_word_match2 - << " best: " << best_tm.size() << endl; - - cerr << "pruned matches: " << ((float)pruned_match_count/(float)tm_count_word_match2) << endl; - - // do not try to find the best ... report multiple matches - if (multiple_flag) { - int input_letter_length = compute_length( input[input_id] ); - for(int si=0; si<best_tm.size(); si++) { - int s = best_tm[si]; - string path; - unsigned int letter_cost = sed( input[input_id], source[s], path, true ); - // do not report multiple identical sentences, but just their count - cout << i << " "; // sentence number - cout << letter_cost << "/" << input_letter_length << " "; - cout << "(" << best_cost <<"/" << input_length <<") "; - cout << "||| " << s << " ||| " << path << endl; - } - continue; - } - - // find the best matches according to letter sed - string best_path = ""; - int best_match = -1; - int best_letter_cost; - if (lsed_flag) { - best_letter_cost = compute_length( input[input_id] ) * min_match / 100 + 1; - for(int si=0; si<best_tm.size(); si++) - { - int s = best_tm[si]; - string path; - unsigned int letter_cost = sed( input[input_id], source[s], path, true ); - if (letter_cost < best_letter_cost) - { - best_letter_cost = letter_cost; - best_path = path; - best_match = s; - } - } - } - // if letter sed turned off, just compute path for first match - else { - if (best_tm.size() > 0) { - string path; - sed( input[input_id], source[best_tm[0]], path, false ); - best_path = path; - best_match = best_tm[0]; - } - } - cerr << "elapsed: " << (1000 * (clock()-start_clock) / CLOCKS_PER_SEC) - << " ( range: " << (1000 * (clock_range-start_clock) / CLOCKS_PER_SEC) - << " match: " << (1000 * (clock_matches-clock_range) / CLOCKS_PER_SEC) - << " tm: " << (1000 * (clock()-clock_matches) / CLOCKS_PER_SEC) - << " (validation: " << (1000 * (clock_validation_sum) / CLOCKS_PER_SEC) << ")" - << " )" << endl; - if (lsed_flag) { - cout << best_letter_cost << "/" << compute_length( input[input_id] ) << " ("; - } - cout << best_cost <<"/" << input_length; - if (lsed_flag) cout << ")"; - cout << " ||| " << best_match << " ||| " << best_path << endl; - } - cerr << "total: " << (1000 * (clock()-start_main_clock) / CLOCKS_PER_SEC) << endl; - - -} diff --git a/contrib/fuzzy-match/old/get-multiple-translations-for-uniq-sources.perl b/contrib/fuzzy-match/old/get-multiple-translations-for-uniq-sources.perl deleted file mode 100644 index 49e9ce1ec..000000000 --- a/contrib/fuzzy-match/old/get-multiple-translations-for-uniq-sources.perl +++ /dev/null @@ -1,58 +0,0 @@ -#!/usr/bin/perl -w - -use strict; - -my $src_in = "corpus/acquis.truecased.4.en"; -my $tgt_in = "corpus/acquis.truecased.4.fr"; -my $align_in = "model/aligned.4.grow-diag-final-and"; - -my $src_out = "data/acquis.truecased.4.en.uniq"; -my $tgt_out = "data/acquis.truecased.4.fr.uniq"; -my $tgt_mf = "data/acquis.truecased.4.fr.uniq.most-frequent"; -my $align_out = "data/acquis.truecased.4.align.uniq"; -my $align_mf = "data/acquis.truecased.4.align.uniq.most-frequent"; - -my (%TRANS,%ALIGN); - -open(SRC,$src_in); -open(TGT,$tgt_in); -open(ALIGN,$align_in); -while(my $src = <SRC>) { - my $tgt = <TGT>; - my $align = <ALIGN>; - chop($tgt); - chop($align); - $TRANS{$src}{$tgt}++; - $ALIGN{$src}{$tgt} = $align; -} -close(SRC); -close(TGT); - -open(SRC_OUT,">$src_out"); -open(TGT_OUT,">$tgt_out"); -open(TGT_MF, ">$tgt_mf"); -open(ALIGN_OUT,">$align_out"); -open(ALIGN_MF, ">$align_mf"); -foreach my $src (keys %TRANS) { - print SRC_OUT $src; - my $first = 1; - my ($max,$best) = (0); - foreach my $tgt (keys %{$TRANS{$src}}) { - print TGT_OUT " ||| " unless $first; - print TGT_OUT $TRANS{$src}{$tgt}." ".$tgt; - print ALIGN_OUT " ||| " unless $first; - print ALIGN_OUT $ALIGN{$src}{$tgt}; - if ($TRANS{$src}{$tgt} > $max) { - $max = $TRANS{$src}{$tgt}; - $best = $tgt; - } - $first = 0; - } - print TGT_OUT "\n"; - print ALIGN_OUT "\n"; - print TGT_MF $best."\n"; - print ALIGN_MF $ALIGN{$src}{$best}."\n"; -} -close(SRC_OUT); -close(TGT_OUT); - diff --git a/contrib/fuzzy-match/old/make-pt-from-tm.perl b/contrib/fuzzy-match/old/make-pt-from-tm.perl deleted file mode 100755 index 6bdb2fa93..000000000 --- a/contrib/fuzzy-match/old/make-pt-from-tm.perl +++ /dev/null @@ -1,308 +0,0 @@ -#!/usr/bin/perl -w - -use strict; -use FindBin qw($RealBin); -use File::Basename; - -my $DEBUG = 1; -my $OUTPUT_RULES = 1; - -#my $data_root = "/Users/hieuhoang/workspace/experiment/data/tm-mt-integration/"; -my $in_file = $ARGV[0]; #"$data_root/in/ac-test.input.tc.4"; -my $source_file = $ARGV[1]; #"$data_root/in/acquis.truecased.4.en.uniq"; -my $target_file = $ARGV[2]; #"$data_root/in/acquis.truecased.4.fr.uniq"; -my $alignment_file = $ARGV[3]; #"$data_root/in/acquis.truecased.4.align.uniq"; -my $lex_file = $ARGV[4]; #$data_root/in/lex.4; -my $pt_file = $ARGV[5]; #"$data_root/out/pt"; - -my $cmd; - -my $TMPDIR=dirname($pt_file) ."/tmp.$$"; -$cmd = "mkdir -p $TMPDIR"; -`$cmd`; - -my $match_file = "$TMPDIR/match"; - -# suffix array creation and extraction -$cmd = "$RealBin/fuzzy-match --multiple $in_file $source_file > $match_file"; -print STDERR "$cmd \n"; -`$cmd`; - -# make into xml and pt -my $out_file = "$TMPDIR/ac-test.input.xml.4.uniq.multi.tuning"; - -my @INPUT = `cat $in_file`; chop(@INPUT); -my @ALL_SOURCE = `cat $source_file`; chop(@ALL_SOURCE); -my @ALL_TARGET = `cat $target_file`; chop(@ALL_TARGET); -my @ALL_ALIGNMENT = `cat $alignment_file`; chop(@ALL_ALIGNMENT); - -open(MATCH,$match_file); -open(FRAME,">$out_file"); -open(RULE,">$out_file.extract") if $OUTPUT_RULES; -open(RULE_INV,">$out_file.extract.inv") if $OUTPUT_RULES; -open(INFO,">$out_file.info"); -while( my $match = <MATCH> ) { - chop($match); - my ($score,$sentence,$path) = split(/ \|\|\| /,$match); - - $score =~ /^(\d+) (.+)/ || die; - my ($i,$match_score) = ($1,$2); - print STDERR "i=$i match_score=$match_score\n"; - - # construct frame - if ($sentence < 1e9 && $sentence >= 0) { - my $SOURCE = $ALL_SOURCE[$sentence]; - my @ALIGNMENT = split(/ \|\|\| /,$ALL_ALIGNMENT[$sentence]); - my @TARGET = split(/ \|\|\| /,$ALL_TARGET[$sentence]); - - for(my $j=0;$j<scalar(@TARGET);$j++) { - $TARGET[$j] =~ /^(\d+) (.+)$/ || die; - my ($target_count,$target) = ($1,$2); - my ($frame,$rule_s,$rule_t,$rule_alignment,$rule_alignment_inv) = - &create_xml($SOURCE, - $INPUT[$i], - $target, - $ALIGNMENT[$j], - $path); - print FRAME $frame."\n"; - print RULE "$rule_s [X] ||| $rule_t [X] ||| $rule_alignment ||| $target_count\n" if $OUTPUT_RULES; - print RULE_INV "$rule_t [X] ||| $rule_s [X] ||| $rule_alignment_inv ||| $target_count\n" if $OUTPUT_RULES; - print INFO "$i ||| $match_score ||| $target_count\n"; - } - } -} -close(FRAME); -close(MATCH); -close(RULE) if $OUTPUT_RULES; -close(RULE_INV) if $OUTPUT_RULES; - -`LC_ALL=C sort $out_file.extract | gzip -c > $out_file.extract.sorted.gz`; -`LC_ALL=C sort $out_file.extract.inv | gzip -c > $out_file.extract.inv.sorted.gz`; - -if ($OUTPUT_RULES) -{ - $cmd = "$RealBin/../../scripts/training/train-model.perl -dont-zip -first-step 6 -last-step 6 -f en -e fr -hierarchical -extract-file $out_file.extract -lexical-file $lex_file -phrase-translation-table $pt_file"; - print STDERR "Executing: $cmd \n"; - `$cmd`; -} - -#$cmd = "rm -rf $TMPDIR"; -#`$cmd`; - -####################################################### -sub create_xml { - my ($source,$input,$target,$alignment,$path) = @_; - - print STDERR " HIEU \n $source \n $input \n $target \n $alignment \n $path \n"; - - my @INPUT = split(/ /,$input); - my @SOURCE = split(/ /,$source); - my @TARGET = split(/ /,$target); - my %ALIGN = &create_alignment($alignment); - - my %FRAME_INPUT; - my (@NT,@INPUT_BITMAP,@TARGET_BITMAP,%ALIGNMENT_I_TO_S); - foreach (@TARGET) { push @TARGET_BITMAP,1 } - - ### STEP 1: FIND MISMATCHES - - my ($s,$i) = (0,0); - my $currently_matching = 0; - my ($start_s,$start_i) = (0,0); - - $path .= "X"; # indicate end - print STDERR "$input\n$source\n$target\n$path\n"; - for(my $p=0;$p<length($path);$p++) { - my $action = substr($path,$p,1); - - # beginning of a mismatch - if ($currently_matching && $action ne "M" && $action ne "X") { - $start_i = $i; - $start_s = $s; - $currently_matching = 0; - } - - # end of a mismatch - elsif (!$currently_matching && - ($action eq "M" || $action eq "X")) { - - # remove use of affected target words - for(my $ss = $start_s; $ss<$s; $ss++) { - foreach my $tt (keys %{${$ALIGN{'s'}}[$ss]}) { - $TARGET_BITMAP[$tt] = 0; - } - - # also remove enclosed unaligned words? - } - - # are there input words that need to be inserted ? - print STDERR "($start_i<$i)?\n"; - if ($start_i<$i) { - - # take note of input words to be inserted - my $insertion = ""; - for(my $ii = $start_i; $ii<$i; $ii++) { - $insertion .= $INPUT[$ii]." "; - } - - # find position for inserted input words - - # find first removed target word - my $start_t = 1000; - for(my $ss = $start_s; $ss<$s; $ss++) { - foreach my $tt (keys %{${$ALIGN{'s'}}[$ss]}) { - $start_t = $tt if $tt < $start_t; - } - } - - # end of sentence? add to end - if ($start_t == 1000 && $i > $#INPUT) { - $start_t = $#TARGET; - } - - # backtrack to previous words if unaligned - if ($start_t == 1000) { - $start_t = -1; - for(my $ss = $s-1; $start_t==-1 && $ss>=0; $ss--) { - foreach my $tt (keys %{${$ALIGN{'s'}}[$ss]}) { - $start_t = $tt if $tt > $start_t; - } - } - } - $FRAME_INPUT{$start_t} .= $insertion; - my %NT = ("start_t" => $start_t, - "start_i" => $start_i ); - push @NT,\%NT; - } - $currently_matching = 1; - } - - print STDERR "$action $s $i ($start_s $start_i) $currently_matching"; - if ($action ne "I") { - print STDERR " ->"; - foreach my $tt (keys %{${$ALIGN{'s'}}[$s]}) { - print STDERR " ".$tt; - } - } - print STDERR "\n"; - $s++ unless $action eq "I"; - $i++ unless $action eq "D"; - $ALIGNMENT_I_TO_S{$i} = $s unless $action eq "D"; - push @INPUT_BITMAP, 1 if $action eq "M"; - push @INPUT_BITMAP, 0 if $action eq "I" || $action eq "S"; - } - - - print STDERR $target."\n"; - foreach (@TARGET_BITMAP) { print STDERR $_; } print STDERR "\n"; - foreach (sort keys %FRAME_INPUT) { - print STDERR "$_: $FRAME_INPUT{$_}\n"; - } - - ### STEP 2: BUILD RULE AND FRAME - - # hierarchical rule - my $rule_s = ""; - my $rule_pos_s = 0; - my %RULE_ALIGNMENT_S; - for(my $i=0;$i<scalar(@INPUT_BITMAP);$i++) { - if ($INPUT_BITMAP[$i]) { - $rule_s .= $INPUT[$i]." "; - $RULE_ALIGNMENT_S{$ALIGNMENT_I_TO_S{$i}} = $rule_pos_s++; - } - foreach my $NT (@NT) { - if ($i == $$NT{"start_i"}) { - $rule_s .= "[X][X] "; - $$NT{"rule_pos_s"} = $rule_pos_s++; - } - } - } - - my $rule_t = ""; - my $rule_pos_t = 0; - my %RULE_ALIGNMENT_T; - for(my $t=-1;$t<scalar(@TARGET_BITMAP);$t++) { - if ($t>=0 && $TARGET_BITMAP[$t]) { - $rule_t .= $TARGET[$t]." "; - $RULE_ALIGNMENT_T{$t} = $rule_pos_t++; - } - foreach my $NT (@NT) { - if ($t == $$NT{"start_t"}) { - $rule_t .= "[X][X] "; - $$NT{"rule_pos_t"} = $rule_pos_t++; - } - } - } - - my $rule_alignment = ""; - foreach my $s (sort { $a <=> $b} keys %RULE_ALIGNMENT_S) { - foreach my $t (keys %{$ALIGN{"s"}[$s]}) { - next unless defined($RULE_ALIGNMENT_T{$t}); - $rule_alignment .= $RULE_ALIGNMENT_S{$s}."-".$RULE_ALIGNMENT_T{$t}." "; - } - } - foreach my $NT (@NT) { - $rule_alignment .= $$NT{"rule_pos_s"}."-".$$NT{"rule_pos_t"}." "; - } - - chop($rule_s); - chop($rule_t); - chop($rule_alignment); - - my $rule_alignment_inv = ""; - foreach (split(/ /,$rule_alignment)) { - /^(\d+)\-(\d+)$/; - $rule_alignment_inv .= "$2-$1 "; - } - chop($rule_alignment_inv); - - # frame - my $frame = ""; - $frame = $FRAME_INPUT{-1} if defined $FRAME_INPUT{-1}; - - my $currently_included = 0; - my $start_t = -1; - push @TARGET_BITMAP,0; # indicate end - - for(my $t=0;$t<=scalar(@TARGET);$t++) { - # beginning of tm target inclusion - if (!$currently_included && $TARGET_BITMAP[$t]) { - $start_t = $t; - $currently_included = 1; - } - - # end of tm target inclusion (not included word or inserted input) - elsif ($currently_included && - (!$TARGET_BITMAP[$t] || defined($FRAME_INPUT{$t}))) { - # add xml (unless change is at the beginning of the sentence - if ($start_t >= 0) { - my $target = ""; - print STDERR "for(tt=$start_t;tt<$t+$TARGET_BITMAP[$t]);\n"; - for(my $tt=$start_t;$tt<$t+$TARGET_BITMAP[$t];$tt++) { - $target .= $TARGET[$tt] . " "; - } - chop($target); - $frame .= "<xml translation=\"$target\"> x </xml> "; - } - $currently_included = 0; - } - - $frame .= $FRAME_INPUT{$t} if defined $FRAME_INPUT{$t}; - print STDERR "$TARGET_BITMAP[$t] $t ($start_t) $currently_included\n"; - } - - print STDERR $frame."\n-------------------------------------\n"; - return ($frame,$rule_s,$rule_t,$rule_alignment,$rule_alignment_inv); -} - -sub create_alignment { - my ($line) = @_; - my (@ALIGNED_TO_S,@ALIGNED_TO_T); - foreach my $point (split(/ /,$line)) { - my ($s,$t) = split(/\-/,$point); - $ALIGNED_TO_S[$s]{$t}++; - $ALIGNED_TO_T[$t]{$s}++; - } - my %ALIGNMENT = ( 's' => \@ALIGNED_TO_S, 't' => \@ALIGNED_TO_T ); - return %ALIGNMENT; -} diff --git a/contrib/fuzzy-match/old/make-pt-from-tm2.perl b/contrib/fuzzy-match/old/make-pt-from-tm2.perl deleted file mode 100755 index 3a5fa4171..000000000 --- a/contrib/fuzzy-match/old/make-pt-from-tm2.perl +++ /dev/null @@ -1,300 +0,0 @@ -#!/usr/bin/perl -w -d - -use strict; -use FindBin qw($RealBin); -use File::Basename; - -my $DEBUG = 1; -my $OUTPUT_RULES = 1; - -#my $data_root = "/Users/hieuhoang/workspace/experiment/data/tm-mt-integration/"; -my $in_file = $ARGV[0]; #"$data_root/in/ac-test.input.tc.4"; -my $source_file = $ARGV[1]; #"$data_root/in/acquis.truecased.4.en.uniq"; -my $target_file = $ARGV[2]; #"$data_root/in/acquis.truecased.4.fr.uniq"; -my $alignment_file = $ARGV[3]; #"$data_root/in/acquis.truecased.4.align.uniq"; -my $lex_file = $ARGV[4]; #$data_root/in/lex.4; -my $pt_file = $ARGV[5]; #"$data_root/out/pt"; - -my $cmd; - -my $TMPDIR= "/tmp/tmp.$$"; -$cmd = "mkdir -p $TMPDIR"; -`$cmd`; -$TMPDIR = "/Users/hieuhoang/workspace/experiment/data/tm-mt-integration/out/tmp.3196"; - -my $match_file = "$TMPDIR/match"; - -# suffix array creation and extraction -$cmd = "$RealBin/fuzzy-match --multiple $in_file $source_file > $match_file"; -`$cmd`; - -# make into xml and pt -my $out_file = "$TMPDIR/ac-test.input.xml.4.uniq.multi.tuning"; - -open(MATCH,$match_file); -open(FRAME,">$out_file"); -open(RULE,">$out_file.extract") if $OUTPUT_RULES; -open(RULE_INV,">$out_file.extract.inv") if $OUTPUT_RULES; -open(INFO,">$out_file.info"); -while( my $match = <MATCH> ) { - chop($match); - my ($score,$sentence,$path) = split(/ \|\|\| /,$match); - - $score =~ /^(\d+) (.+)/ || die; - my ($i,$match_score) = ($1,$2); - - # construct frame - if ($sentence < 1e9 && $sentence >= 0) { - my $SOURCE = $ALL_SOURCE[$sentence]; - my @ALIGNMENT = split(/ \|\|\| /,$ALL_ALIGNMENT[$sentence]); - my @TARGET = split(/ \|\|\| /,$ALL_TARGET[$sentence]); - - for(my $j=0;$j<scalar(@TARGET);$j++) { - $TARGET[$j] =~ /^(\d+) (.+)$/ || die; - my ($target_count,$target) = ($1,$2); - my ($frame,$rule_s,$rule_t,$rule_alignment,$rule_alignment_inv) = - &create_xml($SOURCE, - $INPUT[$i], - $target, - $ALIGNMENT[$j], - $path); - print FRAME $frame."\n"; - print RULE "$rule_s [X] ||| $rule_t [X] ||| $rule_alignment ||| $target_count\n" if $OUTPUT_RULES; - print RULE_INV "$rule_t [X] ||| $rule_s [X] ||| $rule_alignment_inv ||| $target_count\n" if $OUTPUT_RULES; - print INFO "$i ||| $match_score ||| $target_count\n"; - } - } -} -close(FRAME); -close(MATCH); -close(RULE) if $OUTPUT_RULES; -close(RULE_INV) if $OUTPUT_RULES; - -`LC_ALL=C sort $out_file.extract | gzip -c > $out_file.extract.sorted.gz`; -`LC_ALL=C sort $out_file.extract.inv | gzip -c > $out_file.extract.inv.sorted.gz`; - -if ($OUTPUT_RULES) -{ - $cmd = "$RealBin/../../scripts/training/train-model.perl -dont-zip -first-step 6 -last-step 6 -f en -e fr -hierarchical -extract-file $out_file.extract -lexical-file $lex_file -phrase-translation-table $pt_file"; - print STDERR "Executing: $cmd \n"; - `$cmd`; -} - -#$cmd = "rm -rf $TMPDIR"; -#`$cmd`; - -####################################################### -sub create_xml { - my ($source,$input,$target,$alignment,$path) = @_; - - my @INPUT = split(/ /,$input); - my @SOURCE = split(/ /,$source); - my @TARGET = split(/ /,$target); - my %ALIGN = &create_alignment($alignment); - - my %FRAME_INPUT; - my (@NT,@INPUT_BITMAP,@TARGET_BITMAP,%ALIGNMENT_I_TO_S); - foreach (@TARGET) { push @TARGET_BITMAP,1 } - - ### STEP 1: FIND MISMATCHES - - my ($s,$i) = (0,0); - my $currently_matching = 0; - my ($start_s,$start_i) = (0,0); - - $path .= "X"; # indicate end - print STDERR "$input\n$source\n$target\n$path\n"; - for(my $p=0;$p<length($path);$p++) { - my $action = substr($path,$p,1); - - # beginning of a mismatch - if ($currently_matching && $action ne "M" && $action ne "X") { - $start_i = $i; - $start_s = $s; - $currently_matching = 0; - } - - # end of a mismatch - elsif (!$currently_matching && - ($action eq "M" || $action eq "X")) { - - # remove use of affected target words - for(my $ss = $start_s; $ss<$s; $ss++) { - foreach my $tt (keys %{${$ALIGN{'s'}}[$ss]}) { - $TARGET_BITMAP[$tt] = 0; - } - - # also remove enclosed unaligned words? - } - - # are there input words that need to be inserted ? - print STDERR "($start_i<$i)?\n"; - if ($start_i<$i) { - - # take note of input words to be inserted - my $insertion = ""; - for(my $ii = $start_i; $ii<$i; $ii++) { - $insertion .= $INPUT[$ii]." "; - } - - # find position for inserted input words - - # find first removed target word - my $start_t = 1000; - for(my $ss = $start_s; $ss<$s; $ss++) { - foreach my $tt (keys %{${$ALIGN{'s'}}[$ss]}) { - $start_t = $tt if $tt < $start_t; - } - } - - # end of sentence? add to end - if ($start_t == 1000 && $i > $#INPUT) { - $start_t = $#TARGET; - } - - # backtrack to previous words if unaligned - if ($start_t == 1000) { - $start_t = -1; - for(my $ss = $s-1; $start_t==-1 && $ss>=0; $ss--) { - foreach my $tt (keys %{${$ALIGN{'s'}}[$ss]}) { - $start_t = $tt if $tt > $start_t; - } - } - } - $FRAME_INPUT{$start_t} .= $insertion; - my %NT = ("start_t" => $start_t, - "start_i" => $start_i ); - push @NT,\%NT; - } - $currently_matching = 1; - } - - print STDERR "$action $s $i ($start_s $start_i) $currently_matching"; - if ($action ne "I") { - print STDERR " ->"; - foreach my $tt (keys %{${$ALIGN{'s'}}[$s]}) { - print STDERR " ".$tt; - } - } - print STDERR "\n"; - $s++ unless $action eq "I"; - $i++ unless $action eq "D"; - $ALIGNMENT_I_TO_S{$i} = $s unless $action eq "D"; - push @INPUT_BITMAP, 1 if $action eq "M"; - push @INPUT_BITMAP, 0 if $action eq "I" || $action eq "S"; - } - - - print STDERR $target."\n"; - foreach (@TARGET_BITMAP) { print STDERR $_; } print STDERR "\n"; - foreach (sort keys %FRAME_INPUT) { - print STDERR "$_: $FRAME_INPUT{$_}\n"; - } - - ### STEP 2: BUILD RULE AND FRAME - - # hierarchical rule - my $rule_s = ""; - my $rule_pos_s = 0; - my %RULE_ALIGNMENT_S; - for(my $i=0;$i<scalar(@INPUT_BITMAP);$i++) { - if ($INPUT_BITMAP[$i]) { - $rule_s .= $INPUT[$i]." "; - $RULE_ALIGNMENT_S{$ALIGNMENT_I_TO_S{$i}} = $rule_pos_s++; - } - foreach my $NT (@NT) { - if ($i == $$NT{"start_i"}) { - $rule_s .= "[X][X] "; - $$NT{"rule_pos_s"} = $rule_pos_s++; - } - } - } - - my $rule_t = ""; - my $rule_pos_t = 0; - my %RULE_ALIGNMENT_T; - for(my $t=-1;$t<scalar(@TARGET_BITMAP);$t++) { - if ($t>=0 && $TARGET_BITMAP[$t]) { - $rule_t .= $TARGET[$t]." "; - $RULE_ALIGNMENT_T{$t} = $rule_pos_t++; - } - foreach my $NT (@NT) { - if ($t == $$NT{"start_t"}) { - $rule_t .= "[X][X] "; - $$NT{"rule_pos_t"} = $rule_pos_t++; - } - } - } - - my $rule_alignment = ""; - foreach my $s (sort { $a <=> $b} keys %RULE_ALIGNMENT_S) { - foreach my $t (keys %{$ALIGN{"s"}[$s]}) { - next unless defined($RULE_ALIGNMENT_T{$t}); - $rule_alignment .= $RULE_ALIGNMENT_S{$s}."-".$RULE_ALIGNMENT_T{$t}." "; - } - } - foreach my $NT (@NT) { - $rule_alignment .= $$NT{"rule_pos_s"}."-".$$NT{"rule_pos_t"}." "; - } - - chop($rule_s); - chop($rule_t); - chop($rule_alignment); - - my $rule_alignment_inv = ""; - foreach (split(/ /,$rule_alignment)) { - /^(\d+)\-(\d+)$/; - $rule_alignment_inv .= "$2-$1 "; - } - chop($rule_alignment_inv); - - # frame - my $frame = ""; - $frame = $FRAME_INPUT{-1} if defined $FRAME_INPUT{-1}; - - my $currently_included = 0; - my $start_t = -1; - push @TARGET_BITMAP,0; # indicate end - - for(my $t=0;$t<=scalar(@TARGET);$t++) { - # beginning of tm target inclusion - if (!$currently_included && $TARGET_BITMAP[$t]) { - $start_t = $t; - $currently_included = 1; - } - - # end of tm target inclusion (not included word or inserted input) - elsif ($currently_included && - (!$TARGET_BITMAP[$t] || defined($FRAME_INPUT{$t}))) { - # add xml (unless change is at the beginning of the sentence - if ($start_t >= 0) { - my $target = ""; - print STDERR "for(tt=$start_t;tt<$t+$TARGET_BITMAP[$t]);\n"; - for(my $tt=$start_t;$tt<$t+$TARGET_BITMAP[$t];$tt++) { - $target .= $TARGET[$tt] . " "; - } - chop($target); - $frame .= "<xml translation=\"$target\"> x </xml> "; - } - $currently_included = 0; - } - - $frame .= $FRAME_INPUT{$t} if defined $FRAME_INPUT{$t}; - print STDERR "$TARGET_BITMAP[$t] $t ($start_t) $currently_included\n"; - } - - print STDERR $frame."\n-------------------------------------\n"; - return ($frame,$rule_s,$rule_t,$rule_alignment,$rule_alignment_inv); -} - -sub create_alignment { - my ($line) = @_; - my (@ALIGNED_TO_S,@ALIGNED_TO_T); - foreach my $point (split(/ /,$line)) { - my ($s,$t) = split(/\-/,$point); - $ALIGNED_TO_S[$s]{$t}++; - $ALIGNED_TO_T[$t]{$s}++; - } - my %ALIGNMENT = ( 's' => \@ALIGNED_TO_S, 't' => \@ALIGNED_TO_T ); - return %ALIGNMENT; -} diff --git a/contrib/fuzzy-match/old/make-xml-from-match-multiple.perl b/contrib/fuzzy-match/old/make-xml-from-match-multiple.perl deleted file mode 100755 index e16c9de75..000000000 --- a/contrib/fuzzy-match/old/make-xml-from-match-multiple.perl +++ /dev/null @@ -1,288 +0,0 @@ -#!/usr/bin/perl -w - -use strict; - -my $DEBUG = 1; -my $OUTPUT_RULES = 1; - -my $scripts_root_dir = "/Users/hieuhoang/workspace/github/hieuhoang/scripts"; - -my $data_root = "/Users/hieuhoang/workspace/experiment/data/tm-mt-integration/"; -#my $match_file = "$data_root/in/BEST.acquis-xml-escaped.4.uniq.multi.tuning"; -my $match_file = "$data_root/out/BEST"; -my $source_file = "$data_root/in/acquis.truecased.4.en.uniq"; -my $target_file = "$data_root/in/acquis.truecased.4.fr.uniq"; -my $alignment_file = "$data_root/in/acquis.truecased.4.align.uniq"; -my $out_file = "$data_root/out/ac-test.input.xml.4.uniq.multi.tuning"; -my $in_file = "$data_root/in/ac-test.input.tc.4"; - -#my $match_file = "tm/BEST.acquis-xml-escaped.4.uniq.multi"; -#my $source_file = "data/acquis.truecased.4.en.uniq"; -#my $target_file = "data/acquis.truecased.4.fr.uniq"; -#my $alignment_file = "data/acquis.truecased.4.align.uniq"; -#my $out_file = "data/ac-test.input.xml.4.uniq.multi.xxx"; -#my $in_file = "evaluation/ac-test.input.tc.4"; - -my @INPUT = `cat $in_file`; chop(@INPUT); -my @ALL_SOURCE = `cat $source_file`; chop(@ALL_SOURCE); -my @ALL_TARGET = `cat $target_file`; chop(@ALL_TARGET); -my @ALL_ALIGNMENT = `cat $alignment_file`; chop(@ALL_ALIGNMENT); - -open(MATCH,$match_file); -open(FRAME,">$out_file"); -open(RULE,">$out_file.extract") if $OUTPUT_RULES; -open(RULE_INV,">$out_file.extract.inv") if $OUTPUT_RULES; -open(INFO,">$out_file.info"); -while( my $match = <MATCH> ) { - chop($match); - my ($score,$sentence,$path) = split(/ \|\|\| /,$match); - - $score =~ /^(\d+) (.+)/ || die; - my ($i,$match_score) = ($1,$2); - - # construct frame - if ($sentence < 1e9 && $sentence >= 0) { - my $SOURCE = $ALL_SOURCE[$sentence]; - my @ALIGNMENT = split(/ \|\|\| /,$ALL_ALIGNMENT[$sentence]); - my @TARGET = split(/ \|\|\| /,$ALL_TARGET[$sentence]); - - for(my $j=0;$j<scalar(@TARGET);$j++) { - $TARGET[$j] =~ /^(\d+) (.+)$/ || die; - my ($target_count,$target) = ($1,$2); - my ($frame,$rule_s,$rule_t,$rule_alignment,$rule_alignment_inv) = - &create_xml($SOURCE, - $INPUT[$i], - $target, - $ALIGNMENT[$j], - $path); - print FRAME $frame."\n"; - print RULE "$rule_s [X] ||| $rule_t [X] ||| $rule_alignment ||| $target_count\n" if $OUTPUT_RULES; - print RULE_INV "$rule_t [X] ||| $rule_s [X] ||| $rule_alignment_inv ||| $target_count\n" if $OUTPUT_RULES; - print INFO "$i ||| $match_score ||| $target_count\n"; - } - } -} -close(FRAME); -close(MATCH); -close(RULE) if $OUTPUT_RULES; -close(RULE_INV) if $OUTPUT_RULES; - -`LC_ALL=C sort $out_file.extract | gzip -c > $out_file.extract.sorted.gz`; -`LC_ALL=C sort $out_file.extract.inv | gzip -c > $out_file.extract.inv.sorted.gz`; - -`$scripts_root_dir/training/train-model.perl -dont-zip -first-step 6 -last-step 6 -f en -e fr -hierarchical -extract-file $out_file.extract -lexical-file $data_root/in/lex.4 -phrase-translation-table $out_file.phrase-table` if $OUTPUT_RULES; - -sub create_xml { - my ($source,$input,$target,$alignment,$path) = @_; - - my @INPUT = split(/ /,$input); - my @SOURCE = split(/ /,$source); - my @TARGET = split(/ /,$target); - my %ALIGN = &create_alignment($alignment); - - my %FRAME_INPUT; - my (@NT,@INPUT_BITMAP,@TARGET_BITMAP,%ALIGNMENT_I_TO_S); - foreach (@TARGET) { push @TARGET_BITMAP,1 } - - ### STEP 1: FIND MISMATCHES - - my ($s,$i) = (0,0); - my $currently_matching = 0; - my ($start_s,$start_i) = (0,0); - - $path .= "X"; # indicate end - print "$input\n$source\n$target\n$path\n"; - for(my $p=0;$p<length($path);$p++) { - my $action = substr($path,$p,1); - - # beginning of a mismatch - if ($currently_matching && $action ne "M" && $action ne "X") { - $start_i = $i; - $start_s = $s; - $currently_matching = 0; - } - - # end of a mismatch - elsif (!$currently_matching && - ($action eq "M" || $action eq "X")) { - - # remove use of affected target words - for(my $ss = $start_s; $ss<$s; $ss++) { - foreach my $tt (keys %{${$ALIGN{'s'}}[$ss]}) { - $TARGET_BITMAP[$tt] = 0; - } - - # also remove enclosed unaligned words? - } - - # are there input words that need to be inserted ? - print "($start_i<$i)?\n"; - if ($start_i<$i) { - - # take note of input words to be inserted - my $insertion = ""; - for(my $ii = $start_i; $ii<$i; $ii++) { - $insertion .= $INPUT[$ii]." "; - } - - # find position for inserted input words - - # find first removed target word - my $start_t = 1000; - for(my $ss = $start_s; $ss<$s; $ss++) { - foreach my $tt (keys %{${$ALIGN{'s'}}[$ss]}) { - $start_t = $tt if $tt < $start_t; - } - } - - # end of sentence? add to end - if ($start_t == 1000 && $i > $#INPUT) { - $start_t = $#TARGET; - } - - # backtrack to previous words if unaligned - if ($start_t == 1000) { - $start_t = -1; - for(my $ss = $s-1; $start_t==-1 && $ss>=0; $ss--) { - foreach my $tt (keys %{${$ALIGN{'s'}}[$ss]}) { - $start_t = $tt if $tt > $start_t; - } - } - } - $FRAME_INPUT{$start_t} .= $insertion; - my %NT = ("start_t" => $start_t, - "start_i" => $start_i ); - push @NT,\%NT; - } - $currently_matching = 1; - } - - print "$action $s $i ($start_s $start_i) $currently_matching"; - if ($action ne "I") { - print " ->"; - foreach my $tt (keys %{${$ALIGN{'s'}}[$s]}) { - print " ".$tt; - } - } - print "\n"; - $s++ unless $action eq "I"; - $i++ unless $action eq "D"; - $ALIGNMENT_I_TO_S{$i} = $s unless $action eq "D"; - push @INPUT_BITMAP, 1 if $action eq "M"; - push @INPUT_BITMAP, 0 if $action eq "I" || $action eq "S"; - } - - - print $target."\n"; - foreach (@TARGET_BITMAP) { print $_; } print "\n"; - foreach (sort keys %FRAME_INPUT) { - print "$_: $FRAME_INPUT{$_}\n"; - } - - ### STEP 2: BUILD RULE AND FRAME - - # hierarchical rule - my $rule_s = ""; - my $rule_pos_s = 0; - my %RULE_ALIGNMENT_S; - for(my $i=0;$i<scalar(@INPUT_BITMAP);$i++) { - if ($INPUT_BITMAP[$i]) { - $rule_s .= $INPUT[$i]." "; - $RULE_ALIGNMENT_S{$ALIGNMENT_I_TO_S{$i}} = $rule_pos_s++; - } - foreach my $NT (@NT) { - if ($i == $$NT{"start_i"}) { - $rule_s .= "[X][X] "; - $$NT{"rule_pos_s"} = $rule_pos_s++; - } - } - } - - my $rule_t = ""; - my $rule_pos_t = 0; - my %RULE_ALIGNMENT_T; - for(my $t=-1;$t<scalar(@TARGET_BITMAP);$t++) { - if ($t>=0 && $TARGET_BITMAP[$t]) { - $rule_t .= $TARGET[$t]." "; - $RULE_ALIGNMENT_T{$t} = $rule_pos_t++; - } - foreach my $NT (@NT) { - if ($t == $$NT{"start_t"}) { - $rule_t .= "[X][X] "; - $$NT{"rule_pos_t"} = $rule_pos_t++; - } - } - } - - my $rule_alignment = ""; - foreach my $s (sort { $a <=> $b} keys %RULE_ALIGNMENT_S) { - foreach my $t (keys %{$ALIGN{"s"}[$s]}) { - next unless defined($RULE_ALIGNMENT_T{$t}); - $rule_alignment .= $RULE_ALIGNMENT_S{$s}."-".$RULE_ALIGNMENT_T{$t}." "; - } - } - foreach my $NT (@NT) { - $rule_alignment .= $$NT{"rule_pos_s"}."-".$$NT{"rule_pos_t"}." "; - } - - chop($rule_s); - chop($rule_t); - chop($rule_alignment); - - my $rule_alignment_inv = ""; - foreach (split(/ /,$rule_alignment)) { - /^(\d+)\-(\d+)$/; - $rule_alignment_inv .= "$2-$1 "; - } - chop($rule_alignment_inv); - - # frame - my $frame = ""; - $frame = $FRAME_INPUT{-1} if defined $FRAME_INPUT{-1}; - - my $currently_included = 0; - my $start_t = -1; - push @TARGET_BITMAP,0; # indicate end - - for(my $t=0;$t<=scalar(@TARGET);$t++) { - # beginning of tm target inclusion - if (!$currently_included && $TARGET_BITMAP[$t]) { - $start_t = $t; - $currently_included = 1; - } - - # end of tm target inclusion (not included word or inserted input) - elsif ($currently_included && - (!$TARGET_BITMAP[$t] || defined($FRAME_INPUT{$t}))) { - # add xml (unless change is at the beginning of the sentence - if ($start_t >= 0) { - my $target = ""; - print "for(tt=$start_t;tt<$t+$TARGET_BITMAP[$t]);\n"; - for(my $tt=$start_t;$tt<$t+$TARGET_BITMAP[$t];$tt++) { - $target .= $TARGET[$tt] . " "; - } - chop($target); - $frame .= "<xml translation=\"$target\"> x </xml> "; - } - $currently_included = 0; - } - - $frame .= $FRAME_INPUT{$t} if defined $FRAME_INPUT{$t}; - print "$TARGET_BITMAP[$t] $t ($start_t) $currently_included\n"; - } - - print $frame."\n-------------------------------------\n"; - return ($frame,$rule_s,$rule_t,$rule_alignment,$rule_alignment_inv); -} - -sub create_alignment { - my ($line) = @_; - my (@ALIGNED_TO_S,@ALIGNED_TO_T); - foreach my $point (split(/ /,$line)) { - my ($s,$t) = split(/\-/,$point); - $ALIGNED_TO_S[$s]{$t}++; - $ALIGNED_TO_T[$t]{$s}++; - } - my %ALIGNMENT = ( 's' => \@ALIGNED_TO_S, 't' => \@ALIGNED_TO_T ); - return %ALIGNMENT; -} diff --git a/contrib/fuzzy-match/suffix-test.cpp b/contrib/fuzzy-match/suffix-test.cpp deleted file mode 100644 index 01b722fb4..000000000 --- a/contrib/fuzzy-match/suffix-test.cpp +++ /dev/null @@ -1,27 +0,0 @@ -#include "SuffixArray.h" - -using namespace std; - -int main(int argc, char* argv[]) -{ - SuffixArray suffixArray( "/home/pkoehn/syntax/grammars/wmt09-de-en/corpus.1k.de" ); - //suffixArray.List(10,20); - vector< string > der; - der.push_back("der"); - vector< string > inDer; - inDer.push_back("in"); - inDer.push_back("der"); - vector< string > zzz; - zzz.push_back("zzz"); - vector< string > derDer; - derDer.push_back("der"); - derDer.push_back("der"); - - cout << "count of 'der' " << suffixArray.Count( der ) << endl; - cout << "limited count of 'der' " << suffixArray.MinCount( der, 2 ) << endl; - cout << "count of 'in der' " << suffixArray.Count( inDer ) << endl; - cout << "count of 'der der' " << suffixArray.Count( derDer ) << endl; - cout << "limited count of 'der der' " << suffixArray.MinCount( derDer, 1 ) << endl; - // cout << "count of 'zzz' " << suffixArray.Count( zzz ) << endl; - // cout << "limited count of 'zzz' " << suffixArray.LimitedCount( zzz, 1 ) << endl; -} diff --git a/contrib/other-builds/OnDiskPt/.cproject b/contrib/other-builds/OnDiskPt/.cproject index 472888f48..9b25f636f 100644 --- a/contrib/other-builds/OnDiskPt/.cproject +++ b/contrib/other-builds/OnDiskPt/.cproject @@ -47,6 +47,7 @@ </option> <option id="gnu.cpp.compiler.option.preprocessor.def.1052680347" name="Defined symbols (-D)" superClass="gnu.cpp.compiler.option.preprocessor.def" valueType="definedSymbols"> <listOptionValue builtIn="false" value="TRACE_ENABLE"/> + <listOptionValue builtIn="false" value="WITH_THREADS"/> </option> <inputType id="cdt.managedbuild.tool.gnu.cpp.compiler.input.1930757481" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.input"/> </tool> diff --git a/contrib/other-builds/lm.xcodeproj/project.pbxproj b/contrib/other-builds/lm.xcodeproj/project.pbxproj index 2488f1439..2598ae994 100644 --- a/contrib/other-builds/lm.xcodeproj/project.pbxproj +++ b/contrib/other-builds/lm.xcodeproj/project.pbxproj @@ -83,6 +83,8 @@ 1EBA459F14B97E92003CC0EA /* string_piece.hh in Headers */ = {isa = PBXBuildFile; fileRef = 1EBA455914B97E92003CC0EA /* string_piece.hh */; }; 1EBA45A014B97E92003CC0EA /* tokenize_piece_test.cc in Sources */ = {isa = PBXBuildFile; fileRef = 1EBA455A14B97E92003CC0EA /* tokenize_piece_test.cc */; }; 1EBA45A114B97E92003CC0EA /* tokenize_piece.hh in Headers */ = {isa = PBXBuildFile; fileRef = 1EBA455B14B97E92003CC0EA /* tokenize_piece.hh */; }; + 1EC2B30916233A8C00614D71 /* usage.cc in Sources */ = {isa = PBXBuildFile; fileRef = 1EC2B30716233A8C00614D71 /* usage.cc */; }; + 1EC2B30A16233A8C00614D71 /* usage.hh in Headers */ = {isa = PBXBuildFile; fileRef = 1EC2B30816233A8C00614D71 /* usage.hh */; }; /* End PBXBuildFile section */ /* Begin PBXContainerItemProxy section */ @@ -185,6 +187,8 @@ 1EBA455A14B97E92003CC0EA /* tokenize_piece_test.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = tokenize_piece_test.cc; path = ../../util/tokenize_piece_test.cc; sourceTree = "<group>"; }; 1EBA455B14B97E92003CC0EA /* tokenize_piece.hh */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; name = tokenize_piece.hh; path = ../../util/tokenize_piece.hh; sourceTree = "<group>"; }; 1EBA455C14B97E92003CC0EA /* util.xcodeproj */ = {isa = PBXFileReference; lastKnownFileType = "wrapper.pb-project"; name = util.xcodeproj; path = ../../util/util.xcodeproj; sourceTree = "<group>"; }; + 1EC2B30716233A8C00614D71 /* usage.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = usage.cc; path = ../../util/usage.cc; sourceTree = "<group>"; }; + 1EC2B30816233A8C00614D71 /* usage.hh */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; name = usage.hh; path = ../../util/usage.hh; sourceTree = "<group>"; }; 1EE8C2E91476A48E002496F2 /* liblm.a */ = {isa = PBXFileReference; explicitFileType = archive.ar; includeInIndex = 0; path = liblm.a; sourceTree = BUILT_PRODUCTS_DIR; }; /* End PBXFileReference section */ @@ -261,6 +265,8 @@ 1EBA44FC14B97E81003CC0EA /* util */ = { isa = PBXGroup; children = ( + 1EC2B30716233A8C00614D71 /* usage.cc */, + 1EC2B30816233A8C00614D71 /* usage.hh */, 1EBA453614B97E92003CC0EA /* bit_packing_test.cc */, 1EBA453714B97E92003CC0EA /* bit_packing.cc */, 1EBA453814B97E92003CC0EA /* bit_packing.hh */, @@ -377,6 +383,7 @@ 1EBA45A114B97E92003CC0EA /* tokenize_piece.hh in Headers */, 1E890C72159D1B260031F9F3 /* value_build.hh in Headers */, 1E890C73159D1B260031F9F3 /* value.hh in Headers */, + 1EC2B30A16233A8C00614D71 /* usage.hh in Headers */, ); runOnlyForDeploymentPostprocessing = 0; }; @@ -479,6 +486,7 @@ 1EBA459D14B97E92003CC0EA /* sorted_uniform_test.cc in Sources */, 1EBA45A014B97E92003CC0EA /* tokenize_piece_test.cc in Sources */, 1E890C71159D1B260031F9F3 /* value_build.cc in Sources */, + 1EC2B30916233A8C00614D71 /* usage.cc in Sources */, ); runOnlyForDeploymentPostprocessing = 0; }; diff --git a/contrib/other-builds/moses-chart-cmd/.cproject b/contrib/other-builds/moses-chart-cmd/.cproject index dfebdd577..a620366e4 100644 --- a/contrib/other-builds/moses-chart-cmd/.cproject +++ b/contrib/other-builds/moses-chart-cmd/.cproject @@ -29,6 +29,9 @@ <listOptionValue builtIn="false" value=""${workspace_loc}/../../moses/src""/> <listOptionValue builtIn="false" value=""${workspace_loc}/../..""/> </option> + <option id="gnu.cpp.compiler.option.preprocessor.def.1785368241" name="Defined symbols (-D)" superClass="gnu.cpp.compiler.option.preprocessor.def" valueType="definedSymbols"> + <listOptionValue builtIn="false" value="WITH_THREADS"/> + </option> <inputType id="cdt.managedbuild.tool.gnu.cpp.compiler.input.1402496521" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.input"/> </tool> <tool id="cdt.managedbuild.tool.gnu.c.compiler.exe.debug.827478809" name="GCC C Compiler" superClass="cdt.managedbuild.tool.gnu.c.compiler.exe.debug"> @@ -52,6 +55,7 @@ <listOptionValue builtIn="false" value="z"/> <listOptionValue builtIn="false" value="rt"/> <listOptionValue builtIn="false" value="boost_system"/> + <listOptionValue builtIn="false" value="boost_thread"/> <listOptionValue builtIn="false" value="boost_filesystem"/> <listOptionValue builtIn="false" value="lm"/> <listOptionValue builtIn="false" value="util"/> diff --git a/contrib/other-builds/moses-cmd/.cproject b/contrib/other-builds/moses-cmd/.cproject index 1428de199..7fa7b6911 100644 --- a/contrib/other-builds/moses-cmd/.cproject +++ b/contrib/other-builds/moses-cmd/.cproject @@ -29,6 +29,9 @@ <listOptionValue builtIn="false" value=""${workspace_loc}/../../moses/src""/> <listOptionValue builtIn="false" value=""${workspace_loc}/../..""/> </option> + <option id="gnu.cpp.compiler.option.preprocessor.def.849384962" name="Defined symbols (-D)" superClass="gnu.cpp.compiler.option.preprocessor.def" valueType="definedSymbols"> + <listOptionValue builtIn="false" value="WITH_THREADS"/> + </option> <inputType id="cdt.managedbuild.tool.gnu.cpp.compiler.input.363379373" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.input"/> </tool> <tool id="cdt.managedbuild.tool.gnu.c.compiler.exe.debug.504208780" name="GCC C Compiler" superClass="cdt.managedbuild.tool.gnu.c.compiler.exe.debug"> @@ -52,6 +55,7 @@ <listOptionValue builtIn="false" value="z"/> <listOptionValue builtIn="false" value="rt"/> <listOptionValue builtIn="false" value="boost_system"/> + <listOptionValue builtIn="false" value="boost_thread"/> <listOptionValue builtIn="false" value="lm"/> <listOptionValue builtIn="false" value="util"/> </option> diff --git a/contrib/other-builds/moses/.cproject b/contrib/other-builds/moses/.cproject index 05eb2df40..bbf020b98 100644 --- a/contrib/other-builds/moses/.cproject +++ b/contrib/other-builds/moses/.cproject @@ -37,6 +37,8 @@ <listOptionValue builtIn="false" value="${workspace_loc}/../../"/> </option> <option id="gnu.cpp.compiler.option.preprocessor.def.752586397" name="Defined symbols (-D)" superClass="gnu.cpp.compiler.option.preprocessor.def" valueType="definedSymbols"> + <listOptionValue builtIn="false" value="IS_ECLIPSE"/> + <listOptionValue builtIn="false" value="WITH_THREADS"/> <listOptionValue builtIn="false" value="KENLM_MAX_ORDER=7"/> <listOptionValue builtIn="false" value="TRACE_ENABLE"/> <listOptionValue builtIn="false" value="LM_IRST"/> diff --git a/contrib/other-builds/util/.cproject b/contrib/other-builds/util/.cproject index 8ea5ab73b..0893d942f 100644 --- a/contrib/other-builds/util/.cproject +++ b/contrib/other-builds/util/.cproject @@ -46,6 +46,7 @@ </option> <option id="gnu.cpp.compiler.option.preprocessor.def.1952961175" name="Defined symbols (-D)" superClass="gnu.cpp.compiler.option.preprocessor.def" valueType="definedSymbols"> <listOptionValue builtIn="false" value="TRACE_ENABLE"/> + <listOptionValue builtIn="false" value="WITH_THREADS"/> </option> <inputType id="cdt.managedbuild.tool.gnu.cpp.compiler.input.1420621104" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.input"/> </tool> |