/** * IndexedPhrasesPair - implementation of a single phrase pair source and target * phrases represented by numeric indices. * * (C) Ceslav Przywara, UFAL MFF UK, 2011 * * $Id$ * * TODO: * - current unordered_map implementation is terribly slow. More sophisticated * design of hash function should help. */ #ifndef INDEXEDPHRASESPAIR_H #define INDEXEDPHRASESPAIR_H #include #include #include #include #ifdef USE_UNORDERED_MAP #include #endif // Forward declaration because of friend comparison operator declaration below. template class IndexedPhrasesPair; #ifdef USE_UNORDERED_MAP template class IndexedPhrasePairHasher; // Comparison operator (is going to be declared as friend). template bool operator== (const IndexedPhrasesPair& lhs, const IndexedPhrasesPair& rhs); #else // Comparison operator (is going to be declared as friend). template bool operator< (const IndexedPhrasesPair& lhs, const IndexedPhrasesPair& rhs); #endif /** * Structure capable of holding a phrase pair consisting of: * a) source phrase * b) target phrase * c) word alignment * d) orientation info * @param OrientationIndexType * @param TokenIndexType - datatype for token indices. */ template class IndexedPhrasesPair { public: typedef TokenIndexType token_index_t; typedef OrientationIndexType orientation_info_index_t; typedef std::vector phrase_t; // A single alignment point. typedef unsigned char alignment_point_t; // A single pair of alignments points. typedef std::pair alignment_pair_t; // A single phrase alignment (eg. 0-0 0-1 1-2) typedef std::vector alignment_t; #ifdef USE_UNORDERED_MAP // Unordered map requires hashing functor object. typedef IndexedPhrasePairHasher Hash; #endif private: /** @var Source and target phrase as array of respective token indices */ token_index_t* _data; /** @var A single phrase alignment stored in array */ alignment_point_t* _alignment; /** @var Index of orientation info string */ orientation_info_index_t _orientationInfoIndex; alignment_point_t _srcPhraseLength; alignment_point_t _tgtPhraseLength; alignment_point_t _alignmentLength; public: IndexedPhrasesPair(void): _data(NULL), _alignment(NULL), _orientationInfoIndex(0), _srcPhraseLength(0), _tgtPhraseLength(0), _alignmentLength(0) {} IndexedPhrasesPair(const phrase_t& srcPhrase, const phrase_t& tgtPhrase, orientation_info_index_t orientationInfo, const alignment_t& alignment); IndexedPhrasesPair(const IndexedPhrasesPair& copy); ~IndexedPhrasesPair(void); IndexedPhrasesPair& operator=(const IndexedPhrasesPair& other); phrase_t srcPhrase(void) const { return phrase_t(_data, _data + _srcPhraseLength); } phrase_t tgtPhrase(void) const { return phrase_t(_data + _srcPhraseLength, _data + _srcPhraseLength + _tgtPhraseLength); } orientation_info_index_t orientationInfo(void) const { return _orientationInfoIndex; } alignment_t alignment(void) const; const alignment_point_t * alignmentData(void) const { return _alignment; } alignment_point_t alignmentLength(void) const { return _alignmentLength; } #ifdef USE_UNORDERED_MAP friend class IndexedPhrasePairHasher; friend bool operator== <> (const IndexedPhrasesPair& lhs, const IndexedPhrasesPair& rhs); #else friend bool operator< <> (const IndexedPhrasesPair& lhs, const IndexedPhrasesPair& rhs); #endif }; //////////////////////////////////////////////////////////////////////////////// //// IndexedPhrasesPair IMPLEMENTATION ///////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////// template IndexedPhrasesPair::IndexedPhrasesPair(const phrase_t& srcPhrase, const phrase_t& tgtPhrase, orientation_info_index_t orientationInfo, const alignment_t& alignment): _data(NULL), _alignment(NULL), _orientationInfoIndex(orientationInfo), _srcPhraseLength(static_cast(srcPhrase.size())), _tgtPhraseLength(static_cast(tgtPhrase.size())), _alignmentLength(alignment.size()) { // Save alignment. _alignment = new alignment_point_t[2 * _alignmentLength]; // Note: *2 for each pair. for ( size_t i = 0; i < alignment.size(); ++i ) { _alignment[i*2] = alignment[i].first; _alignment[i*2 + 1] = alignment[i].second; } // Save data. _data = new token_index_t[_srcPhraseLength + _tgtPhraseLength]; std::copy(srcPhrase.begin(), srcPhrase.end(), _data); std::copy(tgtPhrase.begin(), tgtPhrase.end(), _data + _srcPhraseLength); } template IndexedPhrasesPair::IndexedPhrasesPair(const IndexedPhrasesPair& copy): _data(NULL), _alignment(NULL), _orientationInfoIndex(copy._orientationInfoIndex), _srcPhraseLength(copy._srcPhraseLength), _tgtPhraseLength(copy._tgtPhraseLength), _alignmentLength(copy._alignmentLength) { // Copy alignment. // alignment_point_t alignmentLength = std::max(_srcPhraseLength, _tgtPhraseLength); _alignment = new alignment_point_t[2 * _alignmentLength]; // Note: *2 for each pair. memcpy(_alignment, copy._alignment, _alignmentLength * 2 * sizeof(alignment_point_t)); // Copy data. _data = new token_index_t[_srcPhraseLength + _tgtPhraseLength]; std::copy(copy._data, copy._data + _srcPhraseLength + _tgtPhraseLength, _data); } template IndexedPhrasesPair::~IndexedPhrasesPair(void) { delete[] _alignment; delete[] _data; } template IndexedPhrasesPair& IndexedPhrasesPair::operator=(const IndexedPhrasesPair& other) { if ( this != &other ) { // Copy alignment. _alignmentLength = other._alignmentLength; // alignment_point_t alignmentLength = std::max(_srcPhraseLength, _tgtPhraseLength); alignment_point_t * alignment = new alignment_point_t[2 * _alignmentLength]; // Note: *2 for each pair. memcpy(alignment, other._alignment, _alignmentLength * 2 * sizeof(alignment_point_t)); if ( _alignment != NULL ) { delete[] _alignment; // ! } _alignment = alignment; // ! // Copy data. _srcPhraseLength = other._srcPhraseLength; _tgtPhraseLength = other._tgtPhraseLength; token_index_t * data = new token_index_t[_srcPhraseLength + _tgtPhraseLength]; std::copy(other._data, other._data + _srcPhraseLength + _tgtPhraseLength, data); if ( _data != NULL ) { delete[] _data; // ! } _data = data; // ! // _orientationInfoIndex = other._orientationInfoIndex; } return *this; } template typename IndexedPhrasesPair::alignment_t IndexedPhrasesPair::alignment(void) const { alignment_t a; //alignment_point_t alignmentLength = std::max(_srcPhraseLength, _tgtPhraseLength); for ( size_t i = 0; i < _alignmentLength; ++i ) { a.push_back(alignment_pair_t(_alignment[2*i], _alignment[2*i + 1])); } return a; } #ifdef USE_UNORDERED_MAP template class IndexedPhrasePairHasher: public std::unary_function, size_t> { typedef typename IndexedPhrasesPair::alignment_point_t alignment_point_t; std::tr1::hash _hash; public: size_t operator()(const IndexedPhrasesPair& phrasePair) const { size_t hash = 0; for ( alignment_point_t i = 0; i < phrasePair._srcPhraseLength + phrasePair._tgtPhraseLength; ++i ) { hash ^= _hash(phrasePair._data[i]); } return hash; } }; template bool operator== (const IndexedPhrasesPair& lhs, const IndexedPhrasesPair& rhs) { typedef typename IndexedPhrasesPair::alignment_point_t alignment_point_t; typedef typename IndexedPhrasesPair::token_index_t string_index_t; // Alignments comparable? if ( lhs._alignmentLength != rhs._alignmentLength ) { return false; } // Same alignment length -> compare alignments. int cmp = memcmp(lhs._alignment, rhs._alignment, lhs._alignmentLength * 2 * sizeof(alignment_point_t)); if ( cmp != 0 ) { // Alignments differ. return false; } // Alignments are equal, compare phrases (data). if ( lhs._srcPhraseLength != rhs._srcPhraseLength ) { // Source phrase lengths differs. return false; } if ( lhs._tgtPhraseLength != rhs._tgtPhraseLength ) { // Target phrase lengths differs. return false; } // Phrases have matching lengths, compare the data in the end: cmp = memcmp(lhs._data, rhs._data, (lhs._srcPhraseLength + lhs._tgtPhraseLength) * sizeof(string_index_t)); if ( cmp != 0 ) { // Data differ. return false; } // Compare orientation info in the end. return lhs._orientationInfoIndex == rhs._orientationInfoIndex; } #else template bool operator< (const IndexedPhrasesPair& lhs, const IndexedPhrasesPair& rhs) { typedef typename IndexedPhrasesPair::alignment_point_t alignment_point_t; typedef typename IndexedPhrasesPair::token_index_t string_index_t; // Alignments comparable? if ( lhs._alignmentLength != rhs._alignmentLength ) { // Shorter alignment length => lesser item. return lhs._alignmentLength < rhs._alignmentLength; } // Same alignment length -> compare alignments. int cmp = memcmp(lhs._alignment, rhs._alignment, lhs._alignmentLength * 2 * sizeof(alignment_point_t)); if ( cmp != 0 ) { // Alignments differ. return cmp < 0; } // Alignments are equal, compare phrases (data). if ( lhs._srcPhraseLength != rhs._srcPhraseLength ) { // Source phrase lengths differs. return lhs._srcPhraseLength < rhs._srcPhraseLength; } if ( lhs._tgtPhraseLength != rhs._tgtPhraseLength ) { // Target phrase lengths differs. return lhs._tgtPhraseLength < rhs._tgtPhraseLength; } // Phrases have matching lengths, compare the data in the end: cmp = memcmp(lhs._data, rhs._data, (lhs._srcPhraseLength + lhs._tgtPhraseLength) * sizeof(string_index_t)); if ( cmp != 0 ) { // Data differ. return cmp < 0; } // Compare orientation info in the end. return lhs._orientationInfoIndex < rhs._orientationInfoIndex; } #endif #endif /* INDEXEDPHRASESPAIR_H */