/** * Common lossy counting phrase extraction functionality declaration. * * Note: The bulk of this unit is based on Philipp Koehn's code from * phrase-extract/extract.cpp. * * (C) Moses: http://www.statmt.org/moses/ * (C) Ceslav Przywara, UFAL MFF UK, 2011 * * $Id$ */ #ifndef PHRASE_EXTRACT_H #define PHRASE_EXTRACT_H #include #include #include #include #include #include "../phrase-extract/SentenceAlignment.h" #include "typedefs.h" //////// Types definitions ///////////////////////////////////////////////////// // HPhraseVertex represents a point in the alignment matrix typedef std::pair HPhraseVertex; // Phrase represents a bi-phrase; each bi-phrase is defined by two points in the alignment matrix: // bottom-left and top-right typedef std::pair HPhrase; // HPhraseVector is a vector of HPhrases typedef std::vector HPhraseVector; // SentenceVertices represents, from all extracted phrases, all vertices that have the same positioning // The key of the map is the English index and the value is a set of the source ones typedef std::map > HSentenceVertices; // typedef std::pair params_pair_t; // typedef std::vector PhrasePairsLossyCountersVector; // MSD - monotone, swap, discontinuous. enum REO_MODEL_TYPE {REO_MSD, REO_MSLR, REO_MONO}; enum REO_POS {LEFT, RIGHT, DLEFT, DRIGHT, UNKNOWN}; struct LossyCounterInstance { // Statistics not provided by the lossy counter must be computed during // phrases flushing (ie. when input processing is done): size_t outputMass; // unique * freq size_t outputSize; // unique // PhrasePairsLossyCounter lossyCounter; LossyCounterInstance(PhrasePairsLossyCounter::error_t error, PhrasePairsLossyCounter::support_t support): outputMass(0), outputSize(0), lossyCounter(error, support) {} }; // typedef std::vector LossyCountersVector; struct OutputProcessor { virtual void operator() (const std::string& srcPhrase, const std::string& tgtPhrase, const std::string& orientationInfo, const alignment_t& alignment, const size_t frequency, int mode) = 0; }; //////// Functions declarations //////////////////////////////////////////////// //// Untouched //// REO_POS getOrientWordModel(SentenceAlignment &, REO_MODEL_TYPE, bool, bool, int, int, int, int, int, int, int, bool (*)(int, int), bool (*)(int, int)); REO_POS getOrientPhraseModel(SentenceAlignment &, REO_MODEL_TYPE, bool, bool, int, int, int, int, int, int, int, bool (*)(int, int), bool (*)(int, int), const HSentenceVertices &, const HSentenceVertices &); REO_POS getOrientHierModel(SentenceAlignment &, REO_MODEL_TYPE, bool, bool, int, int, int, int, int, int, int, bool (*)(int, int), bool (*)(int, int), const HSentenceVertices &, const HSentenceVertices &, const HSentenceVertices &, const HSentenceVertices &, REO_POS); void insertVertex(HSentenceVertices &, int, int); void insertPhraseVertices(HSentenceVertices &, HSentenceVertices &, HSentenceVertices &, HSentenceVertices &, int, int, int, int); std::string getOrientString(REO_POS, REO_MODEL_TYPE); bool ge(int, int); bool le(int, int); bool lt(int, int); bool isAligned (SentenceAlignment &, int, int); void extract(SentenceAlignment &); //// Modified //// void addPhrase(SentenceAlignment &, int, int, int, int, std::string &); //// Added //// void readInput(std::istream& eFile, std::istream& fFile, std::istream& aFile); void processOutput(OutputProcessor& processor); void printStats(void); //////// Extern variables ////////////////////////////////////////////////////// extern bool allModelsOutputFlag; // Some default setting, I guess... extern bool wordModel; // IBM word model. extern REO_MODEL_TYPE wordType; extern bool phraseModel; // Std phrase-based model. extern REO_MODEL_TYPE phraseType; extern bool hierModel; // Hierarchical model. extern REO_MODEL_TYPE hierType; extern int maxPhraseLength; // Eg. 7 extern bool translationFlag; // Generate extract and extract.inv extern bool orientationFlag; // Ordering info needed? extern bool sortedOutput; // Sort output? extern LossyCountersVector lossyCounters; #ifdef GET_COUNTS_ONLY extern std::vector phrasePairsCounters; #endif #endif /* PHRASE_EXTRACT_H */