Welcome to mirror list, hosted at ThFree Co, Russian Federation.

phrase-extract.h « eppex « contrib - github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
blob: c01dd3ee20f22dde55ece09a691831581fdb6fd7 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
/**
 * Common lossy counting phrase extraction functionality declaration.
 *
 * Note: The bulk of this unit is based on Philipp Koehn's code from
 * phrase-extract/extract.cpp.
 *
 * (C) Moses: http://www.statmt.org/moses/
 * (C) Ceslav Przywara, UFAL MFF UK, 2011
 *
 * $Id$
 */

#ifndef PHRASE_EXTRACT_H
#define	PHRASE_EXTRACT_H

#include <fstream>
#include <map>
#include <set>
#include <string>
#include <vector>

#include "../phrase-extract/SentenceAlignment.h"

#include "typedefs.h"


//////// Types definitions /////////////////////////////////////////////////////

// HPhraseVertex represents a point in the alignment matrix
typedef std::pair<int, int> HPhraseVertex;

// Phrase represents a bi-phrase; each bi-phrase is defined by two points in the alignment matrix:
// bottom-left and top-right
typedef std::pair<HPhraseVertex, HPhraseVertex> HPhrase;

// HPhraseVector is a vector of HPhrases
typedef std::vector<HPhrase> HPhraseVector;

// SentenceVertices represents, from all extracted phrases, all vertices that have the same positioning
// The key of the map is the English index and the value is a set of the source ones
typedef std::map<int, std::set<int> > HSentenceVertices;

//
typedef std::pair<PhrasePairsLossyCounter::error_t, PhrasePairsLossyCounter::support_t> params_pair_t;
//
typedef std::vector<PhrasePairsLossyCounter *> PhrasePairsLossyCountersVector;

// MSD - monotone, swap, discontinuous.
enum REO_MODEL_TYPE {REO_MSD, REO_MSLR, REO_MONO};
enum REO_POS {LEFT, RIGHT, DLEFT, DRIGHT, UNKNOWN};

struct LossyCounterInstance {
    // Statistics not provided by the lossy counter must be computed during
    // phrases flushing (ie. when input processing is done):
    size_t outputMass; // unique * freq
    size_t outputSize; // unique
    //
    PhrasePairsLossyCounter lossyCounter;

    LossyCounterInstance(PhrasePairsLossyCounter::error_t error, PhrasePairsLossyCounter::support_t support): outputMass(0), outputSize(0), lossyCounter(error, support) {}
};

//
typedef std::vector<LossyCounterInstance *> LossyCountersVector;

struct OutputProcessor {
    virtual void operator() (const std::string& srcPhrase, const std::string& tgtPhrase, const std::string& orientationInfo, const alignment_t& alignment, const size_t frequency, int mode) = 0;
};


//////// Functions declarations ////////////////////////////////////////////////

//// Untouched ////
REO_POS getOrientWordModel(SentenceAlignment &, REO_MODEL_TYPE, bool, bool,
                           int, int, int, int, int, int, int,
                           bool (*)(int, int), bool (*)(int, int));

REO_POS getOrientPhraseModel(SentenceAlignment &, REO_MODEL_TYPE, bool, bool,
                             int, int, int, int, int, int, int,
                             bool (*)(int, int), bool (*)(int, int),
                             const HSentenceVertices &, const HSentenceVertices &);

REO_POS getOrientHierModel(SentenceAlignment &, REO_MODEL_TYPE, bool, bool,
                           int, int, int, int, int, int, int,
                           bool (*)(int, int), bool (*)(int, int),
                           const HSentenceVertices &, const HSentenceVertices &,
                           const HSentenceVertices &, const HSentenceVertices &,
                           REO_POS);

void insertVertex(HSentenceVertices &, int, int);
void insertPhraseVertices(HSentenceVertices &, HSentenceVertices &, HSentenceVertices &, HSentenceVertices &, int, int, int, int);

std::string getOrientString(REO_POS, REO_MODEL_TYPE);

bool ge(int, int);
bool le(int, int);
bool lt(int, int);
bool isAligned (SentenceAlignment &, int, int);
void extract(SentenceAlignment &);

//// Modified ////
void addPhrase(SentenceAlignment &, int, int, int, int, std::string &);

//// Added ////
void readInput(std::istream& eFile, std::istream& fFile, std::istream& aFile);
void processOutput(OutputProcessor& processor);
void printStats(void);


//////// Extern variables //////////////////////////////////////////////////////

extern bool allModelsOutputFlag;

// Some default setting, I guess...
extern bool wordModel; // IBM word model.
extern REO_MODEL_TYPE wordType;
extern bool phraseModel; // Std phrase-based model.
extern REO_MODEL_TYPE phraseType;
extern bool hierModel; // Hierarchical model.
extern REO_MODEL_TYPE hierType;

extern int maxPhraseLength; // Eg. 7
extern bool translationFlag; // Generate extract and extract.inv
extern bool orientationFlag; // Ordering info needed?
extern bool sortedOutput; // Sort output?

extern LossyCountersVector lossyCounters;

#ifdef GET_COUNTS_ONLY
extern std::vector<size_t> phrasePairsCounters;
#endif

#endif	/* PHRASE_EXTRACT_H */