Welcome to mirror list, hosted at ThFree Co, Russian Federation.

Util.h « moses - github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
blob: 24a4e2c28c2e95deae95733ed4024e3b86f81fb4 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
// $Id$

/***********************************************************************
Moses - factored phrase-based language decoder
Copyright (C) 2006 University of Edinburgh

This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.

This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
Lesser General Public License for more details.

You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
***********************************************************************/

#ifndef moses_Util_h
#define moses_Util_h

#include <iostream>
#include <fstream>
#include <sstream>
#include <string>
#include <vector>
#include <cmath>
#include <cassert>
#include <limits>
#include <map>
#include <cstdlib>
#include <cstring>
#include "util/exception.hh"
#include "TypeDef.h"

namespace Moses
{

/** Outputting debugging/verbose information to stderr.
 * Use TRACE_ENABLE flag to redirect tracing output into oblivion
 * so that you can output your own ad-hoc debugging info.
 * However, if you use stderr directly, please delete calls to it once
 * you finished debugging so that it won't clutter up.
 * Also use TRACE_ENABLE to turn off output of any debugging info
 * when compiling for a gui front-end so that running gui won't generate
 * output on command line
 * */
#ifdef TRACE_ENABLE
#define TRACE_ERR(str) do { std::cerr << str; } while (false)
#else
#define TRACE_ERR(str) do {} while (false)
#endif

/** verbose macros
 * */

#define VERBOSE(level,str) { if (StaticData::Instance().GetVerboseLevel() >= level) { TRACE_ERR(str); } }
#define IFVERBOSE(level) if (StaticData::Instance().GetVerboseLevel() >= level)
#define XVERBOSE(level,str) { if (StaticData::Instance().GetVerboseLevel() >= level) { TRACE_ERR("[" << __FILE__ << ":" << __LINE__ << "] ");TRACE_ERR(str); } }
#define HERE __FILE__ << ":" << __LINE__


#if __GNUC__ == 4 && __GNUC_MINOR__ == 8 && (__GNUC_PATCHLEVEL__ == 1 || __GNUC_PATCHLEVEL__ == 2)
// gcc nth_element() bug
#define NTH_ELEMENT3(begin, middle, end) std::sort(begin, end)
#define NTH_ELEMENT4(begin, middle, end, orderer) std::sort(begin, end, orderer)
#else
#define NTH_ELEMENT3(begin, middle, end) std::nth_element(begin, middle, end)
#define NTH_ELEMENT4(begin, middle, end, orderer) std::nth_element(begin, middle, end, orderer)
#endif

//! delete white spaces at beginning and end of string
const std::string Trim(const std::string& str, const std::string dropChars = " \t\n\r");
const std::string ToLower(const std::string& str);

//! get string representation of any object/variable, as long as it can pipe to a stream
template<typename T>
inline std::string SPrint(const T &input)
{
  std::stringstream stream("");
  stream << input;
  return stream.str();
}

//! convert string to variable of type T. Used to reading floats, int etc from files
template<typename T>
inline T Scan(const std::string &input)
{
  std::stringstream stream(input);
  T ret;
  stream >> ret;
  return ret;
}

//! just return input
template<>
inline std::string Scan<std::string>(const std::string &input)
{
  return input;
}

//! Specialisation to understand yes/no y/n true/false 0/1
template<>
bool Scan<bool>(const std::string &input);

//! convert vectors of string to vectors of type T variables
template<typename T>
inline std::vector<T> Scan(const std::vector< std::string > &input)
{
  std::vector<T> output(input.size());
  for (size_t i = 0 ; i < input.size() ; i++) {
    output[i] = Scan<T>( input[i] );
  }
  return output;
}

//! speeded up version of above
template<typename T>
inline void Scan(std::vector<T> &output, const std::vector< std::string > &input)
{
  output.resize(input.size());
  for (size_t i = 0 ; i < input.size() ; i++) {
    output[i] = Scan<T>( input[i] );
  }
}

/** replace all occurrences of todelStr in str with the string toaddStr */
inline std::string Replace(const std::string& str,
                           const std::string& todelStr,
                           const std::string& toaddStr)
{
  size_t pos=0;
  std::string newStr=str;
  while ((pos=newStr.find(todelStr,pos))!=std::string::npos) {
    newStr.replace(pos++,todelStr.size(),toaddStr);
  }
  return newStr;
}

/** tokenise input string to vector of string. each element has been separated by a character in the delimiters argument.
		The separator can only be 1 character long. The default delimiters are space or tab
*/
inline std::vector<std::string> Tokenize(const std::string& str,
    const std::string& delimiters = " \t")
{
  std::vector<std::string> tokens;
  // Skip delimiters at beginning.
  std::string::size_type lastPos = str.find_first_not_of(delimiters, 0);
  // Find first "non-delimiter".
  std::string::size_type pos     = str.find_first_of(delimiters, lastPos);

  while (std::string::npos != pos || std::string::npos != lastPos) {
    // Found a token, add it to the vector.
    tokens.push_back(str.substr(lastPos, pos - lastPos));
    // Skip delimiters.  Note the "not_of"
    lastPos = str.find_first_not_of(delimiters, pos);
    // Find next "non-delimiter"
    pos = str.find_first_of(delimiters, lastPos);
  }

  return tokens;
}

// speeded up version of above
inline void Tokenize(std::vector<std::string> &output
                     , const std::string& str
                     , const std::string& delimiters = " \t")
{
  // Skip delimiters at beginning.
  std::string::size_type lastPos = str.find_first_not_of(delimiters, 0);
  // Find first "non-delimiter".
  std::string::size_type pos     = str.find_first_of(delimiters, lastPos);

  while (std::string::npos != pos || std::string::npos != lastPos) {
    // Found a token, add it to the vector.
    output.push_back(str.substr(lastPos, pos - lastPos));
    // Skip delimiters.  Note the "not_of"
    lastPos = str.find_first_not_of(delimiters, pos);
    // Find next "non-delimiter"
    pos = str.find_first_of(delimiters, lastPos);
  }
}

//! tokenise input string to vector of type T
template<typename T>
inline std::vector<T> Tokenize( const std::string &input
                                , const std::string& delimiters = " \t")
{
  std::vector<std::string> stringVector = Tokenize(input, delimiters);
  return Scan<T>( stringVector );
}

// speeded up version of above
template<typename T>
inline void Tokenize( std::vector<T> &output
                      , const std::string &input
                      , const std::string& delimiters = " \t")
{
  std::vector<std::string> stringVector;
  Tokenize(stringVector, input, delimiters);
  return Scan<T>(output, stringVector );
}

inline std::vector<std::string> TokenizeMultiCharSeparator(
  const std::string& str,
  const std::string& separator)
{
  std::vector<std::string> tokens;

  size_t pos = 0;
  // Find first "non-delimiter".
  std::string::size_type nextPos     = str.find(separator, pos);

  while (nextPos != std::string::npos) {
    // Found a token, add it to the vector.
    tokens.push_back(str.substr(pos, nextPos - pos));
    // Skip delimiters.  Note the "not_of"
    pos = nextPos + separator.size();
    // Find next "non-delimiter"
    nextPos	= str.find(separator, pos);
  }
  tokens.push_back(str.substr(pos, nextPos - pos));

  return tokens;
}

// speeded up version of above
inline void TokenizeMultiCharSeparator(std::vector<std::string> &output
                                       ,const std::string& str
                                       ,const std::string& separator)
{
  size_t pos = 0;
  // Find first "non-delimiter".
  std::string::size_type nextPos     = str.find(separator, pos);

  while (nextPos != std::string::npos) {
    // Found a token, add it to the vector.
    output.push_back(Trim(str.substr(pos, nextPos - pos)));
    // Skip delimiters.  Note the "not_of"
    pos = nextPos + separator.size();
    // Find next "non-delimiter"
    nextPos	= str.find(separator, pos);
  }
  output.push_back(Trim(str.substr(pos, nextPos - pos)));
}

/** only split of the first delimiter. Used by class FeatureFunction for parse key=value pair.
 * Value may have = character
*/
inline std::vector<std::string> TokenizeFirstOnly(const std::string& str,
    const std::string& delimiters = " \t")
{
  std::vector<std::string> tokens;
  std::string::size_type pos     = str.find_first_of(delimiters);

  if (std::string::npos != pos) {
    // Found a token, add it to the vector.
    tokens.push_back(str.substr(0, pos));
    tokens.push_back(str.substr(pos + 1, str.size() - pos  - 1));
  } else {
    tokens.push_back(str);
  }

  return tokens;
}


/**
 * Convert vector of type T to string
 */
template <typename T>
std::string Join(const std::string& delimiter, const std::vector<T>& items)
{
  std::ostringstream outstr;
  if(items.size() == 0) return "";
  outstr << items[0];
  for(unsigned int i = 1; i < items.size(); i++)
    outstr << delimiter << items[i];
  return outstr.str();
}

//! transform prob to natural log score
inline float TransformScore(float prob)
{
  return log(prob);
}

//! transform natural log score to prob. Not currently used
inline float UntransformScore(float score)
{
  return exp(score);
}

//! irst number are in log 10, transform to natural log
inline float TransformLMScore(float irstScore)
{
  return irstScore * 2.30258509299405f;
}

inline float UntransformLMScore(float logNScore)
{
  // opposite of above
  return logNScore / 2.30258509299405f;
}

//! make sure score doesn't fall below LOWEST_SCORE
inline float FloorScore(float logScore)
{
  return (std::max)(logScore , LOWEST_SCORE);
}

/** convert prob vector to log prob and calc inner product with weight vector.
 * At least, that's what I think it does, fn is only 9 lines but can't figure out what it does.
 * Not sure whether give zens a medal for being a genius, or shoot him for writing unreadable code. Mabe both...
 */
inline float CalcTranslationScore(const std::vector<float> &probVector,
                                  const std::vector<float> &weightT)
{
  UTIL_THROW_IF2(weightT.size() != probVector.size(),
		  "Weight and score vector sizes not the same");
  float rv=0.0;
  for(float const *sb=&probVector[0],*se=sb+probVector.size(),*wb=&weightT[0];
      sb!=se; ++sb, ++wb)
    rv += TransformScore(*sb) * (*wb);
  return rv;
}

/** declaration of ToString() function to go in header for each class.
 *	This function, as well as the operator<< fn for each class, is
 *	for debugging purposes only. The output format is likely to change from
 *	time-to-time as classes are updated so shouldn't be relied upon
 *	for any decoding algorithm
*/
#define TO_STRING()	 std::string ToString() const;

//! definition of ToString() function to go in .cpp file. Can be used for any class that can be piped to a stream
#define TO_STRING_BODY(CLASS) 	\
	std::string CLASS::ToString() const	\
	{															\
		std::stringstream out;			\
		out << *this;								\
		return out.str();						\
	}															\
 
//! delete and remove every element of a collection object such as set, list etc
template<class COLL>
void RemoveAllInColl(COLL &coll)
{
  for (typename COLL::const_iterator iter = coll.begin() ; iter != coll.end() ; ++iter) {
    delete (*iter);
  }
  coll.clear();
}

//! delete and remove every element of map
template<class COLL>
void RemoveAllInMap(COLL &coll)
{
  for (typename COLL::const_iterator iter = coll.begin() ; iter != coll.end() ; ++iter) {
    delete (iter->second);
  }
  coll.clear();
}


//! x-platform reference to temp folder
std::string GetTempFolder();
//! MD5 hash of a file
std::string GetMD5Hash(const std::string &filePath);

//! save memory by getting rid of spare, unused elements in a collection
template<typename T>
inline void ShrinkToFit(T& v)
{
  if(v.capacity()>v.size())
    T(v).swap(v);
  assert(v.capacity()==v.size());
}

bool FileExists(const std::string& filePath);

// A couple of utilities to measure decoding time
void ResetUserTime();
void PrintUserTime(const std::string &message);
double GetUserTime();

// dump SGML parser for <seg> tags
std::map<std::string, std::string> ProcessAndStripSGML(std::string &line);

std::string PassthroughSGML(std::string &line, const std::string tagName,const std::string& lbrackStr="<", const std::string& rbrackStr=">");

/**
 * Returns the first string bounded by the delimiters (default delimiters are " " and "\t")i starting from position first_pos
 * and and stores the starting position of the next string (in first_str)
 */
inline std::string GetFirstString(const std::string& str, int& first_pos,  const std::string& delimiters = " \t")
{

  std::string first_str;
  // Skip delimiters at beginning.
  std::string::size_type lastPos = str.find_first_not_of(delimiters, first_pos);

  // Find first "non-delimiter".
  std::string::size_type pos     = str.find_first_of(delimiters, lastPos);

  if (std::string::npos != pos || std::string::npos != lastPos) {

    first_str = str.substr(lastPos, pos - lastPos);

    // Skip delimiters.  Note the "not_of"
    lastPos = str.find_first_not_of(delimiters, pos);

  }

  first_pos = lastPos;
  return first_str;
}

template<class T>
T log_sum (T log_a, T log_b)
{
  T v;
  if (log_a < log_b) {
    v = log_b+log ( 1 + exp ( log_a-log_b ));
  } else {
    v = log_a+log ( 1 + exp ( log_b-log_a ));
  }
  return ( v );
}


}

#endif