src/common/utils.cpp


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432

#include "common/utils.h"
#include "common/logging.h"
#include "CLI/StringTools.hpp"

#include <stdio.h>
#include <array>
#include <iostream>
#include <sstream>
#include <string>
#include <set>
#if defined(__unix__) || defined(__APPLE__)
#include <unistd.h>
#endif
#include <codecvt>
#include <cwctype>

// MACOS lacks HOST_NAME_MAX
#ifndef HOST_NAME_MAX
# if defined(_POSIX_HOST_NAME_MAX)
#  define HOST_NAME_MAX _POSIX_HOST_NAME_MAX
# elif defined(MAXHOSTNAMELEN)
#  define HOST_NAME_MAX MAXHOSTNAMELEN
# endif
#endif


namespace marian {
namespace utils {

void trim(std::string& s) {
  CLI::detail::trim(s, " \t\n");
}

void trimRight(std::string& s) {
  CLI::detail::rtrim(s, " \t\n");
}

void trimLeft(std::string& s) {
  CLI::detail::ltrim(s, " \t\n");
}

void split(const std::string& line,
           /*out*/ std::vector<std::string>& pieces,
           const std::string& del /*= " "*/,
           bool keepEmpty /*= false*/,
           bool anyOf /*= false*/) {
  pieces.clear();
  size_t begin = 0;
  size_t pos = 0;
  std::string token;
  size_t delSize = anyOf ? 1 : del.size();
  while(true) {
    pos = anyOf ? line.find_first_of(del, begin) : line.find(del, begin);
    if(pos == std::string::npos)
      break;
    if(pos >= begin) {
      token = line.substr(begin, pos - begin);
      if(token.size() > 0 || keepEmpty)
        pieces.push_back(token);
    }
    begin = pos + delSize;
  }
  if(pos >= begin) {
    token = line.substr(begin, pos - begin);
    if(token.size() > 0 || keepEmpty)
      pieces.push_back(token);
  }
}

// the function guarantees that the output has as many elements as requested
void splitTsv(const std::string& line, std::vector<std::string>& fields, size_t numFields) {
  fields.clear();
  fields.resize(numFields); // make sure there is as many elements as requested

  size_t begin = 0;
  size_t pos = 0;
  for(size_t i = 0; i < numFields; ++i) {
    pos = line.find('\t', begin);
    if(pos == std::string::npos) {
      fields[i] = line.substr(begin);
      break;
    }
    fields[i] = line.substr(begin, pos - begin);
    begin = pos + 1;
  }

  ABORT_IF(pos != std::string::npos, "Excessive field(s) in the tab-separated line: '{}'", line);
}

std::vector<std::string> split(const std::string& line,
                               const std::string& del /*= " "*/,
                               bool keepEmpty /*= false*/,
                               bool anyOf /*= false*/) {
  std::vector<std::string> pieces;
  split(line, pieces, del, keepEmpty, anyOf);
  return pieces;
}

void splitAny(const std::string& line,
              /*out*/ std::vector<std::string>& pieces,
              const std::string& del /*= " "*/,
              bool keepEmpty /*= false*/) {
  split(line, pieces, del, keepEmpty, /*anyOf =*/true);
}

std::vector<std::string> splitAny(const std::string& line,
                                  const std::string& del /*= " "*/,
                                  bool keepEmpty /*= false*/) {
  return split(line, del, keepEmpty, /*anyOf =*/true);
}

std::string join(const std::vector<std::string>& words, const std::string& del /*= " "*/) {
  if(words.empty())
    return "";

  std::stringstream ss;
  ss << words[0];
  for(size_t i = 1; i < words.size(); ++i)
    ss << del << words[i];

  return ss.str();
}

std::string join(const std::vector<size_t>& nums, const std::string& del /*= " "*/) {
  std::vector<std::string> words(nums.size());
  std::transform(nums.begin(), nums.end(), words.begin(), [](size_t i) { return std::to_string(i); });
  return join(words, del);
}

std::string join(const std::vector<float>& nums, const std::string& del /*= " "*/, size_t prec /*= 5*/) {
  if(nums.empty())
    return "";

  std::stringstream ss;
  ss << std::fixed << std::setprecision(prec) << nums[0];
  for(size_t i = 1; i < nums.size(); ++i)
    ss << del << nums[i];

  return ss.str();
}

// escapes a string for passing to popen, which uses /bin/sh to parse its argument string
static std::string escapeForPOpen(const std::string& arg) {
  // e.g. abc -> 'abc'; my file.txt -> 'my file.txt'; $10 -> '$10'; it's -> 'it'\''s'
  return arg;
  // @BUGBUG: This sometimes fails with "sh: 1: Syntax error: Unterminated quoted string",
  // so since this is not super-critical, we will disable it for now.
  //return "'" + findReplace(arg, "'", "'\\''", /*all=*/ true) + "'";
}

// execute an external command
// The command is composed of three pieces:
//  - the executable path, e.g. --valid-script-path
//  - an optional array of arguments. Meant for options. E.g. --valid-script-args. Options with leading  - can only be passed via Yaml/Json.
//  - one more optional single argument. Meant as the main filename argument.
// Each item will be escaped for shell syntax.
std::string exec(const std::string& cmd, const std::vector<std::string>& args /*= {}*/, const std::string& arg /*= ""*/) {
  std::array<char, 128> buffer;
  std::string result;
#ifdef _WIN32
#define popen _popen
#define pclose _pclose
#endif
  auto cmdLine = escapeForPOpen(cmd);
  for (const auto& a : args) // @TODO: proper escaping
    cmdLine += " " + escapeForPOpen(a);
  if (!arg.empty())
    cmdLine += " " + escapeForPOpen(arg);
  //std::cerr << "###" << cmdLine << "###" << std::endl;
  std::shared_ptr<std::FILE> pipe(popen(cmdLine.c_str(), "r"), pclose);
  if(!pipe)
    ABORT("popen() failed!");

  while(!std::feof(pipe.get())) {
    if(std::fgets(buffer.data(), (int)buffer.size(), pipe.get()) != NULL)
      result += buffer.data();
  }
  return result;
}

std::pair<std::string, int> hostnameAndProcessId() {  // helper to get hostname:pid
#ifdef _WIN32
  std::string hostname = getenv("COMPUTERNAME");
  auto processId = (int)GetCurrentProcessId();
#else
  static std::string hostname = []() {  // not sure if gethostname() is expensive. This way we call it only once.
    char hostnamebuf[HOST_NAME_MAX + 1] = {0};
    gethostname(hostnamebuf, sizeof(hostnamebuf));
    return std::string(hostnamebuf);
  }();
  auto processId = (int)getpid();
#endif
  return {hostname, processId};
}

// format a long number with comma separators
std::string withCommas(size_t n) {
  std::string res = std::to_string(n);
  for(int i = (int)res.size() - 3; i > 0; i -= 3)
    res.insert(i, ",");
  return res;
}

bool beginsWith(const std::string& text, const std::string& prefix) {
  return text.size() >= prefix.size()
         && !text.compare(0, prefix.size(), prefix);
}

bool endsWith(const std::string& text, const std::string& suffix) {
  return text.size() >= suffix.size()
         && !text.compare(text.size() - suffix.size(), suffix.size(), suffix);
}

// @TODO: sort these functions into a separate header.
std::u32string utf8ToUnicodeString(std::string const& s) {
#ifdef _MSC_VER // workaround for a known bug in VS CRT
  std::wstring_convert<std::codecvt_utf8<unsigned int/*char32_t*/>, unsigned int/*char32_t*/> converter;
  auto res = converter.from_bytes(s);
  return std::u32string(res.begin(), res.end());
#else
  std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t> converter;
  return converter.from_bytes(s);
#endif
}

std::string utf8FromUnicodeString(const std::u32string& s) {
#ifdef _MSC_VER // workaround for a known bug in VS CRT
  std::wstring_convert<std::codecvt_utf8<unsigned int/*char32_t*/>, unsigned int/*char32_t*/> converter;
  std::basic_string<unsigned int> si(s.begin(), s.end());
  return converter.to_bytes(si);
#else
  std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t> converter;
  return converter.to_bytes(s);
#endif
}

std::u16string utf8ToUtf16String(std::string const& s) {
#ifdef _MSC_VER // workaround for a known bug in VS CRT
  std::wstring_convert<std::codecvt_utf8<wchar_t/*char16_t*/>, wchar_t/*char16_t*/> converter;
  auto res = converter.from_bytes(s);
  return std::u16string(res.begin(), res.end());
#else
  std::wstring_convert<std::codecvt_utf8<char16_t>, char16_t> converter;
  return converter.from_bytes(s);
#endif
}

std::string utf8FromUtf16String(const std::u16string& s) {
#ifdef _MSC_VER // workaround for a known bug in VS CRT
  std::wstring_convert<std::codecvt_utf8<wchar_t/*char16_t*/>, wchar_t/*char16_t*/> converter;
  std::basic_string<wchar_t> si(s.begin(), s.end());
  return converter.to_bytes(si);
#else
  std::wstring_convert<std::codecvt_utf8<char16_t>, char16_t> converter;
  return converter.to_bytes(s);
#endif
}

// test whether a Unicode code point is in continuous script (e.g. Chinese or Thai)
// This is used for detok bleu scoring where we have CJT characters.
bool isContinuousScript(char32_t c) {
  // currently, this table is hand-coded, and may need to be extended when the standard grows
  auto in = [c](char32_t minVal, char32_t maxVal) { return c >= minVal && c <= maxVal; };
  bool isHan = in(0x2E80, 0x2E99) || in(0x2E9B, 0x2EF3) || in(0x2F00, 0x2FD5) || in(0x3005, 0x3005) ||
               in(0x3007, 0x3007) || in(0x3021, 0x3029) || in(0x3038, 0x303A) || in(0x303B, 0x303b) ||
               in(0x3200, 0x32FF) || // Enclosed CJK Letters and Months, https://en.wikipedia.org/wiki/Enclosed_CJK_Letters_and_Months
               in(0x3400, 0x4DB5) || // up to here, we have a few gaps compared to sacrebleu
               in(0x4E00, 0x9FEF) || // sacrebleu: only up to 0x9fbb
               in(0xF900, 0xFA6D) || in(0xFA70, 0xFAD9) || // similar to sacrebleu
               in(0x20000, 0x2A6D6) ||
               in(0x2A700, 0x2B734) || in(0x2B740, 0x2B81D) || in(0x2B820, 0x2CEA1) || in(0x2CEB0, 0x2EBE0) || // not in sacrebleu
               in(0x2F800, 0x2FA1D);
  bool isKana = in(0x3040, 0x30FF) ||   // Hiragana, Katakana
                in(0x1B000, 0x1B0FF) || // Kana supplement, https://en.wikipedia.org/wiki/Kana_Supplement
                in(0x1B130, 0x1B16F);   // small Kana, https://en.wikipedia.org/wiki/Small_Kana_Extension
  bool isThai = in(0x0E00, 0x0E7F); // https://en.wikipedia.org/wiki/Thai_(Unicode_block)
  return isHan || isKana || isThai;
  // Korean characters (Hangul syllables): 0xac00..0xd7a3
  // Korean subwords (Hangul Jamo): 0x1100..0x11ff [https://en.wikipedia.org/wiki/Hangul_Jamo_(Unicode_block)]
  // Sacrebleu uses characters units for Chinese characters; specifically, these ranges:
  /* (ranges as used in sacrebleuy.py)
        uchar >= u'\u2600' and uchar <= u'\u27bf'  ## missing above
        uchar >= u'\u2e80'                         # CJK Radicals Supplement
        uchar >= u'\u2f00' and uchar <= u'\u2fdf'  # Kangxi Radicals
        uchar >= u'\u2ff0'                         # Chinese character structure
        uchar >= u'\u3000' and uchar <= u'\u303f'  # CJK punctuation mark           ## 3040..30ff = Kana
        uchar >= u'\u3100' and uchar <= u'\u312f'  # Phonetic symbols
        uchar >= u'\u31a0'                         # Phonetic symbols (Taiwanese and Hakka expansion)
        uchar >= u'\u31c0' and uchar <= u'\u31ef'  # CJK stroke
        uchar >= u'\u3200' and uchar <= u'\u4db5'  # CJK Unified Ideographs Extension A, release 3.0
        uchar >= u'\u4e00'                         # CJK Unified Ideographs, release 1.1
        uchar >= u'\u9fa6' and uchar <= u'\u9fbb'  # CJK Unified Ideographs, release 4.1
        uchar >= u'\uf900' and uchar <= u'\ufa2d'  # CJK Compatibility Ideographs, release 1.1
        uchar >= u'\ufa30' and uchar <= u'\ufa6a'  # CJK Compatibility Ideographs, release 3.2
        uchar >= u'\ufa70' and uchar <= u'\ufad9'  # CJK Compatibility Ideographs, release 4.1
        uchar >= u'\ufe10' and uchar <= u'\ufe1f'  ## missing above
        uchar >= u'\ufe30' and uchar <= u'\ufe4f'  ## missing above
        uchar >= u'\uff00' and uchar <= u'\uffef'  # Full width ASCII, full width of English punctuation, half width Katakana, half wide half width kana, Korean alphabet
        uchar >= u'\u20000' and uchar <= u'\u2a6d6'  # CJK Unified Ideographs Extension B, release 3.1
        uchar >= u'\u2f800' and uchar <= u'\u2fa1d'  # CJK Compatibility Supplement, release 3.1
  */
}

// convert UTF-8 characters to lower or upper case
struct UTF8Mapper { // can't use the standard lib functions because MS-internal Philly servers do not have UTF-8 locale installed
  std::map<char32_t, char32_t> toUpperMap, toLowerMap;
  UTF8Mapper() {
    /*
      env LC_ALL=en_US.UTF-8 sed 's/\(.\)/\1\n/g'   TEXT_FILE_CONTAINING_ALL_CHARS > l
      env LC_ALL=en_US.UTF-8 sed 's/\(.\)/\U\1\n/g' TEXT_FILE_CONTAINING_ALL_CHARS > u
      paste l u | env LC_ALL=en_US.UTF-8 sort -u > x
      cat x | awk '{if($1 != $2){print}}' > y
      cat y | tr -d '\r' \
        | od -w10000 -t x1 \
        | head -1 \
        | sed -e 's/^0000000 /{{".x/g' -e 's/ 09 /",".x/g' -e 's/ 0a /"},{".x/g' -e 's/ 0a$/"}/' -e 's/ /.x/g' \
        | tr '.' '\\' \
        | xclip
    */
    std::vector<std::pair<std::string, std::string>> map8{ {"\xc9\x92","\xe2\xb1\xb0"},{"\x61","\x41"},{"\xc3\xa1","\xc3\x81"},{"\xc3\xa0","\xc3\x80"},{"\xe1\xba\xaf","\xe1\xba\xae"},{"\xe1\xba\xb1","\xe1\xba\xb0"},{"\xe1\xba\xb5","\xe1\xba\xb4"},{"\xe1\xba\xb3","\xe1\xba\xb2"},{"\xe1\xba\xb7","\xe1\xba\xb6"},{"\xc4\x83","\xc4\x82"},{"\xe1\xba\xa5","\xe1\xba\xa4"},{"\xe1\xba\xa7","\xe1\xba\xa6"},{"\xe1\xba\xab","\xe1\xba\xaa"},{"\xe1\xba\xa9","\xe1\xba\xa8"},{"\xe1\xba\xad","\xe1\xba\xac"},{"\xc3\xa2","\xc3\x82"},{"\xc7\x8e","\xc7\x8d"},{"\xc7\xbb","\xc7\xba"},{"\xc3\xa5","\xc3\x85"},{"\xc7\x9f","\xc7\x9e"},{"\xc3\xa4","\xc3\x84"},{"\xc3\xa3","\xc3\x83"},{"\xc4\x85","\xc4\x84"},{"\xc4\x81","\xc4\x80"},{"\xe1\xba\xa3","\xe1\xba\xa2"},{"\xc8\x83","\xc8\x82"},{"\xe1\xba\xa1","\xe1\xba\xa0"},{"\xc7\xa3","\xc7\xa2"},{"\xc3\xa6","\xc3\x86"},{"\x62","\x42"},{"\xe1\xb8\x87","\xe1\xb8\x86"},{"\x63","\x43"},{"\xc4\x87","\xc4\x86"},{"\xc4\x89","\xc4\x88"},{"\xc4\x8d","\xc4\x8c"},{"\xc4\x8b","\xc4\x8a"},{"\xc3\xa7","\xc3\x87"},{"\x64","\x44"},{"\xc4\x8f","\xc4\x8e"},{"\xc4\x91","\xc4\x90"},{"\xe1\xb8\x91","\xe1\xb8\x90"},{"\xe1\xb8\x8d","\xe1\xb8\x8c"},{"\xe1\xb8\x8f","\xe1\xb8\x8e"},{"\xc3\xb0","\xc3\x90"},{"\x65","\x45"},{"\xc3\xa9","\xc3\x89"},{"\xc3\xa8","\xc3\x88"},{"\xc4\x95","\xc4\x94"},{"\xe1\xba\xbf","\xe1\xba\xbe"},{"\xe1\xbb\x81","\xe1\xbb\x80"},{"\xe1\xbb\x85","\xe1\xbb\x84"},{"\xe1\xbb\x83","\xe1\xbb\x82"},{"\xe1\xbb\x87","\xe1\xbb\x86"},{"\xc3\xaa","\xc3\x8a"},{"\xc4\x9b","\xc4\x9a"},{"\xc3\xab","\xc3\x8b"},{"\xe1\xba\xbd","\xe1\xba\xbc"},{"\xc4\x97","\xc4\x96"},{"\xc4\x99","\xc4\x98"},{"\xe1\xb8\x97","\xe1\xb8\x96"},{"\xc4\x93","\xc4\x92"},{"\xe1\xba\xbb","\xe1\xba\xba"},{"\xc8\x87","\xc8\x86"},{"\xe1\xba\xb9","\xe1\xba\xb8"},{"\xc7\x9d","\xc6\x8e"},{"\x66","\x46"},{"\x67","\x47"},{"\xc7\xb5","\xc7\xb4"},{"\xc4\x9f","\xc4\x9e"},{"\xc4\x9d","\xc4\x9c"},{"\xc7\xa7","\xc7\xa6"},{"\xc4\xa1","\xc4\xa0"},{"\xc4\xa3","\xc4\xa2"},{"\xc9\xa0","\xc6\x93"},{"\x68","\x48"},{"\xc4\xa5","\xc4\xa4"},{"\xc4\xa7","\xc4\xa6"},{"\xe1\xb8\xa9","\xe1\xb8\xa8"},{"\xe1\xb8\xa5","\xe1\xb8\xa4"},{"\xe1\xb8\xab","\xe1\xb8\xaa"},{"\x69","\x49"},{"\xc4\xb1","\x49"},{"\xc3\xad","\xc3\x8d"},{"\xc3\xac","\xc3\x8c"},{"\xc4\xad","\xc4\xac"},{"\xc3\xae","\xc3\x8e"},{"\xc7\x90","\xc7\x8f"},{"\xc3\xaf","\xc3\x8f"},{"\xc4\xa9","\xc4\xa8"},{"\xc4\xaf","\xc4\xae"},{"\xc4\xab","\xc4\xaa"},{"\xe1\xbb\x89","\xe1\xbb\x88"},{"\xc8\x8b","\xc8\x8a"},{"\xe1\xbb\x8b","\xe1\xbb\x8a"},{"\x6a","\x4a"},{"\xc4\xb5","\xc4\xb4"},{"\x6b","\x4b"},{"\xe1\xb8\xb1","\xe1\xb8\xb0"},{"\xc4\xb7","\xc4\xb6"},{"\xe1\xb8\xb3","\xe1\xb8\xb2"},{"\xc6\x99","\xc6\x98"},{"\x6c","\x4c"},{"\xc4\xba","\xc4\xb9"},{"\xc4\xbe","\xc4\xbd"},{"\xc5\x82","\xc5\x81"},{"\xc4\xbc","\xc4\xbb"},{"\xe1\xb8\xb7","\xe1\xb8\xb6"},{"\x6d","\x4d"},{"\xe1\xb8\xbf","\xe1\xb8\xbe"},{"\xe1\xb9\x83","\xe1\xb9\x82"},{"\xc5\x8b","\xc5\x8a"},{"\x6e","\x4e"},{"\xc5\x84","\xc5\x83"},{"\xc5\x88","\xc5\x87"},{"\xc3\xb1","\xc3\x91"},{"\xe1\xb9\x85","\xe1\xb9\x84"},{"\xc5\x86","\xc5\x85"},{"\xe1\xb9\x87","\xe1\xb9\x86"},{"\xe1\xb9\x89","\xe1\xb9\x88"},{"\xc5\x93","\xc5\x92"},{"\x6f","\x4f"},{"\xc3\xb3","\xc3\x93"},{"\xc3\xb2","\xc3\x92"},{"\xc5\x8f","\xc5\x8e"},{"\xe1\xbb\x91","\xe1\xbb\x90"},{"\xe1\xbb\x93","\xe1\xbb\x92"},{"\xe1\xbb\x95","\xe1\xbb\x94"},{"\xe1\xbb\x99","\xe1\xbb\x98"},{"\xc3\xb4","\xc3\x94"},{"\xc7\x92","\xc7\x91"},{"\xc3\xb6","\xc3\x96"},{"\xc5\x91","\xc5\x90"},{"\xc3\xb5","\xc3\x95"},{"\xc3\xb8","\xc3\x98"},{"\xc7\xab","\xc7\xaa"},{"\xc5\x8d","\xc5\x8c"},{"\xe1\xbb\x8f","\xe1\xbb\x8e"},{"\xc8\x8f","\xc8\x8e"},{"\xe1\xbb\x8d","\xe1\xbb\x8c"},{"\xe1\xbb\x9b","\xe1\xbb\x9a"},{"\xe1\xbb\x9d","\xe1\xbb\x9c"},{"\xe1\xbb\xa1","\xe1\xbb\xa0"},{"\xe1\xbb\x9f","\xe1\xbb\x9e"},{"\xe1\xbb\xa3","\xe1\xbb\xa2"},{"\xc6\xa1","\xc6\xa0"},{"\xc9\x94","\xc6\x86"},{"\x70","\x50"},{"\xe1\xb9\x95","\xe1\xb9\x94"},{"\x71","\x51"},{"\x72","\x52"},{"\xc5\x95","\xc5\x94"},{"\xc5\x99","\xc5\x98"},{"\xc5\x97","\xc5\x96"},{"\xe1\xb9\x9b","\xe1\xb9\x9a"},{"\xe1\xb9\x9f","\xe1\xb9\x9e"},{"\x73","\x53"},{"\xc5\x9b","\xc5\x9a"},{"\xc5\x9d","\xc5\x9c"},{"\xc5\xa1","\xc5\xa0"},{"\xc5\x9f","\xc5\x9e"},{"\xe1\xb9\xa3","\xe1\xb9\xa2"},{"\x74","\x54"},{"\xc5\xa5","\xc5\xa4"},{"\xc5\xa3","\xc5\xa2"},{"\xe1\xb9\xad","\xe1\xb9\xac"},{"\xe1\xb9\xaf","\xe1\xb9\xae"},{"\xc8\x95","\xc8\x94"},{"\x75","\x55"},{"\xc3\xba","\xc3\x9a"},{"\xc3\xb9","\xc3\x99"},{"\xc5\xad","\xc5\xac"},{"\xc3\xbb","\xc3\x9b"},{"\xc7\x94","\xc7\x93"},{"\xc5\xaf","\xc5\xae"},{"\xc7\x98","\xc7\x97"},{"\xc7\x9c","\xc7\x9b"},{"\xc3\xbc","\xc3\x9c"},{"\xc5\xb1","\xc5\xb0"},{"\xc5\xa9","\xc5\xa8"},{"\xc5\xb3","\xc5\xb2"},{"\xc5\xab","\xc5\xaa"},{"\xe1\xbb\xa7","\xe1\xbb\xa6"},{"\xe1\xbb\xa5","\xe1\xbb\xa4"},{"\xe1\xb9\xb3","\xe1\xb9\xb2"},{"\xe1\xbb\xa9","\xe1\xbb\xa8"},{"\xe1\xbb\xab","\xe1\xbb\xaa"},{"\xe1\xbb\xaf","\xe1\xbb\xae"},{"\xe1\xbb\xad","\xe1\xbb\xac"},{"\xe1\xbb\xb1","\xe1\xbb\xb0"},{"\xc6\xb0","\xc6\xaf"},{"\x76","\x56"},{"\x77","\x57"},{"\xc5\xb5","\xc5\xb4"},{"\x78","\x58"},{"\xe1\xba\x8b","\xe1\xba\x8a"},{"\x79","\x59"},{"\xc3\xbd","\xc3\x9d"},{"\xe1\xbb\xb3","\xe1\xbb\xb2"},{"\xc5\xb7","\xc5\xb6"},{"\xc3\xbf","\xc5\xb8"},{"\xe1\xbb\xb9","\xe1\xbb\xb8"},{"\x7a","\x5a"},{"\xc5\xba","\xc5\xb9"},{"\xc5\xbe","\xc5\xbd"},{"\xc5\xbc","\xc5\xbb"},{"\xc6\xb6","\xc6\xb5"},{"\xe1\xba\x93","\xe1\xba\x92"},{"\xe1\xba\x95","\xe1\xba\x94"},{"\xc8\xa5","\xc8\xa4"},{"\xc3\xbe","\xc3\x9e"},{"\xca\x92","\xc6\xb7"},{"\xce\xb1","\xce\x91"},{"\xce\xac","\xce\x86"},{"\xce\xb2","\xce\x92"},{"\xce\xb3","\xce\x93"},{"\xce\xb4","\xce\x94"},{"\xce\xb5","\xce\x95"},{"\xce\xad","\xce\x88"},{"\xce\xb6","\xce\x96"},{"\xce\xb7","\xce\x97"},{"\xce\xae","\xce\x89"},{"\xce\xb8","\xce\x98"},{"\xce\xb9","\xce\x99"},{"\xce\xaf","\xce\x8a"},{"\xcf\x8a","\xce\xaa"},{"\xce\xba","\xce\x9a"},{"\xce\xbb","\xce\x9b"},{"\xce\xbc","\xce\x9c"},{"\xce\xbd","\xce\x9d"},{"\xce\xbe","\xce\x9e"},{"\xce\xbf","\xce\x9f"},{"\xcf\x8c","\xce\x8c"},{"\xcf\x80","\xce\xa0"},{"\xcf\x83","\xce\xa3"},{"\xcf\x82","\xce\xa3"},{"\xcf\x84","\xce\xa4"},{"\xcf\x85","\xce\xa5"},{"\xcf\x8d","\xce\x8e"},{"\xcf\x8b","\xce\xab"},{"\xcf\x86","\xce\xa6"},{"\xcf\x87","\xce\xa7"},{"\xcf\x88","\xce\xa8"},{"\xcf\x89","\xce\xa9"},{"\xcf\x8e","\xce\x8f"},{"\xd0\xb0","\xd0\x90"},{"\xd3\x93","\xd3\x92"},{"\xd3\x95","\xd3\x94"},{"\xd0\xb1","\xd0\x91"},{"\xd0\xb2","\xd0\x92"},{"\xd0\xb3","\xd0\x93"},{"\xd2\x93","\xd2\x92"},{"\xd2\x91","\xd2\x90"},{"\xd0\xb4","\xd0\x94"},{"\xd1\x93","\xd0\x83"},{"\xd1\x92","\xd0\x82"},{"\xd0\xb5","\xd0\x95"},{"\xd1\x90","\xd0\x80"},{"\xd3\x99","\xd3\x98"},{"\xd1\x94","\xd0\x84"},{"\xd1\x91","\xd0\x81"},{"\xd0\xb6","\xd0\x96"},{"\xd0\xb7","\xd0\x97"},{"\xd2\x99","\xd2\x98"},{"\xd1\x95","\xd0\x85"},{"\xd0\xb8","\xd0\x98"},{"\xd3\xa3","\xd3\xa2"},{"\xd1\x96","\xd0\x86"},{"\xd1\x97","\xd0\x87"},{"\xd0\xb9","\xd0\x99"},{"\xd1\x98","\xd0\x88"},{"\xd0\xba","\xd0\x9a"},{"\xd2\x9b","\xd2\x9a"},{"\xd3\x84","\xd3\x83"},{"\xd2\xa1","\xd2\xa0"},{"\xd0\xbb","\xd0\x9b"},{"\xd1\x99","\xd0\x89"},{"\xd0\xbc","\xd0\x9c"},{"\xd0\xbd","\xd0\x9d"},{"\xd2\xa3","\xd2\xa2"},{"\xd1\x9a","\xd0\x8a"},{"\xd0\xbe","\xd0\x9e"},{"\xd3\xa7","\xd3\xa6"},{"\xd3\xa9","\xd3\xa8"},{"\xd0\xbf","\xd0\x9f"},{"\xd1\x80","\xd0\xa0"},{"\xd1\x81","\xd0\xa1"},{"\xd2\xab","\xd2\xaa"},{"\xd1\x82","\xd0\xa2"},{"\xd1\x9c","\xd0\x8c"},{"\xd1\x9b","\xd0\x8b"},{"\xd1\x83","\xd0\xa3"},{"\xd3\xb1","\xd3\xb0"},{"\xd2\xb1","\xd2\xb0"},{"\xd2\xaf","\xd2\xae"},{"\xd1\x9e","\xd0\x8e"},{"\xd1\x84","\xd0\xa4"},{"\xd1\x85","\xd0\xa5"},{"\xd2\xb3","\xd2\xb2"},{"\xd2\xbb","\xd2\xba"},{"\xd1\x86","\xd0\xa6"},{"\xd1\x87","\xd0\xa7"},{"\xd1\x9f","\xd0\x8f"},{"\xd1\x88","\xd0\xa8"},{"\xd1\x89","\xd0\xa9"},{"\xd1\x8a","\xd0\xaa"},{"\xd1\x8b","\xd0\xab"},{"\xd1\x8c","\xd0\xac"},{"\xd1\x8d","\xd0\xad"},{"\xd1\x8e","\xd0\xae"},{"\xd1\x8f","\xd0\xaf"},{"\xd5\xa1","\xd4\xb1"},{"\xd5\xa3","\xd4\xb3"},{"\xd5\xa5","\xd4\xb5"},{"\xd5\xab","\xd4\xbb"},{"\xd5\xac","\xd4\xbc"},{"\xd5\xb2","\xd5\x82"},{"\xd5\xb8","\xd5\x88"},{"\xd5\xbd","\xd5\x8d"},{"\xd5\xbe","\xd5\x8e"},{"\xd5\xbf","\xd5\x8f"},{"\xd6\x80","\xd5\x90"},{"\xd6\x81","\xd5\x91"} };
    for (auto p8 : map8) {
      auto from = utf8ToUnicodeString(p8.first);
      auto to   = utf8ToUnicodeString(p8.second);
      ABORT_IF(from.size() != 1 || to.size() != 1, "Incorrect character encoding??");
      toUpperMap.insert(std::make_pair(from.front(), to.front()));
      toLowerMap.insert(std::make_pair(to.front(), from.front()));
    }
  }
  char32_t toUpperOrLower(char32_t c, bool toLower) const { return mapChar(toLower ? toLowerMap : toUpperMap, c); }
private:
  static char32_t mapChar(const std::map<char32_t, char32_t>& map, char32_t c) {
    auto iter = map.find(c);
    if (iter == map.end())
      return c;
    else
      return iter->second;
  }
};

// shared implementation of toUpper, toLower, and toCapitalized
static std::string utf8ToUpperOrLower(const std::string& s, bool toLower, bool toInitCap) {
  static UTF8Mapper utf8Mapper;
  auto ws = utf8ToUnicodeString(s);
  for (auto& c : ws) {
    c = utf8Mapper.toUpperOrLower(c, toLower);
    if (toInitCap)
      toLower = true;
  }
  return utf8FromUnicodeString(ws);
}

std::string utf8ToUpper(const std::string& s)     { return utf8ToUpperOrLower(s, /*toLower=*/false, /*toInitCap=*/false); }
std::string utf8ToLower(const std::string& s)     { return utf8ToUpperOrLower(s, /*toLower=*/true , /*toInitCap=*/false); }
std::string utf8Capitalized(const std::string& s) { return utf8ToUpperOrLower(s, /*toLower=*/false, /*toInitCap=*/true ); }

// convert an English sentence to title case
// Since title case is an English thing, we only consider ASCII characters.
std::string toEnglishTitleCase(const std::string& s) {
  auto res = s;
  // process token by token
  const std::string wordStartChars = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ";
  const std::string wordInternalChars = wordStartChars + "'"; // don't title-case letters after word-internal apostrophe
  const std::set<std::string> exceptions = { // from moses-scripts/scripts/recaser/detruecase.perl
    "a","after","against","al-.+","and","any","as","at","be","because","between","by","during","el-.+","for","from","his","in","is","its","last","not","of","off","on","than","the","their","this","to","was","were","which","will","with"
  };
  const std::set<char> wordPredChars = {' ', '"', '\'', '-'}; // only capitalize words if following these characters (to avoid upper-casing word-internal SPM units)
  // These are tokenization heuristics, which may be incomplete.
  size_t epos = 0;
  for(size_t pos = epos; pos < res.size(); pos = epos) {
    // locate the next word
    pos = res.find_first_of(wordStartChars, pos); // find first letter
    if (pos == std::string::npos)
      break;
    epos = res.find_first_not_of(wordInternalChars, pos + 1); // find first non-letter
    if (epos == std::string::npos)
      epos = res.size();
    auto word = res.substr(pos, epos - pos);
    // further checks of the word
    if (res[pos] < 'a' || res[pos] > 'z') // skip if already upper-case
      continue;
    if (pos > 0 && wordPredChars.find(res[pos-1]) == wordPredChars.end()) // skip if unexpected char before the word
      continue;
    if (exceptions.find(word) != exceptions.end()) // skip if in the exception list
      continue;
    // upper-case it
    res[pos] -= 'a' - 'A';
  }
  return res;
}

std::string findReplace(const std::string& in, const std::string& what, const std::string& withWhat, bool all /*= false*/) {
  std::string res = in;
  for(size_t pos = res.find(what); pos != std::string::npos; pos = res.find(what, pos + withWhat.length())) {
    res.replace(pos, what.length(), withWhat);
    if (!all)
      break;
  }
  return res;
}

double parseDouble(std::string s) {
  double res;
  char c;  // dummy char -- if we succeed to parse this, then there were extraneous characters after the number
  auto rc = sscanf(s.c_str(), "%lf%c", &res, &c);
  ABORT_IF(rc != 1, "Mal-formed number: {}", s);
  return res;
}

// parses a user-friendly number that can have commas and (some) units
double parseNumber(std::string param) {
  // get unit prefix
  double factor = 1.;
  if(!param.empty() && param.back() >= 'A') {
    switch(param.back()) {
      case 'k': factor = 1.e3;  break;
      case 'K': factor = 1.e3;  break; // not technically correct but often used for k
      case 'M': factor = 1.e6;  break;
      case 'G': factor = 1.e9;  break;
      case 'T': factor = 1.e12; break;
      default: ABORT("Invalid or unsupported unit prefix '{}' in {}", param.back(), param);
    }
    param.pop_back();
  }
  // we allow users to place commas in numbers (note: we are not actually verifying that they are in
  // the right place)
  auto it = std::remove_if(param.begin(), param.end(), [](char c) { return c == ','; }); // use return value for future-proofing against nodiscard warning
  param.erase(it, param.end()); // since we have that iterator now, we might as well shrink to fit
  return factor * parseDouble(param);
}

}  // namespace utils
}  // namespace marian