Welcome to mirror list, hosted at ThFree Co, Russian Federation.

replace-unicode-punctuation.perl « tokenizer « scripts - github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
blob: cda69ddf7f490626d022d30a7ab1b68b8bcca573 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
#!/usr/bin/env perl

use warnings;
use strict;

#binmode(STDIN, ":utf8");
#binmode(STDOUT, ":utf8");

while(<STDIN>) {
  s/,/,/g;
  s/。 */. /g;
  s/、/,/g;
  s/”/"/g;
  s/“/"/g;
  s/∶/:/g;
  s/:/:/g;
  s/?/\?/g;
  s/《/"/g;
  s/》/"/g;
  s/)/\)/g;
  s/!/\!/g;
  s/(/\(/g;
  s/;/;/g;
  s/1/"/g;
  s/」/"/g;
  s/「/"/g;
  s/0/0/g;
  s/3/3/g;
  s/2/2/g;
  s/5/5/g;
  s/6/6/g;
  s/9/9/g;
  s/7/7/g;
  s/8/8/g;
  s/4/4/g;
  s/. */. /g;
  s/~/\~/g;
  s/’/\'/g;
  s/…/\.\.\./g;
  s/━/\-/g;
  s/〈/\</g;
  s/〉/\>/g;
  s/【/\[/g;
  s/】/\]/g;
  s/%/\%/g;
  print $_;
}