Welcome to mirror list, hosted at ThFree Co, Russian Federation.

pre-tokenizer.perl « tokenizer « scripts - github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
blob: 499671b44de5197122049d437ea1407a253f9f61 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
#!/usr/bin/env perl 

# script for preprocessing language data prior to tokenization
# Start by Ulrich Germann, after noticing systematic preprocessing errors
# in some of the English Europarl data.

use warnings;
use strict;
use Getopt::Std;

binmode(STDIN, ":utf8");
binmode(STDOUT, ":utf8");

sub usage
{
  print "Script for preprocessing of raw language data prior to tokenization\n";
  print "Usage: $0 -l <language tag> [-b]\n";
  print "       -b: no buffering\n";
}

my %args;
getopt('l=s h b',\%args);
usage() && exit(0) if $args{'h'};
$|++ if $args{'b'};
if ($args{'l'} eq "en")
  {
    while (<>)
      {
	s/([[:alpha:]]\') s\b/$1s/g;
	print;
      }
  }
elsif ($args{'l'} eq "fr")
  {
    while (<>)
      {
	s/\b([[:alpha:]]\')\s+(?=[[:alpha:]])/$1/g;
	print;
      }
  }
else
  {
    print while <>;
  }