Welcome to mirror list, hosted at ThFree Co, Russian Federation.

detruecase.perl « recaser « scripts - github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
blob: 4a1b3beae92d64cfa58d500d1fc2057887f48592 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
#!/usr/bin/perl -w

use strict;
use Getopt::Long "GetOptions";

binmode(STDIN, ":utf8");
binmode(STDOUT, ":utf8");


my ($SRC,$INFILE);
die("detruecase.perl < in > out")
    unless &GetOptions('headline=s' => \$SRC,
		       'in=s' => \$INFILE);

my %SENTENCE_END = ("."=>1,":"=>1,"?"=>1,"!"=>1);
my %DELAYED_SENTENCE_START = ("("=>1,"["=>1,"\""=>1,"'"=>1);

# lowercase even in headline
my %ALWAYS_LOWER;
foreach ("a","after","against","al-.+","and","any","as","at","be","because","between","by","during","el-.+","for","from","his","in","is","its","last","not","of","off","on","than","the","their","this","to","was","were","which","will","with") { $ALWAYS_LOWER{$_} = 1; }

# find out about the headlines
my @HEADLINE;
if (defined($SRC)) {
    open(SRC,$SRC);
    my $headline_flag = 0;
    while(<SRC>) {
	$headline_flag = 1 if /<hl>/;
	$headline_flag = 0 if /<.hl>/;
	next unless /^<seg/;
	push @HEADLINE, $headline_flag;
    }
    close(SRC);
}

my $sentence = 0;
if ($INFILE) {
  open(IN,$INFILE) || die("ERROR: could not open file '$INFILE'");
  binmode(IN, ":utf8");
  while(<IN>) {
    &process($_,$sentence++);
  }
  close(IN);
}
else {
  while(<STDIN>) {
    &process($_,$sentence++);
  }
}

sub process {
    my $line = $_[0];
    chomp($line);
    $line =~ s/^\s+//;
    $line =~ s/\s+$//;
    my @WORD  = split(/\s+/,$line);

    # uppercase at sentence start
    my $sentence_start = 1;
    for(my $i=0;$i<scalar(@WORD);$i++) {
      &uppercase(\$WORD[$i]) if $sentence_start;
      if (defined($SENTENCE_END{ $WORD[$i] })) { $sentence_start = 1; }
      elsif (!defined($DELAYED_SENTENCE_START{$WORD[$i] })) { $sentence_start = 0; }
    }

    # uppercase headlines {
    if (defined($SRC) && $HEADLINE[$sentence]) {
	foreach (@WORD) {
	    &uppercase(\$_) unless $ALWAYS_LOWER{$_};
	}	
    }

    # output
    my $first = 1;
    foreach (@WORD) {
	print " " unless $first;
	$first = 0;
	print $_;
    }
    print "\n";
    $sentence++;
}

sub uppercase {
    my ($W) = @_;
    $$W = uc(substr($$W,0,1)).substr($$W,1);
}