scripts/recaser/truecase.perl


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116

#!/usr/bin/env perl 

# $Id: train-recaser.perl 1326 2007-03-26 05:44:27Z bojar $
use strict;
use Getopt::Long "GetOptions";

binmode(STDIN, ":utf8");
binmode(STDOUT, ":utf8");

# apply switches
my ($MODEL, $UNBUFFERED);
die("truecase.perl --model MODEL [-b] < in > out")
    unless &GetOptions('model=s' => \$MODEL,'b|unbuffered' => \$UNBUFFERED)
    && defined($MODEL);
if (defined($UNBUFFERED) && $UNBUFFERED) { $|=1; }

my (%BEST,%KNOWN);
open(MODEL,$MODEL) || die("ERROR: could not open '$MODEL'");
binmode(MODEL, ":utf8");
while(<MODEL>) {
  my ($word,@OPTIONS) = split;
  $BEST{ lc($word) } = $word;
  $KNOWN{ $word } = 1;
  for(my $i=1;$i<$#OPTIONS;$i+=2) {
    $KNOWN{ $OPTIONS[$i] } = 1;
  }
}
close(MODEL);

my %SENTENCE_END = ("."=>1,":"=>1,"?"=>1,"!"=>1);
my %DELAYED_SENTENCE_START = ("("=>1,"["=>1,"\""=>1,"'"=>1,"&apos;"=>1,"&quot;"=>1,"&#91;"=>1,"&#93;"=>1);

while(<STDIN>) {
  chop;
  my ($WORD,$MARKUP) = split_xml($_);
  my $sentence_start = 1;
  for(my $i=0;$i<=$#$WORD;$i++) {
    print " " if $i && $$MARKUP[$i] eq '';
    print $$MARKUP[$i];

    my ($word,$otherfactors);
    if ($$WORD[$i] =~ /^([^\|]+)(.*)/)
    {
	$word = $1;
	$otherfactors = $2;
    }
    else
    {
	$word = $$WORD[$i];
	$otherfactors = "";
    }

    if ($sentence_start && defined($BEST{lc($word)})) {
      print $BEST{lc($word)}; # truecase sentence start
    }
    elsif (defined($KNOWN{$word})) {
      print $word; # don't change known words
    }
    elsif (defined($BEST{lc($word)})) {
      print $BEST{lc($word)}; # truecase otherwise unknown words
    }
    else {
      print $word; # unknown, nothing to do
    }
    print $otherfactors;

    if    ( defined($SENTENCE_END{ $word }))           { $sentence_start = 1; }
    elsif (!defined($DELAYED_SENTENCE_START{ $word })) { $sentence_start = 0; }
  }
  print $$MARKUP[$#$MARKUP];
  print "\n";
}

# store away xml markup
sub split_xml {
  my ($line) = @_;
  my (@WORD,@MARKUP);
  my $i = 0;
  $MARKUP[0] = "";
  while($line =~ /\S/) {
    # XML tag
    if ($line =~ /^\s*(<\S[^>]*>)(.*)$/) {
      my $potential_xml = $1;
      my $line_next = $2;
      # exception for factor that is an XML tag
      if ($line =~ /^\S/ && scalar(@WORD)>0 && $WORD[$i-1] =~ /\|$/) {
	$WORD[$i-1] .= $potential_xml;
	if ($line_next =~ /^(\|+)(.*)$/) {
	  $WORD[$i-1] .= $1;
	  $line_next = $2;
	}
      }
      else {
        $MARKUP[$i] .= $potential_xml." ";
      }
      $line = $line_next;
    }
    # non-XML text
    elsif ($line =~ /^\s*([^\s<>]+)(.*)$/) {
      $WORD[$i++] = $1;
      $MARKUP[$i] = "";
      $line = $2;
    }
    # '<' or '>' occurs in word, but it's not an XML tag
    elsif ($line =~ /^\s*(\S+)(.*)$/) {
      $WORD[$i++] = $1;
      $MARKUP[$i] = "";
      $line = $2;
      }
    else {
      die("ERROR: huh? $line\n");
    }
  }
  chop($MARKUP[$#MARKUP]);
  return (\@WORD,\@MARKUP);
}