diff options
author | Philipp Koehn <phi@jhu.edu> | 2015-05-02 00:46:14 +0300 |
---|---|---|
committer | Philipp Koehn <phi@jhu.edu> | 2015-05-02 00:46:14 +0300 |
commit | de6a9bd1b3ed1ecacf4e76f68ee1ef37f21d90b4 (patch) | |
tree | d561af03c9982420da1c83e81404ffb6eec6e1d2 /scripts | |
parent | b369699661952e8c4d53568a6deddee7756e2c87 (diff) |
minor updates to factor scripts; brown-cluster may now run other scripts (e.g., truecaser) before assigning classes
Diffstat (limited to 'scripts')
-rwxr-xr-x | scripts/training/wrappers/make-factor-de-lemma.perl | 33 | ||||
-rwxr-xr-x | scripts/training/wrappers/make-factor-en-porter.perl | 10 |
2 files changed, 43 insertions, 0 deletions
diff --git a/scripts/training/wrappers/make-factor-de-lemma.perl b/scripts/training/wrappers/make-factor-de-lemma.perl new file mode 100755 index 000000000..db978317e --- /dev/null +++ b/scripts/training/wrappers/make-factor-de-lemma.perl @@ -0,0 +1,33 @@ +#!/usr/bin/perl -w
+
+use strict;
+use Encode;
+use FindBin qw($RealBin);
+
+my ($in,$out,$tmpdir) = @ARGV;
+
+`mkdir -p $tmpdir`;
+`$RealBin/../../tokenizer/deescape-special-chars.perl < $in | /home/pkoehn/statmt/bin/unicode2latin1.perl > $tmpdir/tok.$$`;
+`/home/pkoehn/statmt/bin/run-lopar-tagger.perl $tmpdir/tok.$$ $tmpdir/lopar.$$`;
+
+open(LOPAR,"$tmpdir/lopar.$$");
+open(OUT,"|$RealBin/../../tokenizer/escape-special-chars.perl > $out");
+while(<LOPAR>) {
+ chomp;
+ s/ +/ /g;
+ s/^ //;
+ s/ $//;
+ my $first = 1;
+ foreach (split) {
+ die("ERROR: choked on token '$_'") unless /^(.+)_([^_]+)_(.+)$/;
+ my ($word,$pos,$lemma) = ($1,$2,$3);
+ print OUT " " unless $first;
+ $first = 0;
+ $lemma =~ s/\|.+$//;
+ $lemma = $word if $lemma =~ /^\<.+\>$/;
+ print OUT encode('utf8', decode('iso-8859-1', $lemma));
+ }
+ print OUT "\n";
+}
+close(LOPAR);
+close(OUT);
diff --git a/scripts/training/wrappers/make-factor-en-porter.perl b/scripts/training/wrappers/make-factor-en-porter.perl new file mode 100755 index 000000000..749dc1318 --- /dev/null +++ b/scripts/training/wrappers/make-factor-en-porter.perl @@ -0,0 +1,10 @@ +#!/usr/bin/perl -w
+
+use strict;
+use FindBin qw($RealBin);
+
+my ($in,$out,$tmpdir) = @ARGV;
+
+my $porter_in = "$tmpdir/porter-in.$$";
+`$RealBin/../../tokenizer/deescape-special-chars.perl < $in > $porter_in`;
+`/home/pkoehn/statmt/bin/porter-stemmer $porter_in | $RealBin/../../tokenizer/escape-special-chars.perl > $out`;
|