Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorphikoehn <pkoehn@inf.ed.ac.uk>2012-05-30 03:58:18 +0400
committerphikoehn <pkoehn@inf.ed.ac.uk>2012-05-30 03:58:18 +0400
commit2e370ed11b0cd8989118891dc4385619837dd39f (patch)
treee0b8ab423399453a90bc4cacabc323289620042c /scripts/training
parentfd577d7a65cab923b9102d61873a032654d573a1 (diff)
more escaping in tokenizer; wrapper for berkeley parser (german)
Diffstat (limited to 'scripts/training')
-rwxr-xr-xscripts/training/wrappers/berkeleyparsed2mosesxml.perl36
-rwxr-xr-xscripts/training/wrappers/mosesxml2berkeleyparsed.perl44
-rwxr-xr-xscripts/training/wrappers/parse-de-berkeley.perl48
-rwxr-xr-xscripts/training/wrappers/syntax-hyphen-splitting.perl43
4 files changed, 171 insertions, 0 deletions
diff --git a/scripts/training/wrappers/berkeleyparsed2mosesxml.perl b/scripts/training/wrappers/berkeleyparsed2mosesxml.perl
new file mode 100755
index 000000000..6a4ed731e
--- /dev/null
+++ b/scripts/training/wrappers/berkeleyparsed2mosesxml.perl
@@ -0,0 +1,36 @@
+#!/usr/bin/perl -w
+
+use strict;
+
+while(<STDIN>) {
+ if (/^\(\(\)\)/) {
+ print "\n"; # parse failures
+ next;
+ }
+
+ # prep
+ s/^\( /\(TOP /;
+
+ # escape words
+ s/\&/\&amp;/g; # escape escape
+ s/\|/\&bar;/g; # factor separator
+ s/\</\&lt;/g; # xml
+ s/\>/\&gt;/g; # xml
+ s/\'/\&apos;/g; # xml
+ s/\"/\&quot;/g; # xml
+ s/\[/\&#91;/g; # syntax non-terminal
+ s/\]/\&#93;/g; # syntax non-terminal
+
+ # convert into tree
+ s/\((\S+) /<tree label=\"$1\"> /g;
+ s/\)/ <\/tree> /g;
+ s/\"\-LRB\-\"/\"LRB\"/g; # labels
+ s/\"\-RRB\-\"/\"RRB\"/g;
+ s/\-LRB\-/\(/g; # tokens
+ s/\-RRB\-/\)/g;
+ s/ +/ /g;
+ s/ $//g;
+
+ # output, replace words with original
+ print $_;
+}
diff --git a/scripts/training/wrappers/mosesxml2berkeleyparsed.perl b/scripts/training/wrappers/mosesxml2berkeleyparsed.perl
new file mode 100755
index 000000000..ef6e66024
--- /dev/null
+++ b/scripts/training/wrappers/mosesxml2berkeleyparsed.perl
@@ -0,0 +1,44 @@
+#!/usr/bin/perl -w
+
+use strict;
+
+#( (NP (NP (NN resumption)) (PP (IN of) (NP (DT the) (NN session)))) )
+#( (S (@S (@S (@S (S (NP (PRP I)) (VP (VB declare) (VP (@VP (VBD resumed) (NP (@NP (NP (DT the) (NN session)) (PP (IN of) (NP (@NP (DT the) (NNP European)) (NNP Parliament)))) (VP (VBN adjourned) (PP (IN on) (NP (NNP Friday) (CD 17)))))) (NP (NNP December) (CD 1999))))) (, ,)) (CC and)) (S (NP (PRP I)) (VP (MD would) (VP (VB like) (S (ADVP (RB once) (RB again)) (VP (TO to) (VP (@VP (VB wish) (NP (PRP you))) (NP (NP (@NP (@NP (DT a) (JJ happy)) (JJ new)) (NN year)) (PP (IN in) (NP (@NP (DT the) (NN hope)) (SBAR (IN that) (S (NP (PRP you)) (VP (VBD enjoyed) (NP (@NP (@NP (DT a) (JJ pleasant)) (JJ festive)) (NN period))))))))))))))) (. .)) )
+
+while(<STDIN>) {
+ if (/^$/) {
+ print "\n"; # parse failures
+ next;
+ }
+
+ # parenheses
+ s/\(/\-LRB\-/g; # tokens
+ s/\)/\-RRB\-/g;
+ s/\"LRB\"/\"\-LRB\-\"/g; # labels
+ s/\"RRB\"/\"\-RRB\-\"/g;
+
+ # main
+ s/<tree label=\"([^\"]+)\">/\($1/g;
+ s/ *<\/tree>/\)/g;
+ s/^\(TOP/\(/;
+
+ # de-escape
+ s/\&bar;/\|/g; # factor separator
+ s/\&lt;/\</g; # xml
+ s/\&gt;/\>/g; # xml
+ s/\&bra;/\[/g; # syntax non-terminal (legacy)
+ s/\&ket;/\]/g; # syntax non-terminal (legacy)
+ s/\&quot;/\"/g; # xml
+ s/\&apos;/\'/g; # xml
+ s/\&#91;/\[/g; # syntax non-terminal
+ s/\&#93;/\]/g; # syntax non-terminal
+ s/\&amp;/\&/g; # escape escape
+
+ # cleanup
+ s/ +/ /g;
+ s/ $//g;
+ s/\)$/ \)/g;
+
+ # output
+ print $_;
+}
diff --git a/scripts/training/wrappers/parse-de-berkeley.perl b/scripts/training/wrappers/parse-de-berkeley.perl
new file mode 100755
index 000000000..6482d11f3
--- /dev/null
+++ b/scripts/training/wrappers/parse-de-berkeley.perl
@@ -0,0 +1,48 @@
+#!/usr/bin/perl -w
+
+use strict;
+use Getopt::Long "GetOptions";
+use FindBin qw($Bin);
+
+my ($JAR,$GRAMMAR,$SPLIT_HYPHEN,$MARK_SPLIT,$BINARIZE);
+
+die("ERROR: syntax is: parse-de-berkeley.perl [-split-hyphen] [-mark-split] [-binarize] -jar jar-file -gr grammar < in > out\n")
+ unless &GetOptions
+ ('jar=s' => \$JAR,
+ 'gr=s' => \$GRAMMAR,
+ 'split-hyphen' => \$SPLIT_HYPHEN,
+ 'mark-split' => \$MARK_SPLIT,
+ 'binarize' => \$BINARIZE)
+ && defined($JAR) && defined($GRAMMAR);
+
+die("ERROR: could not find jar file '$JAR'\n") unless -e $JAR;
+die("ERROR: could not find grammar file '$GRAMMAR'\n") unless -e $GRAMMAR;
+
+$BINARIZE = $BINARIZE ? "-binarize" : "";
+$SPLIT_HYPHEN = $SPLIT_HYPHEN ? "| $Bin/syntax-hyphen-splitting.perl $BINARIZE" : "";
+$SPLIT_HYPHEN .= " -mark-split" if $SPLIT_HYPHEN && $MARK_SPLIT;
+
+my $tmp = "/tmp/parse-de-berkeley.$$";
+
+open(TMP,"| $Bin/../../tokenizer/deescape-special-chars.perl > $tmp");
+while(<STDIN>) {
+ # unsplit hyphens
+ s/ \@-\@ /-/g if $SPLIT_HYPHEN;
+
+ # handle parentheses
+ s/\(/*LRB*/g;
+ s/\)/*RRB*/g;
+
+ print TMP $_;
+}
+close(TMP);
+
+my $cmd = "cat $tmp | java -Xmx10000m -Xms10000m -Dfile.encoding=UTF8 -jar $JAR -gr $GRAMMAR -maxLength 1000 $BINARIZE | $Bin/berkeleyparsed2mosesxml.perl $SPLIT_HYPHEN";
+print STDERR $cmd."\n";
+
+open(PARSE,"$cmd|");
+while(<PARSE>) {
+ print $_;
+}
+close(PARSE);
+`rm $tmp`;
diff --git a/scripts/training/wrappers/syntax-hyphen-splitting.perl b/scripts/training/wrappers/syntax-hyphen-splitting.perl
new file mode 100755
index 000000000..69290e51d
--- /dev/null
+++ b/scripts/training/wrappers/syntax-hyphen-splitting.perl
@@ -0,0 +1,43 @@
+#!/usr/bin/perl -w
+
+use strict;
+use Getopt::Long "GetOptions";
+
+my $MARK_HYP = 0;
+my $BINARIZE = 0;
+
+die unless &GetOptions('binarize' => \$BINARIZE,'mark-split' => \$MARK_HYP);
+
+while(<STDIN>) {
+ chop;
+ my @OUT = ();
+ foreach (split) {
+ if (/^</ || />$/) {
+ push @OUT, $_;
+ }
+ elsif(/([\p{IsAlnum}])\-([\p{IsAlnum}])/) {
+ s/([\p{IsAlnum}])\-([\p{IsAlnum}])/$1 \@-\@ $2/g;
+ my @WORD = split;
+ $OUT[$#OUT] =~ /label=\"([^\"]+)\"/;
+ my $pos = $1;
+ if ($MARK_HYP) {
+ $OUT[$#OUT] =~ s/label=\"/label=\"HYP-/;
+ }
+ if ($BINARIZE) {
+ for(my $i=0;$i<scalar(@WORD)-2;$i++) {
+ push @OUT,"<tree label=\"\@".($MARK_HYP ? "HYP-" : "")."$pos\">";
+ }
+ }
+ for(my $i=0;$i<scalar(@WORD);$i++) {
+ if ($BINARIZE && $i>=2) {
+ push @OUT, "</tree>";
+ }
+ push @OUT,"<tree label=\"".(($WORD[$i] eq "\@-\@") ? "HYP" : $pos)."\"> $WORD[$i] </tree>";
+ }
+ }
+ else {
+ push @OUT, $_;
+ }
+ }
+ print join(" ",@OUT)."\n";
+}