more escaping in tokenizer; wrapper for berkeley parser (german)

author: phikoehn <pkoehn@inf.ed.ac.uk> 2012-05-30 03:58:18 +0400
committer: phikoehn <pkoehn@inf.ed.ac.uk> 2012-05-30 03:58:18 +0400
commit: 2e370ed11b0cd8989118891dc4385619837dd39f (patch)
tree: e0b8ab423399453a90bc4cacabc323289620042c /scripts
parent: fd577d7a65cab923b9102d61873a032654d573a1 (diff)
8 files changed, 207 insertions, 28 deletions
diff --git a/scripts/tokenizer/deescape-special-chars.perl b/scripts/tokenizer/deescape-special-chars.perl
index 55035ae6d..345555990 100755
--- a/scripts/tokenizer/deescape-special-chars.perl
+++ b/scripts/tokenizer/deescape-special-chars.perl
@@ -3,13 +3,15 @@
 use strict;
 
 while(<STDIN>) {
-  s/\&bar;/\|/g;
-  s/\&lt;/\</g;
-  s/\&gt;/\>/g;
-  s/\&bra;/\[/g;
-  s/\&ket;/\]/g;
-  s/\&#91;/\[/g;
-  s/\&#93;/\]/g;
-  s/\&amp;/\&/g;
+  s/\&bar;/\|/g;   # factor separator
+  s/\&lt;/\</g;    # xml
+  s/\&gt;/\>/g;    # xml
+  s/\&bra;/\[/g;   # syntax non-terminal (legacy)
+  s/\&ket;/\]/g;   # syntax non-terminal (legacy)
+  s/\&quot;/\"/g;  # xml
+  s/\&apos;/\'/g;  # xml
+  s/\&#91;/\[/g;   # syntax non-terminal
+  s/\&#93;/\]/g;   # syntax non-terminal
+  s/\&amp;/\&/g;   # escape escape
   print $_;
 }
diff --git a/scripts/tokenizer/detokenizer.perl b/scripts/tokenizer/detokenizer.perl
index e55a1a26e..8233b419c 100755
--- a/scripts/tokenizer/detokenizer.perl
+++ b/scripts/tokenizer/detokenizer.perl
@@ -66,14 +66,16 @@ sub detokenize {
 	$text = " $text ";
   $text =~ s/ \@\-\@ /-/g;
   # de-escape special chars
-  $text =~ s/\&bar;/\|/g;
-  $text =~ s/\&lt;/\</g;
-  $text =~ s/\&gt;/\>/g;
-  $text =~ s/\&bra;/\[/g;
-  $text =~ s/\&ket;/\]/g;
-  $text =~ s/\&#91;/\[/g;
-  $text =~ s/\&#93;/\]/g;
-  $text =~ s/\&amp;/\&/g;
+  $text =~ s/\&bar;/\|/g;   # factor separator
+  $text =~ s/\&lt;/\</g;    # xml
+  $text =~ s/\&gt;/\>/g;    # xml
+  $text =~ s/\&bra;/\[/g;   # syntax non-terminal (legacy)
+  $text =~ s/\&ket;/\]/g;   # syntax non-terminal (legacy)
+  $text =~ s/\&quot;/\"/g;  # xml
+  $text =~ s/\&apos;/\'/g;  # xml
+  $text =~ s/\&#91;/\[/g;   # syntax non-terminal
+  $text =~ s/\&#93;/\]/g;   # syntax non-terminal
+  $text =~ s/\&amp;/\&/g;   # escape escape
 
 	my $word;
 	my $i;
diff --git a/scripts/tokenizer/escape-special-chars.perl b/scripts/tokenizer/escape-special-chars.perl
index f4c1b4dd5..5d9690c04 100755
--- a/scripts/tokenizer/escape-special-chars.perl
+++ b/scripts/tokenizer/escape-special-chars.perl
@@ -12,12 +12,14 @@ while(<STDIN>) {
 	s/ $//g;
 
   # special characters in moses
-  s/\&/\&amp;/g;
-  s/\|/\&bar;/g;
-  s/\</\&lt;/g;
-  s/\>/\&gt;/g;
-  s/\[/\&#91;/g;
-  s/\]/\&#93;/g;
+  s/\&/\&amp;/g;   # escape escape
+  s/\|/\&bar;/g;   # factor separator
+  s/\</\&lt;/g;    # xml
+  s/\>/\&gt;/g;    # xml
+  s/\'/\&apos;/g;  # xml
+  s/\"/\&quot;/g;  # xml
+  s/\[/\&#91;/g;   # syntax non-terminal
+  s/\]/\&#93;/g;   # syntax non-terminal
   
   # restore xml instructions
   s/\&lt;(\S+) translation="([^\"]+)"&gt; (.+?) &lt;\/(\S+)&gt;/\<$1 translation=\"$2\"> $3 <\/$4>/g;
diff --git a/scripts/tokenizer/tokenizer.perl b/scripts/tokenizer/tokenizer.perl
index 70bb318f7..0cb713740 100755
--- a/scripts/tokenizer/tokenizer.perl
+++ b/scripts/tokenizer/tokenizer.perl
@@ -149,12 +149,14 @@ sub tokenize {
 	$text =~ s/DOTMULTI/./g;
 
   #escape special chars
-  $text =~ s/\&/\&amp;/g;
-  $text =~ s/\|/\&bar;/g;
-  $text =~ s/\</\&lt;/g;
-  $text =~ s/\>/\&gt;/g;
-  $text =~ s/\[/\&#91;/g;
-  $text =~ s/\]/\&#93;/g;
+  $text =~ s/\&/\&amp;/g;   # escape escape
+  $text =~ s/\|/\&bar;/g;   # factor separator
+  $text =~ s/\</\&lt;/g;    # xml
+  $text =~ s/\>/\&gt;/g;    # xml
+  $text =~ s/\'/\&apos;/g;  # xml
+  $text =~ s/\"/\&quot;/g;  # xml
+  $text =~ s/\[/\&#91;/g;   # syntax non-terminal
+  $text =~ s/\]/\&#93;/g;   # syntax non-terminal
 
 	#ensure final line break
 	$text .= "\n" unless $text =~ /\n$/;
diff --git a/scripts/training/wrappers/berkeleyparsed2mosesxml.perl b/scripts/training/wrappers/berkeleyparsed2mosesxml.perl
new file mode 100755
index 000000000..6a4ed731e
--- /dev/null
+++ b/scripts/training/wrappers/berkeleyparsed2mosesxml.perl
@@ -0,0 +1,36 @@
+#!/usr/bin/perl -w
+
+use strict;
+
+while(<STDIN>) {
+  if (/^\(\(\)\)/) {
+    print "\n"; # parse failures
+    next;
+  }
+
+  # prep
+  s/^\( /\(TOP /;
+
+  # escape words
+  s/\&/\&amp;/g;   # escape escape
+  s/\|/\&bar;/g;   # factor separator
+  s/\</\&lt;/g;    # xml
+  s/\>/\&gt;/g;    # xml
+  s/\'/\&apos;/g;  # xml
+  s/\"/\&quot;/g;  # xml
+  s/\[/\&#91;/g;   # syntax non-terminal
+  s/\]/\&#93;/g;   # syntax non-terminal
+  
+  # convert into tree
+  s/\((\S+) /<tree label=\"$1\"> /g;
+  s/\)/ <\/tree> /g;
+  s/\"\-LRB\-\"/\"LRB\"/g; # labels
+  s/\"\-RRB\-\"/\"RRB\"/g;
+  s/\-LRB\-/\(/g; # tokens
+  s/\-RRB\-/\)/g;
+  s/ +/ /g;
+  s/ $//g;
+
+  # output, replace words with original
+  print $_;
+}
diff --git a/scripts/training/wrappers/mosesxml2berkeleyparsed.perl b/scripts/training/wrappers/mosesxml2berkeleyparsed.perl
new file mode 100755
index 000000000..ef6e66024
--- /dev/null
+++ b/scripts/training/wrappers/mosesxml2berkeleyparsed.perl
@@ -0,0 +1,44 @@
+#!/usr/bin/perl -w
+
+use strict;
+
+#( (NP (NP (NN resumption)) (PP (IN of) (NP (DT the) (NN session)))) )
+#( (S (@S (@S (@S (S (NP (PRP I)) (VP (VB declare) (VP (@VP (VBD resumed) (NP (@NP (NP (DT the) (NN session)) (PP (IN of) (NP (@NP (DT the) (NNP European)) (NNP Parliament)))) (VP (VBN adjourned) (PP (IN on) (NP (NNP Friday) (CD 17)))))) (NP (NNP December) (CD 1999))))) (, ,)) (CC and)) (S (NP (PRP I)) (VP (MD would) (VP (VB like) (S (ADVP (RB once) (RB again)) (VP (TO to) (VP (@VP (VB wish) (NP (PRP you))) (NP (NP (@NP (@NP (DT a) (JJ happy)) (JJ new)) (NN year)) (PP (IN in) (NP (@NP (DT the) (NN hope)) (SBAR (IN that) (S (NP (PRP you)) (VP (VBD enjoyed) (NP (@NP (@NP (DT a) (JJ pleasant)) (JJ festive)) (NN period))))))))))))))) (. .)) )
+
+while(<STDIN>) {
+  if (/^$/) {
+    print "\n"; # parse failures
+    next;
+  }
+
+  # parenheses
+  s/\(/\-LRB\-/g; # tokens
+  s/\)/\-RRB\-/g;
+  s/\"LRB\"/\"\-LRB\-\"/g; # labels
+  s/\"RRB\"/\"\-RRB\-\"/g;
+
+  # main
+  s/<tree label=\"([^\"]+)\">/\($1/g;
+  s/ *<\/tree>/\)/g;
+  s/^\(TOP/\(/;
+
+  # de-escape
+  s/\&bar;/\|/g;   # factor separator
+  s/\&lt;/\</g;    # xml
+  s/\&gt;/\>/g;    # xml
+  s/\&bra;/\[/g;   # syntax non-terminal (legacy)
+  s/\&ket;/\]/g;   # syntax non-terminal (legacy)
+  s/\&quot;/\"/g;  # xml
+  s/\&apos;/\'/g;  # xml
+  s/\&#91;/\[/g;   # syntax non-terminal
+  s/\&#93;/\]/g;   # syntax non-terminal
+  s/\&amp;/\&/g;   # escape escape
+
+  # cleanup
+  s/ +/ /g;
+  s/ $//g;
+  s/\)$/ \)/g;
+
+  # output
+  print $_;
+}
diff --git a/scripts/training/wrappers/parse-de-berkeley.perl b/scripts/training/wrappers/parse-de-berkeley.perl
new file mode 100755
index 000000000..6482d11f3
--- /dev/null
+++ b/scripts/training/wrappers/parse-de-berkeley.perl
@@ -0,0 +1,48 @@
+#!/usr/bin/perl -w
+
+use strict;
+use Getopt::Long "GetOptions";
+use FindBin qw($Bin);
+
+my ($JAR,$GRAMMAR,$SPLIT_HYPHEN,$MARK_SPLIT,$BINARIZE);
+
+die("ERROR: syntax is: parse-de-berkeley.perl [-split-hyphen] [-mark-split] [-binarize] -jar jar-file -gr grammar < in > out\n") 
+  unless &GetOptions
+  ('jar=s' => \$JAR,
+   'gr=s' => \$GRAMMAR,
+   'split-hyphen' => \$SPLIT_HYPHEN,
+   'mark-split' => \$MARK_SPLIT,
+   'binarize' => \$BINARIZE)
+  && defined($JAR) && defined($GRAMMAR);
+
+die("ERROR: could not find jar file '$JAR'\n") unless -e $JAR;
+die("ERROR: could not find grammar file '$GRAMMAR'\n") unless -e $GRAMMAR;
+
+$BINARIZE = $BINARIZE ? "-binarize" : "";
+$SPLIT_HYPHEN = $SPLIT_HYPHEN ? "| $Bin/syntax-hyphen-splitting.perl $BINARIZE" : "";
+$SPLIT_HYPHEN .= " -mark-split" if $SPLIT_HYPHEN && $MARK_SPLIT;
+
+my $tmp = "/tmp/parse-de-berkeley.$$";
+
+open(TMP,"| $Bin/../../tokenizer/deescape-special-chars.perl > $tmp");
+while(<STDIN>) {
+  # unsplit hyphens
+  s/ \@-\@ /-/g if $SPLIT_HYPHEN;
+
+  # handle parentheses
+  s/\(/*LRB*/g;
+  s/\)/*RRB*/g;
+
+  print TMP $_;
+}
+close(TMP);
+
+my $cmd = "cat $tmp | java -Xmx10000m -Xms10000m -Dfile.encoding=UTF8 -jar $JAR -gr $GRAMMAR -maxLength 1000 $BINARIZE | $Bin/berkeleyparsed2mosesxml.perl $SPLIT_HYPHEN";
+print STDERR $cmd."\n";
+
+open(PARSE,"$cmd|");
+while(<PARSE>) {
+  print $_;
+}
+close(PARSE);
+`rm $tmp`;
diff --git a/scripts/training/wrappers/syntax-hyphen-splitting.perl b/scripts/training/wrappers/syntax-hyphen-splitting.perl
new file mode 100755
index 000000000..69290e51d
--- /dev/null
+++ b/scripts/training/wrappers/syntax-hyphen-splitting.perl
@@ -0,0 +1,43 @@
+#!/usr/bin/perl -w
+
+use strict;
+use Getopt::Long "GetOptions";
+
+my $MARK_HYP = 0;
+my $BINARIZE = 0;
+
+die unless &GetOptions('binarize' => \$BINARIZE,'mark-split' => \$MARK_HYP);
+
+while(<STDIN>) {
+  chop;
+  my @OUT = ();
+  foreach (split) {
+    if (/^</ || />$/) {
+      push @OUT, $_;
+    }
+    elsif(/([\p{IsAlnum}])\-([\p{IsAlnum}])/) {
+      s/([\p{IsAlnum}])\-([\p{IsAlnum}])/$1 \@-\@ $2/g;
+      my @WORD = split;
+      $OUT[$#OUT] =~ /label=\"([^\"]+)\"/;
+      my $pos = $1;
+      if ($MARK_HYP) {
+        $OUT[$#OUT] =~ s/label=\"/label=\"HYP-/;
+      }
+      if ($BINARIZE) {
+        for(my $i=0;$i<scalar(@WORD)-2;$i++) {
+          push @OUT,"<tree label=\"\@".($MARK_HYP ? "HYP-" : "")."$pos\">";
+        }
+      }
+      for(my $i=0;$i<scalar(@WORD);$i++) {
+        if ($BINARIZE && $i>=2) {
+          push @OUT, "</tree>";
+        }
+        push @OUT,"<tree label=\"".(($WORD[$i] eq "\@-\@") ? "HYP" : $pos)."\"> $WORD[$i] </tree>";
+      }
+    }
+    else {
+      push @OUT, $_;
+    }
+  }
+  print join(" ",@OUT)."\n";
+}
author	phikoehn <pkoehn@inf.ed.ac.uk>	2012-05-30 03:58:18 +0400
committer	phikoehn <pkoehn@inf.ed.ac.uk>	2012-05-30 03:58:18 +0400
commit	2e370ed11b0cd8989118891dc4385619837dd39f (patch)
tree	e0b8ab423399453a90bc4cacabc323289620042c /scripts
parent	fd577d7a65cab923b9102d61873a032654d573a1 (diff)