Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorphikoehn <pkoehn@inf.ed.ac.uk>2012-05-30 03:58:18 +0400
committerphikoehn <pkoehn@inf.ed.ac.uk>2012-05-30 03:58:18 +0400
commit2e370ed11b0cd8989118891dc4385619837dd39f (patch)
treee0b8ab423399453a90bc4cacabc323289620042c /scripts
parentfd577d7a65cab923b9102d61873a032654d573a1 (diff)
more escaping in tokenizer; wrapper for berkeley parser (german)
Diffstat (limited to 'scripts')
-rwxr-xr-xscripts/tokenizer/deescape-special-chars.perl18
-rwxr-xr-xscripts/tokenizer/detokenizer.perl18
-rwxr-xr-xscripts/tokenizer/escape-special-chars.perl14
-rwxr-xr-xscripts/tokenizer/tokenizer.perl14
-rwxr-xr-xscripts/training/wrappers/berkeleyparsed2mosesxml.perl36
-rwxr-xr-xscripts/training/wrappers/mosesxml2berkeleyparsed.perl44
-rwxr-xr-xscripts/training/wrappers/parse-de-berkeley.perl48
-rwxr-xr-xscripts/training/wrappers/syntax-hyphen-splitting.perl43
8 files changed, 207 insertions, 28 deletions
diff --git a/scripts/tokenizer/deescape-special-chars.perl b/scripts/tokenizer/deescape-special-chars.perl
index 55035ae6d..345555990 100755
--- a/scripts/tokenizer/deescape-special-chars.perl
+++ b/scripts/tokenizer/deescape-special-chars.perl
@@ -3,13 +3,15 @@
use strict;
while(<STDIN>) {
- s/\&bar;/\|/g;
- s/\&lt;/\</g;
- s/\&gt;/\>/g;
- s/\&bra;/\[/g;
- s/\&ket;/\]/g;
- s/\&#91;/\[/g;
- s/\&#93;/\]/g;
- s/\&amp;/\&/g;
+ s/\&bar;/\|/g; # factor separator
+ s/\&lt;/\</g; # xml
+ s/\&gt;/\>/g; # xml
+ s/\&bra;/\[/g; # syntax non-terminal (legacy)
+ s/\&ket;/\]/g; # syntax non-terminal (legacy)
+ s/\&quot;/\"/g; # xml
+ s/\&apos;/\'/g; # xml
+ s/\&#91;/\[/g; # syntax non-terminal
+ s/\&#93;/\]/g; # syntax non-terminal
+ s/\&amp;/\&/g; # escape escape
print $_;
}
diff --git a/scripts/tokenizer/detokenizer.perl b/scripts/tokenizer/detokenizer.perl
index e55a1a26e..8233b419c 100755
--- a/scripts/tokenizer/detokenizer.perl
+++ b/scripts/tokenizer/detokenizer.perl
@@ -66,14 +66,16 @@ sub detokenize {
$text = " $text ";
$text =~ s/ \@\-\@ /-/g;
# de-escape special chars
- $text =~ s/\&bar;/\|/g;
- $text =~ s/\&lt;/\</g;
- $text =~ s/\&gt;/\>/g;
- $text =~ s/\&bra;/\[/g;
- $text =~ s/\&ket;/\]/g;
- $text =~ s/\&#91;/\[/g;
- $text =~ s/\&#93;/\]/g;
- $text =~ s/\&amp;/\&/g;
+ $text =~ s/\&bar;/\|/g; # factor separator
+ $text =~ s/\&lt;/\</g; # xml
+ $text =~ s/\&gt;/\>/g; # xml
+ $text =~ s/\&bra;/\[/g; # syntax non-terminal (legacy)
+ $text =~ s/\&ket;/\]/g; # syntax non-terminal (legacy)
+ $text =~ s/\&quot;/\"/g; # xml
+ $text =~ s/\&apos;/\'/g; # xml
+ $text =~ s/\&#91;/\[/g; # syntax non-terminal
+ $text =~ s/\&#93;/\]/g; # syntax non-terminal
+ $text =~ s/\&amp;/\&/g; # escape escape
my $word;
my $i;
diff --git a/scripts/tokenizer/escape-special-chars.perl b/scripts/tokenizer/escape-special-chars.perl
index f4c1b4dd5..5d9690c04 100755
--- a/scripts/tokenizer/escape-special-chars.perl
+++ b/scripts/tokenizer/escape-special-chars.perl
@@ -12,12 +12,14 @@ while(<STDIN>) {
s/ $//g;
# special characters in moses
- s/\&/\&amp;/g;
- s/\|/\&bar;/g;
- s/\</\&lt;/g;
- s/\>/\&gt;/g;
- s/\[/\&#91;/g;
- s/\]/\&#93;/g;
+ s/\&/\&amp;/g; # escape escape
+ s/\|/\&bar;/g; # factor separator
+ s/\</\&lt;/g; # xml
+ s/\>/\&gt;/g; # xml
+ s/\'/\&apos;/g; # xml
+ s/\"/\&quot;/g; # xml
+ s/\[/\&#91;/g; # syntax non-terminal
+ s/\]/\&#93;/g; # syntax non-terminal
# restore xml instructions
s/\&lt;(\S+) translation="([^\"]+)"&gt; (.+?) &lt;\/(\S+)&gt;/\<$1 translation=\"$2\"> $3 <\/$4>/g;
diff --git a/scripts/tokenizer/tokenizer.perl b/scripts/tokenizer/tokenizer.perl
index 70bb318f7..0cb713740 100755
--- a/scripts/tokenizer/tokenizer.perl
+++ b/scripts/tokenizer/tokenizer.perl
@@ -149,12 +149,14 @@ sub tokenize {
$text =~ s/DOTMULTI/./g;
#escape special chars
- $text =~ s/\&/\&amp;/g;
- $text =~ s/\|/\&bar;/g;
- $text =~ s/\</\&lt;/g;
- $text =~ s/\>/\&gt;/g;
- $text =~ s/\[/\&#91;/g;
- $text =~ s/\]/\&#93;/g;
+ $text =~ s/\&/\&amp;/g; # escape escape
+ $text =~ s/\|/\&bar;/g; # factor separator
+ $text =~ s/\</\&lt;/g; # xml
+ $text =~ s/\>/\&gt;/g; # xml
+ $text =~ s/\'/\&apos;/g; # xml
+ $text =~ s/\"/\&quot;/g; # xml
+ $text =~ s/\[/\&#91;/g; # syntax non-terminal
+ $text =~ s/\]/\&#93;/g; # syntax non-terminal
#ensure final line break
$text .= "\n" unless $text =~ /\n$/;
diff --git a/scripts/training/wrappers/berkeleyparsed2mosesxml.perl b/scripts/training/wrappers/berkeleyparsed2mosesxml.perl
new file mode 100755
index 000000000..6a4ed731e
--- /dev/null
+++ b/scripts/training/wrappers/berkeleyparsed2mosesxml.perl
@@ -0,0 +1,36 @@
+#!/usr/bin/perl -w
+
+use strict;
+
+while(<STDIN>) {
+ if (/^\(\(\)\)/) {
+ print "\n"; # parse failures
+ next;
+ }
+
+ # prep
+ s/^\( /\(TOP /;
+
+ # escape words
+ s/\&/\&amp;/g; # escape escape
+ s/\|/\&bar;/g; # factor separator
+ s/\</\&lt;/g; # xml
+ s/\>/\&gt;/g; # xml
+ s/\'/\&apos;/g; # xml
+ s/\"/\&quot;/g; # xml
+ s/\[/\&#91;/g; # syntax non-terminal
+ s/\]/\&#93;/g; # syntax non-terminal
+
+ # convert into tree
+ s/\((\S+) /<tree label=\"$1\"> /g;
+ s/\)/ <\/tree> /g;
+ s/\"\-LRB\-\"/\"LRB\"/g; # labels
+ s/\"\-RRB\-\"/\"RRB\"/g;
+ s/\-LRB\-/\(/g; # tokens
+ s/\-RRB\-/\)/g;
+ s/ +/ /g;
+ s/ $//g;
+
+ # output, replace words with original
+ print $_;
+}
diff --git a/scripts/training/wrappers/mosesxml2berkeleyparsed.perl b/scripts/training/wrappers/mosesxml2berkeleyparsed.perl
new file mode 100755
index 000000000..ef6e66024
--- /dev/null
+++ b/scripts/training/wrappers/mosesxml2berkeleyparsed.perl
@@ -0,0 +1,44 @@
+#!/usr/bin/perl -w
+
+use strict;
+
+#( (NP (NP (NN resumption)) (PP (IN of) (NP (DT the) (NN session)))) )
+#( (S (@S (@S (@S (S (NP (PRP I)) (VP (VB declare) (VP (@VP (VBD resumed) (NP (@NP (NP (DT the) (NN session)) (PP (IN of) (NP (@NP (DT the) (NNP European)) (NNP Parliament)))) (VP (VBN adjourned) (PP (IN on) (NP (NNP Friday) (CD 17)))))) (NP (NNP December) (CD 1999))))) (, ,)) (CC and)) (S (NP (PRP I)) (VP (MD would) (VP (VB like) (S (ADVP (RB once) (RB again)) (VP (TO to) (VP (@VP (VB wish) (NP (PRP you))) (NP (NP (@NP (@NP (DT a) (JJ happy)) (JJ new)) (NN year)) (PP (IN in) (NP (@NP (DT the) (NN hope)) (SBAR (IN that) (S (NP (PRP you)) (VP (VBD enjoyed) (NP (@NP (@NP (DT a) (JJ pleasant)) (JJ festive)) (NN period))))))))))))))) (. .)) )
+
+while(<STDIN>) {
+ if (/^$/) {
+ print "\n"; # parse failures
+ next;
+ }
+
+ # parenheses
+ s/\(/\-LRB\-/g; # tokens
+ s/\)/\-RRB\-/g;
+ s/\"LRB\"/\"\-LRB\-\"/g; # labels
+ s/\"RRB\"/\"\-RRB\-\"/g;
+
+ # main
+ s/<tree label=\"([^\"]+)\">/\($1/g;
+ s/ *<\/tree>/\)/g;
+ s/^\(TOP/\(/;
+
+ # de-escape
+ s/\&bar;/\|/g; # factor separator
+ s/\&lt;/\</g; # xml
+ s/\&gt;/\>/g; # xml
+ s/\&bra;/\[/g; # syntax non-terminal (legacy)
+ s/\&ket;/\]/g; # syntax non-terminal (legacy)
+ s/\&quot;/\"/g; # xml
+ s/\&apos;/\'/g; # xml
+ s/\&#91;/\[/g; # syntax non-terminal
+ s/\&#93;/\]/g; # syntax non-terminal
+ s/\&amp;/\&/g; # escape escape
+
+ # cleanup
+ s/ +/ /g;
+ s/ $//g;
+ s/\)$/ \)/g;
+
+ # output
+ print $_;
+}
diff --git a/scripts/training/wrappers/parse-de-berkeley.perl b/scripts/training/wrappers/parse-de-berkeley.perl
new file mode 100755
index 000000000..6482d11f3
--- /dev/null
+++ b/scripts/training/wrappers/parse-de-berkeley.perl
@@ -0,0 +1,48 @@
+#!/usr/bin/perl -w
+
+use strict;
+use Getopt::Long "GetOptions";
+use FindBin qw($Bin);
+
+my ($JAR,$GRAMMAR,$SPLIT_HYPHEN,$MARK_SPLIT,$BINARIZE);
+
+die("ERROR: syntax is: parse-de-berkeley.perl [-split-hyphen] [-mark-split] [-binarize] -jar jar-file -gr grammar < in > out\n")
+ unless &GetOptions
+ ('jar=s' => \$JAR,
+ 'gr=s' => \$GRAMMAR,
+ 'split-hyphen' => \$SPLIT_HYPHEN,
+ 'mark-split' => \$MARK_SPLIT,
+ 'binarize' => \$BINARIZE)
+ && defined($JAR) && defined($GRAMMAR);
+
+die("ERROR: could not find jar file '$JAR'\n") unless -e $JAR;
+die("ERROR: could not find grammar file '$GRAMMAR'\n") unless -e $GRAMMAR;
+
+$BINARIZE = $BINARIZE ? "-binarize" : "";
+$SPLIT_HYPHEN = $SPLIT_HYPHEN ? "| $Bin/syntax-hyphen-splitting.perl $BINARIZE" : "";
+$SPLIT_HYPHEN .= " -mark-split" if $SPLIT_HYPHEN && $MARK_SPLIT;
+
+my $tmp = "/tmp/parse-de-berkeley.$$";
+
+open(TMP,"| $Bin/../../tokenizer/deescape-special-chars.perl > $tmp");
+while(<STDIN>) {
+ # unsplit hyphens
+ s/ \@-\@ /-/g if $SPLIT_HYPHEN;
+
+ # handle parentheses
+ s/\(/*LRB*/g;
+ s/\)/*RRB*/g;
+
+ print TMP $_;
+}
+close(TMP);
+
+my $cmd = "cat $tmp | java -Xmx10000m -Xms10000m -Dfile.encoding=UTF8 -jar $JAR -gr $GRAMMAR -maxLength 1000 $BINARIZE | $Bin/berkeleyparsed2mosesxml.perl $SPLIT_HYPHEN";
+print STDERR $cmd."\n";
+
+open(PARSE,"$cmd|");
+while(<PARSE>) {
+ print $_;
+}
+close(PARSE);
+`rm $tmp`;
diff --git a/scripts/training/wrappers/syntax-hyphen-splitting.perl b/scripts/training/wrappers/syntax-hyphen-splitting.perl
new file mode 100755
index 000000000..69290e51d
--- /dev/null
+++ b/scripts/training/wrappers/syntax-hyphen-splitting.perl
@@ -0,0 +1,43 @@
+#!/usr/bin/perl -w
+
+use strict;
+use Getopt::Long "GetOptions";
+
+my $MARK_HYP = 0;
+my $BINARIZE = 0;
+
+die unless &GetOptions('binarize' => \$BINARIZE,'mark-split' => \$MARK_HYP);
+
+while(<STDIN>) {
+ chop;
+ my @OUT = ();
+ foreach (split) {
+ if (/^</ || />$/) {
+ push @OUT, $_;
+ }
+ elsif(/([\p{IsAlnum}])\-([\p{IsAlnum}])/) {
+ s/([\p{IsAlnum}])\-([\p{IsAlnum}])/$1 \@-\@ $2/g;
+ my @WORD = split;
+ $OUT[$#OUT] =~ /label=\"([^\"]+)\"/;
+ my $pos = $1;
+ if ($MARK_HYP) {
+ $OUT[$#OUT] =~ s/label=\"/label=\"HYP-/;
+ }
+ if ($BINARIZE) {
+ for(my $i=0;$i<scalar(@WORD)-2;$i++) {
+ push @OUT,"<tree label=\"\@".($MARK_HYP ? "HYP-" : "")."$pos\">";
+ }
+ }
+ for(my $i=0;$i<scalar(@WORD);$i++) {
+ if ($BINARIZE && $i>=2) {
+ push @OUT, "</tree>";
+ }
+ push @OUT,"<tree label=\"".(($WORD[$i] eq "\@-\@") ? "HYP" : $pos)."\"> $WORD[$i] </tree>";
+ }
+ }
+ else {
+ push @OUT, $_;
+ }
+ }
+ print join(" ",@OUT)."\n";
+}