#!/usr/bin/env perl use warnings; use strict; use Getopt::Long "GetOptions"; use FindBin qw($RealBin); my ($EGRET_DIR,$MOSES_DIR,$TREE_CONVERTER,$FOREST,$SPLIT_HYPHEN,$SPLIT_SLASH,$MARK_SPLIT,$BINARIZE,$UNPARSEABLE,$RAW_IN,$RAW_OUT,$EGRET_OPTIONS,$TREE_CONVERTER_OPTIONS); $UNPARSEABLE = 0; die("ERROR: syntax is: parse-en-egret.perl [-forest] [-split-hyphen] [-split-slash] [-mark-split] [-binarize] [-unparseable] [-raw-in PATH] [-raw-out PATH] [-egret-options OPTIONS] [-tree-converter-options OPTIONS] -egret-dir DIR -moses-dir DIR -tree-converter PATH < in > out\n") unless &GetOptions ('egret-dir=s' => \$EGRET_DIR, 'moses-dir=s' => \$MOSES_DIR, 'tree-converter=s' => \$TREE_CONVERTER, 'forest' => \$FOREST, 'split-hyphen' => \$SPLIT_HYPHEN, 'split-slash' => \$SPLIT_SLASH, 'mark-split' => \$MARK_SPLIT, 'binarize' => \$BINARIZE, 'unparseable' => \$UNPARSEABLE, 'raw-in=s' => \$RAW_IN, 'raw-out=s' => \$RAW_OUT, 'egret-options=s' => \$EGRET_OPTIONS, 'tree-converter-options=s' => \$TREE_CONVERTER_OPTIONS ) && defined($EGRET_DIR) && defined($MOSES_DIR) && defined($TREE_CONVERTER); die("ERROR: could not find egret directory: '$EGRET_DIR'\n") unless -d $EGRET_DIR; die("ERROR: could not find moses directory: '$MOSES_DIR'\n") unless -d $MOSES_DIR; die("ERROR: file not found or not executable: '$TREE_CONVERTER'\n") unless -x $TREE_CONVERTER; # Pre-processing. my $tmpEscaped = "/tmp/parse-en-egret.1.$$"; my $tmpDeescaped = "/tmp/parse-en-egret.2.$$"; my $tmpSplitPoints = "/tmp/parse-en-egret.3.$$"; open(ESCAPED, ">>$tmpEscaped"); open(DEESCAPED, "| $RealBin/../../tokenizer/deescape-special-chars.perl > $tmpDeescaped"); open(SPLIT_POINTS, ">>$tmpSplitPoints"); # Unsplit hyphen and slashes and write a file indicating where split points # are required in later post-processing. while() { print ESCAPED $_; my @tokens = split; my $new_token = ""; my $i = 0; # current token index in input sentence my $j = -1; # current token index in output sentence my $s = ""; # output sentence my $t = ""; # split point line while ($i <= $#tokens) { if (defined($SPLIT_HYPHEN) && $i <= $#tokens-1 && $tokens[$i] eq "\@\-\@") { my $pos = length $new_token; $new_token .= "-$tokens[$i+1]"; $t .= "$j,$pos,- "; $i += 2; } elsif (defined($SPLIT_SLASH) && $i <= $#tokens-1 && $tokens[$i] eq "\@\/\@") { my $pos = length $new_token; $new_token .= "/$tokens[$i+1]"; $t .= "$j,$pos,/ "; $i += 2; } else { $s .= "$new_token "; $new_token = $tokens[$i]; $i++; $j++; } } $s .= "$new_token"; $s =~ s/^\s+//; $t =~ s/^\s+//; print DEESCAPED "$s\n"; print SPLIT_POINTS "$t\n"; } close(SPLIT_POINTS); close(DEESCAPED); close(ESCAPED); # Construct the parsing / post-processing pipeline: # Stage 1: Parse (unless the user has provided Egret input via -raw-in option). my $pipeline = ""; if (defined($RAW_IN)) { $pipeline .= "cat \"$RAW_IN\" |"; } else { $pipeline .= "$EGRET_DIR/egret"; $pipeline .= " -lapcfg"; $pipeline .= " -data=$EGRET_DIR/eng_grammar"; $pipeline .= " -printForest" if $FOREST; $pipeline .= " -i=$tmpDeescaped"; $pipeline .= " $EGRET_OPTIONS" if defined($EGRET_OPTIONS); $pipeline .= " |"; } if (defined($RAW_OUT)) { $pipeline .= "tee \"$RAW_OUT\" |"; } # Stage 2: Convert trees to forests (unless we already have forests) unless ($FOREST) { $pipeline .= 'sed \'s/^(//\' |'; # Remove opening ( $pipeline .= 'sed \'s/)$//\' |'; # Remove closing ) $pipeline .= "$TREE_CONVERTER"; $pipeline .= " -input_format penn"; $pipeline .= " -output_format egret"; $pipeline .= " |"; } # Stage 3: Postprocess using Moses' postprocess-egret-forests # This performs some minor transformations to the forest: Moses-style escaping # of special characters; removal of Egret's "^g" suffixes from constituent # labels; and marking of slash/hyphen split points (using @ characters). $pipeline .= "$MOSES_DIR/bin/postprocess-egret-forests"; $pipeline .= " --Escape" if $FOREST; $pipeline .= " --MarkSplitPoints $tmpSplitPoints"; $pipeline .= " |"; # Stage 4: Postprocess using Travatar's tree-converter. # This normalizes the forest weights and performs hyphen / slash splitting (if # requested). The option -tree-converter-options can be used to enable # additional tree-converter transformations (such as binarization). #my $output_format = $FOREST ? "egret" : "mosesxml"; my $output_format = $FOREST ? "egret" : "penn"; $pipeline .= "$TREE_CONVERTER"; $pipeline .= " -input_format egret"; $pipeline .= " -output_format $output_format"; # FIXME Single split option $pipeline .= " -split \@\-\@" if defined($SPLIT_HYPHEN); $pipeline .= " -split \@\/\@" if defined($SPLIT_SLASH); $pipeline .= " $TREE_CONVERTER_OPTIONS" if defined($TREE_CONVERTER_OPTIONS); $pipeline .= " |"; unless ($FOREST) { $pipeline .= 'sed \'s/^()$//\' |'; # Remove empty trees (failed parses) $pipeline .= 'sed \'s/^(/( (/\' |'; # Add Berkeley-style opening ( + blank $pipeline .= 'sed \'s/)$/))/\' |'; # Add Berkeley-style closing ) $pipeline .= 'sed \'s/^$/(())/\' |'; # Restore empty trees (Berkeley-style) $pipeline .= "$RealBin/berkeleyparsed2mosesxml.perl |"; $pipeline .= 'sed \'s/^) { print $_; } } else { open(TMPESCAPED, $tmpEscaped); while () { my $outLine = $_; my $unparsedLine = ; if ($UNPARSEABLE == 1 && length($outLine) == 1) { print $unparsedLine; } else { print $outLine; } } } close(PARSE); `rm $tmpSplitPoints`; `rm $tmpDeescaped`; `rm $tmpEscaped`;