#!/usr/bin/env perl 

use warnings;
use strict;
use Getopt::Long "GetOptions";

my $MAX_LENGTH = 4;

my ($system,$system_alignment,$segmentation,$reference,$dir,$input,$corpus,$ttable,@FACTORED_TTABLE,$score_options,$hierarchical,$output_corpus,$alignment,$biconcor,$input_factors,$input_factor_names,$output_factor_names,$precision_by_coverage,$precision_by_coverage_factor,$coverage_dir,$search_graph);
if (!&GetOptions('system=s' => \$system, # raw output from decoder
		 'system-alignment=s' => \$system_alignment, # word alignment of system output
                 'reference=s' => \$reference, # tokenized reference
                 'dir=s' => \$dir, # directory for storing results
		 'input-factors=i' => \$input_factors, # list of input factors
		 'input-factor-names=s' => \$input_factor_names,
		 'output-factor-names=s' => \$output_factor_names,
		 'precision-by-coverage' => \$precision_by_coverage, # added report for input words
		 'precision-by-coverage-factor=i' => \$precision_by_coverage_factor, # sub-reports
                 'input=s' => \$input, # tokenized input (as for decoder)
                 'segmentation=s' => \$segmentation, # system output with segmentation markup
                 'input-corpus=s' => \$corpus, # input side of parallel training corpus
                 'ttable=s' => \$ttable, # phrase translation table used for decoding
		 'factored-ttable=s' => \@FACTORED_TTABLE, # factored phrase translation table
		 'score-options=s' => \$score_options, # score options to detect p(e|f) score
                 'output-corpus=s' => \$output_corpus, # output side of parallel training corpus
                 'alignment-file=s' => \$alignment, # alignment of parallel corpus
		 'coverage=s' => \$coverage_dir, # already computed coverage, stored in this dir
                 'biconcor=s' => \$biconcor, # binary for bilingual concordancer
                 'search-graph=s' => \$search_graph, # visualization of search graph
		 'hierarchical' => \$hierarchical) || # hierarchical model?
    !defined($dir)) {
	die("ERROR: syntax: analysis.perl -system FILE -reference FILE -dir DIR [-input FILE] [-input-corpus FILE] [-ttable FILE] [-score-options SETTINGS] [-segmentation FILE] [-output-corpus FILE] [-alignment-file FILE] [-biconcor BIN]");	
}

`mkdir -p $dir`;

# factor names
if (defined($input_factor_names) && defined($output_factor_names)) {
  open(FACTOR,">$dir/factor-names") or die "Cannot open: $!";
  print FACTOR $input_factor_names."\n";
  print FACTOR $output_factor_names."\n";
  close(FACTOR);
}

# compare system output against reference translation
my(@SYSTEM,@REFERENCE);
my (%PRECISION_CORRECT,%PRECISION_TOTAL,
    %RECALL_CORRECT,%RECALL_TOTAL);
if (defined($system) || defined($reference)) {
  die("you need to you specify both system and reference, not just either")
    unless defined($system) && defined($reference);

  die("can't open system file $system") if ! -e $system;
  @SYSTEM = `cat $system`;
  chop(@SYSTEM);

  if (! -e $reference && -e $reference.".ref0") {
      for(my $i=0;-e $reference.".ref".$i;$i++) {
	  my @REF = `cat $reference.ref$i`;
	  chop(@REF);
	  for(my $j=0;$j<scalar(@REF);$j++) {
	      push @{$REFERENCE[$j]}, $REF[$j];
	  }
      }
  }
  else {
      die("can't open system file $reference") if ! -e $reference;
      @REFERENCE = `cat $reference`;
      chop(@REFERENCE);
  }

  for(my $i=0;$i<scalar @SYSTEM;$i++) {
    &add_match($SYSTEM[$i],$REFERENCE[$i],
	       \%PRECISION_CORRECT,\%PRECISION_TOTAL);
    &add_match($REFERENCE[$i],$SYSTEM[$i],
	       \%RECALL_CORRECT,\%RECALL_TOTAL);
  }

  open(SUMMARY,">$dir/summary") or die "Cannot open: $!";
  &best_matches(\%PRECISION_CORRECT,\%PRECISION_TOTAL,"$dir/n-gram-precision");
  &best_matches(\%RECALL_CORRECT,\%RECALL_TOTAL,"$dir/n-gram-recall");
  &bleu_annotation();
  close(SUMMARY);
}

# segmentation
if (defined($segmentation)) {
    if (defined($hierarchical)) {
	&hierarchical_segmentation();
    }
    else {
	&segmentation();
    }
}

# coverage analysis
my (%INPUT_PHRASE,%CORPUS_COVERED,%TTABLE_COVERED,%TTABLE_ENTROPY);
if (!defined($coverage_dir) && (defined($ttable) || defined($corpus))) {	
  if (!defined($input)) {
    die("ERROR: when specifying either ttable or input-corpus, please also specify input\n");
  }
  $MAX_LENGTH = 7;
  &input_phrases();
  &ttable_coverage("0",$ttable) if defined($ttable);
  &corpus_coverage() if defined($corpus);
  &input_annotation();

  # corpus coverage for non-surface factors
  if (defined($input_factors)) {
    for(my $factor=1;$factor<$input_factors;$factor++) {
      &input_phrases($factor);
      &corpus_coverage($factor);
    }
  }

  # factored ttable coverage
  foreach my $factored_ttable (@FACTORED_TTABLE) {
      die("factored ttable must be specified as factor:file -- $ttable")
	unless $factored_ttable =~ /^([\d,]+)\:(.+)/; #  factor:ttable
      my ($factor,$file) = ($1,$2);
      next if defined($ttable) && $file eq $ttable; # no need to do this twice
      &input_phrases($factor);
      &ttable_coverage($factor,$file);
  }
}

if (defined($precision_by_coverage)) {
  &precision_by_coverage("ttable");
  &precision_by_coverage("corpus");
}

# bilingual concordance -- not used by experiment.perl
if (defined($corpus) && defined($output_corpus) && defined($alignment) && defined($biconcor)) {
  `$biconcor -s $dir/biconcor -c $corpus -t $output_corpus -a $alignment`;
}

# process search graph for visualization
if (defined($search_graph)) {
  &process_search_graph($search_graph);
}

sub best_matches {
    my ($CORRECT,$TOTAL,$out) = @_;
    my $type = ($out =~ /precision/) ? "precision" : "recall";
    for(my $length=1;$length<=$MAX_LENGTH;$length++) {
	my ($total,$correct) = (0,0);
	open(OUT,">$out.$length") or die "Cannot open: $!";
	foreach my $ngram (keys %{$$TOTAL{$length}}) {
	    printf OUT "%d\t%d\t%s\n",
	    $$TOTAL{$length}{$ngram},
	    $$CORRECT{$length}{$ngram},
	    $ngram;
	    $total += $$TOTAL{$length}{$ngram};
	    $correct += $$CORRECT{$length}{$ngram};
	}
	close(OUT);
	print SUMMARY "$type-$length-total: $total\n";
	print SUMMARY "$type-$length-correct: $correct\n";
    }
}

# get all the n-grams from the input corpus
sub input_phrases {
  my ($factor) = (@_);
  %INPUT_PHRASE = ();

  open(INPUT,$input) or die "Can't read input $input";
  while(my $line = <INPUT>) {
    chop($line);
    $line = &get_factor_phrase($factor,$line);
    &extract_n_grams($line,\%INPUT_PHRASE);
  }
  close(INPUT);  
}

# reduce a factorized phrase into the factors of interest
sub get_factor_phrase {
  my ($factor,$line) = @_;

  # clean line
  $line =~ s/[\r\n]+//g;
  $line =~ s/\s+/ /;
  $line =~ s/^ //;
  $line =~ s/ $//;

  # only surface? delete remaining factors
  if (!defined($factor) || $factor eq "0") {
    $line =~ s/\|\S+//g;
    return $line;
  }
  my $factored_line = "";

  # reduce each word
  foreach (split(/ /,$line)) {
    $factored_line .= &get_factor_word($factor,$_) . " ";
  }

  chop($factored_line);
  return $factored_line;
}

# reduce a factorized word into the factors of interest
sub get_factor_word {
  my ($factor,$word) = @_;

  my @WORD = split(/\|/,$word);
  my $fword = "";
  foreach (split(/,/,$factor)) {
    $fword .= $WORD[$_]."|";
  }
  chop($fword);
  return $fword;
}

sub factor_ext {
  my ($factor) = @_;
  return "" if !defined($factor) || $factor eq "0";
  return ".".$factor;
}

sub bleu_annotation {
    open(OUT,"| sort -r >$dir/bleu-annotation") or die "Cannot open: $!";
    for(my $i=0;$i<scalar @SYSTEM;$i++) {
	my $system = $SYSTEM[$i];
	$system =~ s/\s+/ /g;
	$system =~ s/^ //;
	$system =~ s/ $//;
	my (%SYS_NGRAM,%REF_NGRAM);
	&extract_n_grams( $system, \%SYS_NGRAM );
	&extract_n_grams_arrayopt( $REFERENCE[$i], \%REF_NGRAM, "max" );

 	my @WORD = split(/ /,$system);
	my @MATCH;
	for(my $i=0;$i<scalar @WORD;$i++) {
	    $MATCH[$i] = 0;
	}

	my $bleu = 1;
	for(my $length=1;$length<=$MAX_LENGTH && $length <= scalar @WORD;$length++) {
	    my $ngram_correct = 1;
	    for(my $i=0;$i<=scalar @WORD-$length;$i++) {
		my $ngram = "";
		for(my $n=0;$n<$length;$n++) {
		    $ngram .= " " if $n>0;
		    $ngram .= $WORD[$i+$n];
		}
		$REF_NGRAM{$length}{$ngram}--;
		if ($REF_NGRAM{$length}{$ngram} >= 0) {
		    $ngram_correct++;
		    for(my $n=0;$n<$length;$n++) {
			$MATCH[$i+$n] = $length;
		    }
		}
	    }
	    $bleu *= ($ngram_correct/(scalar(@WORD)-$length+2));
	}
	$bleu = $bleu ** (1/4);

	my $ref_length = 9999;
	if (ref($REFERENCE[$i]) eq 'ARRAY') {
	    foreach my $ref (@{$REFERENCE[$i]}) {
		my @RW = split(/ /,$ref);
		$ref_length = scalar(@RW) if scalar(@RW) < $ref_length;
	    }
	}
	else {
	    my @RW = split(/ /,$REFERENCE[$i]);
	    $ref_length = scalar(@RW);
	}

	if (scalar(@WORD) < $ref_length && scalar(@WORD)>0) {
	    $bleu *= exp(1-$ref_length/scalar(@WORD));
	}

	printf OUT "%5.4f\t%d\t",$bleu,$i;
        for(my $i=0;$i<scalar @WORD;$i++) {
	    print OUT " " if $i;
	    print OUT "$WORD[$i]|$MATCH[$i]";
	}
	if (ref($REFERENCE[$i]) eq 'ARRAY') {
	    foreach my $ref (@{$REFERENCE[$i]}) {
		print OUT "\t".$ref;	
	    }
	}
	else {
	  print OUT "\t".$REFERENCE[$i]  
	}
	print OUT "\n";
    }
    close(OUT);
}

sub add_match {
    my ($system,$reference,$CORRECT,$TOTAL) = @_;
    my (%SYS_NGRAM,%REF_NGRAM);
    &extract_n_grams_arrayopt( $system, \%SYS_NGRAM, "min" );
    &extract_n_grams_arrayopt( $reference, \%REF_NGRAM, "max" );
    foreach my $length (keys %SYS_NGRAM) {
	foreach my $ngram (keys %{$SYS_NGRAM{$length}}) {
	    my $sys_count = $SYS_NGRAM{$length}{$ngram};
	    my $ref_count = 0;
	    $ref_count = $REF_NGRAM{$length}{$ngram} if defined($REF_NGRAM{$length}{$ngram});
	    my $match_count = ($sys_count > $ref_count) ? $ref_count : $sys_count;
	    
	    $$CORRECT{$length}{$ngram} += $match_count;
	    $$TOTAL{$length}{$ngram} += $sys_count;
	    #print "$length:$ngram $sys_count $ref_count\n";
	}
    }
}

sub ttable_coverage {
  my ($factor,$ttable) = @_;

  # open file
  if (! -e $ttable && -e $ttable.".gz") {
    open(TTABLE,"gzip -cd $ttable.gz|") or die "Cannot open: $!";
  }
  elsif ($ttable =~ /.gz$/) {
    open(TTABLE,"gzip -cd $ttable|") or die "Cannot open: $!";
  }
  else {
    open(TTABLE,$ttable) or die "Can't read ttable $ttable: $!";
  }

  # create report file
  open(REPORT,">$dir/ttable-coverage-by-phrase".&factor_ext($factor)) or die "Cannot open: $!";
  my ($last_in,$last_size,$size) = ("",0);

  my $p_e_given_f_score = 2;
  if ($score_options) {
    if ($score_options =~ /OnlyDirect/) {
      $p_e_given_f_score = 0;
    }
    elsif ($score_options =~ /NoLex/) {
      $p_e_given_f_score = 1;
    }
  }

  my @DISTRIBUTION = ();
  while(<TTABLE>) {
    chop;
    my @COLUMN = split(/ +\|\|\| +/);
    my ($in,$out,$scores) = @COLUMN;
    # handling hierarchical
    $in =~ s/ \[[^ \]]+\]$//; # remove lhs nt
    next if $in =~ /\[[^ \]]+\]\[[^ \]]+\]/; # only consider flat rules
    $in = &get_factor_phrase($factor,$in) if defined($factor) && $factor eq "0"; 
    $scores = $COLUMN[4] if defined($hierarchical); #scalar @COLUMN == 5;
    my @IN = split(/ /,$in);
    $size = scalar @IN;
    next unless defined($INPUT_PHRASE{$size}{$in});
    $TTABLE_COVERED{$size}{$in}++;
    my @SCORE = split(/ /,$scores);
    if ($in ne $last_in) {
      if ($last_in ne "") {
        my $entropy = &compute_entropy(@DISTRIBUTION);
        printf REPORT "%s\t%d\t%.5f\n",$last_in,$TTABLE_COVERED{$last_size}{$last_in},$entropy;
	$TTABLE_ENTROPY{$last_size}{$last_in} = $entropy;
        @DISTRIBUTION = ();
      }
      $last_in = $in;
      $last_size = $size;
    }
    push @DISTRIBUTION, $SCORE[$p_e_given_f_score]; # forward probability
  }
  my $entropy = &compute_entropy(@DISTRIBUTION);
  printf REPORT "%s\t%d\t%.5f\n",$last_in,$TTABLE_COVERED{$last_size}{$last_in},$entropy;
  $TTABLE_ENTROPY{$last_size}{$last_in} = $entropy;
  close(REPORT);
  close(TTABLE);

  &additional_coverage_reports($factor,"ttable",\%TTABLE_COVERED);
}

sub compute_entropy {
  my $z = 0; # normalization
  foreach my $p (@_) {
    $z += $p;
  }
  my $entropy = 0;
  foreach my $p (@_) {
    next if $p == 0;
    $entropy -= ($p/$z)*log($p/$z)/log(2);
  }
  return $entropy;
}

sub corpus_coverage {
  my ($factor) = @_;
  %CORPUS_COVERED = ();

  # compute how often input phrases occur in the corpus
  open(CORPUS,$corpus) or die "Can't read corpus $corpus";
  while(<CORPUS>) {
    my $line = &get_factor_phrase($factor,$_);
    my @WORD = split(/ /,$line);
    my $sentence_length = scalar @WORD;
    for(my $start=0;$start < $sentence_length;$start++) {
      my $phrase = "";
      for(my $length=1;$length<$MAX_LENGTH && $start+$length<=$sentence_length;$length++) {
	$phrase .= " " if $length > 1;
	$phrase .= $WORD[$start+$length-1];
	last if !defined($INPUT_PHRASE{$length}{$phrase});
	$CORPUS_COVERED{$length}{$phrase}++;
      }
    }
  }
  close(CORPUS);

  # report occurrence counts for all known input phrases
  open(REPORT,">$dir/corpus-coverage-by-phrase".&factor_ext($factor)) or die "Cannot open: $!";
  foreach my $size (sort {$a <=> $b} keys %INPUT_PHRASE) {
    foreach my $phrase (keys %{$INPUT_PHRASE{$size}}) {
      next unless defined $CORPUS_COVERED{$size}{$phrase};
      printf REPORT "%s\t%d\n", $phrase, $CORPUS_COVERED{$size}{$phrase};
    }
  }
  close(REPORT);

  &additional_coverage_reports($factor,"corpus",\%CORPUS_COVERED);
}

sub additional_coverage_reports {
  my ($factor,$name,$COVERED) = @_;

  # unknown word report ---- TODO: extend to rare words?
  open(REPORT,">$dir/$name-unknown".&factor_ext($factor)) or die "Cannot open: $!";
  foreach my $phrase (keys %{$INPUT_PHRASE{1}}) {
    next if defined($$COVERED{1}{$phrase});
    printf REPORT "%s\t%d\n",$phrase,$INPUT_PHRASE{1}{$phrase};
  }
  close(REPORT);

  # summary report
  open(REPORT,">$dir/$name-coverage-summary".&factor_ext($factor)) or die "Cannot open: $!";
  foreach my $size (sort {$a <=> $b} keys %INPUT_PHRASE) {
    my (%COUNT_TYPE,%COUNT_TOKEN);
    foreach my $phrase (keys %{$INPUT_PHRASE{$size}}) {
      my $covered = $$COVERED{$size}{$phrase};
      $covered = 0 unless defined($covered);
      $COUNT_TYPE{$covered}++;
      $COUNT_TOKEN{$covered} += $INPUT_PHRASE{$size}{$phrase};
    }
    foreach my $count (sort {$a <=> $b} keys %COUNT_TYPE) {
      printf REPORT "%d\t%d\t%d\t%d\n",$size,$count,$COUNT_TYPE{$count},$COUNT_TOKEN{$count};
    }
  }
  close(REPORT);
}

sub input_annotation {
  open(OUT,">$dir/input-annotation") or die "Cannot open: $!";;
  open(INPUT,$input) or die "Can't read input $input";
  while(<INPUT>) {
    chop;
    s/\|\S+//g; # remove additional factors
    s/<\S[^>]*>//g; # remove xml markup
    s/\s+/ /g; s/^ //; s/ $//; # remove redundant spaces
    print OUT $_."\t";
    my @WORD = split;
    my $sentence_length = scalar @WORD;
    for(my $start=0;$start < $sentence_length;$start++) {
      my $phrase = "";
      for(my $length=1;$length<$MAX_LENGTH && $start+$length<=$sentence_length;$length++) {
	$phrase .= " " if $length > 1;
	$phrase .= $WORD[$start+$length-1];

	my $ttable_covered = $TTABLE_COVERED{$length}{$phrase};
	my $corpus_covered = $CORPUS_COVERED{$length}{$phrase};
	next unless defined($ttable_covered) || defined($corpus_covered);
	my $ttable_entropy = $TTABLE_ENTROPY{$length}{$phrase} || 0;
	#$ttable_entropy = 0 unless defined($ttable_entropy);
	$ttable_covered = 0 unless defined($ttable_covered);
	$corpus_covered = 0 unless defined($corpus_covered);
	
	if (defined($TTABLE_COVERED{$length}{$phrase})) {
	  printf OUT "%d-%d:%d:%d:%.5f ",$start,$start+$length-1,$corpus_covered,$ttable_covered,$ttable_entropy;
	}
      }
    }
    print OUT "\n";
  }
  close(INPUT);    
  close(OUT);
}

sub extract_n_grams_arrayopt {
    my ($sentence,$NGRAM,$minmax) = @_;
    if (ref($sentence) eq 'ARRAY') {
	my %MINMAX_NGRAM;
	&extract_n_grams($$sentence[0],\%MINMAX_NGRAM);
	for(my $i=1;$i<scalar(@{$sentence});$i++) {
	    my %SET_NGRAM;
	    &extract_n_grams($$sentence[$i],\%SET_NGRAM);
	    for(my $length=1;$length<=$MAX_LENGTH;$length++) {
		if ($minmax eq "min") {
		    foreach my $ngram (keys %{$MINMAX_NGRAM{$length}}) {
			if (!defined($SET_NGRAM{$length}{$ngram})) {
			    delete( $MINMAX_NGRAM{$length}{$ngram} );
			}
			elsif($MINMAX_NGRAM{$length}{$ngram} > $SET_NGRAM{$length}{$ngram}) {
			    $MINMAX_NGRAM{$length}{$ngram} = $SET_NGRAM{$length}{$ngram};
			}
		    }
		}
		else {
		    foreach my $ngram (keys %{$SET_NGRAM{$length}}) {
			if (!defined($MINMAX_NGRAM{$length}{$ngram}) ||
			    $SET_NGRAM{$length}{$ngram} > $MINMAX_NGRAM{$length}{$ngram}) {
			    $MINMAX_NGRAM{$length}{$ngram} = $SET_NGRAM{$length}{$ngram};
			}
		    }
		}
	    }
	}
	for(my $length=1;$length<=$MAX_LENGTH;$length++) {
	    foreach my $ngram (keys %{$MINMAX_NGRAM{$length}}) {
		$$NGRAM{$length}{$ngram} += $MINMAX_NGRAM{$length}{$ngram};
	    }
	}
    }
    else {
	&extract_n_grams($sentence,$NGRAM);
    }
}

sub extract_n_grams {
    my ($sentence,$NGRAM) = @_;

    $sentence =~ s/[\r\n]+//g;
    $sentence =~ s/\s+/ /g;
    $sentence =~ s/^ //;
    $sentence =~ s/ $//;
    
    my @WORD = split(/ /,$sentence);
    for(my $length=1;$length<=$MAX_LENGTH;$length++) {
	for(my $i=0;$i<=scalar(@WORD)-$length;$i++) {
	    my $ngram = "";
	    for(my $n=0;$n<$length;$n++) {
		$ngram .= " " if $n>0;
		$ngram .= $WORD[$i+$n];
	    }
	    $$NGRAM{$length}{$ngram}++;
	}
    }
}

sub precision_by_coverage {
  my ($coverage_type) = @_;
  my (%PREC_BY_WORD,%TOTAL_BY_WORD,%LENGTH_BY_WORD,%DELETED_BY_WORD);
  my (%PREC_BY_COVERAGE,%TOTAL_BY_COVERAGE,%LENGTH_BY_COVERAGE,%DELETED_BY_COVERAGE);
  my (%PREC_BY_FACTOR,%TOTAL_BY_FACTOR,%LENGTH_BY_FACTOR,%DELETED_BY_FACTOR);
  my (%PREC_BY_FACTOR_COVERAGE,%TOTAL_BY_FACTOR_COVERAGE,%LENGTH_BY_FACTOR_COVERAGE,%DELETED_BY_FACTOR_COVERAGE);

  # get coverage statistics
  my %COVERAGE;
  print STDERR "".(defined($coverage_dir)?$coverage_dir:$dir)
      ."/$coverage_type-coverage-by-phrase";
  open(COVERAGE,(defined($coverage_dir)?$coverage_dir:$dir)
                ."/$coverage_type-coverage-by-phrase") or die "Cannot open: $!";
  while(<COVERAGE>) {
    chop;
    my ($phrase,$count) = split(/\t/);
    $COVERAGE{$phrase} = $count;
  }
  close(COVERAGE);

  # go through each line...
  open(FILE,$segmentation) || die("ERROR: could not open segmentation file $segmentation");
  open(INPUT,$input) or die "Can't read input $input";
  open(ALIGNMENT,$system_alignment) or die "Can't read output alignment file $system_alignment";

  # get marked up output
  my $line_count = 0;
  while(my $line = <FILE>) {
    chop($line);

    # get corresponding input line
    my $input = <INPUT>;
    my @INPUT = split(/ /,&get_factor_phrase(0,$input)); # surface
    my @FACTOR = split(/ /,&get_factor_phrase($precision_by_coverage_factor,$input));

    # word alignment
    my $alignment = <ALIGNMENT>;
    my %ALIGNED;
    foreach (split(/ /,$alignment)) {
      my ($input_pos,$output_pos) = split(/\-/,$_);
      push @{$ALIGNED{$input_pos}}, $output_pos;
    }

    # output words
    # @SYSTEM is already collected
    my @OUTPUT = split(/ /,$SYSTEM[$line_count]);

    # compute precision of each ngram
    # @REFERENCE (possibly multiple) is already collected
    my (%SYS_NGRAM,%REF_NGRAM,%PREC_NGRAM);
    &extract_n_grams( $SYSTEM[$line_count], \%SYS_NGRAM );
    &extract_n_grams_arrayopt( $REFERENCE[$line_count++], \%REF_NGRAM, "max" );
    foreach my $ngram (keys %{$SYS_NGRAM{1}}) { # note: only interested in unigram precision
      $PREC_NGRAM{1}{$ngram} = 0;
      if (defined($REF_NGRAM{1}) &&
	  defined($REF_NGRAM{1}{$ngram})) {
	my $ref_count = $REF_NGRAM{1}{$ngram};
        my $sys_count = $SYS_NGRAM{1}{$ngram};
	$PREC_NGRAM{1}{$ngram} = 
	  ($ref_count >= $sys_count) ? 1 : $ref_count/$sys_count;	    
      }
    }
    close(REPORT);

    # process one phrase at a time
    my $output_pos = 0;
    while($line =~ /([^|]+) \|(\d+)\-(\d+)\|\s*(.*)$/) {
      my ($output,$from,$to) = ($1,$2,$3);
      $line = $4;
        
      # bug fix: 1-1 unknown word mappings get alignment point
      if ($from == $to &&                         # one
	  scalar(split(/ /,$output)) == 1 &&      # to one 
	  !defined($ALIGNED{$from})) {             # but not aligned
	push @{$ALIGNED{$from}},$output_pos;
      }
      $output_pos += scalar(split(/ /,$output));

      # compute precision for each word
      for(my $i=$from; $i<=$to; $i++) {
	my $coverage = 0;
	$coverage = $COVERAGE{$INPUT[$i]} if defined($COVERAGE{$INPUT[$i]});

	my ($precision,$deleted,$length) = (0,0,0);

	# unaligned? note as deleted	
	if (!defined($ALIGNED{$i})) {
	  $deleted = 1;
	}
	# aligned 
	else {
	  foreach my $o (@{$ALIGNED{$i}}) {
	    $precision += $PREC_NGRAM{1}{$OUTPUT[$o]};
	  }
	  $precision /= scalar(@{$ALIGNED{$i}}); # average, if multi-aligned
	  $length = scalar(@{$ALIGNED{$i}});
	}

	my $word = $INPUT[$i];
	$word .= "\t".$FACTOR[$i] if $precision_by_coverage_factor;
	$DELETED_BY_WORD{$word} += $deleted;
	$PREC_BY_WORD{$word} += $precision;
	$LENGTH_BY_WORD{$word} += $length;
	$TOTAL_BY_WORD{$word}++;	

	$DELETED_BY_COVERAGE{$coverage} += $deleted;
	$PREC_BY_COVERAGE{$coverage} += $precision;
	$LENGTH_BY_COVERAGE{$coverage} += $length;
	$TOTAL_BY_COVERAGE{$coverage}++;	

	if ($precision_by_coverage_factor) {
	  $DELETED_BY_FACTOR{$FACTOR[$i]} += $deleted;
	  $DELETED_BY_FACTOR_COVERAGE{$FACTOR[$i]}{$coverage} += $deleted;
	  $PREC_BY_FACTOR{$FACTOR[$i]} += $precision;
	  $PREC_BY_FACTOR_COVERAGE{$FACTOR[$i]}{$coverage} += $precision;
	  $LENGTH_BY_FACTOR{$FACTOR[$i]} += $length;
	  $LENGTH_BY_FACTOR_COVERAGE{$FACTOR[$i]}{$coverage} += $length;	
	  $TOTAL_BY_FACTOR{$FACTOR[$i]}++;	    
	  $TOTAL_BY_FACTOR_COVERAGE{$FACTOR[$i]}{$coverage}++;	
        }
      }
    }
  }
  close(FILE);

  open(REPORT,">$dir/precision-by-$coverage_type-coverage") or die "Cannot open: $!";
  foreach my $coverage (sort {$a <=> $b} keys %TOTAL_BY_COVERAGE) {
    printf REPORT "%d\t%.3f\t%d\t%d\t%d\n", $coverage, $PREC_BY_COVERAGE{$coverage}, $DELETED_BY_COVERAGE{$coverage}, $LENGTH_BY_COVERAGE{$coverage}, $TOTAL_BY_COVERAGE{$coverage};
  }
  close(REPORT);

  open(REPORT,">$dir/precision-by-input-word") or die "Cannot open: $!";
  foreach my $word (keys %TOTAL_BY_WORD) {
    my ($w,$f) = split(/\t/,$word);
    my $coverage = 0;
    $coverage = $COVERAGE{$w} if defined($COVERAGE{$w});
   printf REPORT "%.3f\t%d\t%d\t%d\t%d\t%s\n", $PREC_BY_WORD{$word}, $DELETED_BY_WORD{$word}, $LENGTH_BY_WORD{$word}, $TOTAL_BY_WORD{$word},$coverage,$word;
  }
  close(REPORT);

  if ($precision_by_coverage_factor) {
    open(REPORT,">$dir/precision-by-$coverage_type-coverage.$precision_by_coverage_factor") or die "Cannot open: $!";
    foreach my $factor (sort keys %TOTAL_BY_FACTOR_COVERAGE) {
      foreach my $coverage (sort {$a <=> $b} keys %{$TOTAL_BY_FACTOR_COVERAGE{$factor}}) {
        printf REPORT "%s\t%d\t%.3f\t%d\t%d\t%d\n", $factor, $coverage, $PREC_BY_FACTOR_COVERAGE{$factor}{$coverage}, $DELETED_BY_FACTOR_COVERAGE{$factor}{$coverage}, $LENGTH_BY_FACTOR_COVERAGE{$factor}{$coverage}, $TOTAL_BY_FACTOR_COVERAGE{$factor}{$coverage};
      }
    }
    close(REPORT);
  }
}

sub segmentation {
  my %SEGMENTATION;

  open(FILE,$segmentation) || die("ERROR: could not open segmentation file $segmentation");
  open(OUT,">$dir/segmentation-annotation") or die "Cannot open: $!";
  while(<FILE>) {
    chop;
    my $count=0;
    my $out = -1;
    foreach (split) {
      if (/^\|(\d+)\-(\d+)\|$/) {
	print OUT " " unless $out-($count-1) == 0;
	printf OUT "%d:%d:%d:%d",$1,$2,$out-($count-1),$out;
	my $in_count = $2-$1+1;
	$SEGMENTATION{$in_count}{$count}++;
        $count = 0;
      }
      else {
        $out++;
        $count++;
      }
    }
    print OUT "\n";
  }
  close(OUT);
  close(FILE);

  open(SUMMARY,">$dir/segmentation") or die "Cannot open: $!";
  foreach my $in (sort { $a <=> $b } keys %SEGMENTATION) {
    foreach my $out (sort { $a <=> $b } keys %{$SEGMENTATION{$in}}) {
      printf SUMMARY "%d\t%d\t%d\n", $in, $out, $SEGMENTATION{$in}{$out};
    }
  }
  close(SUMMARY);

  # TODO: error by segmentation
}

# analyze the trace file to collect statistics over the
# hierarchical derivations and also create segmentation annotation
sub hierarchical_segmentation {
    my $last_sentence = -1;
    my @DERIVATION;
    my %STATS;
    open(TRACE,$segmentation.".trace") or die "Cannot open: $!";
    open(INPUT_TREE,">$dir/input-tree") or die "Cannot open: $!";
    open(OUTPUT_TREE,">$dir/output-tree") or die "Cannot open: $!";
    open(NODE,">$dir/node") or die "Cannot open: $!";
    while(<TRACE>) {
        my $sentence;
        my %ITEM;
        &hs_scan_line($_, \$sentence, \%ITEM) || die("cannot scan line $_");
        if ($last_sentence >= 0 && $sentence != $last_sentence) {
            &hs_process($last_sentence,\@DERIVATION,\%STATS);
            @DERIVATION = ();
        }
        push @DERIVATION,\%ITEM;
        $last_sentence = $sentence;
    }
    &hs_process($last_sentence,\@DERIVATION,\%STATS);
    close(TRACE);
    close(NODE);
    close(INPUT_TREE);
    close(OUTPUT_TREE);

    open(SUMMARY,">$dir/rule") or die "Cannot open: $!";
    print SUMMARY "sentence-count\t".(++$last_sentence)."\n";
    print SUMMARY "glue-rule\t".$STATS{'glue-rule'}."\n";
    print SUMMARY "depth\t".$STATS{'depth'}."\n";
    foreach (keys %{$STATS{'rule-type'}}) {
	print SUMMARY "rule\t$_\t".$STATS{'rule-type'}{$_}."\n";
    }
    close(SUMMARY);
}

# scan a single line of the trace file
sub hs_scan_line {
    my ($line,$ref_sentence,$ref_item) = @_;

    if ($line =~ /^Trans Opt/) {
        # Old format
        $line =~ /^Trans Opt (\d+) \[(\d+)\.\.(\d+)\]: (.+)  : (\S+) \-\>(.+) :([\(\),\d\- ]*): pC=[\d\.\-e]+, c=/ ||
        $line =~ /^Trans Opt (\d+) \[(\d+)\.\.(\d+)\]: (.+)  : (\S+) \-\>\S+  \-\> (.+) :([\(\),\d\- ]*): c=/ || return 0;
        my ($sentence,$start,$end,$spans,$rule_lhs,$rule_rhs,$alignment) = ($1,$2,$3,$4,$5,$6,$7);

        ${$ref_sentence} = $sentence;

        $ref_item->{'start'} = $start;
        $ref_item->{'end'} = $end;
        $ref_item->{'rule_lhs'} = $rule_lhs;

        $rule_rhs =~ s/</&lt;/g;
        $rule_rhs =~ s/>/&gt;/g;
        @{$ref_item->{'rule_rhs'}} = split(/ /,$rule_rhs);

        foreach (split(/ /,$alignment)) {
            /(\d+)[\-,](\d+)/ || die("funny alignment: $_\n");
            $ref_item->{'alignment'}{$2} = $1; # target non-terminal to source span
            $ref_item->{'alignedSpan'}{$1} = 1;
        }

        @{$ref_item->{'spans'}} = ();
        foreach my $span (reverse split(/\s+/,$spans)) {
            $span =~ /\[(\d+)\.\.(\d+)\]=(\S+)$/ || die("funny span: $span\n");
            my %SPAN = ( 'from' => $1, 'to' => $2, 'word' => $3 );
            push @{$ref_item->{'spans'}}, \%SPAN;
        }
    } else {
        # New format
        $line =~ /^(\d+) \|\|\| \[\S+\] -> (.+) \|\|\| \[(\S+)\] -> (.+) \|\|\| (.*)\|\|\| (.*)/ || return 0;
        my ($sentence,$source_rhs,$target_lhs,$target_rhs,$alignment,$source_spans) = ($1,$2,$3,$4,$5,$6);

        ${$ref_sentence} = $sentence;

        @{$ref_item->{'spans'}} = ();
        foreach (split(/ /,$source_rhs)) {
            /^\[?([^\]]+)\]?$/;
            my %SPAN = ( 'word' => $1 );
            push @{$ref_item->{'spans'}}, \%SPAN;
        }

        my $i = 0;
        foreach my $span (split(/ /,$source_spans)) {
            $span =~ /(\d+)\.\.(\d+)/ || die("funny span: $span\n");
            $ref_item->{'spans'}[$i]{'from'} = $1;
            $ref_item->{'spans'}[$i]{'to'} = $2;
            if ($i == 0) {
                $ref_item->{'start'} = $1;
            }
            $ref_item->{'end'} = $2;
            $i++;
        }

        $ref_item->{'rule_lhs'} = $target_lhs;

        $target_rhs =~ s/</&lt;/g;
        $target_rhs =~ s/>/&gt;/g;
        @{$ref_item->{'rule_rhs'}} = ();
        foreach (split(/ /,$target_rhs)) {
            /^\[?([^\]]+)\]?$/;
            push @{$ref_item->{'rule_rhs'}}, $1;
        }

        foreach (split(/ /,$alignment)) {
            /(\d+)[\-,](\d+)/ || die("funny alignment: $_\n");
            $ref_item->{'alignment'}{$2} = $1; # target non-terminal to source span
            $ref_item->{'alignedSpan'}{$1} = 1;
        }
    }

    return 1;
}

# process a single sentence for hierarchical segmentation
sub hs_process {
    my ($sentence,$DERIVATION,$STATS) = @_;
    
    my $DROP_RULE = shift @{$DERIVATION}; # get rid of S -> S </s>
    my $max = $$DERIVATION[0]{'end'};
    
    # consolidate glue rules into one rule
    my %GLUE_RULE;
    $GLUE_RULE{'start'} = 1;
    $GLUE_RULE{'end'} = $max;
    $GLUE_RULE{'rule_lhs'} = "S";
    $GLUE_RULE{'depth'} = 0;
    my $x=0;
    while(1) {
	my $RULE = shift @{$DERIVATION};
	if (scalar(@{$$RULE{'rule_rhs'}}) == 2 &&
	     ($$RULE{'rule_lhs'} eq "S" && 
	      $$RULE{'rule_rhs'}[0] eq "S" &&
	      $$RULE{'rule_rhs'}[1] eq "X") ||
	     ($$RULE{'rule_lhs'} eq "Q" && 
	      $$RULE{'rule_rhs'}[0] eq "Q")) {
	    unshift @{$GLUE_RULE{'spans'}},$$RULE{'spans'}[1];
	    push @{$GLUE_RULE{'rule_rhs'}}, $$RULE{'rule_rhs'}[1];
	    $GLUE_RULE{'alignment'}{$x} = $x;
	    $GLUE_RULE{'alignedSpan'}{$x} = 1;
	    $x++;
	}
	else {
	    unshift @{$DERIVATION}, $RULE;
	    last;
	}
    }
    unshift @{$DERIVATION}, \%GLUE_RULE;	
    $$STATS{'glue-rule'} += $x;
    
    # create chart
    my %CHART;
    foreach my $RULE (@{$DERIVATION}) {
	$CHART{$$RULE{'start'}}{$$RULE{'end'}} = $RULE;
    }
    
    # compute depth
    &hs_compute_depth(1,$max,0,\%CHART);	
    my $max_depth = 0;
    foreach my $RULE (@{$DERIVATION}) {
	next unless defined($$RULE{'depth'}); # better: delete offending rule S -> S <s>
	$max_depth = $$RULE{'depth'} if $$RULE{'depth'} > $max_depth;
    }
    &hs_recompute_depth(1,$max,\%CHART,$max_depth);
    $$STATS{'depth'} += $max_depth;
    
    # build matrix of divs
    
    my @MATRIX;
    &hs_create_out_span(1,$max,\%CHART,\@MATRIX);
    print OUTPUT_TREE &hs_output_matrix($sentence,\@MATRIX,$max_depth);
    
    my @MATRIX_IN;
    &hs_create_in_span(1,$max,\%CHART,\@MATRIX_IN);
    print INPUT_TREE &hs_output_matrix($sentence,\@MATRIX_IN,$max_depth);
    
    # number rules and get their children
    my $id = 0;
    foreach my $RULE (@{$DERIVATION}) {
	next unless defined($$RULE{'start_div'}); # better: delete offending rule S -> S <s>
	$$STATS{'rule-type'}{&hs_rule_type($RULE)}++ if $id>0;
	$$RULE{'id'} = $id++;
    }
    &hs_get_children(1,$max,\%CHART);
    
    foreach my $RULE (@{$DERIVATION}) {
	next unless defined($$RULE{'start_div'}); # better: delete offending rule S -> S <s>
	
	print NODE $sentence." ";
	print NODE $$RULE{'depth'}." ";
	print NODE $$RULE{'start_div'}." ".$$RULE{'end_div'}." ";
	print NODE $$RULE{'start_div_in'}." ".$$RULE{'end_div_in'}." ";
	print NODE join(",",@{$$RULE{'children'}})."\n";
    }
}

sub hs_output_matrix {
    my ($sentence,$MATRIX,$max_depth) = @_;
    my @OPEN;
    my $out = "";
    for(my $d=0;$d<=$max_depth;$d++) { push @OPEN, 0; }
    foreach my $SPAN (@$MATRIX) {
	$out .= $sentence."\t";
	for(my $d=0;$d<=$max_depth;$d++) {
	    my $class = " ";
	    my $closing_flag = 0;
	    if (defined($$SPAN{'closing'}) && defined($$SPAN{'closing'}{$d})) {
		$closing_flag = 1;
	    }
	    if ($d == $$SPAN{'depth'}) {
		if (defined($$SPAN{'opening'}) && $closing_flag) {
		    $class = "O";
		}
		elsif(defined($$SPAN{'opening'})) {
		    $class = "[";
		}
		elsif($closing_flag) {
		    $class = "]";
		}
		else {
		    $class = "-";
		}
	    }
	    elsif ($closing_flag) {
		$class = "]";
	    }
	    elsif ($OPEN[$d]) {
		$class = "-";				
	    }
	    $out .= $class;
	}
	$out .= "\t";		
	$out .= $$SPAN{'lhs'} if defined($$SPAN{'lhs'});
	$out .= "\t";
	$out .= $$SPAN{'rhs'} if defined($$SPAN{'rhs'});
	$out .= "\n";
	$OPEN[$$SPAN{'depth'}] = 1 if defined($$SPAN{'opening'});
	if(defined($$SPAN{'closing'})) {
	    for(my $d=$max_depth;$d>=0;$d--) {
		$OPEN[$d] = 0 if defined($$SPAN{'closing'}{$d});
	    }
	}
    }
    return $out;
}

sub hs_rule_type {
    my ($RULE) = @_;
    
    my $type = "";
    
    # output side
    my %NT;
    my $total_word_count = 0;
    my $word_count = 0;
    my $nt_count = 0;
    for(my $i=0;$i<scalar @{$$RULE{'rule_rhs'}};$i++) {
	if (defined($$RULE{'alignment'}{$i})) {
	    $type .= $word_count if $word_count > 0;
	    $word_count = 0;
	    my $nt = chr(97+$nt_count++);
	    $NT{$$RULE{'alignment'}{$i}} = $nt;
	    $type .= $nt;			
	}
	else {
	    $word_count++;
	    $total_word_count++;
	}
    }
    $type .= $word_count if $word_count > 0;
    
    $type .= ":".$total_word_count.":".$nt_count.":";
    
    # input side
    $word_count = 0;
    $total_word_count = 0;
    for(my $i=0;$i<scalar(@{$$RULE{'spans'}});$i++) {
	my $SUBSPAN = ${$$RULE{'spans'}}[$i];
	if (defined($$RULE{'alignedSpan'}{$i})) {
	    $type .= $word_count if $word_count > 0;
	    $word_count = 0;
	    $type .= $NT{$i};
	}
	else {
	    $word_count++;
	    $total_word_count++;
	}
    }
    $type .= $word_count if $word_count > 0;
    $type .= ":".$total_word_count;
    return $type;
}

# compute depth of each node
sub hs_compute_depth {
    my ($start,$end,$depth,$CHART)  = @_;
    if (!defined($$CHART{$start}{$end})) {
      print STDERR "warning: illegal span ($start,$end)\n";
      return;
    }
    my $RULE = $$CHART{$start}{$end};

    $$RULE{'depth'} = $depth;
    
    for(my $i=0;$i<scalar @{$$RULE{'rule_rhs'}};$i++) {
	# non-terminals
	if (defined($$RULE{'alignment'}{$i})) {
	    my $SUBSPAN = $$RULE{'spans'}[$$RULE{'alignment'}{$i}];
	    &hs_compute_depth($$SUBSPAN{'from'},$$SUBSPAN{'to'},$depth+1,$CHART);
	}
    }
}

# re-assign depth to as deep as possible
sub hs_recompute_depth {
    my ($start,$end,$CHART,$max_depth)  = @_;
    if (!defined($$CHART{$start}{$end})) {
      print STDERR "warning: illegal span ($start,$end)\n";
      return 0;
    }
    my $RULE = $$CHART{$start}{$end};
    
    my $min_sub_depth = $max_depth+1;
    for(my $i=0;$i<scalar @{$$RULE{'rule_rhs'}};$i++) {
	# non-terminals
	if (defined($$RULE{'alignment'}{$i})) {
	    my $SUBSPAN = $$RULE{'spans'}[$$RULE{'alignment'}{$i}];
	    my $sub_depth = &hs_recompute_depth($$SUBSPAN{'from'},$$SUBSPAN{'to'},$CHART,$max_depth);
	    $min_sub_depth = $sub_depth if $sub_depth < $min_sub_depth;
	}
    }
    $$RULE{'depth'} = $min_sub_depth-1;
    return $$RULE{'depth'};
}

# get child dependencies for a sentence
sub hs_get_children {
    my ($start,$end,$CHART)  = @_;
    if (!defined($$CHART{$start}{$end})) {
      print STDERR "warning: illegal span ($start,$end)\n";
      return -1;
    }
    my $RULE = $$CHART{$start}{$end};
    
    my @CHILDREN = ();
    $$RULE{'children'} = \@CHILDREN;
    
    for(my $i=0;$i<scalar @{$$RULE{'rule_rhs'}};$i++) {
	# non-terminals
	if (defined($$RULE{'alignment'}{$i})) {
	    my $SUBSPAN = $$RULE{'spans'}[$$RULE{'alignment'}{$i}];
	    my $child = &hs_get_children($$SUBSPAN{'from'},$$SUBSPAN{'to'},$CHART);
	    push @CHILDREN, $child unless $child == -1;
	}
    }
    return $$RULE{'id'};	
}

# create the span annotation for an output sentence
sub hs_create_out_span {
    my ($start,$end,$CHART,$MATRIX) = @_;
    if (!defined($$CHART{$start}{$end})) {
      print STDERR "warning: illegal span ($start,$end)\n";
      return;
    }
    my $RULE = $$CHART{$start}{$end};
    
    my %SPAN;
    $SPAN{'start'} = $start;
    $SPAN{'end'} = $end;
    $SPAN{'depth'} = $$RULE{'depth'};
    $SPAN{'lhs'} = $$RULE{'rule_lhs'};
    $SPAN{'opening'} = 1;
    push @{$MATRIX},\%SPAN;
    $$RULE{'start_div'} = $#{$MATRIX};
    my $THIS_SPAN = \%SPAN;
    # in output order ...
    my $terminal = 1;
    for(my $i=0;$i<scalar @{$$RULE{'rule_rhs'}};$i++) {
	# non-terminals
	if (defined($$RULE{'alignment'}{$i})) {
	    my $SUBSPAN = $$RULE{'spans'}[$$RULE{'alignment'}{$i}];
	    &hs_create_out_span($$SUBSPAN{'from'},$$SUBSPAN{'to'},$CHART,$MATRIX);
	    $terminal = 0;
	}
	# terminals
	else {
	    # new sequence of terminals?
	    if (!$terminal) {
		my %SPAN;
		$SPAN{'start'} = $start;
		$SPAN{'end'} = $end;
		$SPAN{'depth'} = $$RULE{'depth'};
		push @{$MATRIX},\%SPAN;
		$THIS_SPAN = \%SPAN;								
	    }
	    $$THIS_SPAN{'rhs'} .= " " if defined($$THIS_SPAN{'rhs'});
	    $$THIS_SPAN{'rhs'} .= $$RULE{"rule_rhs"}[$i];
	    $terminal = 1;
	}
    }
    $THIS_SPAN = $$MATRIX[scalar(@{$MATRIX})-1];
    $$RULE{'end_div'} = $#{$MATRIX};
    $$THIS_SPAN{'closing'}{$$RULE{'depth'}} = 1;
}

# create the span annotation for an input sentence
sub hs_create_in_span {
    my ($start,$end,$CHART,$MATRIX) = @_;
    if (!defined($$CHART{$start}{$end})) {
      print STDERR "warning: illegal span ($start,$end)\n";
      return;
    }
    my $RULE = $$CHART{$start}{$end};
    
    my %SPAN;
    $SPAN{'start'} = $start;
    $SPAN{'end'} = $end;
    $SPAN{'depth'} = $$RULE{'depth'};
    $SPAN{'lhs'} = $$RULE{'rule_lhs'};
    $SPAN{'opening'} = 1;
    push @{$MATRIX},\%SPAN;
    $$RULE{'start_div_in'} = $#{$MATRIX};
    my $THIS_SPAN = \%SPAN;
    
    my $terminal = 1;
    # in input order ...
    for(my $i=0;$i<scalar(@{$$RULE{'spans'}});$i++) {
	my $SUBSPAN = ${$$RULE{'spans'}}[$i];
	if (defined($$RULE{'alignedSpan'}{$i})) {
	    &hs_create_in_span($$SUBSPAN{'from'},$$SUBSPAN{'to'},$CHART,$MATRIX);
	    $terminal = 0;
	}
	else {
	    # new sequence of terminals?
	    if (!$terminal) {
		my %SPAN;
		$SPAN{'start'} = $start;
		$SPAN{'end'} = $end;
		$SPAN{'depth'} = $$RULE{'depth'};
		push @{$MATRIX},\%SPAN;
		$THIS_SPAN = \%SPAN;								
	    }
	    $$THIS_SPAN{'rhs'} .= " " if defined($$THIS_SPAN{'rhs'});
	    $$THIS_SPAN{'rhs'} .= $$SUBSPAN{'word'};
	    $terminal = 1;
	}
    }
    $THIS_SPAN = $$MATRIX[scalar(@{$MATRIX})-1];
    $$RULE{'end_div_in'} = $#{$MATRIX};
    $$THIS_SPAN{'closing'}{$$RULE{'depth'}} = 1;
}

sub process_search_graph {
  my ($search_graph_file) = @_;
  open(OSG,$search_graph) || die("ERROR: could not open search graph file '$search_graph_file'");
  `mkdir -p $dir/search-graph`;
  my $last_sentence = -1;
  while(<OSG>) {
    my ($sentence,$id,$recomb,$lhs,$output,$alignment,$rule_score,$heuristic_rule_score,$from,$to,$children,$hyp_score);
    if (/^(\d+) (\d+)\-?\>?(\S*) (\S+) =\> (.+) :(.*): pC=([\de\-\.]+), c=([\de\-\.]+) \[(\d+)\.\.(\d+)\] (.*)\[total=([\d\-\.]+)\] \<\</) {
      ($sentence,$id,$recomb,$lhs,$output,$alignment,$rule_score,$heuristic_rule_score,$from,$to,$children,$hyp_score) = ($1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12);
    }
    elsif (/^(\d+) (\d+)\-?\>?(\S*) (\S+) =\> (.+) :(.*): c=([\de\-\.]+) \[(\d+)\.\.(\d+)\] (.*)\[total=([\d\-\.]+)\] core/) {
      ($sentence,$id,$recomb,$lhs,$output,$alignment,$rule_score,$from,$to,$children,$hyp_score) = ($1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12);
      $heuristic_rule_score = $rule_score; # hmmmm....
    }
    else {
      die("ERROR: buggy search graph line: $_"); 
    }
    chop($alignment) if $alignment;
    chop($children) if $children;
    $recomb = 0 unless $recomb;
    $children = "" unless defined $children;
    $alignment = "" unless defined $alignment;
    if ($last_sentence != $sentence) {
      close(SENTENCE) if $sentence;
      open(SENTENCE,">$dir/search-graph/graph.$sentence");
      $last_sentence = $sentence;
    }
    print SENTENCE "$id\t$recomb\t$from\t$to\t$output\t$alignment\t$children\t$rule_score\t$heuristic_rule_score\t$hyp_score\t$lhs\n";
  }
  close(OSG);
  close(SENTENCE);
}