Welcome to mirror list, hosted at ThFree Co, Russian Federation.

fsa2fsal.pl « generic « scripts - github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
blob: 28ec28a261f76a71f149196c9db36e4f3fe36df5 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
#!/usr/bin/env perl
# A very simple script that converts fsa format (openfst lattices) to the same
# thing represented one sentence per line. It uses '|||' to delimit columns and
# ' ' to delimit nodes (i.e. original lines).
# Some rudimentary sanity checks are done on the fly.
# Ondrej Bojar, bojar@ufal.mff.cuni.cz
#
# This file is part of moses.  Its use is licensed under the GNU Lesser General
# Public License version 2.1 or, at your option, any later version.

use warnings;
use strict;

my $errs = 0;
sub err {
  my $nr = shift;
  my $msg = shift;
  print STDERR "$nr:$msg\n";
  $errs++;
}

my $onr = 0;
my @lines = ();
sub flush {
  return if 0 == scalar @lines;
  print join(" ", @lines);
  print "\n";
  $onr++;
  @lines = ();
}

my $nr = 0;
my $numscores = undef;
while (<>) {
  chomp;
  if ($_ eq "") {
    flush();
    next;
  }
  my ($a, $b, $label, $scores, $rest) = split /\s+/, $_, 5;
  err($nr, "The delimiter '|||' can't appear in the input!") if /\|\|\|/;
  err($nr, "Node id not numeric: $a") if $a !~ /^\d+$/;
  err($nr, "Node id not numeric: $b") if $b !~ /^\d+$/;
  err($nr, "Unexpected tail: '$rest'") if defined $rest && $rest !~ /^\s*$/;
  my $thisnumscores = ($scores =~ tr/,/,/);
  $numscores = $thisnumscores if !defined $numscores;
  err($nr, "Incompatible number of arc scores, previous lines had ".($numscores+1).", now ".($thisnumscores+1))
    if $numscores != $thisnumscores;
  push @lines, join("|||", ($a,$b,$label,$scores));
}
flush();

exit 1 if $errs;