Welcome to mirror list, hosted at ThFree Co, Russian Federation.

reference-from-sgm.perl « support « ems « scripts - github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
blob: b8e1d108dd8dcdf3a4bc4a4a34aa469a1b0e120f (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
#!/usr/bin/env perl
#
# This file is part of moses.  Its use is licensed under the GNU Lesser General
# Public License version 2.1 or, at your option, any later version.

use warnings;
use strict;

die("ERROR syntax: reference-from-sgm.perl ref src out")
    unless scalar @ARGV == 3;
my ($ref,$src,$txt) = @ARGV;

# get order of the documents
my @ORDER;
open(ORDER,$src) || die("ERROR not found: $src");
while(<ORDER>) {
    next unless /docid="([^\"]+)"/;
    push @ORDER,$1;
}
close(ORDER);

# get from sgm file which lines belong to which system
my %DOC;
my $system_from_refset = 0;
my ($doc,$system);
open(REF,$ref) or die "Cannot open: $!";
while(my $line = <REF>) {
    if ($line =~ /<refset/ && $line =~ /refid="([^\"]+)"/i) {
      $system = $1;
      $system_from_refset = 1;
    }
    if ($line =~ /<doc/i) {
        die unless $line =~ /sysid="([^\"]+)"/i || $system_from_refset;
        $system = $1 unless $system_from_refset;
        die unless $line =~ /docid="([^\"]+)"/i;
        $doc = $1;
    }
    while ($line =~ /<seg[^>]+>\s*(.*)\s*$/i &&
	   $line !~ /<seg[^>]+>\s*(.*)\s*<\/seg>/i) {
	my $next_line = <REF>;
	$line .= $next_line;
	chop($line);
    }
    if ($line =~ /<seg[^>]+>\s*(.+)\s*<\/seg>/i) {
   	push @{$DOC{$system}{$doc}},$1;
    }
}
close(REF);

my $i=0;
foreach my $system (keys %DOC) {
    my $outfile = $txt;
    if (scalar keys %DOC > 1) {
	if ($outfile =~ /\.\d+$/) {
	    $outfile .= ".ref$i";
	}
	else {
	    $outfile .= $i;
	}
    }
    open(TXT,">$outfile") || die($outfile);
    foreach my $doc (@ORDER) {
	die("can't find '$doc' for ref '$system'") unless defined @{$DOC{$system}{$doc}};
	foreach my $line (@{$DOC{$system}{$doc}}) {
	    print TXT $line."\n";
	}
    }
    close(TXT);
    $i++;
}