contrib/web/translate.cgi


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806

#!/usr/bin/perl -Tw
use warnings;
use strict;
$|++;

# file: translate.cgi

# Herve Saint-Amand
# saintamh [o] yahoo, com
# Universitaet des Saarlandes
# Mon May 12 14:10:54 2008

# This CGI script takes a web page URL as a parameter, fetches that page,
# translates it using the Moses decoder, and displays the translated version
# to the user, similarily to how Google or BabelFish translate web pages.

# I don't think I've ever written anything with such a high comment/code ratio,
# so hopefully it should be understandable. Just read top to bottom.


# TODO:
# 
#  - if the document contains <a name='anchor'></a> it will be lost
#  - don't insert spaces everywhere around soft tags
#  - charset autodetection would be nice, but it's not trivial

#------------------------------------------------------------------------------
# includes

use CGI;
use CGI::Carp qw/fatalsToBrowser/;

# we use the 2nd perl thread API. I think this means you need perl 5.6 or
# higher, compiled with thread support
use threads;
use threads::shared;

use Encode;
use HTML::Entities;
use HTML::Parser;
use LWP::UserAgent;
use URI;
use URI::Escape;

use lib 'lib';
use RemoteProcess;
use Subprocess;

#------------------------------------------------------------------------------
# constants, config

# In order to run this script, you must first start Moses as a sort of daemon
# process that accepts connections on some INET port, reads the sentences sent
# to it one line at a time and returns translations. The daemon.pl script that
# comes with this script does just that -- starts an instance of Moses and
# 'plugs' it to the net so it can be used from other machines or just other
# processes on the same machine.
# 
# This list here indicates where to find these instances of Moses. May be 
# localhost, or may be separate machines.
# 
# On the current UniSaar setup we use SSH tunneling to connect to other hosts,
# so from this script's POV they're all localhost. These ports are actually
# forwarded to other machines. There wouldn't be much point in running 16
# instances of Moses on the same machine.

my @MOSES_ADDRESSES = map "localhost:90$_",
    qw/01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 16/;


# The tokenizer tries to adapt its rules depending on the language it's dealing
# with, so we indicate that here.

my $INPUT_LANG  = 'fr';
my $OUTPUT_LANG = 'en';


# In order to tokenize and detokenize strings in a way that stays consistent
# with how it is done in the rest of the Moses system, we use the scripts that
# come with Moses as external processes. These are the commands we must run to
# start them.

my @TOKENIZER_CMD   = ('./bin/tokenizer.perl',   '-l', $INPUT_LANG);
my @DETOKENIZER_CMD = ('./bin/detokenizer.perl', '-l', $OUTPUT_LANG);


# We call 'soft tags' HTML tags whose presence is tolerated inside
# sentences. All other tags are assumed to be sentence-breakers and will be
# used to chop up documents into independent sentences. These few, however, are
# allowed within sentences.

my %SOFT_TAGS = map {$_ => 1} qw/a b i u em font blink tt acronym/;


# We call 'verbatim tags' HTML tags whose entire data is to be left untouched
# and reprinted as-is. These also happen to be tags whose content is typically
# not printed by the browser.

my %VERBATIM_TAGS = map {$_ => 1} qw/script style/;


# Some HTML tags have attributes that contain URLs. Since we'll be displaying
# the page on another server than its usual source server, relative paths will
# be broken, so we need to make all URLs absolute. These are the attributes
# that will be so modified.

my %URL_ATTRS = %{{
    a      => 'href',
    img    => 'src',
    form   => 'action',
    link   => 'href',
    script => 'src',
}};


# Some HTML tags have attributes that can contain free text that is displayed
# to the user. Data in attributes is not usually translated, but these should
# be.
# 
# Note that for implementation reasons these will always be treated as hard,
# sentence-splitting tags. This could be changed but would require a
# substantial re-write of this script.

my %TEXT_ATTR = %{{ input => [qw/value/], img => [qw/alt title/], }};


# Sentence splitting within a paragraph or block of text is done after
# tokenizing. Tokens matched by this regex will be considered to end a
# sentence, and hence be used in splitting the text into sentences.

my $RE_EOS_TOKEN = qr/^(?:\.+|[\?!:;])$/;

# This regex also matches sentence-ending tokens, but tokens matched by this
# one will not be included in the sentence itself. Tokens matched by the
# previous regex will be sent to Moses as part of the end of the sentence.
# Tokens matches by this one will never be sent to Moses. Which is why the pipe
# symbol, which Moses doesn't seem to like, must be in here.

my $RE_SPLIT_TOKEN = qr!^[\|\-]+$!;

#------------------------------------------------------------------------------
# global vars

# In cleaner code there wouldn't be global variables, but it simplified things
# to put these here. Eventually I wouldn't mind removing this section.


# This array is very central to the way this script works. The document will be
# chopped up into a list of 'segments'. Each segment is either some HTML code
# and whitespace which we don't translate or manipulate in any way, or a bit of
# text to be translated. It's as if we highlighted in the HTML source the bits
# of text that needed translation, and make each stripe of highlighter, and
# each length of text between them, a segment.
# 
# Segments that are untouched HTML are simply strings. If the whole document
# contained no translatable text, this array would only contain strings.
# 
# Segments that contain text to be translated are represented as arrayrefs. The
# first element of that arrayref is the text to be translated, with any soft
# tags within it replaced by placeholders of the type MOSESOPENTAG4. The
# remaining elements contain the necessary info to reinsert these tags. The
# placeholders are numbered, and the i-th placeholder corresponds to the
# (i+1)-th element in the arrayref (element 0 being the text). That element is
# itself an array ref, whose first element is the tag name and second element
# is a hashref of attributes.
# 
# So this document:
# 
#   <p>This is <a href="somewhere">a link</a> but it's not <b>bold</b></p>
# 
# would be represented by this @segments array:
# 
#   0: "<p>"
#   1: [ 0: "This is MOSESOPENTAG0 a link MOSESCLOSETAG0 but it's not" .
#           " MOSESOPENTAG1 bold MOSESCLOSETAG1"
#        1: [ "a", { href => "somewhere" } ]
#        2: [ "b", {} ] ]
#   2: "</p>"
# 
# Finally, there's one hack to be mentioned: text in %TEXT_ATTR attributes
# (defined above) also goes into a segment of its own. Since this text does
# not contain tags, and to signal that the code for the popup containing
# source text should not be inserted around this text, we replace the tag
# information by the "__NOPOPUP__" string. So this document:
# 
#   <img src="blah" alt="This describes the image">
# 
# would correspond to this @segments array:
# 
#   0: "<img src=\"blah\" alt=\""
#   1: [ "This describes the image", "__NOPOPUP__" ]
#   2: "\">"
# 
# This is a horrible hack. Yes.

my @segments;


# Finally, since this script is run in 'tainted' mode (-T switch) for basic
# security reasons, and we'll be launching subprocesses, so we need to make
# sure the PATH is clean otherwise Perl will refuse to do the system() calls.

$ENV{PATH} = '';

#------------------------------------------------------------------------------
# Fetch the source page

# get value of URL param, make sure it's absolute
my $url = CGI->new->param ('url');
die "No URL?" unless $url;
$url = "http://$url" unless ($url =~ m!^[a-z]+://!);

# configure Web client
my $lwp = new LWP::UserAgent (%{{
    agent   => $ENV{HTTP_USER_AGENT} || 'Mozilla/5.0',
    timeout => 5,
}});

# fetch the web page we want to translate
my $res = $lwp->get ($url);
die "Couldn't fetch page: " . $res->status_line unless $res->is_success;
my $html = $res->decoded_content;

# Find the page's base url. It may be different than the URL given to us as
# parameter if for instance that URL redirects to a different one, or the
# document contains a <base> tag.
my $base_url = $res->base;

# Decode entities, except some basics because it confuses our parsing. We need
# this because Moses won't understand the entities. It sometimes introduces
# minor display bugs, though. TODO: decode only alphanumerical entities?
$html =~ s/&((?:lt|gt);?)/&amp;$1/g;
$html = decode_entities ($html);

# Start printing HTML page
print "Content-Type: text/html; charset=UTF-8\n\n";

#------------------------------------------------------------------------------
# Parser stack and state management

# We're going to use a callback parser to parse the HTML file. As we walk the
# HTML tree we maintain a buffer containing the current block if text to be
# translated. These state variables contain that. The buffer is repeatedly
# emptied and its contents pushed onto @segments.
# 
# We also remove 'soft' tags from the text as we append it to the buffer,
# replace them with placeholders, and save info about the tags we set aside in
# @buf_tag_index. @buf_tag_stack keeps track of 'currently open' tags, so that
# we can match closing tags to their opening tags.

my $buf_text_has_content = 0;
my $buf_text = '';
my @buf_tag_index;
my @buf_tag_stack;

my $in_verbatim = 0;


# This is called when we find soft tags within text to be translated. Arguments
# are the tag name, a hash of tag attributes, and a boolean telling us whether
# it's an opening or closing tag.
# 
# We perform lookups in the above state variables, save the tag info in them if
# necessary, and return a string which is the placeholder to replace that tag.

sub make_placeholder {
    my ($tag, $attr, $closing) = @_;
    my $placeholder = '';

    if ($closing) {

        # try to match closing tags with their opening sibling
        foreach my $i (reverse 0 .. $#buf_tag_stack) {
            if ($buf_tag_stack[$i][0] eq $tag) {
                $placeholder = 'MOSESCLOSETAG' . $buf_tag_stack[$i][1];
                splice (@buf_tag_stack, $i, 1);
                last;
            }
        }

        # lone closing tags are added to the index but not the stack
        if (!$placeholder) {
            push (@buf_tag_index, [ $tag, $attr ]);
            $placeholder = 'MOSESCLOSETAG' . $#buf_tag_index;
        }

    } else {
        # opening tags are added to the index and the stack
        push (@buf_tag_index, [ $tag, $attr ]);
        push (@buf_tag_stack, [ $tag, $#buf_tag_index ]);
        $placeholder = 'MOSESOPENTAG' . $#buf_tag_index;
    }

    return $placeholder;
}


# When we hit a hard tag, we call this to save any current text segment we have
# to the @segments array.

sub flush_buf_text {
    if ($buf_text_has_content || @buf_tag_index) {
        push (@segments, [ $buf_text, @buf_tag_index ] );
    } else {
        push (@segments, $buf_text);
    }

    $buf_text = '';
    @buf_tag_index = ();
    @buf_tag_stack = ();
    $buf_text_has_content = 0;
}

#------------------------------------------------------------------------------
# HTML parser

# Parser callback for when we hit an opening or closing tag
sub start_and_end_h {
    my ($tag, $attr, $closing) = @_;

    # keep track of whether we're in a verbatim segment
    $in_verbatim = $closing ? 0 : $tag
        if $VERBATIM_TAGS{$tag};

    # make links absolute
    my $url_attr = $URL_ATTRS{$tag};
    &make_link_absolute ($tag, $attr, $url_attr)
        if ($url_attr && $attr->{$url_attr});

    # textual attributes require some trickery - FIXME this duplicates some of
    # &print_tag
    if ($TEXT_ATTR{$tag}) {
        &flush_buf_text ();
        my $found = 0;

        # there's an example of how this works in the comments that precede the
        # declaration of @segments, above
        foreach my $text_attr (@{$TEXT_ATTR{$tag}}) {
            if ($attr->{$text_attr}) {
                push (@segments, ($found ? '"' : "<$tag") . " $text_attr=\"");
                push (@segments, [ $attr->{$text_attr}, '__NOPOPUP__' ]);
                delete $attr->{$text_attr};
                $found = 1;
            }
        }

        if ($found) {
            my $self_close = delete $attr->{'/'} ? 1 : 0;
            push (@segments, "\"" . join ('', map {
                (my $v = $attr->{$_}) =~ s/\"/&\#34;/g;
                " $_=\"$v\"";
            } keys %{$attr}) . ($self_close ? ' /' : '') . '>');
        } else {
            push (@segments, &print_tag ($tag, $attr, $closing));
        }

    # if the tag is soft we buffer it, if it's hard we flush the buffer out
    } elsif ($SOFT_TAGS{$tag}) {
        my $placeholder = &make_placeholder ($tag, $attr, $closing);
        $buf_text .= ' ' . $placeholder . ' ';
    } else {
        &flush_buf_text ();
        push (@segments, &print_tag ($tag, $attr, $closing));
    }

    # add a <base> tag at the beginning of the <head> (do we need this?)
    push (@segments, "<base src='$base_url'>\n")
        if ($tag eq 'head' && !$closing);
}


# parser callback for text segments
sub text_h {
    my ($text) = @_;

    if ($in_verbatim) {
        # when in verbatim mode (in <script> or <style> tags), everything just
        # gets reprinted as-is

        # .. except this
        $text =~ s/\@import\s+\"([^\n\"]+)\"/
            '@import "' . URI->new_abs($1, $base_url)->as_string . '"';
        /ge;

        push (@segments, $text);

    } else {
        # otherwise add the text to the sentence buffer
        $buf_text .= $text;
        $buf_text_has_content ||= ($text =~ /\p{IsAlnum}/);
    }
}

sub rest_h {
    my ($text) = @_;
    &flush_buf_text ();
    push (@segments, $text);
}


my $parser = HTML::Parser->new (%{{
    start_h       => [\&start_and_end_h, 'tagname, attr' ],
    text_h        => [\&text_h,          'text'          ],
    declaration_h => [\&rest_h,          'text'          ],
    comment_h     => [\&rest_h,          'text'          ],

    end_h => [sub {
        &start_and_end_h (shift, {}, 1);
    }, 'tagname' ],
}});

# parse it into @segments
$parser->parse ($html);
undef $parser;

#------------------------------------------------------------------------------
# Run translation threads

# We have now parsed the who document to the @segments array. Now we start
# the actual translation process.
# 
# We start one thread for each Moses host defined in the configuration above.
# All threads will then race to translate text segments, working down the
# @segments array. They also print segments as soon as a sequence of segments
# is done.


# These are the variables that are shared between threads and used for
# synchronisation.

my @input  :shared = map { ref $_ ? $_->[0] : undef } @segments;
my @output :shared = map { ref $_ ? undef : $_ } @segments;
my $next_job_i :shared = 0;
my $num_printed :shared = 0;


# This sub will be run in parallel by the threads
my $thread_body = sub {
    my ($moses_i) = @_;

    # each thread uses it's own tokenizer and detokenizer subprocess
    # (FIXME -- isn't this hugely inefficient?)
    my $tokenizer   = new Subprocess (@TOKENIZER_CMD);
    my $detokenizer = new Subprocess (@DETOKENIZER_CMD);
    $tokenizer->start;
    $detokenizer->start;

    # each thread also connects to its own Moses server
    my ($host, $port) = split /:/, $MOSES_ADDRESSES[$moses_i];
    my $moses = new RemoteProcess ($host, $port) ||
        die "Can't connect to '$host:$port'";
    $moses->start;

    for (;;) {

        # Snatch the next unassigned job from the queue
        my $job_i;
        { lock $next_job_i; $job_i = $next_job_i++; }
        last if ($job_i > $#input);

        # If it's a text job, translate it, otherwise just don't do anything
        $output[$job_i] = &translate_text_with_placeholders
            ($input[$job_i], $moses, $tokenizer, $detokenizer)
            if (!defined $output[$job_i]);

        # Print out any sequential block of done jobs
        lock $num_printed;
        while ($num_printed < @input && defined $output[$num_printed]) {
            my $print;

            if (ref $segments[$num_printed]) {

                # replace placeholders by the original tags
                my @buf_tag_index = @{$segments[$num_printed]};
                shift @buf_tag_index;
                $print = &replace_placeholders_by_tags
                    ($output[$num_printed], @buf_tag_index);

                # wrap in code to popup the original text onmouseover
                if ($buf_tag_index[0] ne '__NOPOPUP__') {
                    $print = &add_original_text_popup
                        ($input[$num_printed], $print);
                } else {
                    $print =~ s/\"/&\#34;/g;
                }

            } else {
                # HTML segments are just printed as-is
                $print = $segments[$num_printed];
            }

            print encode ('UTF-8', $print);
            $num_printed++;
        }
    }
};

if (@MOSES_ADDRESSES == 1) {

    # If there's only one instance of Moses, there's no point in forking a
    # single thread and waiting for it to complete, so we just run the thread
    # code directly in the main thread
    $thread_body->(0);

} else {

    # Start all threads and wait for them all to finish
    my @threads = map {
        threads->create ($thread_body, $_);
    } (0 .. $#MOSES_ADDRESSES);
    $_->join foreach @threads;

}

#------------------------------------------------------------------------------
# Translation subs


# This sub is called bt the translation thread for each text segment. The
# arguments are the input text and pointers to the various external processes
# needed for processing.
# 
# At this stage the input text contains placeholders that look like
# "MOSESOPENTAG2". We don't need to know which tag they stand for, but we do
# need to set them aside, translate the remaining plain text, and reinsert them
# at the correct place in the translation.

sub translate_text_with_placeholders {
    my ($input_text, $moses, $tokenizer, $detokenizer) = @_;
    my $traced_text = '';

    # Start by tokenizing the text, with placeholders still in it. The
    # placeholders are designed to be interpreted as individual tokens by the
    # tokenizer.
    my @tokens = split /\s+/, $tokenizer->do_line ($input_text);

    # remove placeholders, and for each remaining token, make a list of the
    # tags that cover it
    @tokens = ('START', @tokens, 'END');
    my @tags_over_token = &_extract_placeholders (\@tokens);
    @tokens = @tokens[1 .. $#tokens-1];

    # translate sentence by sentence
    my $token_base_i = 0;
    while (@tokens > 0) {

        # take a string of tokens up to the next sentence-ending token
        my (@s_tokens, $split_token);
        while (@tokens > 0) {
            if ($tokens[0] =~ $RE_EOS_TOKEN) {
                push (@s_tokens, shift @tokens);
                last;
            } elsif ($tokens[0] =~ $RE_SPLIT_TOKEN) {
                $split_token = shift @tokens;
                last;
            } else {
                push (@s_tokens, shift @tokens);
            }
        }

        # Join together tokens into a plain text string. This is now ready to
        # be shipped to Moses: all tags and placeholders have been removed,
        # and it's a single sentence. We also lowercase as needed, and make
        # a note of whether we did.
        my $s_input_text = join (' ', @s_tokens);
        my $was_ucfirst =
            ($s_input_text =~ s/^(\p{IsUpper})(?=\p{IsLower})/lc $1;/e);
        my $was_allcaps =
            ($s_input_text =~ s/^([\p{IsUpper}\P{IsAlpha}]+)$/lc $1;/e);

        # Translate the plain text sentence
        # my $s_traced_text = &_translate_text_pig_latin ($s_input_text);
        my $s_traced_text = &_translate_text_moses ($s_input_text, $moses);

        # Early post-translation formatting fixes
        $s_traced_text .= " $split_token" if $split_token;
        $s_traced_text = ucfirst $s_traced_text if $was_ucfirst;
        $s_traced_text = uc      $s_traced_text if $was_allcaps;

        # Update trace numbers to fit in the Grand Scheme of Things
        $s_traced_text =~ s{\s*\|(\d+)-(\d+)\|}{
            ' |' . ($1+$token_base_i) . '-' . ($2+$token_base_i) . '| ';
        }ge;
        $token_base_i += @s_tokens + ($split_token ? 1 : 0);

        $traced_text .= $s_traced_text . ' ';
    }

    # Apply to every segment in the traced output the union of all tags
    # that covered tokens in the corresponding source segment
    my $output_text = &_reinsert_placeholders
        ($traced_text, @tags_over_token);

    # Try to remove spaces inserted by the tokenizer
    $output_text = $detokenizer->do_line ($output_text);

    return $output_text;
}


# This sub takes an array of tokens, some of which are placeholders for
# formatting tags. Some of these tag placeholders are for opening tags, some
# are for closing tags. What we do here is we remove all these placeholders
# from the list and create an index of which of the remaining tokens are
# covered by which tags (by which we mean, inside their scope).
# 
# So for instance if the given array looks like this:
# 
#     [ "MOSESOPENTAG0", "MOSESOPENTAG1", "Hello", "MOSESCLOSETAG1",
#       "MOSESOPENTAG2", "world", "MOSESCLOSETAG2", "MOSESCLOSETAG0" ]
# 
# after executing this sub the array will look like this:
# 
#     [ "Hello", "world" ]
# 
# and the @tags_over_token index will have been created, containing this:
# 
#     [ [0,1], [0,2] ]
# 
# indicating that the first token ("Hello") is covered by tags 0 and 1, and
# that the 2nd token ("world") is covered by tags 0 and 2.

sub _extract_placeholders {
    my ($tokens) = @_;
    my @tags_over_token = ([]);

    while (@tags_over_token <= @$tokens) {
        my $i = $#tags_over_token;
        my @t = @{$tags_over_token[$i]};

        if ($tokens->[$i] =~ /^MOSESOPENTAG(\d+)$/) {
            $tags_over_token[$i] = [@t, $1];
            splice (@{$tokens}, $i, 1);
        } elsif ($tokens->[$i] =~ /^MOSESCLOSETAG(\d+)$/) {
            if (grep $_ == $1, @t) {
                $tags_over_token[$i] = [grep $_ != $1, @t];
            } else {
                push (@{$tags_over_token[$_]}, $1) foreach (0 .. $i-1);
            }
            splice (@{$tokens}, $i, 1);
        } else {
            push (@tags_over_token, [@t]);
        }
    }

    return @tags_over_token;
}


# This sub does pretty much the opposite of the preceding sub. It gets as
# argument the traced text output by Moses and the @tags_over_token array
# computed by the preceding sub. The traced text looks something like this:
# 
#   Hallo |0-0| Welt |1-1|
# 
# For each such segment which is between two traces, we will want to apply
# to it the union of all tags that were over the corresponding source text.
# 
# This sub does that, and returns the string, minus traces, plus reinserted
# placeholders.

sub _reinsert_placeholders {
    my ($traced_text, @tags_over_token) = @_;

    my %cur_open_tags = map {$_ => 1} @{$tags_over_token[0]};
    my $output_text = '';

    while ($traced_text =~ s/^(.+?)\s*\|(\d+)-+(\d+)\|\s*//) {
        my ($segment, $from, $to) = ($1, $2+1, $3+1);

        # list all tags that cover the source segment
        my %segment_tags = map {$_ => 1} map {
            @{$tags_over_token[$_]};
        } ($from .. $to);

        $output_text .= " MOSESCLOSETAG$_ "
            foreach (grep !$segment_tags{$_}, keys %cur_open_tags);
        $output_text .= " MOSESOPENTAG$_ "
            foreach (grep !$cur_open_tags{$_}, keys %segment_tags);
        %cur_open_tags = %segment_tags;

        $output_text .= " $segment ";
    }

    my %final_tags = map {$_ => 1} @{$tags_over_token[-1]};
    $output_text .= " MOSESCLOSETAG$_ "
        foreach (grep !$final_tags{$_}, keys %cur_open_tags);
    $output_text .= " MOSESOPENTAG$_ "
        foreach (grep !$cur_open_tags{$_}, keys %final_tags);

    $output_text .= $traced_text;
    return $output_text;
}


# Finally this one replaces the placeholders by the actual tags.

sub replace_placeholders_by_tags {
    my ($buf_text, @buf_tag_index) = @_;

    # replace the placeholders by the original tags
    $buf_text =~ s{MOSES(OPEN|CLOSE)TAG(\d+)}{
        &print_tag (@{$buf_tag_index[$2]}, $1 eq 'CLOSE');
    }ge;

    return $buf_text;
}

#------------------------------------------------------------------------------
# Interfaces to actual plain-text translators. These take a plain string and
# return a traced (Moses-style) translation


# This sub is used when you want to debug everything in this script except the
# actual translation. Translates to Pig Latin.

sub _translate_text_pig_latin {
    my ($text) = @_;

    $text =~ s/\b([bcdfhj-np-tv-z]+)([a-z]+)/
        ($1 eq ucfirst $1 ? ucfirst $2 : $2) .
        ($2 eq lc $2 ? lc $1 : $1) .
        'ay';
    /gei;

    # insert fake traces
    my $i = -1;
    $text .= ' ';
    $text =~ s/\s+/$i++; " |$i-$i| "/ge;

    return $text;
}


# This one, given a handle to a Moses subprocess, will use that to translate
# the text. Not much to see here actually.

sub _translate_text_moses {
    my ($text, $moses) = @_;

    my $traced_text = $moses->do_line ($text);
    unless ($traced_text) {
        my @tokens = split /\s+/, $text;

        # insert a fake trace if for some reason moses didn't return one
        # (which most likely indicates something is quite wrong)
        $traced_text = $text . " |0-$#tokens|";
    }

    return $traced_text;
}

#------------------------------------------------------------------------------
# basic HTML manipulation subs

sub make_link_absolute {
    my ($tag_name, $attr_hash, $attr_name) = @_;

    # make it absolute
    $attr_hash->{$attr_name} = URI->new_abs
        ($attr_hash->{$attr_name}, $base_url)->as_string;

    # make it point back to us if it's a link
    if ($tag_name eq 'a') {
        $attr_hash->{$attr_name} = 'index.cgi?url=' .
            uri_escape ($attr_hash->{$attr_name});
        $attr_hash->{target} = '_top';
    }
}

sub print_tag {
    my ($tag_name, $attr_hash, $closing) = @_;
    my $self_close = $attr_hash->{'/'} ? 1 : 0;

    return '<' . ($closing ? '/' : '') . $tag_name .
        ($closing ? '' : join ('', map {
            my $v = $attr_hash->{$_};
            $v =~ s/\"/&\#34;/g;
            " $_=\"$v\"";
        } keys %{$attr_hash})) .
        ($self_close ? ' /' : '') . '>';
}

sub add_original_text_popup {
    my ($input_text, $output_html) = @_;

    $input_text =~ s/\"/&\#34;/g;
    $input_text =~ s/MOSES(?:OPEN|CLOSE)TAG\d+//g;
    $input_text =~ s/^\s+//;
    $input_text =~ s/\s+$//;
    $input_text =~ s/\s+/ /g;

    # Using this technique for displaying the source text pop-up means we don't
    # have to fiddle with JavaScript, but it also means you need the LongTitles
    # extension installed if using Firefox.. *I* happen to have it, so..
    return "<span title=\"$input_text\">$output_html</span>";
}

#------------------------------------------------------------------------------
# conclusion

# stop the top frame counter
my $num_sentences = grep ref $_, @segments;
print "<script> top.numSentences = $num_sentences </script>\n";

#------------------------------------------------------------------------------