git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/branches/DPR_MOSES@3167 1f5c12ca-751b-0410-a591-d2e778427230

author: skyload <skyload@1f5c12ca-751b-0410-a591-d2e778427230> 2010-04-21 16:07:21 +0400
committer: skyload <skyload@1f5c12ca-751b-0410-a591-d2e778427230> 2010-04-21 16:07:21 +0400
commit: c009c843d5e0ac3a86ef0f0170d0401ef01fe5cc (patch)
tree: 881e28a3c0fcf6e280bfeea0f97b9f058f6187e3
parent: 6895b4fc3856950022d7995b0e6f5b0d58eeccfe (diff)
12 files changed, 1857 insertions, 0 deletions
diff --git a/web/bin/daemon.pl b/web/bin/daemon.pl
new file mode 100644
index 000000000..7172aa655
--- /dev/null
+++ b/web/bin/daemon.pl
@@ -0,0 +1,59 @@
+#!/usr/bin/perl -w
+use warnings;
+use strict;
+$|++;
+
+# file: daemon.pl
+
+# Herve Saint-Amand
+# Universitaet des Saarlandes
+# Tue May 13 19:45:31 2008
+
+# This script starts Moses to run in the background, so that it can be used by
+# the CGI script. It spawns the Moses process, then binds itself to listen on
+# some port, and when it gets a connection, reads it line by line, feeds those
+# to Moses, and sends back the translation.
+
+# You can either run one instance of this on your Web server, or, if you have
+# the hardware setup for it, run several instances of this, then configure
+# translate.cgi to connect to these.
+
+#------------------------------------------------------------------------------
+# includes
+
+use IO::Socket::INET;
+use IPC::Open2;
+
+#------------------------------------------------------------------------------
+# constants, global vars, config
+
+my $MOSES      = '/local/herves/moses/moses-irst';
+my $MOSES_INI  = '/local/herves/moses/fr-en/moses.ini.2';
+
+die "usage: daemon.pl <hostname> <port>" unless (@ARGV == 2);
+my $LISTEN_HOST = shift;
+my $LISTEN_PORT = shift;
+
+#------------------------------------------------------------------------------
+# main
+
+# spawn moses
+my ($MOSES_IN, $MOSES_OUT);
+my $pid = open2 ($MOSES_OUT, $MOSES_IN, $MOSES, '-f', $MOSES_INI, '-t');
+
+# open server socket
+my $server_sock = new IO::Socket::INET
+    (LocalAddr => $LISTEN_HOST, LocalPort => $LISTEN_PORT, Listen => 1)
+    || die "Can't bind server socket";
+
+while (my $client_sock = $server_sock->accept) {
+    while (my $line = <$client_sock>) {
+        print $MOSES_IN $line;
+        $MOSES_IN->flush ();
+        print $client_sock scalar <$MOSES_OUT>;
+    }
+
+    $client_sock->close ();
+}
+
+#------------------------------------------------------------------------------
diff --git a/web/bin/detokenizer.perl b/web/bin/detokenizer.perl
new file mode 100644
index 000000000..e784e761c
--- /dev/null
+++ b/web/bin/detokenizer.perl
@@ -0,0 +1,112 @@
+#!/usr/bin/perl -w
+
+# Sample De-Tokenizer
+# written by Josh Schroeder, based on code by Philipp Koehn
+
+# This added by Herve Saint-Amand for compatibility with translate.cgi
+$|++;
+
+binmode(STDIN, ":utf8");
+binmode(STDOUT, ":utf8");
+use strict;
+
+my $language = "en";
+my $QUIET = 0;
+my $HELP = 0;
+
+while (@ARGV) {
+	$_ = shift;
+	/^-l$/ && ($language = shift, next);
+	/^-q$/ && ($QUIET = 1, next);
+	/^-h$/ && ($HELP = 1, next);
+}
+
+if ($HELP) {
+	print "Usage ./detokenizer.perl (-l [en|de|...]) < tokenizedfile > detokenizedfile\n";
+	exit;
+}
+if (!$QUIET) {
+	print STDERR "Detokenizer Version 1.0\n";
+	print STDERR "Language: $language\n";
+}
+
+while(<STDIN>) {
+	if (/^<.+>$/ || /^\s*$/) {
+		#don't try to detokenize XML/HTML tag lines
+		print $_;
+	}
+	else {
+		print &detokenize($_);
+	}
+}
+
+sub detokenize {
+	my($text) = @_;
+	chomp($text);
+	$text = " $text ";
+	
+	my $word;
+	my $i;
+	my @words = split(/ /,$text);
+	$text = "";
+	my %quoteCount =  ("\'"=>0,"\""=>0);
+	my $prependSpace = " ";
+	for ($i=0;$i<(scalar(@words));$i++) {		
+		if ($words[$i] =~ /^[\p{IsSc}\(\[\{\¿\¡]+$/) {
+			#perform right shift on currency and other random punctuation items
+			$text = $text.$prependSpace.$words[$i];
+			$prependSpace = "";
+		} elsif ($words[$i] =~ /^[\,\.\?\!\:\;\\\%\}\]\)]+$/){
+			#perform left shift on punctuation items
+			$text=$text.$words[$i];
+			$prependSpace = " ";
+		} elsif (($language eq "en") && ($i>0) && ($words[$i] =~ /^[\'][\p{IsAlpha}]/) && ($words[$i-1] =~ /[\p{IsAlnum}]$/)) {
+			#left-shift the contraction for English
+			$text=$text.$words[$i];
+			$prependSpace = " ";
+		}  elsif (($language eq "fr") && ($i<(scalar(@words)-2)) && ($words[$i] =~ /[\p{IsAlpha}][\']$/) && ($words[$i+1] =~ /^[\p{IsAlpha}]/)) {
+			#right-shift the contraction for French
+			$text = $text.$prependSpace.$words[$i];
+			$prependSpace = "";
+		} elsif ($words[$i] =~ /^[\'\"]+$/) {
+			#combine punctuation smartly
+			if (($quoteCount{$words[$i]} % 2) eq 0) {
+				if(($language eq "en") && ($words[$i] eq "'") && ($i > 0) && ($words[$i-1] =~ /[s]$/)) {
+					#single quote for posesssives ending in s... "The Jones' house"
+					#left shift
+					$text=$text.$words[$i];
+					$prependSpace = " ";
+				} else {
+					#right shift
+					$text = $text.$prependSpace.$words[$i];
+					$prependSpace = "";
+					$quoteCount{$words[$i]} = $quoteCount{$words[$i]} + 1;
+
+				}
+			} else {
+				#left shift
+				$text=$text.$words[$i];
+				$prependSpace = " ";
+				$quoteCount{$words[$i]} = $quoteCount{$words[$i]} + 1;
+
+			}
+			
+		} else {
+			$text=$text.$prependSpace.$words[$i];
+			$prependSpace = " ";
+		}
+	}
+	
+	# clean up spaces at head and tail of each line as well as any double-spacing
+	$text =~ s/ +/ /g;
+	$text =~ s/\n /\n/g;
+	$text =~ s/ \n/\n/g;
+	$text =~ s/^ //g;
+	$text =~ s/ $//g;
+	
+	#add trailing break
+	$text .= "\n" unless $text =~ /\n$/;
+
+	return $text;
+}
+
diff --git a/web/bin/nonbreaking_prefixes/nonbreaking_prefix.de b/web/bin/nonbreaking_prefixes/nonbreaking_prefix.de
new file mode 100644
index 000000000..c24f2080f
--- /dev/null
+++ b/web/bin/nonbreaking_prefixes/nonbreaking_prefix.de
@@ -0,0 +1,325 @@
+#Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
+#Special cases are included for prefixes that ONLY appear before 0-9 numbers.
+
+#any single upper case letter  followed by a period is not a sentence ender (excluding I occasionally, but we leave it in)
+#usually upper case letters are initials in a name
+#no german words end in single lower-case letters, so we throw those in too.
+A
+B
+C
+D
+E
+F
+G
+H
+I
+J
+K
+L
+M
+N
+O
+P
+Q
+R
+S
+T
+U
+V
+W
+X
+Y
+Z
+a
+b
+c
+d
+e
+f
+g
+h
+i
+j
+k
+l
+m
+n
+o
+p
+q
+r
+s
+t
+u
+v
+w
+x
+y
+z
+
+
+#Roman Numerals. A dot after one of these is not a sentence break in German.
+I
+II
+III
+IV
+V
+VI
+VII
+VIII
+IX
+X
+XI
+XII
+XIII
+XIV
+XV
+XVI
+XVII
+XVIII
+XIX
+XX
+i
+ii
+iii
+iv
+v
+vi
+vii
+viii
+ix
+x
+xi
+xii
+xiii
+xiv
+xv
+xvi
+xvii
+xviii
+xix
+xx
+
+#Titles and Honorifics
+Adj
+Adm
+Adv
+Asst
+Bart
+Bldg
+Brig
+Bros
+Capt
+Cmdr
+Col
+Comdr
+Con
+Corp
+Cpl
+DR
+Dr
+Ens
+Gen
+Gov
+Hon
+Hosp
+Insp
+Lt
+MM
+MR
+MRS
+MS
+Maj
+Messrs
+Mlle
+Mme
+Mr
+Mrs
+Ms
+Msgr
+Op
+Ord
+Pfc
+Ph
+Prof
+Pvt
+Rep
+Reps
+Res
+Rev
+Rt
+Sen
+Sens
+Sfc
+Sgt
+Sr
+St
+Supt
+Surg
+
+#Misc symbols
+Mio
+Mrd
+bzw
+v
+vs
+usw
+d.h
+z.B
+u.a
+etc
+Mrd
+MwSt
+ggf
+d.J
+D.h
+m.E
+vgl
+I.F
+z.T
+sogen
+ff
+u.E
+g.U
+g.g.A
+c.-à-d
+Buchst
+u.s.w
+sog
+u.ä
+Std
+evtl
+Zt
+Chr
+u.U
+o.ä
+Ltd
+b.A
+z.Zt
+spp
+sen
+SA
+k.o
+jun
+i.H.v
+dgl
+dergl
+Co
+zzt
+usf
+s.p.a
+Dkr
+Corp
+bzgl
+BSE
+
+#Number indicators
+# add #NUMERIC_ONLY# after the word if it should ONLY be non-breaking when a 0-9 digit follows it
+No
+Nos
+Art
+Nr
+pp
+ca
+Ca
+
+#Ordinals are done with . in German - "1." = "1st" in English
+1
+2
+3
+4
+5
+6
+7
+8
+9
+10
+11
+12
+13
+14
+15
+16
+17
+18
+19
+20
+21
+22
+23
+24
+25
+26
+27
+28
+29
+30
+31
+32
+33
+34
+35
+36
+37
+38
+39
+40
+41
+42
+43
+44
+45
+46
+47
+48
+49
+50
+51
+52
+53
+54
+55
+56
+57
+58
+59
+60
+61
+62
+63
+64
+65
+66
+67
+68
+69
+70
+71
+72
+73
+74
+75
+76
+77
+78
+79
+80
+81
+82
+83
+84
+85
+86
+87
+88
+89
+90
+91
+92
+93
+94
+95
+96
+97
+98
+99
diff --git a/web/bin/nonbreaking_prefixes/nonbreaking_prefix.el b/web/bin/nonbreaking_prefixes/nonbreaking_prefix.el
new file mode 100644
index 000000000..7bb3d490a
--- /dev/null
+++ b/web/bin/nonbreaking_prefixes/nonbreaking_prefix.el
@@ -0,0 +1,2 @@
+# for now, just include the Greek equivalent of "Mr."
+κ
diff --git a/web/bin/nonbreaking_prefixes/nonbreaking_prefix.en b/web/bin/nonbreaking_prefixes/nonbreaking_prefix.en
new file mode 100644
index 000000000..7e7a8ce2e
--- /dev/null
+++ b/web/bin/nonbreaking_prefixes/nonbreaking_prefix.en
@@ -0,0 +1,107 @@
+#Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
+#Special cases are included for prefixes that ONLY appear before 0-9 numbers.
+
+#any single upper case letter  followed by a period is not a sentence ender (excluding I occasionally, but we leave it in)
+#usually upper case letters are initials in a name
+A
+B
+C
+D
+E
+F
+G
+H
+I
+J
+K
+L
+M
+N
+O
+P
+Q
+R
+S
+T
+U
+V
+W
+X
+Y
+Z
+
+#List of titles. These are often followed by upper-case names, but do not indicate sentence breaks
+Adj
+Adm
+Adv
+Asst
+Bart
+Bldg
+Brig
+Bros
+Capt
+Cmdr
+Col
+Comdr
+Con
+Corp
+Cpl
+DR
+Dr
+Drs
+Ens
+Gen
+Gov
+Hon
+Hr
+Hosp
+Insp
+Lt
+MM
+MR
+MRS
+MS
+Maj
+Messrs
+Mlle
+Mme
+Mr
+Mrs
+Ms
+Msgr
+Op
+Ord
+Pfc
+Ph
+Prof
+Pvt
+Rep
+Reps
+Res
+Rev
+Rt
+Sen
+Sens
+Sfc
+Sgt
+Sr
+St
+Supt
+Surg
+
+#misc - odd period-ending items that NEVER indicate breaks (p.m. does NOT fall into this category - it sometimes ends a sentence)
+v
+vs
+i.e
+rev
+e.g
+
+#Numbers only. These should only induce breaks when followed by a numeric sequence
+# add NUMERIC_ONLY after the word for this function
+#This case is mostly for the english "No." which can either be a sentence of its own, or
+#if followed by a number, a non-breaking prefix
+No #NUMERIC_ONLY# 
+Nos
+Art #NUMERIC_ONLY#
+Nr
+pp #NUMERIC_ONLY#
diff --git a/web/bin/start-daemon-cluster.pl b/web/bin/start-daemon-cluster.pl
new file mode 100644
index 000000000..3bfe01927
--- /dev/null
+++ b/web/bin/start-daemon-cluster.pl
@@ -0,0 +1,35 @@
+#!/usr/bin/perl -w
+use warnings;
+use strict;
+$|++;
+
+# file: start-daemon-cluster.pl
+
+# Herve Saint-Amand
+# Universitaet des Saarlandes
+# Thu May 15 08:22:13 2008
+
+# Utility to start/stop the daemon processes on the 16 cluster machines.
+# Config in here should match that given in translate.cgi (hostnames, ports)
+
+#------------------------------------------------------------------------------
+
+my $stop = @ARGV && $ARGV[0] eq '-s';
+
+foreach my $i (qw/01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 16/) {
+    my $host = "cluster-$i";
+    my $port = "90$i";
+
+    my @cmd = $stop ?
+        ('ssh', $host, 'killall', '-q', 'daemon.pl')
+        :
+        ('ssh', 
+         '-L', "$port:localhost:$port",
+         $host,
+         '/local/herves/moses/daemon.pl', 'localhost', $port);
+
+    print "@cmd\n";
+    exec @cmd unless fork ();
+}
+
+#------------------------------------------------------------------------------
diff --git a/web/bin/tokenizer.perl b/web/bin/tokenizer.perl
new file mode 100644
index 000000000..4e6565dff
--- /dev/null
+++ b/web/bin/tokenizer.perl
@@ -0,0 +1,170 @@
+#!/usr/bin/perl -w
+
+# Sample Tokenizer
+# written by Josh Schroeder, based on code by Philipp Koehn
+
+# This added by Herve Saint-Amand for compatibility with translate.cgi
+$|++;
+
+binmode(STDIN, ":utf8");
+binmode(STDOUT, ":utf8");
+
+use FindBin qw($Bin);
+use strict;
+#use Time::HiRes;
+
+my $mydir = "$Bin/nonbreaking_prefixes";
+
+my %NONBREAKING_PREFIX = ();
+my $language = "en";
+my $QUIET = 0;
+my $HELP = 0;
+
+#my $start = [ Time::HiRes::gettimeofday( ) ];
+
+while (@ARGV) {
+	$_ = shift;
+	/^-l$/ && ($language = shift, next);
+	/^-q$/ && ($QUIET = 1, next);
+	/^-h$/ && ($HELP = 1, next);
+}
+
+if ($HELP) {
+	print "Usage ./tokenizer.perl (-l [en|de|...]) < textfile > tokenizedfile\n";
+	exit;
+}
+if (!$QUIET) {
+	print STDERR "Tokenizer v3\n";
+	print STDERR "Language: $language\n";
+}
+
+load_prefixes($language,\%NONBREAKING_PREFIX);
+
+if (scalar(%NONBREAKING_PREFIX) eq 0){
+	print STDERR "Warning: No known abbreviations for language '$language'\n";
+}
+
+while(<STDIN>) {
+	if (/^<.+>$/ || /^\s*$/) {
+		#don't try to tokenize XML/HTML tag lines
+		print $_;
+	}
+	else {
+		print &tokenize($_);
+	}
+}
+
+#my $duration = Time::HiRes::tv_interval( $start );
+#print STDERR ("EXECUTION TIME: ".$duration."\n");
+
+
+sub tokenize {
+	my($text) = @_;
+	chomp($text);
+	$text = " $text ";
+	
+	# seperate out all "other" special characters
+	$text =~ s/([^\p{IsAlnum}\s\.\'\`\,\-])/ $1 /g;
+	
+	#multi-dots stay together
+	$text =~ s/\.([\.]+)/ DOTMULTI$1/g;
+	while($text =~ /DOTMULTI\./) {
+		$text =~ s/DOTMULTI\.([^\.])/DOTDOTMULTI $1/g;
+		$text =~ s/DOTMULTI\./DOTDOTMULTI/g;
+	}
+
+	# seperate out "," except if within numbers (5,300)
+	$text =~ s/([^\p{IsN}])[,]([^\p{IsN}])/$1 , $2/g;
+	# separate , pre and post number
+	$text =~ s/([\p{IsN}])[,]([^\p{IsN}])/$1 , $2/g;
+	$text =~ s/([^\p{IsN}])[,]([\p{IsN}])/$1 , $2/g;
+	      
+	# turn `into '
+	$text =~ s/\`/\'/g;
+	
+	#turn '' into "
+	$text =~ s/\'\'/ \" /g;
+
+	if ($language eq "en") {
+		#split contractions right
+		$text =~ s/([^\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g;
+		$text =~ s/([^\p{IsAlpha}\p{IsN}])[']([\p{IsAlpha}])/$1 ' $2/g;
+		$text =~ s/([\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g;
+		$text =~ s/([\p{IsAlpha}])[']([\p{IsAlpha}])/$1 '$2/g;
+		#special case for "1990's"
+		$text =~ s/([\p{IsN}])[']([s])/$1 '$2/g;
+	} elsif (($language eq "fr") or ($language eq "it")) {
+		#split contractions left	
+		$text =~ s/([^\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g;
+		$text =~ s/([^\p{IsAlpha}])[']([\p{IsAlpha}])/$1 ' $2/g;
+		$text =~ s/([\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g;
+		$text =~ s/([\p{IsAlpha}])[']([\p{IsAlpha}])/$1' $2/g;
+	} else {
+		$text =~ s/\'/ \' /g;
+	}
+	
+	#word token method
+	my @words = split(/\s/,$text);
+	$text = "";
+	for (my $i=0;$i<(scalar(@words));$i++) {
+		my $word = $words[$i];
+		if ( $word =~ /^(\S+)\.$/) {
+			my $pre = $1;
+			if (($pre =~ /\./ && $pre =~ /\p{IsAlpha}/) || ($NONBREAKING_PREFIX{$pre} && $NONBREAKING_PREFIX{$pre}==1) || ($i<scalar(@words)-1 && ($words[$i+1] =~ /^[\p{IsLower}]/))) {
+				#no change
+			} elsif (($NONBREAKING_PREFIX{$pre} && $NONBREAKING_PREFIX{$pre}==2) && ($i<scalar(@words)-1 && ($words[$i+1] =~ /^[0-9]+/))) {
+				#no change
+			} else {
+				$word = $pre." .";
+			}
+		}
+		$text .= $word." ";
+	}		
+
+	# clean up extraneous spaces
+	$text =~ s/ +/ /g;
+	$text =~ s/^ //g;
+	$text =~ s/ $//g;
+
+	#restore multi-dots
+	while($text =~ /DOTDOTMULTI/) {
+		$text =~ s/DOTDOTMULTI/DOTMULTI./g;
+	}
+	$text =~ s/DOTMULTI/./g;
+	
+	#ensure final line break
+	$text .= "\n" unless $text =~ /\n$/;
+
+	return $text;
+}
+
+sub load_prefixes {
+	my ($language, $PREFIX_REF) = @_;
+	
+	my $prefixfile = "$mydir/nonbreaking_prefix.$language";
+	
+	#default back to English if we don't have a language-specific prefix file
+	if (!(-e $prefixfile)) {
+		$prefixfile = "$mydir/nonbreaking_prefix.en";
+		print STDERR "WARNING: No known abbreviations for language '$language', attempting fall-back to English version...\n";
+		die ("ERROR: No abbreviations files found in $mydir\n") unless (-e $prefixfile);
+	}
+	
+	if (-e "$prefixfile") {
+		open(PREFIX, "<:utf8", "$prefixfile");
+		while (<PREFIX>) {
+			my $item = $_;
+			chomp($item);
+			if (($item) && (substr($item,0,1) ne "#")) {
+				if ($item =~ /(.*)[\s]+(\#NUMERIC_ONLY\#)/) {
+					$PREFIX_REF->{$1} = 2;
+				} else {
+					$PREFIX_REF->{$item} = 1;
+				}
+			}
+		}
+		close(PREFIX);
+	}
+	
+}
+
diff --git a/web/index.cgi b/web/index.cgi
new file mode 100644
index 000000000..dcc20b0aa
--- /dev/null
+++ b/web/index.cgi
@@ -0,0 +1,103 @@
+#!/usr/bin/perl -Tw
+use warnings;
+use strict;
+$|++;
+
+# file: index.cgi
+
+# Herve Saint-Amand
+# Universitaet des Saarlandes
+# Tue May 13 20:09:25 2008
+
+# When we do Moses translation of Web pages there's a little tool frame at the
+# top of the screen. This script takes care of printing the frameset and that
+# top frame. It also invokes the main script with the proper URL param.
+
+# You're most probably not interested in the code that's in here. Have a look
+# a translate.cgi instead. That's where the meat is.
+
+#------------------------------------------------------------------------------
+# includes
+
+use CGI;
+use CGI::Carp qw/fatalsToBrowser/;
+
+use URI::Escape;
+
+#------------------------------------------------------------------------------
+# constants, global vars
+
+(my $SELF_URL = $ENV{QUERY_STRING}) =~ s![^/]*$!!;
+
+my $TRANSLATE_CGI = 'translate.cgi';
+
+#------------------------------------------------------------------------------
+# read CGI params
+
+my %params = %{{
+    url   => undef,
+    frame => undef,
+}};
+
+my $cgi = new CGI;
+
+foreach my $p (keys %params) {
+    $params{$p} = $cgi->param ($p)
+        if (defined $cgi->param ($p));
+}
+
+#------------------------------------------------------------------------------
+# print out
+
+print "Content-Type: text/html\n\n";
+
+print
+    "<html>\n" .
+    "  <head>\n" .
+    "    <title>$params{url} -- Moses translation</title>\n" .
+    "    <style>\n" .
+    "      p, a, b, body {\n" .
+    "        font-family: verdana;\n" .
+    "        font-size: 9pt;\n" .
+    "      }\n" .
+    "    </style>\n" .
+    "  </head>\n";
+
+if (!$params{url}) {
+    print
+        "  <body bgcolor='#ffFFfF'>\n" .
+        "    <h1>Moses Web Interface</h1>\n" .
+        "    <form method='GET' action='$SELF_URL'>\n" .
+        "      <input name='url' size='60'>\n" .
+        "      <input type='submit' value='Translate'>\n" .
+        "    </form>\n" .
+        "  </body>\n";
+
+} else {
+
+    # check that we have a URL and it's absolute
+    $params{url} = "http://$params{url}"
+        unless ($params{url} =~ m!^[a-z]+://!);
+    my $URL = uri_escape ($params{url});
+
+    if (!$params{frame}) {
+        print
+            "  <frameset rows='30,*' border='1' frameborder='1'>\n" .
+            "    <frame src='$SELF_URL?frame=top&url=$URL'>\n" .
+            "    <frame src='$TRANSLATE_CGI?url=$URL'>\n" .
+            "  </frameset>\n";
+
+    } else {
+        print
+            "  <script src='index.js'></script>\n" .
+            "  <body bgcolor='#ccCCcC' onload='startCount()'>\n" .
+            "    <b>Moses translation of\n" .
+            "    <a href='$params{url}' target='_top'>$params{url}</a></b>\n" .
+            "    <span id='status'></span>\n" .
+            "  </body>\n";
+    }
+}
+
+print "</html>\n";
+
+#------------------------------------------------------------------------------
diff --git a/web/index.js b/web/index.js
new file mode 100644
index 000000000..1d917ce73
--- /dev/null
+++ b/web/index.js
@@ -0,0 +1,26 @@
+// file: frameset.js
+
+// Herve Saint-Amand
+// Universitaet des Saarlandes
+// Tue May 13 10:00:42 2008
+
+//-----------------------------------------------------------------------------
+
+function startCount () {
+    var startTime = (new Date()).getTime ();
+    var element   = document.getElementById ('status');
+
+    function step () {
+        var secs = parseInt (((new Date()).getTime() - startTime) / 1000);
+        var status = "(elapsed: " + secs + " seconds)";
+        if (top.numSentences != null)
+            status += " (" + top.numSentences + " segments)";
+        else
+            setTimeout (step, 1000);
+        element.innerHTML = status;
+    }
+
+    step ();
+}
+
+//-----------------------------------------------------------------------------
diff --git a/web/lib/RemoteProcess.pm b/web/lib/RemoteProcess.pm
new file mode 100644
index 000000000..1a414cd77
--- /dev/null
+++ b/web/lib/RemoteProcess.pm
@@ -0,0 +1,51 @@
+# file: RemoteProcess.pm
+
+# Herve Saint-Amand
+# Universitaet des Saarlandes
+# Thu May 15 08:30:19 2008
+
+#------------------------------------------------------------------------------
+# includes
+
+package RemoteProcess;
+our @ISA = qw/Subprocess/;
+
+use warnings;
+use strict;
+
+use IO::Socket::INET;
+
+use Subprocess;
+
+#------------------------------------------------------------------------------
+# constructor
+
+sub new {
+    my ($class, $host, $port) = @_;
+
+    my $self = new Subprocess;
+    $self->{host} = $host;
+    $self->{port} = $port;
+    $self->{sock} = undef;
+
+    bless $self, $class;
+}
+
+#------------------------------------------------------------------------------
+# should have the same interface as Subprocess.pm
+
+sub start {
+    my ($self) = @_;
+
+    $self->{sock} = new IO::Socket::INET (%{{
+        PeerAddr => $self->{host},
+        PeerPort => $self->{port},
+    }}) || die "Can't connect to $self->{host}:$self->{port}";
+
+    $self->{child_in} = $self->{child_out} = $self->{sock};
+}
+
+#------------------------------------------------------------------------------
+
+1;
+
diff --git a/web/lib/Subprocess.pm b/web/lib/Subprocess.pm
new file mode 100644
index 000000000..adc7a853a
--- /dev/null
+++ b/web/lib/Subprocess.pm
@@ -0,0 +1,61 @@
+# file: Subprocess.pm
+
+# Herve Saint-Amand
+# Universitaet des Saarlandes
+# Wed May 14 09:55:46 2008
+
+# NOTE that to use this with Philipp Koehn's tokenizer.perl I had to modify
+# that script to autoflush its streams, by adding a '$|++' to it
+
+#------------------------------------------------------------------------------
+# includes
+
+package Subprocess;
+
+use warnings;
+use strict;
+
+use Encode;
+use IPC::Open2;
+
+#------------------------------------------------------------------------------
+# constructor
+
+sub new {
+    my ($class, @cmd) = @_;
+    bless {
+        cmd => \@cmd,
+        num_done => 0,
+        child_in  => undef,
+        child_out => undef,
+    }, $class;
+}
+
+#------------------------------------------------------------------------------
+
+sub start {
+    my ($self) = @_;
+    open2 ($self->{child_out}, $self->{child_in}, @{$self->{cmd}});
+}
+
+sub do_line {
+    my ($self, $line) = @_;
+    my ($in, $out) = ($self->{child_in}, $self->{child_out});
+
+    $line =~ s/\s+/ /g;
+    print $in encode ('UTF-8', $line), "\n";
+    $in->flush ();
+
+    my $ret = decode ('UTF-8', scalar <$out>);
+    chomp $ret;
+
+    $self->{num_done}++;
+    return $ret;
+}
+
+sub num_done { shift->{num_done} }
+
+#------------------------------------------------------------------------------
+
+1;
+
diff --git a/web/translate.cgi b/web/translate.cgi
new file mode 100644
index 000000000..ece7109fc
--- /dev/null
+++ b/web/translate.cgi
@@ -0,0 +1,806 @@
+#!/usr/bin/perl -Tw
+use warnings;
+use strict;
+$|++;
+
+# file: translate.cgi
+
+# Herve Saint-Amand
+# saintamh [o] yahoo, com
+# Universitaet des Saarlandes
+# Mon May 12 14:10:54 2008
+
+# This CGI script takes a web page URL as a parameter, fetches that page,
+# translates it using the Moses decoder, and displays the translated version
+# to the user, similarily to how Google or BabelFish translate web pages.
+
+# I don't think I've ever written anything with such a high comment/code ratio,
+# so hopefully it should be understandable. Just read top to bottom.
+
+
+# TODO:
+# 
+#  - if the document contains <a name='anchor'></a> it will be lost
+#  - don't insert spaces everywhere around soft tags
+#  - charset autodetection would be nice, but it's not trivial
+
+#------------------------------------------------------------------------------
+# includes
+
+use CGI;
+use CGI::Carp qw/fatalsToBrowser/;
+
+# we use the 2nd perl thread API. I think this means you need perl 5.6 or
+# higher, compiled with thread support
+use threads;
+use threads::shared;
+
+use Encode;
+use HTML::Entities;
+use HTML::Parser;
+use LWP::UserAgent;
+use URI;
+use URI::Escape;
+
+use lib 'lib';
+use RemoteProcess;
+use Subprocess;
+
+#------------------------------------------------------------------------------
+# constants, config
+
+# In order to run this script, you must first start Moses as a sort of daemon
+# process that accepts connections on some INET port, reads the sentences sent
+# to it one line at a time and returns translations. The daemon.pl script that
+# comes with this script does just that -- starts an instance of Moses and
+# 'plugs' it to the net so it can be used from other machines or just other
+# processes on the same machine.
+# 
+# This list here indicates where to find these instances of Moses. May be 
+# localhost, or may be separate machines.
+# 
+# On the current UniSaar setup we use SSH tunneling to connect to other hosts,
+# so from this script's POV they're all localhost. These ports are actually
+# forwarded to other machines. There wouldn't be much point in running 16
+# instances of Moses on the same machine.
+
+my @MOSES_ADDRESSES = map "localhost:90$_",
+    qw/01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 16/;
+
+
+# The tokenizer tries to adapt its rules depending on the language it's dealing
+# with, so we indicate that here.
+
+my $INPUT_LANG  = 'fr';
+my $OUTPUT_LANG = 'en';
+
+
+# In order to tokenize and detokenize strings in a way that stays consistent
+# with how it is done in the rest of the Moses system, we use the scripts that
+# come with Moses as external processes. These are the commands we must run to
+# start them.
+
+my @TOKENIZER_CMD   = ('./bin/tokenizer.perl',   '-l', $INPUT_LANG);
+my @DETOKENIZER_CMD = ('./bin/detokenizer.perl', '-l', $OUTPUT_LANG);
+
+
+# We call 'soft tags' HTML tags whose presence is tolerated inside
+# sentences. All other tags are assumed to be sentence-breakers and will be
+# used to chop up documents into independent sentences. These few, however, are
+# allowed within sentences.
+
+my %SOFT_TAGS = map {$_ => 1} qw/a b i u em font blink tt acronym/;
+
+
+# We call 'verbatim tags' HTML tags whose entire data is to be left untouched
+# and reprinted as-is. These also happen to be tags whose content is typically
+# not printed by the browser.
+
+my %VERBATIM_TAGS = map {$_ => 1} qw/script style/;
+
+
+# Some HTML tags have attributes that contain URLs. Since we'll be displaying
+# the page on another server than its usual source server, relative paths will
+# be broken, so we need to make all URLs absolute. These are the attributes
+# that will be so modified.
+
+my %URL_ATTRS = %{{
+    a      => 'href',
+    img    => 'src',
+    form   => 'action',
+    link   => 'href',
+    script => 'src',
+}};
+
+
+# Some HTML tags have attributes that can contain free text that is displayed
+# to the user. Data in attributes is not usually translated, but these should
+# be.
+# 
+# Note that for implementation reasons these will always be treated as hard,
+# sentence-splitting tags. This could be changed but would require a
+# substantial re-write of this script.
+
+my %TEXT_ATTR = %{{ input => [qw/value/], img => [qw/alt title/], }};
+
+
+# Sentence splitting within a paragraph or block of text is done after
+# tokenizing. Tokens matched by this regex will be considered to end a
+# sentence, and hence be used in splitting the text into sentences.
+
+my $RE_EOS_TOKEN = qr/^(?:\.+|[\?!:;])$/;
+
+# This regex also matches sentence-ending tokens, but tokens matched by this
+# one will not be included in the sentence itself. Tokens matched by the
+# previous regex will be sent to Moses as part of the end of the sentence.
+# Tokens matches by this one will never be sent to Moses. Which is why the pipe
+# symbol, which Moses doesn't seem to like, must be in here.
+
+my $RE_SPLIT_TOKEN = qr!^[\|\-]+$!;
+
+#------------------------------------------------------------------------------
+# global vars
+
+# In cleaner code there wouldn't be global variables, but it simplified things
+# to put these here. Eventually I wouldn't mind removing this section.
+
+
+# This array is very central to the way this script works. The document will be
+# chopped up into a list of 'segments'. Each segment is either some HTML code
+# and whitespace which we don't translate or manipulate in any way, or a bit of
+# text to be translated. It's as if we highlighted in the HTML source the bits
+# of text that needed translation, and make each stripe of highlighter, and
+# each length of text between them, a segment.
+# 
+# Segments that are untouched HTML are simply strings. If the whole document
+# contained no translatable text, this array would only contain strings.
+# 
+# Segments that contain text to be translated are represented as arrayrefs. The
+# first element of that arrayref is the text to be translated, with any soft
+# tags within it replaced by placeholders of the type MOSESOPENTAG4. The
+# remaining elements contain the necessary info to reinsert these tags. The
+# placeholders are numbered, and the i-th placeholder corresponds to the
+# (i+1)-th element in the arrayref (element 0 being the text). That element is
+# itself an array ref, whose first element is the tag name and second element
+# is a hashref of attributes.
+# 
+# So this document:
+# 
+#   <p>This is <a href="somewhere">a link</a> but it's not <b>bold</b></p>
+# 
+# would be represented by this @segments array:
+# 
+#   0: "<p>"
+#   1: [ 0: "This is MOSESOPENTAG0 a link MOSESCLOSETAG0 but it's not" .
+#           " MOSESOPENTAG1 bold MOSESCLOSETAG1"
+#        1: [ "a", { href => "somewhere" } ]
+#        2: [ "b", {} ] ]
+#   2: "</p>"
+# 
+# Finally, there's one hack to be mentioned: text in %TEXT_ATTR attributes
+# (defined above) also goes into a segment of its own. Since this text does
+# not contain tags, and to signal that the code for the popup containing
+# source text should not be inserted around this text, we replace the tag
+# information by the "__NOPOPUP__" string. So this document:
+# 
+#   <img src="blah" alt="This describes the image">
+# 
+# would correspond to this @segments array:
+# 
+#   0: "<img src=\"blah\" alt=\""
+#   1: [ "This describes the image", "__NOPOPUP__" ]
+#   2: "\">"
+# 
+# This is a horrible hack. Yes.
+
+my @segments;
+
+
+# Finally, since this script is run in 'tainted' mode (-T switch) for basic
+# security reasons, and we'll be launching subprocesses, so we need to make
+# sure the PATH is clean otherwise Perl will refuse to do the system() calls.
+
+$ENV{PATH} = '';
+
+#------------------------------------------------------------------------------
+# Fetch the source page
+
+# get value of URL param, make sure it's absolute
+my $url = CGI->new->param ('url');
+die "No URL?" unless $url;
+$url = "http://$url" unless ($url =~ m!^[a-z]+://!);
+
+# configure Web client
+my $lwp = new LWP::UserAgent (%{{
+    agent   => $ENV{HTTP_USER_AGENT} || 'Mozilla/5.0',
+    timeout => 5,
+}});
+
+# fetch the web page we want to translate
+my $res = $lwp->get ($url);
+die "Couldn't fetch page: " . $res->status_line unless $res->is_success;
+my $html = $res->decoded_content;
+
+# Find the page's base url. It may be different than the URL given to us as
+# parameter if for instance that URL redirects to a different one, or the
+# document contains a <base> tag.
+my $base_url = $res->base;
+
+# Decode entities, except some basics because it confuses our parsing. We need
+# this because Moses won't understand the entities. It sometimes introduces
+# minor display bugs, though. TODO: decode only alphanumerical entities?
+$html =~ s/&((?:lt|gt);?)/&amp;$1/g;
+$html = decode_entities ($html);
+
+# Start printing HTML page
+print "Content-Type: text/html; charset=UTF-8\n\n";
+
+#------------------------------------------------------------------------------
+# Parser stack and state management
+
+# We're going to use a callback parser to parse the HTML file. As we walk the
+# HTML tree we maintain a buffer containing the current block if text to be
+# translated. These state variables contain that. The buffer is repeatedly
+# emptied and its contents pushed onto @segments.
+# 
+# We also remove 'soft' tags from the text as we append it to the buffer,
+# replace them with placeholders, and save info about the tags we set aside in
+# @buf_tag_index. @buf_tag_stack keeps track of 'currently open' tags, so that
+# we can match closing tags to their opening tags.
+
+my $buf_text_has_content = 0;
+my $buf_text = '';
+my @buf_tag_index;
+my @buf_tag_stack;
+
+my $in_verbatim = 0;
+
+
+# This is called when we find soft tags within text to be translated. Arguments
+# are the tag name, a hash of tag attributes, and a boolean telling us whether
+# it's an opening or closing tag.
+# 
+# We perform lookups in the above state variables, save the tag info in them if
+# necessary, and return a string which is the placeholder to replace that tag.
+
+sub make_placeholder {
+    my ($tag, $attr, $closing) = @_;
+    my $placeholder = '';
+
+    if ($closing) {
+
+        # try to match closing tags with their opening sibling
+        foreach my $i (reverse 0 .. $#buf_tag_stack) {
+            if ($buf_tag_stack[$i][0] eq $tag) {
+                $placeholder = 'MOSESCLOSETAG' . $buf_tag_stack[$i][1];
+                splice (@buf_tag_stack, $i, 1);
+                last;
+            }
+        }
+
+        # lone closing tags are added to the index but not the stack
+        if (!$placeholder) {
+            push (@buf_tag_index, [ $tag, $attr ]);
+            $placeholder = 'MOSESCLOSETAG' . $#buf_tag_index;
+        }
+
+    } else {
+        # opening tags are added to the index and the stack
+        push (@buf_tag_index, [ $tag, $attr ]);
+        push (@buf_tag_stack, [ $tag, $#buf_tag_index ]);
+        $placeholder = 'MOSESOPENTAG' . $#buf_tag_index;
+    }
+
+    return $placeholder;
+}
+
+
+# When we hit a hard tag, we call this to save any current text segment we have
+# to the @segments array.
+
+sub flush_buf_text {
+    if ($buf_text_has_content || @buf_tag_index) {
+        push (@segments, [ $buf_text, @buf_tag_index ] );
+    } else {
+        push (@segments, $buf_text);
+    }
+
+    $buf_text = '';
+    @buf_tag_index = ();
+    @buf_tag_stack = ();
+    $buf_text_has_content = 0;
+}
+
+#------------------------------------------------------------------------------
+# HTML parser
+
+# Parser callback for when we hit an opening or closing tag
+sub start_and_end_h {
+    my ($tag, $attr, $closing) = @_;
+
+    # keep track of whether we're in a verbatim segment
+    $in_verbatim = $closing ? 0 : $tag
+        if $VERBATIM_TAGS{$tag};
+
+    # make links absolute
+    my $url_attr = $URL_ATTRS{$tag};
+    &make_link_absolute ($tag, $attr, $url_attr)
+        if ($url_attr && $attr->{$url_attr});
+
+    # textual attributes require some trickery - FIXME this duplicates some of
+    # &print_tag
+    if ($TEXT_ATTR{$tag}) {
+        &flush_buf_text ();
+        my $found = 0;
+
+        # there's an example of how this works in the comments that precede the
+        # declaration of @segments, above
+        foreach my $text_attr (@{$TEXT_ATTR{$tag}}) {
+            if ($attr->{$text_attr}) {
+                push (@segments, ($found ? '"' : "<$tag") . " $text_attr=\"");
+                push (@segments, [ $attr->{$text_attr}, '__NOPOPUP__' ]);
+                delete $attr->{$text_attr};
+                $found = 1;
+            }
+        }
+
+        if ($found) {
+            my $self_close = delete $attr->{'/'} ? 1 : 0;
+            push (@segments, "\"" . join ('', map {
+                (my $v = $attr->{$_}) =~ s/\"/&\#34;/g;
+                " $_=\"$v\"";
+            } keys %{$attr}) . ($self_close ? ' /' : '') . '>');
+        } else {
+            push (@segments, &print_tag ($tag, $attr, $closing));
+        }
+
+    # if the tag is soft we buffer it, if it's hard we flush the buffer out
+    } elsif ($SOFT_TAGS{$tag}) {
+        my $placeholder = &make_placeholder ($tag, $attr, $closing);
+        $buf_text .= ' ' . $placeholder . ' ';
+    } else {
+        &flush_buf_text ();
+        push (@segments, &print_tag ($tag, $attr, $closing));
+    }
+
+    # add a <base> tag at the beginning of the <head> (do we need this?)
+    push (@segments, "<base src='$base_url'>\n")
+        if ($tag eq 'head' && !$closing);
+}
+
+
+# parser callback for text segments
+sub text_h {
+    my ($text) = @_;
+
+    if ($in_verbatim) {
+        # when in verbatim mode (in <script> or <style> tags), everything just
+        # gets reprinted as-is
+
+        # .. except this
+        $text =~ s/\@import\s+\"([^\n\"]+)\"/
+            '@import "' . URI->new_abs($1, $base_url)->as_string . '"';
+        /ge;
+
+        push (@segments, $text);
+
+    } else {
+        # otherwise add the text to the sentence buffer
+        $buf_text .= $text;
+        $buf_text_has_content ||= ($text =~ /\p{IsAlnum}/);
+    }
+}
+
+sub rest_h {
+    my ($text) = @_;
+    &flush_buf_text ();
+    push (@segments, $text);
+}
+
+
+my $parser = HTML::Parser->new (%{{
+    start_h       => [\&start_and_end_h, 'tagname, attr' ],
+    text_h        => [\&text_h,          'text'          ],
+    declaration_h => [\&rest_h,          'text'          ],
+    comment_h     => [\&rest_h,          'text'          ],
+
+    end_h => [sub {
+        &start_and_end_h (shift, {}, 1);
+    }, 'tagname' ],
+}});
+
+# parse it into @segments
+$parser->parse ($html);
+undef $parser;
+
+#------------------------------------------------------------------------------
+# Run translation threads
+
+# We have now parsed the who document to the @segments array. Now we start
+# the actual translation process.
+# 
+# We start one thread for each Moses host defined in the configuration above.
+# All threads will then race to translate text segments, working down the
+# @segments array. They also print segments as soon as a sequence of segments
+# is done.
+
+
+# These are the variables that are shared between threads and used for
+# synchronisation.
+
+my @input  :shared = map { ref $_ ? $_->[0] : undef } @segments;
+my @output :shared = map { ref $_ ? undef : $_ } @segments;
+my $next_job_i :shared = 0;
+my $num_printed :shared = 0;
+
+
+# This sub will be run in parallel by the threads
+my $thread_body = sub {
+    my ($moses_i) = @_;
+
+    # each thread uses it's own tokenizer and detokenizer subprocess
+    # (FIXME -- isn't this hugely inefficient?)
+    my $tokenizer   = new Subprocess (@TOKENIZER_CMD);
+    my $detokenizer = new Subprocess (@DETOKENIZER_CMD);
+    $tokenizer->start;
+    $detokenizer->start;
+
+    # each thread also connects to its own Moses server
+    my ($host, $port) = split /:/, $MOSES_ADDRESSES[$moses_i];
+    my $moses = new RemoteProcess ($host, $port) ||
+        die "Can't connect to '$host:$port'";
+    $moses->start;
+
+    for (;;) {
+
+        # Snatch the next unassigned job from the queue
+        my $job_i;
+        { lock $next_job_i; $job_i = $next_job_i++; }
+        last if ($job_i > $#input);
+
+        # If it's a text job, translate it, otherwise just don't do anything
+        $output[$job_i] = &translate_text_with_placeholders
+            ($input[$job_i], $moses, $tokenizer, $detokenizer)
+            if (!defined $output[$job_i]);
+
+        # Print out any sequential block of done jobs
+        lock $num_printed;
+        while ($num_printed < @input && defined $output[$num_printed]) {
+            my $print;
+
+            if (ref $segments[$num_printed]) {
+
+                # replace placeholders by the original tags
+                my @buf_tag_index = @{$segments[$num_printed]};
+                shift @buf_tag_index;
+                $print = &replace_placeholders_by_tags
+                    ($output[$num_printed], @buf_tag_index);
+
+                # wrap in code to popup the original text onmouseover
+                if ($buf_tag_index[0] ne '__NOPOPUP__') {
+                    $print = &add_original_text_popup
+                        ($input[$num_printed], $print);
+                } else {
+                    $print =~ s/\"/&\#34;/g;
+                }
+
+            } else {
+                # HTML segments are just printed as-is
+                $print = $segments[$num_printed];
+            }
+
+            print encode ('UTF-8', $print);
+            $num_printed++;
+        }
+    }
+};
+
+if (@MOSES_ADDRESSES == 1) {
+
+    # If there's only one instance of Moses, there's no point in forking a
+    # single thread and waiting for it to complete, so we just run the thread
+    # code directly in the main thread
+    $thread_body->(0);
+
+} else {
+
+    # Start all threads and wait for them all to finish
+    my @threads = map {
+        threads->create ($thread_body, $_);
+    } (0 .. $#MOSES_ADDRESSES);
+    $_->join foreach @threads;
+
+}
+
+#------------------------------------------------------------------------------
+# Translation subs
+
+
+# This sub is called bt the translation thread for each text segment. The
+# arguments are the input text and pointers to the various external processes
+# needed for processing.
+# 
+# At this stage the input text contains placeholders that look like
+# "MOSESOPENTAG2". We don't need to know which tag they stand for, but we do
+# need to set them aside, translate the remaining plain text, and reinsert them
+# at the correct place in the translation.
+
+sub translate_text_with_placeholders {
+    my ($input_text, $moses, $tokenizer, $detokenizer) = @_;
+    my $traced_text = '';
+
+    # Start by tokenizing the text, with placeholders still in it. The
+    # placeholders are designed to be interpreted as individual tokens by the
+    # tokenizer.
+    my @tokens = split /\s+/, $tokenizer->do_line ($input_text);
+
+    # remove placeholders, and for each remaining token, make a list of the
+    # tags that cover it
+    @tokens = ('START', @tokens, 'END');
+    my @tags_over_token = &_extract_placeholders (\@tokens);
+    @tokens = @tokens[1 .. $#tokens-1];
+
+    # translate sentence by sentence
+    my $token_base_i = 0;
+    while (@tokens > 0) {
+
+        # take a string of tokens up to the next sentence-ending token
+        my (@s_tokens, $split_token);
+        while (@tokens > 0) {
+            if ($tokens[0] =~ $RE_EOS_TOKEN) {
+                push (@s_tokens, shift @tokens);
+                last;
+            } elsif ($tokens[0] =~ $RE_SPLIT_TOKEN) {
+                $split_token = shift @tokens;
+                last;
+            } else {
+                push (@s_tokens, shift @tokens);
+            }
+        }
+
+        # Join together tokens into a plain text string. This is now ready to
+        # be shipped to Moses: all tags and placeholders have been removed,
+        # and it's a single sentence. We also lowercase as needed, and make
+        # a note of whether we did.
+        my $s_input_text = join (' ', @s_tokens);
+        my $was_ucfirst =
+            ($s_input_text =~ s/^(\p{IsUpper})(?=\p{IsLower})/lc $1;/e);
+        my $was_allcaps =
+            ($s_input_text =~ s/^([\p{IsUpper}\P{IsAlpha}]+)$/lc $1;/e);
+
+        # Translate the plain text sentence
+        # my $s_traced_text = &_translate_text_pig_latin ($s_input_text);
+        my $s_traced_text = &_translate_text_moses ($s_input_text, $moses);
+
+        # Early post-translation formatting fixes
+        $s_traced_text .= " $split_token" if $split_token;
+        $s_traced_text = ucfirst $s_traced_text if $was_ucfirst;
+        $s_traced_text = uc      $s_traced_text if $was_allcaps;
+
+        # Update trace numbers to fit in the Grand Scheme of Things
+        $s_traced_text =~ s{\s*\|(\d+)-(\d+)\|}{
+            ' |' . ($1+$token_base_i) . '-' . ($2+$token_base_i) . '| ';
+        }ge;
+        $token_base_i += @s_tokens + ($split_token ? 1 : 0);
+
+        $traced_text .= $s_traced_text . ' ';
+    }
+
+    # Apply to every segment in the traced output the union of all tags
+    # that covered tokens in the corresponding source segment
+    my $output_text = &_reinsert_placeholders
+        ($traced_text, @tags_over_token);
+
+    # Try to remove spaces inserted by the tokenizer
+    $output_text = $detokenizer->do_line ($output_text);
+
+    return $output_text;
+}
+
+
+# This sub takes an array of tokens, some of which are placeholders for
+# formatting tags. Some of these tag placeholders are for opening tags, some
+# are for closing tags. What we do here is we remove all these placeholders
+# from the list and create an index of which of the remaining tokens are
+# covered by which tags (by which we mean, inside their scope).
+# 
+# So for instance if the given array looks like this:
+# 
+#     [ "MOSESOPENTAG0", "MOSESOPENTAG1", "Hello", "MOSESCLOSETAG1",
+#       "MOSESOPENTAG2", "world", "MOSESCLOSETAG2", "MOSESCLOSETAG0" ]
+# 
+# after executing this sub the array will look like this:
+# 
+#     [ "Hello", "world" ]
+# 
+# and the @tags_over_token index will have been created, containing this:
+# 
+#     [ [0,1], [0,2] ]
+# 
+# indicating that the first token ("Hello") is covered by tags 0 and 1, and
+# that the 2nd token ("world") is covered by tags 0 and 2.
+
+sub _extract_placeholders {
+    my ($tokens) = @_;
+    my @tags_over_token = ([]);
+
+    while (@tags_over_token <= @$tokens) {
+        my $i = $#tags_over_token;
+        my @t = @{$tags_over_token[$i]};
+
+        if ($tokens->[$i] =~ /^MOSESOPENTAG(\d+)$/) {
+            $tags_over_token[$i] = [@t, $1];
+            splice (@{$tokens}, $i, 1);
+        } elsif ($tokens->[$i] =~ /^MOSESCLOSETAG(\d+)$/) {
+            if (grep $_ == $1, @t) {
+                $tags_over_token[$i] = [grep $_ != $1, @t];
+            } else {
+                push (@{$tags_over_token[$_]}, $1) foreach (0 .. $i-1);
+            }
+            splice (@{$tokens}, $i, 1);
+        } else {
+            push (@tags_over_token, [@t]);
+        }
+    }
+
+    return @tags_over_token;
+}
+
+
+# This sub does pretty much the opposite of the preceding sub. It gets as
+# argument the traced text output by Moses and the @tags_over_token array
+# computed by the preceding sub. The traced text looks something like this:
+# 
+#   Hallo |0-0| Welt |1-1|
+# 
+# For each such segment which is between two traces, we will want to apply
+# to it the union of all tags that were over the corresponding source text.
+# 
+# This sub does that, and returns the string, minus traces, plus reinserted
+# placeholders.
+
+sub _reinsert_placeholders {
+    my ($traced_text, @tags_over_token) = @_;
+
+    my %cur_open_tags = map {$_ => 1} @{$tags_over_token[0]};
+    my $output_text = '';
+
+    while ($traced_text =~ s/^(.+?)\s*\|(\d+)-+(\d+)\|\s*//) {
+        my ($segment, $from, $to) = ($1, $2+1, $3+1);
+
+        # list all tags that cover the source segment
+        my %segment_tags = map {$_ => 1} map {
+            @{$tags_over_token[$_]};
+        } ($from .. $to);
+
+        $output_text .= " MOSESCLOSETAG$_ "
+            foreach (grep !$segment_tags{$_}, keys %cur_open_tags);
+        $output_text .= " MOSESOPENTAG$_ "
+            foreach (grep !$cur_open_tags{$_}, keys %segment_tags);
+        %cur_open_tags = %segment_tags;
+
+        $output_text .= " $segment ";
+    }
+
+    my %final_tags = map {$_ => 1} @{$tags_over_token[-1]};
+    $output_text .= " MOSESCLOSETAG$_ "
+        foreach (grep !$final_tags{$_}, keys %cur_open_tags);
+    $output_text .= " MOSESOPENTAG$_ "
+        foreach (grep !$cur_open_tags{$_}, keys %final_tags);
+
+    $output_text .= $traced_text;
+    return $output_text;
+}
+
+
+# Finally this one replaces the placeholders by the actual tags.
+
+sub replace_placeholders_by_tags {
+    my ($buf_text, @buf_tag_index) = @_;
+
+    # replace the placeholders by the original tags
+    $buf_text =~ s{MOSES(OPEN|CLOSE)TAG(\d+)}{
+        &print_tag (@{$buf_tag_index[$2]}, $1 eq 'CLOSE');
+    }ge;
+
+    return $buf_text;
+}
+
+#------------------------------------------------------------------------------
+# Interfaces to actual plain-text translators. These take a plain string and
+# return a traced (Moses-style) translation
+
+
+# This sub is used when you want to debug everything in this script except the
+# actual translation. Translates to Pig Latin.
+
+sub _translate_text_pig_latin {
+    my ($text) = @_;
+
+    $text =~ s/\b([bcdfhj-np-tv-z]+)([a-z]+)/
+        ($1 eq ucfirst $1 ? ucfirst $2 : $2) .
+        ($2 eq lc $2 ? lc $1 : $1) .
+        'ay';
+    /gei;
+
+    # insert fake traces
+    my $i = -1;
+    $text .= ' ';
+    $text =~ s/\s+/$i++; " |$i-$i| "/ge;
+
+    return $text;
+}
+
+
+# This one, given a handle to a Moses subprocess, will use that to translate
+# the text. Not much to see here actually.
+
+sub _translate_text_moses {
+    my ($text, $moses) = @_;
+
+    my $traced_text = $moses->do_line ($text);
+    unless ($traced_text) {
+        my @tokens = split /\s+/, $text;
+
+        # insert a fake trace if for some reason moses didn't return one
+        # (which most likely indicates something is quite wrong)
+        $traced_text = $text . " |0-$#tokens|";
+    }
+
+    return $traced_text;
+}
+
+#------------------------------------------------------------------------------
+# basic HTML manipulation subs
+
+sub make_link_absolute {
+    my ($tag_name, $attr_hash, $attr_name) = @_;
+
+    # make it absolute
+    $attr_hash->{$attr_name} = URI->new_abs
+        ($attr_hash->{$attr_name}, $base_url)->as_string;
+
+    # make it point back to us if it's a link
+    if ($tag_name eq 'a') {
+        $attr_hash->{$attr_name} = 'index.cgi?url=' .
+            uri_escape ($attr_hash->{$attr_name});
+        $attr_hash->{target} = '_top';
+    }
+}
+
+sub print_tag {
+    my ($tag_name, $attr_hash, $closing) = @_;
+    my $self_close = $attr_hash->{'/'} ? 1 : 0;
+
+    return '<' . ($closing ? '/' : '') . $tag_name .
+        ($closing ? '' : join ('', map {
+            my $v = $attr_hash->{$_};
+            $v =~ s/\"/&\#34;/g;
+            " $_=\"$v\"";
+        } keys %{$attr_hash})) .
+        ($self_close ? ' /' : '') . '>';
+}
+
+sub add_original_text_popup {
+    my ($input_text, $output_html) = @_;
+
+    $input_text =~ s/\"/&\#34;/g;
+    $input_text =~ s/MOSES(?:OPEN|CLOSE)TAG\d+//g;
+    $input_text =~ s/^\s+//;
+    $input_text =~ s/\s+$//;
+    $input_text =~ s/\s+/ /g;
+
+    # Using this technique for displaying the source text pop-up means we don't
+    # have to fiddle with JavaScript, but it also means you need the LongTitles
+    # extension installed if using Firefox.. *I* happen to have it, so..
+    return "<span title=\"$input_text\">$output_html</span>";
+}
+
+#------------------------------------------------------------------------------
+# conclusion
+
+# stop the top frame counter
+my $num_sentences = grep ref $_, @segments;
+print "<script> top.numSentences = $num_sentences </script>\n";
+
+#------------------------------------------------------------------------------
author	skyload <skyload@1f5c12ca-751b-0410-a591-d2e778427230>	2010-04-21 16:07:21 +0400
committer	skyload <skyload@1f5c12ca-751b-0410-a591-d2e778427230>	2010-04-21 16:07:21 +0400
commit	c009c843d5e0ac3a86ef0f0170d0401ef01fe5cc (patch)
tree	881e28a3c0fcf6e280bfeea0f97b9f058f6187e3
parent	6895b4fc3856950022d7995b0e6f5b0d58eeccfe (diff)