From 1a26cb84140bde842b0b60c6888e7f169536e849 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Fulajt=C3=A1r?= Date: Thu, 27 Aug 2015 15:15:32 +0200 Subject: Added a simple support for the factored systems. --- scripts/analysis/oov.pl | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'scripts/analysis') diff --git a/scripts/analysis/oov.pl b/scripts/analysis/oov.pl index 9756887c9..5228f0f45 100755 --- a/scripts/analysis/oov.pl +++ b/scripts/analysis/oov.pl @@ -176,6 +176,13 @@ sub ngrams { return { md5(encode_utf8($sent)) => 1 }; } else { my @words = split /\s+/, $sent; + + #factors + if ( $sent =~ m/[|]/) { + my $use_index = 0; # default factor is the first one + @words = map { ( split /[|]/, $_ ) [$use_index] } @words; + } + my $out; if ($n == 1) { foreach my $w (@words) { -- cgit v1.2.3