Minor fixes for simulated post-editing with mert-moses.pl

author: Michael Denkowski <michael.j.denkowski@gmail.com> 2014-08-13 23:58:51 +0400
committer: Michael Denkowski <michael.j.denkowski@gmail.com> 2014-08-13 23:58:51 +0400
commit: 057066ea0e9c9a9400cd4bd40d204166e80dc125 (patch)
tree: 7b568e9ff619df766fa74345a4765286e0396eb7
parent: 94c44c03d5470694827243aa2f12d3a3b031fda4 (diff)
3 files changed, 18 insertions, 13 deletions
diff --git a/moses-cmd/simulate-pe.cc b/moses-cmd/simulate-pe.cc
index e88c1e463..5384d9886 100644
--- a/moses-cmd/simulate-pe.cc
+++ b/moses-cmd/simulate-pe.cc
@@ -280,7 +280,7 @@ public:
 
         if (file->is_complete() && file->good()) {
           fix(*file,PRECISION);
-          manager.OutputSearchGraphAsHypergraph(m_lineNumber, *file);
+          manager.OutputSearchGraphAsHypergraph(*file);
           file -> flush();
         } else {
           TRACE_ERR("Cannot output hypergraph for line " << m_lineNumber 
diff --git a/scripts/generic/moses_sim_pe.py b/scripts/generic/moses_sim_pe.py
index 290711b56..e29f0333d 100755
--- a/scripts/generic/moses_sim_pe.py
+++ b/scripts/generic/moses_sim_pe.py
@@ -32,7 +32,7 @@ Usage: {} moses-cmd -config moses.ini -input-file text.src -ref text.tgt -symal
 
 Options:
     -threads N: number of decoders to run in parallel (default read from moses.ini, 1 if not present)
-    -n-best-list nbest.out N: location and size of N-best list
+    -n-best-list nbest.out N [distinct]: location and size of N-best list
     -show-weights: for mert-moses.pl, just call moses and exit
     -tmp: location of temp directory (default /tmp)
 
@@ -110,6 +110,7 @@ def main(argv):
     threads = 1
     n_best_out = None
     n_best_size = None
+    n_best_distinct = False
     tmp_dir = '/tmp'
     xml_found = False
     xml_input = 'exclusive'
@@ -143,7 +144,12 @@ def main(argv):
         elif cmd[i] == '-n-best-list':
             n_best_out = cmd[i + 1]
             n_best_size = cmd[i + 2]
-            cmd = cmd[:i] + cmd[i + 3:]
+            # Optional "distinct"
+            if i + 3 < len(cmd) and cmd[i + 3] == 'distinct':
+                n_best_distinct = True
+                cmd = cmd[:i] + cmd[i + 4:]
+            else:
+                cmd = cmd[:i] + cmd[i + 3:]
         elif cmd[i] == '-tmp':
             tmp_dir = cmd[i + 1]
             cmd = cmd[:i] + cmd[i + 2:]
@@ -231,7 +237,7 @@ def main(argv):
     sys.stderr.write('Jobs: {}\n'.format(threads))
     sys.stderr.write('Batch size: {}\n'.format(batch_size))
     if n_best_out:
-        sys.stderr.write('N-best list: {} ({})\n'.format(n_best_out, n_best_size))
+        sys.stderr.write('N-best list: {} ({}{})\n'.format(n_best_out, n_best_size, ', distinct' if n_best_distinct else ''))
     sys.stderr.write('Temp dir: {}\n'.format(work_dir))
 
     # Accumulate seen lines
@@ -289,6 +295,8 @@ def main(argv):
             work_cmd.append('-n-best-list')
             work_cmd.append(os.path.join(work_dir, 'nbest.{}'.format(i)))
             work_cmd.append(str(n_best_size))
+            if n_best_distinct:
+                work_cmd.append('distinct')
         in_file = os.path.join(work_dir, 'input.{}.xml'.format(i))
         out_file = os.path.join(work_dir, 'out.{}'.format(i))
         err_file = os.path.join(work_dir, 'err.{}'.format(i))
diff --git a/scripts/training/mert-moses.pl b/scripts/training/mert-moses.pl
index a4b20f3c7..027d94a77 100755
--- a/scripts/training/mert-moses.pl
+++ b/scripts/training/mert-moses.pl
@@ -477,9 +477,9 @@ if ($___DECODER_FLAGS =~ /(^|\s)-(config|f) /
 }
 
 # Paths needed for simulated post-editing
-if ($___DEV_SYMAL) {
+$working_dir_abs = ensure_full_path($___WORKING_DIR);
+if (defined $___DEV_SYMAL) {
    $dev_symal_abs = ensure_full_path($___DEV_SYMAL);
-   $working_dir_abs = ensure_full_path($___WORKING_DIR);
 }
 
 # as weights are normalized in the next steps (by cmert)
@@ -1254,16 +1254,13 @@ sub run_decoder {
       }
       $decoder_cmd = "$___DECODER $___DECODER_FLAGS  -config $___CONFIG";
       $decoder_cmd .= " -inputtype $___INPUTTYPE" if defined($___INPUTTYPE);
-      $decoder_cmd .= " $decoder_config $lsamp_cmd $nbest_list_cmd  -input-file $___DEV_F > run$run.out";
-
-      # If simulating post-editing, route command through moses_sim_pe.py
+      $decoder_cmd .= " $decoder_config $lsamp_cmd $nbest_list_cmd  -input-file $___DEV_F";
       if (defined $___DEV_SYMAL) {
+        # If simulating post-editing, route command through moses_sim_pe.py
         # Always use single (first) reference.  Simulated post-editing undefined for multiple references.
-        $decoder_cmd = "$___MOSES_SIM_PE $___DECODER $___DECODER_FLAGS  -config $___CONFIG -inputtype $___INPUTTYPE $decoder_config $lsamp_cmd $nbest_list_cmd  -input-file $___DEV_F -ref $references[0] -symal $dev_symal_abs -tmp $working_dir_abs > run$run.out";
-      } else {
-        # Default: call decoder directly
-        $decoder_cmd = "$___DECODER $___DECODER_FLAGS  -config $___CONFIG -inputtype $___INPUTTYPE $decoder_config $lsamp_cmd $nbest_list_cmd  -input-file $___DEV_F > run$run.out";
+        $decoder_cmd = "$___MOSES_SIM_PE $decoder_cmd -ref $references[0] -symal $dev_symal_abs -tmp $working_dir_abs > run$run.out";
       }
+      $decoder_cmd .= " > run$run.out";
     }
 
     print STDERR "Executing: $decoder_cmd \n";
author	Michael Denkowski <michael.j.denkowski@gmail.com>	2014-08-13 23:58:51 +0400
committer	Michael Denkowski <michael.j.denkowski@gmail.com>	2014-08-13 23:58:51 +0400
commit	057066ea0e9c9a9400cd4bd40d204166e80dc125 (patch)
tree	7b568e9ff619df766fa74345a4765286e0396eb7
parent	94c44c03d5470694827243aa2f12d3a3b031fda4 (diff)