blob: 9edeec4609e50c8d64d71d434f7d340cd3909147 (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
|
# experiment.meta: now with comments.
[CORPUS] multiple
get-corpus
in: get-corpus-script
out: raw-stem
default-name: corpus/txt
rerun-on-change: input-extension output-extension
template: IN OUT $input-extension $output-extension
pre-tok-clean
in: raw-stem
out: pre-tok-cleaned
default-name: corpus/pre-tok-cleaned
pass-unless: pre-tok-clean
template: $pre-tok-clean IN $input-extension $output-extension OUT OUT.lines-retained
parallelizable: yes
tokenize
in: pre-tok-cleaned
out: tokenized-stem
default-name: corpus/tok
pass-unless: input-tokenizer output-tokenizer
template-if: input-tokenizer IN.$input-extension OUT.$input-extension
template-if: output-tokenizer IN.$output-extension OUT.$output-extension
parallelizable: yes
clean
in: tokenized-stem
out: clean-stem
default-name: corpus/clean
ignore-if: cleaner
rerun-on-change: max-sentence-length $moses-script-dir/training/clean-corpus-n.perl
template: $moses-script-dir/training/clean-corpus-n.perl IN $input-extension $output-extension OUT 1 $max-sentence-length OUT.lines-retained
error: there is a blank factor
error: is too long! at
custom-clean
in: tokenized-stem
out: clean-stem
default-name: corpus/clean
ignore-unless: cleaner
rerun-on-change: max-sentence-length cleaner
template: $cleaner IN $input-extension $output-extension OUT 1 $max-sentence-length OUT.lines-retained
error: there is a blank factor
error: is too long! at
parse
in: clean-stem
out: parsed-stem
default-name: corpus/parsed
pass-unless: input-parser output-parser
template-if: input-parser IN.$input-extension OUT.$input-extension
template-if: output-parser IN.$output-extension OUT.$output-extension
parallelizable: yes
post-parse-clean
in: parsed-stem
out: clean-parsed-stem
default-name: corpus/parsed-clean
pass-unless: input-parser output-parser
template: $moses-script-dir/training/clean-corpus-n.perl IN $input-extension $output-extension OUT 1 10000 OUT.lines-retained --ignore-xml
error: there is a blank factor
factorize
in: clean-parsed-stem
out: factorized-stem
rerun-on-change: TRAINING:input-factors TRAINING:output-factors
default-name: corpus/factored
pass-unless: TRAINING:input-factors
parallelizable: yes
error: can't open
error: incompatible number of words in factor
truecase
in: factorized-stem TRUECASER:truecase-model
out: truecased-stem
rerun-on-change: input-truecaser output-truecaser
default-name: corpus/truecased
pass-unless: input-truecaser output-truecaser
template-if: input-truecaser IN.$input-extension OUT.$input-extension -model IN1.$input-extension
template-if: output-truecaser IN.$output-extension OUT.$output-extension -model IN1.$output-extension
parallelizable: yes
source-label
in: truecased-stem
out: source-labelled
default-name: corpus/labelled
pass-unless: source-labeller
template-if: source-labeller IN.$input-extension OUT.$input-extension
template-if: cat IN.$output-extension OUT.$output-extension
parallelizable: yes
lowercase
in: source-labelled
out: lowercased-stem
default-name: corpus/lowercased
pass-unless: input-lowercaser output-lowercaser
template-if: input-lowercaser IN.$input-extension OUT.$input-extension
template-if: output-lowercaser IN.$output-extension OUT.$output-extension
parallelizable: yes
split
in: lowercased-stem SPLITTER:splitter-model
out: split-stem
default-name: corpus/split
pass-unless: input-splitter output-splitter
template-if: input-splitter IN.$input-extension OUT.$input-extension -model IN1.$input-extension
template-if: output-splitter IN.$output-extension OUT.$output-extension -model IN1.$output-extension
post-split-clean
in: split-stem
out: clean-split-stem
default-name: corpus/split-clean
ignore-if: input-parser output-parser
pass-unless: input-splitter output-splitter
template: $moses-script-dir/training/clean-corpus-n.perl IN $input-extension $output-extension OUT 1 $max-sentence-length OUT.lines-retained
error: there is a blank factor
post-split-clean-syntax
in: split-stem
out: clean-split-stem
default-name: corpus/split-clean
ignore-unless: input-parser output-parser
pass-unless: input-splitter output-splitter
template: $moses-script-dir/training/clean-corpus-n.perl IN $input-extension $output-extension OUT 1 10000 OUT.lines-retained --ignore-xml
error: there is a blank factor
[RECASING] single
tokenize
in: raw
out: tokenized
default-name: recasing/cased
pass-unless: output-tokenizer
template: $output-tokenizer < IN > OUT
train
in: tokenized
out: recase-config
template: $moses-script-dir/recaser/train-recaser.perl -train-script $TRAINING:script -dir OUT.model -corpus IN -scripts-root-dir $moses-script-dir -config OUT $recasing-settings
default-name: recasing/moses.ini
tmp-name: recasing/model
ignore-unless: EVALUATION:recaser
error: cannot execute binary file
[TRUECASER] single
consolidate
in: CORPUS:clean-parsed-stem
out: tokenized-stem
default-name: truecaser/corpus
pass-unless: trainer
template: $moses-script-dir/ems/support/consolidate-training-data.perl $input-extension $output-extension OUT IN
error: number of lines don't match
train
in: tokenized-stem
out: truecase-model
rerun-on-change: trainer
pass-unless: trainer
default-name: truecaser/truecase-model
template: $trainer -model OUT.$input-extension -corpus IN.$input-extension ; $trainer -model OUT.$output-extension -corpus IN.$output-extension
[SPLITTER] single
consolidate
in: CORPUS:lowercased-stem
out: truecased-stem
default-name: splitter/corpus
ignore-unless: input-splitter output-splitter
template: $moses-script-dir/ems/support/consolidate-training-data.perl $input-extension $output-extension OUT IN
train
in: truecased-stem
out: splitter-model
default-name: splitter/split-model
ignore-unless: input-splitter output-splitter
ignore-if: no-splitter-training
[LM] multiple
prepare-bilingual-nplm
in: TRAINING:corpus TRAINING:word-alignment
out: numberized_ngrams
ignore-unless: bilingual-lm
rerun-on-change: TRAINING:corpus TRAINING:word-alignment
template: $moses-script-dir/training/bilingual-lm/extract_training.py -c IN0 -e $output-extension -f $input-extension -a IN1.$TRAINING:alignment-symmetrization-method -w $working-dir/$bilingual-lm-workdir -n $order -m $source-window $bilingual-lm-settings
default-name: lm/bilingualLM_prep
train-bilingual-lm
in: numberized_ngrams TRAINING:corpus
out: binlm
ignore-unless: bilingual-lm
rerun-on-change: numberized_ngrams
template: $moses-script-dir/training/bilingual-lm/train_nplm.py -w $working-dir/$bilingual-lm-workdir -c IN1 -r $working-dir/$nplm-output-dir -n $train_order $nplm-settings
default-name: lm/bilingualLM
get-corpus
in: get-corpus-script
out: raw-corpus
pass-unless: get-corpus-script
default-name: lm/txt
template: $get-corpus-script > OUT
use-parallel-corpus
in: parallel-corpus-stem
out: tokenized-corpus
default-name: lm/tok
ignore-unless: parallel-corpus-stem
template: ln -s IN.$output-extension OUT
error: failed to create symbolic link
tokenize
in: raw-corpus
out: tokenized-corpus
default-name: lm/tok
pass-unless: output-tokenizer
ignore-if: parallel-corpus-stem concatenate-files concatenate-files-split
template: $output-tokenizer < IN > OUT
parallelizable: yes
mock-parse
in: tokenized-corpus
out: mock-parsed-corpus
default-name: lm/mock-parsed
pass-unless: mock-output-parser-lm
ignore-if: concatenate-files concatenate-files-split
template: $mock-output-parser-lm < IN > OUT
factorize
in: mock-parsed-corpus
out: factorized-corpus
default-name: lm/factored
pass-unless: factors
ignore-if: concatenate-files concatenate-files-split
parallelizable: yes
error: can't open
error: incompatible number of words in factor
lowercase
in: factorized-corpus
out: lowercased-corpus
default-name: lm/lowercased
pass-unless: output-lowercaser
ignore-if: output-truecaser concatenate-files concatenate-files-split
#only-factor-0: yes
template: $output-lowercaser < IN > OUT
parallelizable: yes
truecase
in: factorized-corpus TRUECASER:truecase-model
out: lowercased-corpus
rerun-on-change: output-truecaser
default-name: lm/truecased
ignore-unless: output-truecaser
ignore-if: concatenate-files concatenate-files-split
only-factor-0: yes
template: $output-truecaser -model IN1.$output-extension < IN > OUT
parallelizable: yes
split
in: lowercased-corpus SPLITTER:splitter-model
out: split-corpus
rerun-on-change: output-splitter
default-name: lm/split
pass-unless: output-splitter
ignore-if: concatenate-files concatenate-files-split
template: $output-splitter -model IN1.$output-extension < IN > OUT
strip
in: split-corpus
out: stripped-corpus
default-name: lm/stripped
pass-unless: mock-output-parser-lm
ignore-if: concatenate-files
template: $moses-script-dir/training/strip-xml.perl < IN > OUT
concatenate-split
in: concatenate-files-split
out: split-corpus
ignore-unless: concatenate-files-split
default-name: lm/split
template: cat IN > OUT
concatenate
in: concatenate-files
out: stripped-corpus
ignore-unless: concatenate-files
default-name: lm/stripped
template: cat IN > OUT
train
in: stripped-corpus
out: lm
default-name: lm/lm
ignore-if: rlm-training custom-training bilingual-lm
rerun-on-change: lm-training order settings
template: $lm-training -order $order $settings -text IN -lm OUT
error: cannot execute binary file
error: unrecognised option
not-error: BadDiscountException
not-error: To override this error
train-custom
in: stripped-corpus
out: binlm
default-name: lm/custom-lm
rerun-on-change: custom-training
ignore-unless: AND custom-training config-feature-line config-weight-line
ignore-if: syntactic
template: $custom-training -text IN -lm OUT
final-model: yes
train-custom-syntax
in: split-corpus
out: binlm
default-name: lm/custom-lm
rerun-on-change: custom-training
ignore-unless: AND custom-training config-feature-line config-weight-line syntactic mock-output-parser-lm
template: $custom-training -text IN -lm OUT
final-model: yes
randomize
in: lm
out: rlm
default-name: lm/rlm
pass-unless: lm-randomizer
ignore-if: rlm-training
train-randomized
in: stripped-corpus
out: rlm
default-name: lm/rlm
ignore-unless: rlm-training
rerun-on-change: rlm-training order
quantize
in: rlm
out: qlm
pass-unless: lm-quantizer
default-name: lm/qlm
template: $lm-quantizer IN OUT
binarize
in: qlm
out: binlm
pass-unless: lm-binarizer
ignore-if: bilingual-lm
rerun-on-change: lm
default-name: lm/binlm
template: $lm-binarizer IN OUT
error: set KENLM_MAX_ORDER to at least this value
final-model: yes
[INTERPOLATED-LM] single
tuning-from-sgm
in: tuning-sgm
out: raw-tuning
default-name: lm/interpolate-tuning.txt
template: $moses-script-dir/ems/support/input-from-sgm.perl < IN > OUT
tokenize-tuning
in: raw-tuning
out: tokenized-tuning
default-name: lm/interpolate-tuning.tok
pass-unless: output-tokenizer
template: $output-tokenizer < IN > OUT
parallelizable: yes
mock-parse-tuning
in: tokenized-tuning
out: mock-parsed-tuning
default-name: lm/interpolate-tuning.mock-parsed
pass-unless: mock-output-parser-lm
template: $mock-output-parser-lm < IN > OUT
factorize-tuning
in: mock-parsed-tuning
out: factorized-tuning
default-name: lm/interpolate-tuning.factored
pass-unless: TRAINING:output-factors
parallelizable: yes
error: can't open
error: incompatible number of words in factor
lowercase-tuning
in: factorized-tuning
out: lowercased-tuning
default-name: lm/interpolate-tuning.lowercased
pass-unless: output-lowercaser
ignore-if: output-truecaser
template: $output-lowercaser < IN > OUT
truecase-tuning
in: factorized-tuning TRUECASER:truecase-model
out: lowercased-tuning
rerun-on-change: output-truecaser
default-name: lm/interpolate-tuning.truecased
ignore-unless: output-truecaser
template: $output-truecaser -model IN1.$output-extension < IN > OUT
split-tuning
in: lowercased-tuning SPLITTER:splitter-model
out: split-tuning
rerun-on-change: output-splitter
default-name: lm/interpolate-tuning.split
pass-unless: output-splitter
template: $output-splitter -model IN1.$output-extension < IN > OUT
strip-tuning
in: split-tuning
out: stripped-tuning
default-name: lm/interpolate-tuning.stripped
pass-unless: mock-output-parser-lm
template: $moses-script-dir/training/strip-xml.perl < IN > OUT && $moses-script-dir/training/wrappers/mosesxml2brackets.py < IN > OUT.trees
interpolate
in: script stripped-tuning LM:lm
rerun-on-change: srilm-dir group weights
out: lm
default-name: lm/interpolated-lm
randomize
in: lm
out: rlm
pass-unless: lm-randomizer
default-name: lm/interpolated-rlm
quantize
in: rlm
out: qlm
pass-unless: lm-quantizer
default-name: lm/interpolated-qlm
binarize
in: qlm
out: binlm
pass-unless: lm-binarizer
ignore-unless: script
rerun-on-change: lm
default-name: lm/interpolated-binlm
error: set kMaxOrder to at least this value
final-model: yes
[MML] single
tokenize-indomain-source
in: raw-indomain-source
out: tokenized-indomain-source
default-name: mml/indomain-source.tok
pass-unless: input-tokenizer
template: $input-tokenizer < IN > OUT
parallelizable: yes
factorize-indomain-source
in: tokenized-indomain-source
out: factorized-indomain-source
rerun-on-change: TRAINING:input-factors
default-name: mml/indomain-source.factored
pass-unless: factors
parallelizable: yes
error: can't open
error: incompatible number of words in factor
lowercase-indomain-source
in: factorized-indomain-source
out: lowercased-indomain-source
default-name: mml/indomain-source.lowercased
pass-unless: input-lowercaser
ignore-if: input-truecaser
only-factor-0: yes
template: $input-lowercaser < IN > OUT
parallelizable: yes
truecase-indomain-source
in: factorized-indomain-source TRUECASER:truecase-model
out: lowercased-indomain-source
rerun-on-change: input-truecaser
default-name: mml/indomain-source.truecased
ignore-unless: input-truecaser
only-factor-0: yes
template: $input-truecaser -model IN1.$input-extension < IN > OUT
parallelizable: yes
split-indomain-source
in: lowercased-indomain-source SPLITTER:splitter-model
out: indomain-source
rerun-on-change: input-splitter
default-name: mml/indomain-source.split
pass-unless: input-splitter
template: $input-splitter -model IN1.$input-extension < IN > OUT
tokenize-indomain-target
in: raw-indomain-target
out: tokenized-indomain-target
default-name: mml/indomain-target.tok
pass-unless: output-tokenizer
template: $output-tokenizer < IN > OUT
parallelizable: yes
factorize-indomain-target
in: tokenized-indomain-target
out: factorized-indomain-target
rerun-on-change: TRAINING:output-factors
default-name: mml/indomain-target.factored
pass-unless: factors
parallelizable: yes
error: can't open
error: incompatible number of words in factor
lowercase-indomain-target
in: factorized-indomain-target
out: lowercased-indomain-target
default-name: mml/indomain-target.lowercased
pass-unless: output-lowercaser
ignore-if: output-truecaser
only-factor-0: yes
template: $output-lowercaser < IN > OUT
parallelizable: yes
truecase-indomain-target
in: factorized-indomain-target TRUECASER:truecase-model
out: lowercased-indomain-target
rerun-on-change: output-truecaser
default-name: mml/indomain-target.truecased
ignore-unless: output-truecaser
only-factor-0: yes
template: $output-truecaser -model IN1.$output-extension < IN > OUT
parallelizable: yes
split-indomain-target
in: lowercased-indomain-target SPLITTER:splitter-model
out: indomain-target
rerun-on-change: output-splitter
default-name: mml/indomain-target.split
pass-unless: output-splitter
template: $output-splitter -model IN1.$output-extension < IN > OUT
train
in: indomain-stem outdomain-stem
out: model
rerun-on-change: settings
ignore-unless: indomain-stem
default-name: mml/model
template: $moses-script-dir/ems/support/mml-train.perl -in-source IN.$input-extension -in-target IN.$output-extension -out-source IN1.$input-extension -out-target IN1.$output-extension -model OUT -lm-training "$lm-training" -order $order -lm-settings "$lm-settings" -lm-binarizer $lm-binarizer $settings
train-in-mono
in: indomain-source indomain-target outdomain-stem
out: model
rerun-on-change: settings
ignore-if: indomain-stem
default-name: mml/model
template: $moses-script-dir/ems/support/mml-train.perl -in-source IN -in-target IN1 -out-source IN2.$input-extension -out-target IN2.$output-extension -model OUT -lm-training "$lm-training" -order $order -lm-settings "$lm-settings" -lm-binarizer $lm-binarizer $settings
[TRAINING] single
consolidate
in: CORPUS:clean-split-stem
out: corpus
default-name: corpus
template: $moses-script-dir/ems/support/consolidate-training-data.perl $input-extension $output-extension OUT IN
build-domains
in: CORPUS:clean-split-stem
out: domains
default-name: model/domains
ignore-unless: domain-features mml-filter-corpora
template: $moses-script-dir/ems/support/build-domain-file-from-subcorpora.perl $input-extension IN > OUT
final-model: yes
mml-score
in: MML:model corpus domains
out: mml-scores
ignore-unless: mml-before-wa mml-after-wa
rerun-on-change: mml-filter-corpora
default-name: training/corpus-mml-score
template: $moses-script-dir/ems/support/mml-score.perl -model IN -corpus IN1 -domains IN2 -input-extension $input-extension -output-extension $output-extension -query $MML:lm-query -filter-domains "$mml-filter-corpora" > OUT
mml-filter-before-wa
in: corpus mml-scores domains
out: corpus-mml-prefilter
ignore-unless: mml-before-wa
rerun-on-change: mml-filter-corpora mml-before-wa
default-name: training/corpus-mml
template: $moses-script-dir/ems/support/mml-filter.perl -in IN -out OUT -score IN1 -domain IN2 -input-extension $input-extension -output-extension $output-extension $mml-before-wa
prepare-data-fast-align
in: corpus-mml-prefilter=OR=corpus
out: prepared-data-fast-align
default-name: prepared
fast-align
in: prepared-data-fast-align
out: fast-alignment
rerun-on-change: fast-align-settings
ignore-if: fast-align-max-lines
template: $external-bin-dir/fast_align -i IN $fast-align-settings > OUT
default-name: fast-align
fast-align-inverse
in: prepared-data-fast-align
out: fast-alignment-inverse
rerun-on-change: fast-align-settings
ignore-if: fast-align-max-lines
template: $external-bin-dir/fast_align -i IN -r $fast-align-settings > OUT
default-name: fast-align-inverse
fast-align-in-parts
in: prepared-data-fast-align
out: fast-alignment
rerun-on-change: fast-align-settings fast-align-max-lines
ignore-unless: fast-align-max-lines
tmp-name: training/tmp.fast-align
template: $moses-script-dir/ems/support/fast-align-in-parts.perl -bin $external-bin-dir/fast_align -i IN -max-lines $fast-align-max-lines -tmp TMP -settings '$fast-align-settings' > OUT
default-name: fast-align
fast-align-in-parts-inverse
in: prepared-data-fast-align
out: fast-alignment-inverse
rerun-on-change: fast-align-settings fast-align-max-lines
ignore-unless: fast-align-max-lines
tmp-name: training/tmp.fast-align-inverse
template: $moses-script-dir/ems/support/fast-align-in-parts.perl -bin $external-bin-dir/fast_align -i IN -r -max-lines $fast-align-max-lines -tmp TMP -settings '$fast-align-settings' > OUT
default-name: fast-align
symmetrize-fast-align
in: fast-alignment fast-alignment-inverse corpus-mml-prefilter=OR=corpus
out: word-alignment
ignore-unless: fast-align-settings
template: $moses-script-dir/ems/support/symmetrize-fast-align.perl IN IN1 IN2.$input-extension IN2.$output-extension OUT $alignment-symmetrization-method $moses-src-dir/bin/symal
default-name: model/aligned
prepare-data
in: corpus-mml-prefilter=OR=corpus
out: prepared-data
rerun-on-change: alignment-factors training-options script baseline-alignment-model external-bin-dr
ignore-if: use-berkeley
default-name: prepared
run-giza
in: prepared-data
out: giza-alignment
ignore-if: use-berkeley
rerun-on-change: giza-settings training-options script baseline-alignment-model external-bin-dir
default-name: giza
error: not found
not-error: 0 not found
run-giza-inverse
in: prepared-data
out: giza-alignment-inverse
rerun-on-change: giza-settings training-options script baseline-alignment-model external-bin-dir
ignore-if: use-berkeley
default-name: giza-inverse
error: not found
not-error: 0 not found
run-berkeley
in: corpus-mml-prefilter
out: berkeley-alignment
ignore-unless: use-berkeley
rerun-on-change: berkeley-train berkeley-jar berkeley-training-options
default-name: berkeley
template: $berkeley-train " $berkeley-java-options " $berkeley-jar IN OUT $input-extension $output-extension $berkeley-training-options
not-error: 0 errors,
process-berkeley
in: corpus-mml-prefilter berkeley-alignment
out: word-alignment
default-name: model/aligned
rerun-on-change: berkeley-process berkeley-jar berkeley-posterior berkeley-process-options
ignore-unless: use-berkeley
template: $berkeley-process " $berkeley-java-options " $berkeley-jar IN IN1 OUT $input-extension $output-extension $alignment-symmetrization-method $berkeley-posterior $berkeley-process-options
not-error: 0 errors,
symmetrize-giza
in: giza-alignment giza-alignment-inverse
out: word-alignment
ignore-if: use-berkeley fast-align-settings
rerun-on-change: alignment-symmetrization-method training-options script
default-name: model/aligned
error: skip=<[1-9]
mml-filter-after-wa
in: corpus-mml-prefilter=OR=corpus word-alignment mml-scores corpus-mml-prefilter=OR=domains
out: corpus-mml-postfilter
ignore-unless: mml-after-wa
rerun-on-change: mml-filter-corpora mml-after-wa
default-name: model/corpus-mml
template: $moses-script-dir/ems/support/mml-filter.perl -in IN -out OUT -alignment IN1 -score IN2 -domain IN3 -input-extension $input-extension -output-extension $output-extension $mml-after-wa
build-biconcor
in: corpus-mml-postfilter=OR=word-alignment corpus-mml-postfilter=OR=corpus-mml-prefilter=OR=corpus
out: biconcor-model
default-name: model/biconcor
ignore-unless: biconcor
error: usage
final-model: yes
build-suffix-array
in: corpus-mml-postfilter=OR=word-alignment corpus-mml-postfilter=OR=corpus-mml-prefilter=OR=corpus
out: phrase-translation-table
default-name: model/suffix-array
ignore-unless: suffix-array
error: usage
build-lex-trans
in: corpus-mml-postfilter=OR=word-alignment corpus-mml-postfilter=OR=corpus-mml-prefilter=OR=corpus
out: lexical-translation-table
rerun-on-change: translation-factors training-options script
default-name: model/lex
parse-relax
in: corpus-mml-postfilter=OR=corpus-mml-prefilter=OR=corpus
out: parse-relaxed-corpus
default-name: model/parsed-relaxed
pass-unless: input-parse-relaxer output-parse-relaxer
template-if: input-parse-relaxer IN.$input-extension OUT.$input-extension
template-if: output-parse-relaxer IN.$output-extension OUT.$output-extension
pcfg-extract
in: parse-relaxed-corpus
out: pcfg
default-name: model/pcfg
ignore-unless: use-pcfg-feature
rerun-on-change: use-pcfg-feature
template: $moses-bin-dir/pcfg-extract < IN.$output-extension > OUT.$output-extension
pcfg-score
in: parse-relaxed-corpus pcfg
out: scored-corpus
default-name: model/scored-corpus
pass-unless: use-pcfg-feature
template: ln -s IN.$input-extension OUT.$input-extension ; $moses-bin-dir/pcfg-score IN1.$output-extension < IN.$output-extension > OUT.$output-extension
build-osm
in: corpus word-alignment
out: osm-model
ignore-unless: operation-sequence-model
rerun-on-change: operation-sequence-model training-options script giza-settings operation-sequence-model-settings
template: $moses-script-dir/OSM/OSM-Train.perl --corpus-f IN0.$input-extension --corpus-e IN0.$output-extension --alignment IN1.$alignment-symmetrization-method --order $operation-sequence-model-order --out-dir OUT --moses-src-dir $moses-src-dir $operation-sequence-model-settings
default-name: model/OSM
build-transliteration-model
in: corpus word-alignment
out: transliteration-model
ignore-unless: transliteration-module
rerun-on-change: transliteration-module training-options script giza-settings
default-name: model/Transliteration
final-model: yes
build-translit-table
in: transliteration-model
out: transliteration-table
ignore-unless: in-decoding-transliteration
rerun-on-change: in-decoding-transliteration transliteration-module
default-name: model/transliteration-phrase-table
template: $moses-script-dir/Transliteration/in-decoding-transliteration.pl --moses-src-dir $moses-src-dir --external-bin-dir $external-bin-dir --transliteration-model-dir IN --input-extension $input-extension --output-extension $output-extension --transliteration-file $transliteration-file --out-file OUT
extract-phrases
in: corpus-mml-postfilter=OR=word-alignment scored-corpus
out: extracted-phrases
rerun-on-change: max-phrase-length translation-factors reordering-factors hierarchical-rule-set extract-settings training-options script use-ghkm domain-features baseline-extract lexicalized-reordering
only-existence-matters: domain-features
default-name: model/extract
build-reordering
in: extracted-phrases
out: reordering-table
ignore-unless: lexicalized-reordering
rerun-on-change: lexicalized-reordering reordering-factors
default-name: model/reordering-table
final-model: yes
build-ttable
in: extracted-phrases lexical-translation-table corpus-mml-prefilter=OR=corpus-mml-postfilter=OR=domains
out: phrase-translation-table
rerun-on-change: translation-factors hierarchical-rule-set score-settings training-options script EVALUATION:report-precision-by-coverage include-word-alignment-in-rules domain-features
default-name: model/phrase-table
ignore-if: suffix-array mmsapt
final-model: yes
build-mmsapt
in: corpus-mml-postfilter=OR=word-alignment corpus-mml-postfilter=OR=corpus-mml-prefilter=OR=corpus
out: phrase-translation-table
ignore-unless: mmsapt
default-name: model/phrase-table-mmsapt
template: $moses-script-dir/training/build-mmsapt.perl --alignment IN.$alignment-symmetrization-method --corpus IN1 --f $input-extension --e $output-extension --dir OUT --settings '$mmsapt'
final-model: yes
sigtest-filter-suffix-array
in: corpus-mml-postfilter=OR=corpus-mml-prefilter=OR=corpus
out: sigtest-filter-suffix-array
default-name: training/corpus
template: $salm-index IN.$input-extension ; \
mv IN.${input-extension}.id_voc OUT.${input-extension}.id_voc ; \
mv IN.${input-extension}.sa_corpus OUT.${input-extension}.sa_corpus ; \
mv IN.${input-extension}.sa_offset OUT.${input-extension}.sa_offset ; \
mv IN.${input-extension}.sa_suffix OUT.${input-extension}.sa_suffix ; \
$salm-index IN.$output-extension ; \
mv IN.${output-extension}.id_voc OUT.${output-extension}.id_voc ; \
mv IN.${output-extension}.sa_corpus OUT.${output-extension}.sa_corpus ; \
mv IN.${output-extension}.sa_offset OUT.${output-extension}.sa_offset ; \
mv IN.${output-extension}.sa_suffix OUT.${output-extension}.sa_suffix
ignore-unless: sigtest-filter
final-model: yes
sigtest-filter-ttable
in: phrase-translation-table sigtest-filter-suffix-array
out: sigtest-filter-phrase-translation-table
default-name: model/phrase-table-sigtest-filter
pass-unless: sigtest-filter
ignore-if: TRAINING:config
final-model: yes
sigtest-filter-reordering
in: reordering-table sigtest-filter-suffix-array
out: sigtest-filter-reordering-table
default-name: model/reordering-table-sigtest-filter
pass-unless: sigtest-filter
ignore-if: TRAINING:config
ignore-unless: lexicalized-reordering
final-model: yes
build-generation
in: corpus-mml-postfilter=OR=corpus-mml-prefilter=OR=corpus
out: generation-table
rerun-on-change: generation-factors generation-type training-options script
ignore-unless: generation-factors
ignore-if: generation-corpus
default-name: model/generation-table
final-model: yes
build-generation-custom
in: generation-corpus
out: generation-table
rerun-on-change: generation-factors generation-type training-options script generation-corpus
ignore-unless: AND generation-factors generation-corpus
default-name: model/generation-table
final-model: yes
build-sparse
in: corpus-mml-postfilter=OR=corpus-mml-prefilter=OR=corpus
out: sparse
ignore-unless: sparse-features
rerun-on-change: sparse-features
default-name: model/sparse-features
template: $moses-script-dir/ems/support/build-sparse-features.perl IN $input-extension $output-extension OUT "$sparse-features"
create-config
in: sigtest-filter-reordering-table sigtest-filter-phrase-translation-table transliteration-table generation-table sparse corpus-mml-prefilter=OR=corpus-mml-postfilter=OR=domains osm-model INTERPOLATED-LM:binlm LM:binlm
out: config
ignore-if: use-hiero thot
rerun-on-change: decoding-steps alignment-factors translation-factors reordering-factors generation-factors lexicalized-reordering training-options script decoding-graph-backoff score-settings additional-ini mmsapt no-glue-grammar dont-tune-glue-grammar use-syntax-input-weight-feature
default-name: model/moses.ini
error: Unknown option
final-model: yes
binarize-config
in: config
out: bin-config
pass-unless: binarize-all
rerun-on-change: config
default-name: model/moses.bin.ini
template: $binarize-all IN OUT -Binarizer $ttable-binarizer
final-model: yes
hiero-compile-source-suffix-array
in: corpus-mml-postfilter=OR=corpus-mml-prefilter=OR=corpus
out: hiero-source-suffix-array
ignore-unless: use-hiero
default-name: hiero-model/f.sa.bin
template: $hiero-decode-dir/compile_bin.py -s IN.$input-extension OUT
hiero-compile-target
in: corpus-mml-postfilter=OR=corpus-mml-prefilter=OR=corpus
out: hiero-target-array
ignore-unless: use-hiero
default-name: hiero-model/e.bin
template: $hiero-decode-dir/compile_bin.py IN.$output-extension OUT
hiero-compile-alignment
in: corpus-mml-postfilter=OR=word-alignment
out: hiero-alignment-array
ignore-unless: use-hiero
default-name: hiero-model/a.bin
template: $hiero-decode-dir/compile_bin.py -a IN.$alignment-symmetrization-method OUT
hiero-compile-lex
in: hiero-alignment-array hiero-source-suffix-array hiero-target-array
out: hiero-lex-array
ignore-unless: use-hiero
default-name: hiero-model/lex.bin
template: $hiero-decode-dir/compile_bin.py -x IN1 IN2 IN OUT
hiero-find-frequencies
in: hiero-source-suffix-array
out: hiero-topN
ignore-unless: use-hiero
default-name: hiero-model/f.topN
template: $hiero-decode-dir/lcp_ops.py -t 4 IN | sort -nr | head -100 > OUT
hiero-compile-precomputations
in: hiero-topN hiero-source-suffix-array
out: hiero-precomputation-array
ignore-unless: use-hiero
default-name: hiero-model/f.precomputations.bin
rerun-on-change: hiero-max-phrase-length hiero-max-nonterminals hiero-max-phrase-span hiero-min-gap-length hiero-freq-rank1 hiero-freq-rank2
template: $hiero-decode-dir/compile_bin.py -r max-len=$hiero-max-phrase-length max-nt=$hiero-max-nonterminals max-size=$hiero-max-phrase-span min-gap=$hiero-min-gap-length rank1=$hiero-freq-rank1 rank2=$hiero-freq-rank2 sa=IN1 IN OUT
hiero-create-config
in: hiero-source-suffix-array hiero-target-array hiero-alignment-array hiero-lex-array hiero-precomputation-array LM:lm
out: hiero-config
ignore-unless: use-hiero
rerun-on-change: decoding-steps alignment-factors translation-factors reordering-factors generation-factors
default-name: hiero-model/hiero.ini
template: $hiero-util-dir/generate-ini.pl IN IN1 IN2 IN3 IN4 IN5 $hiero-max-phrase-length $hiero-max-nonterminals $hiero-max-phrase-span $hiero-min-gap-length $hiero-freq-rank1 $hiero-freq-rank2 < $GENERAL:hiero-template-ini > OUT
thot-build-ttable
in: corpus
out: thot-ttable
default-name: model/phrase-table-thot
rerun-on-change: input-extension output-extension
template: $thot/thot_tm_train -sdir $working-dir -s IN.$input-extension -t IN.$output-extension -o OUT
thot-create-config
in: thot-ttable LM:lm
out: config
ignore-unless: thot
default-name: model/thot.ini
template: $thot/thot_gen_cfg_file IN1/lm_desc IN/tm_desc > OUT
[TUNING] single
input-from-sgm
in: input-sgm
out: raw-input
default-name: tuning/input.txt
template: $moses-script-dir/ems/support/input-from-sgm.perl < IN > OUT
input-devtest-from-sgm
in: input-devtest-sgm
out: raw-input-devtest
default-name: tuning/input.devtest.txt
ignore-unless: use-mira
template: $moses-script-dir/ems/support/input-from-sgm.perl < IN > OUT
tokenize-input
in: raw-input
out: tokenized-input
default-name: tuning/input.tok
pass-unless: input-tokenizer
template: $input-tokenizer < IN > OUT
tokenize-input-devtest
in: raw-input-devtest
out: tokenized-input-devtest
default-name: tuning/input.devtest.tok
pass-unless: input-tokenizer
ignore-unless: use-mira
template: $input-tokenizer < IN > OUT
mock-parse-input
in: tokenized-input
out: mock-parsed-input
default-name: tuning/input.mock-parsed
pass-unless: mock-input-parser-devtesteval
template: $mock-input-parser-devtesteval < IN > OUT
mock-parse-input-devtest
in: tokenized-input-devtest
out: mock-parsed-input-devtest
default-name: tuning/input.devtest.mock-parsed
pass-unless: mock-input-parser-devtesteval
ignore-unless: use-mira
template: $mock-input-parser-devtesteval < IN > OUT
parse-input
in: mock-parsed-input
out: parsed-input
default-name: tuning/input.parsed
pass-unless: input-parser
pass-if: skip-parse-input-devtesteval mock-input-parser-devtesteval
template: $input-parser < IN > OUT
parse-input-devtest
in: mock-parsed-input-devtesteval
out: parsed-input-devtest
default-name: tuning/input.devtest.parsed
pass-unless: input-parser
pass-if: skip-parse-input-devtesteval mock-input-parser-devtesteval
ignore-unless: use-mira
template: $input-parser < IN > OUT
parse-relax-input
in: split-input
out: input
default-name: tuning/input.parse-relaxed
pass-unless: input-parse-relaxer
pass-if: skip-parse-input-devtesteval mock-input-parser-devtesteval
template: $input-parse-relaxer < IN > OUT
parse-relax-input-devtest
in: split-input-devtest
out: input-devtest
default-name: tuning/input.devtest.parse-relaxed
pass-unless: input-parse-relaxer
pass-if: skip-parse-input-devtesteval mock-input-parser-devtesteval
ignore-unless: use-mira
template: $input-parse-relaxer < IN > OUT
factorize-input
in: parsed-input
out: factorized-input
default-name: tuning/input.factorized
rerun-on-change: TRAINING:input-factors
pass-unless: TRAINING:input-factors
error: can't open
error: incompatible number of words in factor
factorize-input-devtest
in: parsed-input-devtest
out: factorized-input-devtest
default-name: tuning/input.devtest.factorized
rerun-on-change: TRAINING:input-factors
pass-unless: TRAINING:input-factors
ignore-unless: use-mira
error: can't open
error: incompatible number of words in factor
source-label-input
in: factorized-input
out: source-labelled-input
default-name: tuning/input.labelled
pass-unless: source-labeller
template-if: source-labeller IN OUT
parallelizable: yes
source-label-input-devtest
in: factorized-input-devtest
out: source-labelled-input-devtest
default-name: tuning/input.devtest.labelled
pass-unless: source-labeller
template-if: source-labeller IN OUT
parallelizable: yes
lowercase-input
in: source-labelled-input
out: truecased-input
default-name: tuning/input.lc
pass-unless: input-lowercaser
ignore-if: input-truecaser
template: $input-lowercaser < IN > OUT
lowercase-input-devtest
in: source-labelled-input-devtest
out: truecased-input-devtest
default-name: tuning/input.devtest.lc
pass-unless: input-lowercaser
ignore-unless: use-mira
ignore-if: input-truecaser
template: $input-lowercaser < IN > OUT
truecase-input
in: source-labelled-input TRUECASER:truecase-model
out: truecased-input
rerun-on-change: input-truecaser
default-name: tuning/input.tc
ignore-unless: input-truecaser
template: $input-truecaser -model IN1.$input-extension < IN > OUT
truecase-input-devtest
in: source-labelled-input-devtest TRUECASER:truecase-model
out: truecased-input-devtest
rerun-on-change: input-truecaser
default-name: tuning/input.devtest.tc
ignore-unless: AND input-truecaser use-mira
template: $input-truecaser -model IN1.$input-extension < IN > OUT
split-input
in: truecased-input SPLITTER:splitter-model
out: split-input
rerun-on-change: input-splitter
default-name: tuning/input.split
pass-unless: input-splitter
template: $input-splitter -model IN1.$input-extension < IN > OUT
split-input-devtest
in: truecased-input-devtest SPLITTER:splitter-model
out: split-input-devtest
rerun-on-change: input-splitter
default-name: tuning/input.devtest.split
pass-unless: input-splitter
ignore-unless: use-mira
template: $input-splitter -model IN1.$input-extension < IN > OUT
reference-from-sgm
in: reference-sgm input-sgm
out: raw-reference
default-name: tuning/reference.txt
template: $moses-script-dir/ems/support/reference-from-sgm.perl IN IN1 OUT
reference-devtest-from-sgm
in: reference-devtest-sgm input-devtest-sgm
out: raw-reference-devtest
default-name: tuning/reference.devtest.txt
ignore-unless: use-mira
template: $moses-script-dir/ems/support/reference-from-sgm.perl IN IN1 OUT
tokenize-reference
in: raw-reference
out: tokenized-reference
default-name: tuning/reference.tok
pass-unless: output-tokenizer
multiref: $moses-script-dir/ems/support/run-command-on-multiple-refsets.perl
template: $output-tokenizer < IN > OUT
tokenize-reference-devtest
in: raw-reference-devtest
out: tokenized-reference-devtest
default-name: tuning/reference.devtest.tok
pass-unless: output-tokenizer
ignore-unless: use-mira
multiref: $moses-script-dir/ems/support/run-command-on-multiple-refsets.perl
template: $output-tokenizer < IN > OUT
mock-parse-reference
in: tokenized-reference
out: mock-parsed-reference
default-name: tuning/reference.mock-parsed
pass-unless: mock-output-parser-references
template: $mock-output-parser-references < IN > OUT
mock-parse-reference-devtest
in: tokenized-input-devtest
out: mock-parsed-reference-devtest
default-name: tuning/reference.devtest.mock-parsed
pass-unless: mock-output-parser-references
template: $mock-output-parser-references < IN > OUT
lowercase-reference
in: mock-parsed-reference
out: truecased-reference
default-name: tuning/reference.lc
pass-unless: output-lowercaser
ignore-if: output-truecaser
multiref: $moses-script-dir/ems/support/run-command-on-multiple-refsets.perl
template: $output-lowercaser < IN > OUT
lowercase-reference-devtest
in: mock-parsed-reference-devtest
out: truecased-reference-devtest
default-name: tuning/reference.devtest.lc
pass-unless: output-lowercaser
ignore-if: output-truecaser
ignore-unless: use-mira
multiref: $moses-script-dir/ems/support/run-command-on-multiple-refsets.perl
template: $output-lowercaser < IN > OUT
truecase-reference
in: mock-parsed-reference TRUECASER:truecase-model
out: truecased-reference
rerun-on-change: output-truecaser
default-name: tuning/reference.tc
ignore-unless: output-truecaser
multiref: $moses-script-dir/ems/support/run-command-on-multiple-refsets.perl
template: $output-truecaser -model IN1.$output-extension < IN > OUT
truecase-reference-devtest
in: mock-parsed-reference-devtest TRUECASER:truecase-model
out: truecased-reference-devtest
rerun-on-change: output-truecaser
default-name: tuning/reference.devtest.tc
ignore-unless: AND output-truecaser use-mira
multiref: $moses-script-dir/ems/support/run-command-on-multiple-refsets.perl
template: $output-truecaser -model IN1.$output-extension < IN > OUT
split-reference
in: truecased-reference SPLITTER:splitter-model
out: split-ref
default-name: tuning/reference.split
pass-unless: output-splitter
multiref: $moses-script-dir/ems/support/run-command-on-multiple-refsets.perl
template: $output-splitter -model IN1.$output-extension < IN > OUT
split-reference-devtest
in: truecased-reference-devtest SPLITTER:splitter-model
out: split-ref-devtest
default-name: tuning/reference.devtest.split
pass-unless: output-splitter
ignore-unless: use-mira
multiref: $moses-script-dir/ems/support/run-command-on-multiple-refsets.perl
template: $output-splitter -model IN1.$output-extension < IN > OUT
strip-reference
in: split-ref
out: reference
default-name: tuning/reference.stripped
pass-unless: mock-output-parser-references
multiref: $moses-script-dir/ems/support/run-command-on-multiple-refsets.perl
template: $moses-script-dir/training/strip-xml.perl < IN > OUT && $moses-script-dir/training/wrappers/mosesxml2brackets.py < IN > OUT.trees
strip-reference-devtest
in: split-ref-devtest
out: reference
default-name: tuning/reference.devtest.stripped
pass-unless: mock-output-parser-references
ignore-unless: use-mira
multiref: $moses-script-dir/ems/support/run-command-on-multiple-refsets.perl
template: $moses-script-dir/training/strip-xml.perl < IN > OUT && $moses-script-dir/training/wrappers/mosesxml2brackets.py < IN > OUT.trees
filter
in: input TRAINING:sigtest-filter-phrase-translation-table TRAINING:sigtest-filter-reordering-table TRAINING:corpus-mml-prefilter=OR=TRAINING:corpus-mml-postfilter=OR=TRAINING:domains TRAINING:transliteration-table
out: filtered-dir
default-name: tuning/filtered
rerun-on-change: filter-settings ttable-binarizer TRAINING:no-glue-grammar TRAINING:dont-tune-glue-grammar TRAINING:use-syntax-input-weight-feature TRAINING:config
ignore-if: TRAINING:binarize-all
error: already exists. Please delete
filter-devtest
in: input-devtest TRAINING:sigtest-filter-phrase-translation-table TRAINING:sigtest-filter-reordering-table
out: filtered-dir-devtest
default-name: tuning/filtered.devtest
rerun-on-change: filter-settings ttable-binarizer TRAINING:no-glue-grammar TRAINING:dont-tune-glue-grammar TRAINING:use-syntax-input-weight-feature TRAINING:config
ignore-if: TRAINING:binarize-all
ignore-unless: use-mira
error: already exists. Please delete
apply-filter
in: TRAINING:bin-config filtered-dir
out: filtered-config
default-name: tuning/moses.filtered.ini
ignore-if: TRAINING:binarize-all
template: $moses-script-dir/ems/support/substitute-filtered-tables.perl IN1/moses.ini < IN > OUT
apply-filter-devtest
in: TRAINING:bin-config filtered-dir-devtest
out: filtered-config-devtest
default-name: tuning/moses.filtered.devtest.ini
pass-if: TRAINING:binarize-all
ignore-unless: use-mira
template: $moses-script-dir/ems/support/substitute-filtered-tables.perl IN1/moses.ini < IN > OUT
tune
in: TRAINING:bin-config input reference filtered-config-devtest input-devtest reference-devtest filtered-config
out: weight-config
ignore-if: use-hiero
qsub-script: yes
default-name: tuning/moses.ini
tmp-name: tuning/tmp
final-model: yes
rerun-on-change: decoder-settings tuning-settings nbest lambda async
not-error: trans: No such file or directory
thot-tune
in: TRAINING:config input reference
out: config-with-reused-weights
ignore-unless: thot
tmp-name: tuning/thot.tmp
default-name: tuning/thot.tuned.ini
template: mkdir -p TMP/home ; mkdir -p TMP/tdir ; mkdir -p TMP/sdir ; HOME=TMP/home $thot/thot_smt_tune -tdir TMP/tdir -sdir TMP/sdir -c IN -s IN1 -t IN2 -o OUT
apply-weights
in: TRAINING:bin-config weight-config
out: config-with-reused-weights
ignore-if: use-hiero thot
default-name: tuning/moses.tuned.ini
template: $moses-script-dir/ems/support/substitute-weights.perl IN IN1 OUT
error: cannot open
hiero-tune
in: TRAINING:hiero-config input reference
out: hiero-weight-config
ignore-unless: use-hiero
qsub-script: yes
default-name: hiero-tuning/mert
rerun-on-change: nbest
template: $hiero-mert --nbest $nbest --decoder $hiero-decoder --workdir OUT IN --source-file IN1 --ref-files "IN2*" --no-test
hiero-apply-weights
in: hiero-weight-config TRAINING:hiero-config
out: hiero-config-with-reused-weights
default-name: hiero-tuning/hiero.weight-reused.ini
ignore-unless: use-hiero
template: $hiero-util-dir/apply-weights.pl IN/best.weights < IN1 > OUT
[EVALUATION] multiple
input-from-sgm
in: input-sgm
out: raw-input
ignore-unless: input-sgm
default-name: evaluation/input.txt
template: $moses-script-dir/ems/support/input-from-sgm.perl < IN > OUT
get-input
in: get-corpus-script
out: raw-input
ignore-if: input-sgm
default-name: evaluation/input.txt
template: IN OUT
tokenize-input
in: raw-input
out: tokenized-input
default-name: evaluation/input.tok
pass-unless: input-tokenizer
template: $input-tokenizer < IN > OUT
mock-parse-input
in: tokenized-input
out: mock-parsed-input
default-name: evaluation/input.mock-parsed
pass-unless: mock-input-parser-devtesteval
template: $mock-input-parser-devtesteval < IN > OUT
parse-input
in: mock-parsed-input
out: parsed-input
default-name: evaluation/input.parsed
pass-unless: input-parser
pass-if: skip-parse-input-devtesteval mock-input-parser-devtesteval
template: $input-parser < IN > OUT
parse-relax-input
in: split-input
out: input
default-name: evaluation/input.parse-relaxed
pass-unless: input-parse-relaxer
pass-if: skip-parse-input-devtesteval mock-input-parser-devtesteval
template: $input-parse-relaxer < IN > OUT
factorize-input
in: parsed-input
out: factorized-input
default-name: evaluation/input.factorized
rerun-on-change: TRAINING:input-factors
pass-unless: TRAINING:input-factors
error: can't open
error: incompatible number of words in factor
source-label-input
in: factorized-input
out: source-labelled-input
default-name: evaluation/input.labelled
pass-unless: source-labeller
template-if: source-labeller IN OUT
parallelizable: yes
lowercase-input
in: source-labelled-input
out: truecased-input
default-name: evaluation/input.lc
pass-unless: input-lowercaser
ignore-if: input-truecaser
template: $input-lowercaser < IN > OUT
truecase-input
in: source-labelled-input TRUECASER:truecase-model
out: truecased-input
default-name: evaluation/input.tc
rerun-on-change: input-truecaser
ignore-unless: input-truecaser
template: $input-truecaser -model IN1.$input-extension < IN > OUT
split-input
in: truecased-input SPLITTER:splitter-model
out: split-input
default-name: evaluation/input.split
pass-unless: input-splitter
template: $input-splitter -model IN1.$input-extension < IN > OUT
filter
in: input TRAINING:sigtest-filter-phrase-translation-table TRAINING:sigtest-filter-reordering-table TRAINING:corpus-mml-prefilter=OR=TRAINING:corpus-mml-postfilter=OR=TRAINING:domains TRAINING:transliteration-table
out: filtered-dir
default-name: evaluation/filtered
rerun-on-change: filter-settings report-precision-by-coverage ttable-binarizer TRAINING:no-glue-grammar TRAINING:dont-tune-glue-grammar TRAINING:use-syntax-input-weight-feature TRAINING:config
pass-if: TRAINING:binarize-all
ignore-if: use-hiero
error: already exists. Please delete
apply-filter
in: filtered-dir TRAINING:config TUNING:config-with-reused-weights
out: filtered-config
default-name: evaluation/filtered.ini
ignore-if: TRAINING:binarize-all thot
template: $moses-script-dir/ems/support/substitute-filtered-tables-and-weights.perl IN/moses.ini IN1 IN2 OUT
decode
in: TUNING:config-with-reused-weights input filtered-config
out: system-output
default-name: evaluation/output
qsub-script: yes
ignore-if: use-hiero thot
rerun-on-change: decoder decoder-settings nbest report-segmentation report-precision-by-coverage analyze-search-graph wade TRAINING:post-decoding-transliteration
error: Translation was not performed correctly
not-error: trans: No such file or directory
final-model: yes
hiero-decode
in: TUNING:hiero-config-with-reused-weights input
out: system-output
default-name: evaluation/output
qsub-script: yes
ignore-unless: use-hiero
template: $hiero-parallelizer -e OUT.edir -r -- $hiero-decoder -c IN < IN1 > OUT
rerun-on-change: hiero-decoder
thot-filter
in: TUNING:config-with-reused-weights input
out: filtered-config
ignore-unless: thot
default-name: evaluation/filtered
tmp-name: evaluation/filtered-tmp
template: mkdir -p TMP/home ; mkdir -p TMP/tdir ; mkdir -p TMP/sdir ; HOME=TMP/home $thot/thot_prepare_sys_for_test -sdir TMP/sdir -tdir TMP/tdir -t IN1 -c IN/tuned_for_dev.cfg -o OUT ; cp OUT/lm/main/* OUT/lm
thot-decode
in: input filtered-config
out: system-output
ignore-unless: thot
default-name: evaluation/output
template: $thot/thot_decoder -sdir $working-dir -c IN1/test_specific.cfg -t IN > OUT
not-error: Error in word penalty model file
remove-markup
in: system-output
out: cleaned-output
default-name: evaluation/cleaned
pass-if: TRAINING:hierarchical-rule-set
pass-unless: report-segmentation
template: $moses-script-dir/ems/support/remove-segmentation-markup.perl < IN > OUT
post-decoding-transliteration
in: cleaned-output system-output TRAINING:transliteration-model INTERPOLATED-LM:binlm=OR=LM:binlm
out: transliterated-output
default-name: evaluation/transliterated
pass-unless: TRAINING:post-decoding-transliteration
template: $moses-script-dir/Transliteration/post-decoding-transliteration.pl --moses-src-dir $moses-src-dir --external-bin-dir $external-bin-dir --transliteration-model-dir IN2 --input-extension $input-extension --output-extension $output-extension --language-model IN3 --input-file IN0 --output-file OUT --oov-file IN1.oov --decoder $decoder
recase-output
in: transliterated-output RECASING:recase-config
out: recased-output
default-name: evaluation/recased
pass-unless: recaser
ignore-if: output-truecaser
template: $recaser -moses $RECASING:decoder -in IN -model IN1 > OUT
detruecase-output
in: transliterated-output
out: recased-output
default-name: evaluation/truecased
ignore-unless: output-truecaser
template: $detruecaser < IN > OUT
detokenize-output
in: recased-output
out: detokenized-output
default-name: evaluation/detokenized
pass-unless: detokenizer
template: $detokenizer < IN > OUT
final-model: yes
wrap
in: detokenized-output
out: wrapped-output
default-name: evaluation/detokenized.sgm
rerun-on-change: wrapping-frame use-hiero
template: $wrapping-script $wrapping-frame < IN > OUT
error: Use of uninitialized value in pattern match
final-model: yes
reference-from-sgm
in: reference-sgm input-sgm
out: raw-reference
default-name: evaluation/reference.txt
template: $moses-script-dir/ems/support/reference-from-sgm.perl IN IN1 OUT
tokenize-reference
in: raw-reference
out: tokenized-reference
default-name: evaluation/reference.tok
pass-unless: output-tokenizer
multiref: $moses-script-dir/ems/support/run-command-on-multiple-refsets.perl
template: $output-tokenizer < IN > OUT
mock-parse-reference
in: tokenized-reference
out: mock-parsed-reference
default-name: evaluation/reference.mock-parsed
pass-unless: mock-output-parser-references
template: $mock-output-parser-references < IN > OUT
lowercase-reference
in: mock-parsed-reference
out: lowercased-reference
default-name: evaluation/reference.lowercased
pass-unless: output-lowercaser
pass-if: recaser
multiref: $moses-script-dir/ems/support/run-command-on-multiple-refsets.perl
template: $output-lowercaser < IN > OUT
strip-reference
in: lowercased-reference
out: reference
default-name: evaluation/reference
pass-unless: mock-output-parser-references
multiref: $moses-script-dir/ems/support/run-command-on-multiple-refsets.perl
template: $moses-script-dir/training/strip-xml.perl < IN > OUT && $moses-script-dir/training/wrappers/mosesxml2brackets.py < IN > OUT.trees
wade
in: filtered-dir truecased-input tokenized-reference alignment system-output
out: wade-analysis
default-name: evaluation/wade-analysis
ignore-unless: wade
rerun-on-change: wade
template: $moses-script-dir/ems/support/run-wade.perl $wade IN IN1 IN2 IN3 IN4 OUT
nist-bleu
in: wrapped-output reference-sgm
out: nist-bleu-score
default-name: evaluation/nist-bleu
ignore-unless: nist-bleu
rerun-on-change: nist-bleu
error: Illegal division by zero
template: $nist-bleu -s $input-sgm -r IN1 -t IN > OUT
final-model: yes
nist-bleu-c
in: wrapped-output reference-sgm
out: nist-bleu-c-score
default-name: evaluation/nist-bleu-c
ignore-unless: nist-bleu-c
rerun-on-change: nist-bleu-c
error: Illegal division by zero
template: $nist-bleu-c -c -s $input-sgm -r IN1 -t IN > OUT
final-model: yes
ibm-bleu
in: wrapped-output reference-sgm
out: ibm-bleu-score
default-name: evaluation/ibm-bleu
ignore-unless: ibm-bleu
rerun-on-change: ibm-bleu
template: $ibm-bleu -ci -s $input-sgm -r IN1 -t IN > OUT
final-model: yes
ibm-bleu-c
in: wrapped-output reference-sgm
out: ibm-bleu-c-score
default-name: evaluation/ibm-bleu-c
ignore-unless: ibm-bleu-c
rerun-on-change: ibm-bleu-c
template: $ibm-bleu-c -s $input-sgm -r IN1 -t IN > OUT
final-model: yes
bolt-bleu
in: detokenized-output
out: bolt-bleu-score
default-name: evaluation/bolt-bleu
ignore-unless: bolt-bleu
rerun-on-change: bolt-bleu
template: $bolt-bleu IN > OUT
final-model: yes
bolt-bleu-c
in: detokenized-output
out: bolt-bleu-c-score
default-name: evaluation/bolt-bleu-c
ignore-unless: bolt-bleu-c
rerun-on-change: bolt-bleu-c
template: $bolt-bleu-c IN > OUT
final-model: yes
multi-bleu
in: transliterated-output tokenized-reference
out: multi-bleu-score
default-name: evaluation/multi-bleu
ignore-unless: multi-bleu
rerun-on-change: multi-bleu
template: $multi-bleu IN1 < IN > OUT
final-model: yes
multi-bleu-c
in: recased-output tokenized-reference
out: multi-bleu-c-score
default-name: evaluation/multi-bleu-c
ignore-unless: multi-bleu-c
rerun-on-change: multi-bleu-c
template: $multi-bleu-c IN1 < IN > OUT
final-model: yes
multi-bleu-detok
in: detokenized-output tokenized-reference
out: multi-bleu-detok-score
default-name: evaluation/multi-bleu-detok
ignore-unless: multi-bleu-detok
rerun-on-change: multi-bleu-detok
template: $multi-bleu-detok IN1 < IN > OUT
final-model: yes
multi-bleu-c-detok
in: detokenized-output tokenized-reference
out: multi-bleu-c-detok-score
default-name: evaluation/multi-bleu-c-detok
ignore-unless: multi-bleu-c-detok
rerun-on-change: multi-bleu-c-detok
template: $multi-bleu-c-detok IN1 < IN > OUT
final-model: yes
ter
in: wrapped-output reference-sgm
out: ter-score
default-name: evaluation/detokenized.sgm.TER
ignore-unless: ter
rerun-on-change: ter
final-model: yes
wer
in: recased-output reference
out: wer-score
default-name: evaluation/wer
ignore-unless: wer
rerun-on-change: wer
template: $wer IN IN1 > OUT
final-model: yes
meteor
in: transliterated-output reference
out: meteor-score
default-name: evaluation/meteor
ignore-unless: meteor
rerun-on-change: meteor
template: $meteor IN IN1 $meteor-params > OUT
final-model: yes
analysis
in: recased-output reference input
out: analysis
default-name: evaluation/analysis
ignore-if: report-precision-by-coverage
ignore-unless: analysis
rerun-on-change: analyze-search-graph
analysis-coverage
in: input TRAINING:corpus-mml-postfilter=OR=TRAINING:corpus-mml-prefilter=OR=TRAINING:corpus TRAINING:sigtest-filter-phrase-translation-table
out: analysis-coverage
default-name: evaluation/analysis
ignore-unless: AND analysis analyze-coverage
rerun-on-change: score-settings
final-model: yes
analysis-precision
in: recased-output reference input TRAINING:corpus-mml-postfilter=OR=TRAINING:corpus-mml-prefilter=OR=TRAINING:corpus TRAINING:sigtest-filter-phrase-translation-table analysis-coverage
out: analysis
default-name: evaluation/analysis
ignore-unless: AND analysis analyze-coverage report-precision-by-coverage
rerun-on-change: precision-by-coverage-base
final-model: yes
[REPORTING] single
report
in: EVALUATION:nist-bleu-score EVALUATION:nist-bleu-c-score EVALUATION:bolt-bleu-score EVALUATION:bolt-bleu-c-score EVALUATION:multi-bleu-score EVALUATION:multi-bleu-c-score EVALUATION:multi-bleu-detok-score EVALUATION:multi-bleu-c-detok-score EVALUATION:meteor-score EVALUATION:ter-score EVALUATION:wer-score EVALUATION:ibm-bleu-score EVALUATION:ibm-bleu-c-score EVALUATION:analysis EVALUATION:analysis-coverage EVALUATION:analysis-prec TRAINING:biconcor-model EVALUATION:wade-analysis
out: report
default-name: evaluation/report
|