scripts/ems/example/config.factored


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554

################################################
### CONFIGURATION FILE FOR AN SMT EXPERIMENT ###
################################################

[GENERAL]

### directory in which experiment is run
#
working-dir = /home/pkoehn/experiment

# specification of the language pair
input-extension = fr
output-extension = en
pair-extension = fr-en

### directories that contain tools and data
# 
# moses
moses-src-dir = /home/pkoehn/moses
#
# moses scripts
moses-script-dir = /home/pkoehn/moses/scripts
#
# srilm
srilm-dir = $moses-src-dir/srilm/bin/i686
#
# data
wmt10-data = $working-dir/data

### basic tools
#
# moses decoder
decoder = $moses-src-dir/dist/bin/moses

# conversion of phrase table into binary on-disk format
ttable-binarizer = $moses-src-dir/misc/processPhraseTable

# conversion of rule table into binary on-disk format
#ttable-binarizer = "$moses-src-dir/dist/bin/CreateOnDiskPt 1 1 5 100 2"

# tokenizers - comment out if all your data is already tokenized
input-tokenizer = "$moses-script-dir/tokenizer/tokenizer.perl -a -l $input-extension"
output-tokenizer = "$moses-script-dir/tokenizer/tokenizer.perl -a -l $output-extension"

# truecasers - comment out if you do not use the truecaser
input-truecaser = $moses-script-dir/recaser/truecase.perl
output-truecaser = $moses-script-dir/recaser/truecase.perl
detruecaser = $moses-script-dir/recaser/detruecase.perl

### generic parallelizer for cluster and multi-core machines
# you may specify a script that allows the parallel execution
# parallizable steps (see meta file). you also need specify 
# the number of jobs (cluster) or cores (multicore)
#
#generic-parallelizer = $moses-script-dir/ems/support/generic-parallelizer.perl
#generic-parallelizer = $moses-script-dir/ems/support/generic-multicore-parallelizer.perl

### cluster settings (if run on a cluster machine)
# number of jobs to be submitted in parallel
#
#jobs = 10

# arguments to qsub when scheduling a job
#qsub-settings = ""

# project for priviledges and usage accounting 
#qsub-project = iccs_smt

# memory and time 
#qsub-memory = 4
#qsub-hours = 48

### multi-core settings
# when the generic parallelizer is used, the number of cores
# specified here 
cores = 8

#################################################################
# PARALLEL CORPUS PREPARATION: 
# create a tokenized, sentence-aligned corpus, ready for training

[CORPUS]

### long sentences are filtered out, since they slow down GIZA++ 
# and are a less reliable source of data. set here the maximum
# length of a sentence
#
max-sentence-length = 80

[CORPUS:europarl] IGNORE

### command to run to get raw corpus files
#
# get-corpus-script = 

### raw corpus files (untokenized, but sentence aligned)
# 
raw-stem = $wmt10-data/training/europarl-v5.$pair-extension

### tokenized corpus files (may contain long sentences)
#
#tokenized-stem =

### if sentence filtering should be skipped,
# point to the clean training data
#
#clean-stem = 

### if corpus preparation should be skipped,
# point to the prepared training data
#
#lowercased-stem = 

[CORPUS:nc]
raw-stem = $wmt10-data/training/news-commentary10.$pair-extension

[CORPUS:un] IGNORE
raw-stem = $wmt10-data/training/undoc.2000.$pair-extension

#################################################################
# LANGUAGE MODEL TRAINING

[LM]

### tool to be used for language model training
# for instance: ngram-count (SRILM), train-lm-on-disk.perl (Edinburgh) 
# 
lm-training = $srilm-dir/ngram-count
settings = "-interpolate -kndiscount -unk"
order = 5

### tool to be used for training randomized language model from scratch
# (more commonly, a SRILM is trained)
#
#rlm-training = "$moses-src-dir/randlm/bin/buildlm -falsepos 8 -values 8"

### script to use for binary table format for irstlm or kenlm
# (default: no binarization)

# irstlm
#lm-binarizer = $moses-src-dir/irstlm/bin/compile-lm

# kenlm, also set type to 8
#lm-binarizer = $moses-src-dir/dist/bin/build_binary
#type = 8

### script to create quantized language model format (irstlm)
# (default: no quantization)
# 
#lm-quantizer = $moses-src-dir/irstlm/bin/quantize-lm

### script to use for converting into randomized table format
# (default: no randomization)
#
#lm-randomizer = "$moses-src-dir/randlm/bin/buildlm -falsepos 8 -values 8"

### each language model to be used has its own section here

[LM:europarl] IGNORE

### command to run to get raw corpus files
#
#get-corpus-script = ""

### raw corpus (untokenized)
#
raw-corpus = $wmt10-data/training/europarl-v5.$output-extension

### tokenized corpus files (may contain long sentences)
#
#tokenized-corpus = 

### if corpus preparation should be skipped, 
# point to the prepared language model
#
#lm = 

[LM:nc]
raw-corpus = $wmt10-data/training/news-commentary10.$pair-extension.$output-extension

[LM:un] IGNORE
raw-corpus = $wmt10-data/training/undoc.2000.$pair-extension.$output-extension

[LM:news] IGNORE
raw-corpus = $wmt10-data/training/news.$output-extension.shuffled

[LM:nc=pos]
factors = "pos"
order = 7
settings = "-interpolate -unk"
raw-corpus = $wmt10-data/training/news-commentary10.$pair-extension.$output-extension

#################################################################
# INTERPOLATING LANGUAGE MODELS

[INTERPOLATED-LM] IGNORE

# if multiple language models are used, these may be combined
# by optimizing perplexity on a tuning set
# see, for instance [Koehn and Schwenk, IJCNLP 2008]

### script to interpolate language models
# if commented out, no interpolation is performed
#
script = $moses-script-dir/ems/support/interpolate-lm.perl

### tuning set
# you may use the same set that is used for mert tuning (reference set)
#
tuning-sgm = $wmt10-data/dev/news-test2008-ref.$output-extension.sgm
#raw-tuning =
#tokenized-tuning = 
#factored-tuning = 
#lowercased-tuning = 
#split-tuning = 

### script to use for binary table format for irstlm or kenlm
# (default: no binarization)

# irstlm
#lm-binarizer = $moses-src-dir/irstlm/bin/compile-lm

# kenlm, also set type to 8
#lm-binarizer = $moses-src-dir/dist/bin/build_binary
#type = 8

### script to create quantized language model format (irstlm)
# (default: no quantization)
# 
#lm-quantizer = $moses-src-dir/irstlm/bin/quantize-lm

### script to use for converting into randomized table format
# (default: no randomization)
#
#lm-randomizer = "$moses-src-dir/randlm/bin/buildlm -falsepos 8 -values 8"

#################################################################
# FACTOR DEFINITION

[INPUT-FACTOR]

# also used for output factors
temp-dir = $working-dir/training/factor

[OUTPUT-FACTOR:pos]

### script that generates this factor
#
mxpost = /home/pkoehn/bin/mxpost
factor-script = "$moses-script-dir/training/wrappers/make-factor-en-pos.mxpost.perl -mxpost $mxpost"

#################################################################
# TRANSLATION MODEL TRAINING

[TRAINING]

### training script to be used: either a legacy script or 
# current moses training script (default) 
# 
script = $moses-script-dir/training/train-model.perl

### general options
#
#training-options = ""

### factored training: specify here which factors used
# if none specified, single factor training is assumed
# (one translation step, surface to surface)
#
input-factors = word
output-factors = word pos
alignment-factors = "word -> word"
translation-factors = "word -> word+pos"
reordering-factors = "word -> word"
#generation-factors = 
decoding-steps = "t0"

### pre-computation for giza++
# giza++ has a more efficient data structure that needs to be
# initialized with snt2cooc. if run in parallel, this may reduces
# memory requirements. set here the number of parts
#
run-giza-in-parts = 5

### symmetrization method to obtain word alignments from giza output
# (commonly used: grow-diag-final-and)
#
alignment-symmetrization-method = grow-diag-final-and

### use of berkeley aligner for word alignment
#
#use-berkeley = true
#alignment-symmetrization-method = berkeley
#berkeley-train = $moses-script-dir/ems/support/berkeley-train.sh
#berkeley-process =  $moses-script-dir/ems/support/berkeley-process.sh
#berkeley-jar = /your/path/to/berkeleyaligner-1.1/berkeleyaligner.jar
#berkeley-java-options = "-server -mx30000m -ea"
#berkeley-training-options = "-Main.iters 5 5 -EMWordAligner.numThreads 8"
#berkeley-process-options = "-EMWordAligner.numThreads 8"
#berkeley-posterior = 0.5

### if word alignment should be skipped,
# point to word alignment files
#
#word-alignment = $working-dir/model/aligned.1

### create a bilingual concordancer for the model
#
#biconcor = $moses-script-dir/ems/biconcor/biconcor

### lexicalized reordering: specify orientation type
# (default: only distance-based reordering model)
#
lexicalized-reordering = msd-bidirectional-fe

### hierarchical rule set
#
#hierarchical-rule-set = true

### settings for rule extraction
#
#extract-settings = ""

### unknown word labels (target syntax only)
# enables use of unknown word labels during decoding
# label file is generated during rule extraction
#
#use-unknown-word-labels = true

### if phrase extraction should be skipped,
# point to stem for extract files
#
# extracted-phrases = 

### settings for rule scoring
#
score-settings = "--GoodTuring"

### include word alignment in phrase table
#
#include-word-alignment-in-rules = yes

### if phrase table training should be skipped,
# point to phrase translation table
#
# phrase-translation-table = 

### if reordering table training should be skipped,
# point to reordering table
#
# reordering-table = 

### if training should be skipped, 
# point to a configuration file that contains
# pointers to all relevant model files
#
#config = 

#####################################################
### TUNING: finding good weights for model components

[TUNING]

### instead of tuning with this setting, old weights may be recycled
# specify here an old configuration file with matching weights
#
#weight-config = $working-dir/tuning/moses.weight-reused.ini.1

### tuning script to be used
#
tuning-script = $moses-script-dir/training/mert-moses.pl
tuning-settings = "-mertdir $moses-src-dir/mert"

### specify the corpus used for tuning 
# it should contain 1000s of sentences
#
input-sgm = $wmt10-data/dev/news-test2008-src.$input-extension.sgm
#raw-input = 
#tokenized-input = 
#factorized-input = 
#input =
# 
reference-sgm = $wmt10-data/dev/news-test2008-ref.$output-extension.sgm
#raw-reference = 
#tokenized-reference = 
#factorized-reference = 
#reference = 

### size of n-best list used (typically 100)
#
nbest = 100

### ranges for weights for random initialization
# if not specified, the tuning script will use generic ranges
# it is not clear, if this matters
#
# lambda = 

### additional flags for the filter script
#
filter-settings = ""

### additional flags for the decoder
#
decoder-settings = ""

### if tuning should be skipped, specify this here
# and also point to a configuration file that contains
# pointers to all relevant model files
#
#config = 

#########################################################
## RECASER: restore case, this part only trains the model

[RECASING]

#decoder = $moses-src-dir/moses-cmd/src/moses.1521.srilm

### training data
# raw input needs to be still tokenized,
# also also tokenized input may be specified
#
#tokenized = [LM:europarl:tokenized-corpus]

# recase-config = 

#lm-training = $srilm-dir/ngram-count

#######################################################
## TRUECASER: train model to truecase corpora and input

[TRUECASER]

### script to train truecaser models
#
trainer = $moses-script-dir/recaser/train-truecaser.perl

### training data
# data on which truecaser is trained
# if no training data is specified, parallel corpus is used
#
# raw-stem = 
# tokenized-stem =

### trained model
#
# truecase-model = 

######################################################################
## EVALUATION: translating a test set using the tuned system and score it

[EVALUATION]

### number of jobs (if parallel execution on cluster)
#
#jobs = 10

### additional flags for the filter script
#
#filter-settings = ""

### additional decoder settings
# switches for the Moses decoder
#
decoder-settings = "-search-algorithm 1 -cube-pruning-pop-limit 5000 -s 5000"

### specify size of n-best list, if produced
#
#nbest = 100

### multiple reference translations
#
#multiref = yes

### prepare system output for scoring 
# this may include detokenization and wrapping output in sgm 
# (needed for nist-bleu, ter, meteor)
#
detokenizer = "$moses-script-dir/tokenizer/detokenizer.perl -l $output-extension"
#recaser = $moses-script-dir/recaser/recase.perl
wrapping-script = "$moses-script-dir/ems/support/wrap-xml.perl $output-extension"
#output-sgm = 

### BLEU
#
nist-bleu = $moses-script-dir/generic/mteval-v12.pl
nist-bleu-c = "$moses-script-dir/generic/mteval-v12.pl -c"
#multi-bleu = $moses-script-dir/generic/multi-bleu.perl
#ibm-bleu =

### TER: translation error rate (BBN metric) based on edit distance
# not yet integrated
#
# ter = 

### METEOR: gives credit to stem / worknet synonym matches
# not yet integrated
#
# meteor = 

### Analysis: carry out various forms of analysis on the output
#
analysis = $moses-script-dir/ems/support/analysis.perl
#
# also report on input coverage
analyze-coverage = yes
#
# also report on phrase mappings used
report-segmentation = yes
#
# report precision of translations for each input word, broken down by
# count of input word in corpus and model
#report-precision-by-coverage = yes
#
# further precision breakdown by factor
#precision-by-coverage-factor = pos

[EVALUATION:newstest2009]

### input data
#
input-sgm = $wmt10-data/dev/newstest2009-src.$input-extension.sgm
# raw-input = 
# tokenized-input = 
# factorized-input =
# input = 

### reference data
#
reference-sgm = $wmt10-data/dev/newstest2009-ref.$output-extension.sgm
# raw-reference =
# tokenized-reference = 
# reference = 

### analysis settings 
# may contain any of the general evaluation analysis settings
# specific setting: base coverage statistics on earlier run
#
#precision-by-coverage-base = $working-dir/evaluation/test.analysis.5

### wrapping frame
# for nist-bleu and other scoring scripts, the output needs to be wrapped 
# in sgm markup (typically like the input sgm)
#
wrapping-frame = $input-sgm

##########################################
### REPORTING: summarize evaluation scores

[REPORTING]

### currently no parameters for reporting section