Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--BUILD-INSTRUCTIONS.txt5
-rw-r--r--Jamroot57
-rw-r--r--biconcor/Vocabulary.cpp202
-rw-r--r--contrib/c++tokenizer/tokenizer.cpp254
-rw-r--r--contrib/c++tokenizer/tokenizer.h16
-rw-r--r--contrib/c++tokenizer/tokenizer_main.cpp18
-rw-r--r--contrib/eppex/ISS.h472
-rw-r--r--contrib/eppex/LossyCounter.h8
-rw-r--r--contrib/eppex/eppex.cpp6
-rw-r--r--contrib/eppex/phrase-extract.cpp20
-rw-r--r--contrib/mira/Main.cpp2
-rw-r--r--contrib/mira/Main.h4
-rw-r--r--contrib/other-builds/cmake/boost.example/main.cpp10
-rw-r--r--contrib/other-builds/moses-cmd/moses-cmd.project35
-rw-r--r--contrib/other-builds/moses/moses.project5
-rw-r--r--contrib/other-builds/util/util.project2
-rw-r--r--contrib/python/moses/dictree.cpp544
-rwxr-xr-xcontrib/relent-filter/sigtest-filter/WIN32_functions.cpp462
-rwxr-xr-xcontrib/relent-filter/sigtest-filter/WIN32_functions.h48
-rwxr-xr-xcontrib/relent-filter/sigtest-filter/filter-pt.cpp2
-rwxr-xr-xcontrib/relent-filter/src/IOWrapper.cpp10
-rwxr-xr-xcontrib/relent-filter/src/Main.cpp33
-rwxr-xr-xcontrib/relent-filter/src/RelativeEntropyCalc.cpp2
-rwxr-xr-xcontrib/relent-filter/src/TranslationAnalysis.cpp2
-rw-r--r--contrib/rephraser/paraphrase.cpp4
-rw-r--r--contrib/server/Jamfile2
-rw-r--r--contrib/server/mosesserver.cpp202
-rw-r--r--contrib/sigtest-filter/WIN32_functions.cpp462
-rw-r--r--contrib/sigtest-filter/WIN32_functions.h48
-rw-r--r--contrib/sigtest-filter/filter-pt.cpp84
-rw-r--r--contrib/synlm/hhmm/rvtl/include/nl-archetypeset.h12
-rw-r--r--contrib/synlm/hhmm/rvtl/include/nl-array.h2
-rw-r--r--contrib/synlm/hhmm/rvtl/include/nl-beam.h4
-rw-r--r--contrib/synlm/hhmm/rvtl/include/nl-cpt.h4
-rw-r--r--contrib/synlm/hhmm/rvtl/include/nl-crf.h24
-rw-r--r--contrib/synlm/hhmm/rvtl/include/nl-denot.h6
-rw-r--r--contrib/synlm/hhmm/rvtl/include/nl-dtree-cont.h18
-rw-r--r--contrib/synlm/hhmm/rvtl/include/nl-dtree.h44
-rw-r--r--contrib/synlm/hhmm/rvtl/include/nl-fixedmatrix.h20
-rw-r--r--contrib/synlm/hhmm/rvtl/include/nl-gauss.h20
-rw-r--r--contrib/synlm/hhmm/rvtl/include/nl-hash.h2
-rw-r--r--contrib/synlm/hhmm/rvtl/include/nl-hmm.h32
-rw-r--r--contrib/synlm/hhmm/rvtl/include/nl-hmm2.h28
-rw-r--r--contrib/synlm/hhmm/rvtl/include/nl-hmmloop.h2
-rw-r--r--contrib/synlm/hhmm/rvtl/include/nl-linsep.h4
-rw-r--r--contrib/synlm/hhmm/rvtl/include/nl-mixture.h16
-rw-r--r--contrib/synlm/hhmm/rvtl/include/nl-modelfile.h10
-rw-r--r--contrib/synlm/hhmm/rvtl/include/nl-oblidtree.h6
-rw-r--r--contrib/synlm/hhmm/rvtl/include/nl-prob.h6
-rw-r--r--contrib/synlm/hhmm/rvtl/include/nl-probmodel.h20
-rw-r--r--contrib/synlm/hhmm/rvtl/include/nl-racpt.h32
-rw-r--r--contrib/synlm/hhmm/rvtl/include/nl-randvar.h18
-rw-r--r--contrib/synlm/hhmm/rvtl/include/nl-safeids.h12
-rw-r--r--contrib/synlm/hhmm/rvtl/include/nl-stream.h14
-rw-r--r--contrib/synlm/hhmm/rvtl/include/nl-string.h10
-rw-r--r--contrib/synlm/hhmm/rvtl/include/nl-stringindex.h2
-rw-r--r--contrib/synlm/hhmm/rvtl/include/nl-tetrahex.h2
-rw-r--r--contrib/synlm/hhmm/rvtl/include/nl-timer.h2
-rw-r--r--contrib/synlm/hhmm/wsjparse/include/HHMMLangModel-gf.h2
-rw-r--r--contrib/synlm/hhmm/wsjparse/include/TextObsModel.h12
-rw-r--r--jam-files/check-environment.jam42
-rw-r--r--jam-files/curlpp.jam123
-rw-r--r--jam-files/sanity.jam8
-rw-r--r--jam-files/server.jam86
-rw-r--r--jam-files/xmlrpc-c.jam100
-rw-r--r--lm/bhiksha.cc6
-rw-r--r--lm/bhiksha.hh6
-rw-r--r--lm/binary_format.cc2
-rw-r--r--lm/binary_format.hh12
-rw-r--r--lm/blank.hh6
-rw-r--r--lm/build_binary_main.cc2
-rw-r--r--lm/builder/adjust_counts.cc56
-rw-r--r--lm/builder/adjust_counts.hh8
-rw-r--r--lm/builder/adjust_counts_test.cc2
-rw-r--r--lm/builder/corpus_count.cc28
-rw-r--r--lm/builder/corpus_count.hh2
-rw-r--r--lm/builder/initial_probabilities.cc42
-rw-r--r--lm/builder/initial_probabilities.hh6
-rw-r--r--lm/builder/interpolate.cc6
-rw-r--r--lm/builder/interpolate.hh4
-rw-r--r--lm/builder/joint_order.hh10
-rw-r--r--lm/builder/lmplz_main.cc10
-rw-r--r--lm/builder/ngram.hh10
-rw-r--r--lm/builder/pipeline.cc18
-rw-r--r--lm/builder/pipeline.hh2
-rw-r--r--lm/builder/print.cc2
-rw-r--r--lm/builder/print.hh6
-rw-r--r--lm/builder/sort.hh40
-rw-r--r--lm/enumerate_vocab.hh2
-rw-r--r--lm/facade.hh8
-rw-r--r--lm/filter/count_io.hh4
-rw-r--r--lm/filter/filter_main.cc2
-rw-r--r--lm/filter/format.hh12
-rw-r--r--lm/filter/phrase.cc26
-rw-r--r--lm/filter/phrase.hh14
-rw-r--r--lm/filter/phrase_table_vocab_main.cc4
-rw-r--r--lm/filter/thread.hh14
-rw-r--r--lm/filter/vocab.cc2
-rw-r--r--lm/filter/vocab.hh2
-rw-r--r--lm/filter/wrapper.hh2
-rw-r--r--lm/left.hh44
-rw-r--r--lm/left_test.cc4
-rw-r--r--lm/lm_exception.hh2
-rw-r--r--lm/max_order.hh2
-rw-r--r--lm/model.hh40
-rw-r--r--lm/model_test.cc10
-rw-r--r--lm/model_type.hh2
-rw-r--r--lm/ngram_query.hh4
-rw-r--r--lm/partial.hh10
-rw-r--r--lm/partial_test.cc4
-rw-r--r--lm/quantize.cc4
-rw-r--r--lm/quantize.hh10
-rw-r--r--lm/return.hh10
-rw-r--r--lm/search_trie.cc2
-rw-r--r--lm/sizes.cc2
-rw-r--r--lm/state.hh12
-rw-r--r--lm/trie.cc8
-rw-r--r--lm/trie.hh18
-rw-r--r--lm/trie_sort.cc24
-rw-r--r--lm/trie_sort.hh4
-rw-r--r--lm/value.hh4
-rw-r--r--lm/value_build.cc8
-rw-r--r--lm/value_build.hh2
-rw-r--r--lm/virtual_interface.hh32
-rw-r--r--lm/vocab.cc24
-rw-r--r--lm/vocab.hh14
-rw-r--r--lm/weights.hh4
-rw-r--r--lm/wrappers/nplm.cc4
-rw-r--r--lm/wrappers/nplm.hh2
-rw-r--r--mert/Data.cpp3
-rw-r--r--mert/ForestRescoreTest.cpp2
-rw-r--r--mert/Point.cpp7
-rw-r--r--mert/TER/bestShiftStruct.cpp1
-rw-r--r--mert/TODO7
-rw-r--r--mert/evaluator.cpp7
-rw-r--r--mert/kbmira.cpp5
-rw-r--r--mert/mert.cpp5
-rw-r--r--mert/pro.cpp9
-rw-r--r--misc/merge-sorted.cc22
-rw-r--r--misc/pmoses/pmoses.cc34
-rw-r--r--moses-cmd/LatticeMBRGrid.cpp10
-rw-r--r--moses-cmd/MainVW.cpp7
-rw-r--r--moses/BaseManager.cpp14
-rw-r--r--moses/BaseManager.h16
-rw-r--r--moses/ChartCellCollection.cpp3
-rw-r--r--moses/ChartCellCollection.h7
-rw-r--r--moses/ChartManager.cpp19
-rw-r--r--moses/ChartManager.h4
-rw-r--r--moses/ChartParser.cpp19
-rw-r--r--moses/ChartParser.h7
-rw-r--r--moses/ConfusionNet.cpp4
-rw-r--r--moses/ConfusionNet.h3
-rw-r--r--moses/ContextScope.h97
-rw-r--r--moses/DecodeStepTranslation.cpp34
-rw-r--r--moses/ExportInterface.cpp82
-rw-r--r--moses/FF/BleuScoreFeature.cpp1
-rw-r--r--moses/FF/CountNonTerms.cpp2
-rw-r--r--moses/FF/DecodeFeature.cpp5
-rw-r--r--moses/FF/DecodeFeature.h2
-rw-r--r--moses/FF/Factory.cpp43
-rw-r--r--moses/FF/FeatureFunction.cpp50
-rw-r--r--moses/FF/FeatureFunction.h51
-rw-r--r--moses/FF/InputFeature.cpp2
-rw-r--r--moses/FF/LexicalReordering/LexicalReordering.cpp26
-rw-r--r--moses/FF/LexicalReordering/LexicalReordering.h4
-rw-r--r--moses/FF/LexicalReordering/LexicalReorderingState.cpp6
-rw-r--r--moses/FF/LexicalReordering/LexicalReorderingState.h2
-rw-r--r--moses/FF/LexicalReordering/SparseReordering.cpp66
-rw-r--r--moses/FF/LexicalReordering/SparseReordering.h6
-rw-r--r--moses/FF/Model1Feature.cpp32
-rw-r--r--moses/FF/Model1Feature.h4
-rw-r--r--moses/FF/NieceTerminal.cpp2
-rw-r--r--moses/FF/OSM-Feature/osmHyp.cpp11
-rw-r--r--moses/FF/PhraseOrientationFeature.cpp6
-rw-r--r--moses/FF/PhrasePairFeature.cpp18
-rw-r--r--moses/FF/RulePairUnlexicalizedSource.cpp8
-rw-r--r--moses/FF/RuleScope.cpp4
-rw-r--r--moses/FF/SoftSourceSyntacticConstraintsFeature.cpp8
-rw-r--r--moses/FF/SpanLength.cpp3
-rw-r--r--moses/FF/StatefulFeatureFunction.cpp8
-rw-r--r--moses/FF/StatefulFeatureFunction.h2
-rw-r--r--moses/FF/StatelessFeatureFunction.cpp10
-rw-r--r--moses/FF/StatelessFeatureFunction.h2
-rw-r--r--moses/FF/SyntaxRHS.cpp4
-rw-r--r--moses/FF/UnalignedWordCountFeature.cpp6
-rw-r--r--moses/FF/VW/VW.h10
-rw-r--r--moses/GenerationDictionary.cpp2
-rw-r--r--moses/Hypothesis.cpp122
-rw-r--r--moses/Hypothesis.h8
-rw-r--r--moses/IOWrapper.cpp4
-rw-r--r--moses/Incremental.cpp14
-rw-r--r--moses/Incremental.h3
-rw-r--r--moses/InputType.cpp2
-rw-r--r--moses/InputType.h4
-rw-r--r--moses/Jamfile20
-rw-r--r--moses/LM/BilingualLM.cpp2
-rw-r--r--moses/LM/RDLM.cpp40
-rw-r--r--moses/Manager.cpp35
-rw-r--r--moses/Manager.h8
-rw-r--r--moses/MockHypothesis.cpp33
-rw-r--r--moses/MockHypothesis.h11
-rw-r--r--moses/Parameter.cpp118
-rw-r--r--moses/Parameter.h22
-rw-r--r--moses/ScoreComponentCollection.cpp49
-rw-r--r--moses/ScoreComponentCollection.h77
-rw-r--r--moses/Sentence.cpp78
-rw-r--r--moses/Sentence.h25
-rw-r--r--moses/StaticData.cpp553
-rw-r--r--moses/StaticData.h55
-rw-r--r--moses/Syntax/F2S/Manager-inl.h11
-rw-r--r--moses/Syntax/F2S/Manager.h9
-rw-r--r--moses/Syntax/Manager.cpp7
-rw-r--r--moses/Syntax/Manager.h2
-rw-r--r--moses/Syntax/RuleTableFF.cpp2
-rw-r--r--moses/Syntax/S2T/Manager-inl.h12
-rw-r--r--moses/Syntax/S2T/Manager.h2
-rw-r--r--moses/Syntax/T2S/Manager-inl.h6
-rw-r--r--moses/Syntax/T2S/Manager.h2
-rw-r--r--moses/TargetPhrase.cpp12
-rw-r--r--moses/TargetPhrase.h12
-rw-r--r--moses/ThreadPool.h2
-rw-r--r--moses/TrainingTask.h44
-rw-r--r--moses/TranslationModel/CompactPT/BlockHashIndex.h4
-rw-r--r--moses/TranslationModel/CompactPT/MurmurHash3.cpp850
-rw-r--r--moses/TranslationModel/CompactPT/MurmurHash3.h74
-rw-r--r--moses/TranslationModel/CompactPT/PhraseDictionaryCompact.cpp2
-rw-r--r--moses/TranslationModel/DynSAInclude/hash.h14
-rw-r--r--moses/TranslationModel/DynSAInclude/utils.h16
-rw-r--r--moses/TranslationModel/DynSuffixArray.cpp38
-rw-r--r--moses/TranslationModel/PhraseDictionary.cpp10
-rw-r--r--moses/TranslationModel/PhraseDictionary.h23
-rw-r--r--moses/TranslationModel/PhraseDictionaryDynSuffixArray.cpp2
-rw-r--r--moses/TranslationModel/PhraseDictionaryDynamicCacheBased.cpp2
-rw-r--r--moses/TranslationModel/PhraseDictionaryMultiModel.cpp4
-rw-r--r--moses/TranslationModel/PhraseDictionaryTransliteration.cpp2
-rw-r--r--moses/TranslationModel/PhraseDictionaryTreeAdaptor.cpp2
-rw-r--r--moses/TranslationModel/ProbingPT/hash.hh2
-rw-r--r--moses/TranslationModel/ProbingPT/storing.hh2
-rw-r--r--moses/TranslationModel/RuleTable/PhraseDictionaryFuzzyMatch.cpp7
-rw-r--r--moses/TranslationModel/RuleTable/PhraseDictionaryOnDisk.cpp2
-rw-r--r--moses/TranslationModel/RuleTable/Trie.h2
-rw-r--r--moses/TranslationModel/SkeletonPT.cpp2
-rw-r--r--moses/TranslationModel/UG/Jamfile15
-rw-r--r--moses/TranslationModel/UG/TargetPhraseCollectionCache.cc179
-rw-r--r--moses/TranslationModel/UG/TargetPhraseCollectionCache.h62
-rw-r--r--moses/TranslationModel/UG/bitext-find.cc149
-rw-r--r--moses/TranslationModel/UG/count-ptable-features.cc4
-rw-r--r--moses/TranslationModel/UG/generic/file_io/ug_stream.cpp8
-rw-r--r--moses/TranslationModel/UG/generic/file_io/ug_stream.h2
-rw-r--r--moses/TranslationModel/UG/generic/program_options/ug_get_options.cpp8
-rw-r--r--moses/TranslationModel/UG/generic/program_options/ug_get_options.h12
-rw-r--r--moses/TranslationModel/UG/generic/program_options/ug_splice_arglist.cc12
-rw-r--r--moses/TranslationModel/UG/generic/program_options/ug_splice_arglist.h8
-rw-r--r--moses/TranslationModel/UG/generic/sampling/Sampling.h17
-rw-r--r--moses/TranslationModel/UG/generic/sorting/VectorIndexSorter.h22
-rw-r--r--moses/TranslationModel/UG/generic/stringdist/ug_stringdist.cc78
-rw-r--r--moses/TranslationModel/UG/generic/stringdist/ug_stringdist.h16
-rw-r--r--moses/TranslationModel/UG/generic/threading/ug_thread_safe_counter.cc16
-rw-r--r--moses/TranslationModel/UG/mm/calc-coverage.cc2
-rw-r--r--moses/TranslationModel/UG/mm/custom-pt.cc34
-rw-r--r--moses/TranslationModel/UG/mm/mam2symal.cc14
-rw-r--r--moses/TranslationModel/UG/mm/mam_verify.cc12
-rw-r--r--moses/TranslationModel/UG/mm/mmlex-build.cc123
-rw-r--r--moses/TranslationModel/UG/mm/mmlex-lookup.cc30
-rw-r--r--moses/TranslationModel/UG/mm/mtt-build.cc102
-rw-r--r--moses/TranslationModel/UG/mm/mtt-count-words.cc6
-rw-r--r--moses/TranslationModel/UG/mm/mtt-demo1.cc10
-rw-r--r--moses/TranslationModel/UG/mm/mtt-dump.cc30
-rw-r--r--moses/TranslationModel/UG/mm/mtt.count.cc8
-rw-r--r--moses/TranslationModel/UG/mm/num_read_write.cc30
-rw-r--r--moses/TranslationModel/UG/mm/num_read_write.h8
-rw-r--r--moses/TranslationModel/UG/mm/obsolete/ug_bitext_base.h36
-rw-r--r--moses/TranslationModel/UG/mm/symal2mam.cc48
-rw-r--r--moses/TranslationModel/UG/mm/tpt_pickler.cc110
-rw-r--r--moses/TranslationModel/UG/mm/tpt_pickler.h36
-rw-r--r--moses/TranslationModel/UG/mm/tpt_tightindex.cc212
-rw-r--r--moses/TranslationModel/UG/mm/tpt_tightindex.h36
-rw-r--r--moses/TranslationModel/UG/mm/tpt_tokenindex.cc90
-rw-r--r--moses/TranslationModel/UG/mm/tpt_tokenindex.h30
-rw-r--r--moses/TranslationModel/UG/mm/tpt_typedefs.h2
-rw-r--r--moses/TranslationModel/UG/mm/ug_bitext.cc297
-rw-r--r--moses/TranslationModel/UG/mm/ug_bitext.h1767
-rw-r--r--moses/TranslationModel/UG/mm/ug_bitext_agenda.h186
-rw-r--r--moses/TranslationModel/UG/mm/ug_bitext_agenda_job.h240
-rw-r--r--moses/TranslationModel/UG/mm/ug_bitext_agenda_worker.h102
-rw-r--r--moses/TranslationModel/UG/mm/ug_bitext_jstats.cc90
-rw-r--r--moses/TranslationModel/UG/mm/ug_bitext_jstats.h51
-rw-r--r--moses/TranslationModel/UG/mm/ug_bitext_pstats.cc86
-rw-r--r--moses/TranslationModel/UG/mm/ug_bitext_pstats.h63
-rw-r--r--moses/TranslationModel/UG/mm/ug_conll_bottom_up_token.h10
-rw-r--r--moses/TranslationModel/UG/mm/ug_conll_record.h16
-rw-r--r--moses/TranslationModel/UG/mm/ug_corpus_token.cc6
-rw-r--r--moses/TranslationModel/UG/mm/ug_corpus_token.h28
-rw-r--r--moses/TranslationModel/UG/mm/ug_deptree.cc68
-rw-r--r--moses/TranslationModel/UG/mm/ug_deptree.h26
-rw-r--r--moses/TranslationModel/UG/mm/ug_im_bitext.cc87
-rw-r--r--moses/TranslationModel/UG/mm/ug_im_bitext.h130
-rw-r--r--moses/TranslationModel/UG/mm/ug_im_tsa.h116
-rw-r--r--moses/TranslationModel/UG/mm/ug_im_ttrack.h56
-rw-r--r--moses/TranslationModel/UG/mm/ug_lexical_phrase_scorer1.h30
-rw-r--r--moses/TranslationModel/UG/mm/ug_lexical_phrase_scorer2.h42
-rw-r--r--moses/TranslationModel/UG/mm/ug_lexical_reordering.cc47
-rw-r--r--moses/TranslationModel/UG/mm/ug_lexical_reordering.h6
-rw-r--r--moses/TranslationModel/UG/mm/ug_load_primer.h4
-rw-r--r--moses/TranslationModel/UG/mm/ug_lru_cache.h22
-rw-r--r--moses/TranslationModel/UG/mm/ug_mm_2d_table.h26
-rw-r--r--moses/TranslationModel/UG/mm/ug_mm_bitext.h82
-rw-r--r--moses/TranslationModel/UG/mm/ug_mm_tsa.h34
-rw-r--r--moses/TranslationModel/UG/mm/ug_mm_ttrack.h44
-rw-r--r--moses/TranslationModel/UG/mm/ug_mmbitext.cc68
-rw-r--r--moses/TranslationModel/UG/mm/ug_mmbitext.h46
-rw-r--r--moses/TranslationModel/UG/mm/ug_phrasepair.cc117
-rw-r--r--moses/TranslationModel/UG/mm/ug_phrasepair.h338
-rw-r--r--moses/TranslationModel/UG/mm/ug_sampling_bias.cc178
-rw-r--r--moses/TranslationModel/UG/mm/ug_sampling_bias.h86
-rw-r--r--moses/TranslationModel/UG/mm/ug_tsa_array_entry.h14
-rw-r--r--moses/TranslationModel/UG/mm/ug_tsa_base.h220
-rw-r--r--moses/TranslationModel/UG/mm/ug_tsa_bitset_cache.h12
-rw-r--r--moses/TranslationModel/UG/mm/ug_tsa_tree_iterator.h210
-rw-r--r--moses/TranslationModel/UG/mm/ug_ttrack_base.cc4
-rw-r--r--moses/TranslationModel/UG/mm/ug_ttrack_base.h136
-rw-r--r--moses/TranslationModel/UG/mm/ug_ttrack_position.h32
-rw-r--r--moses/TranslationModel/UG/mm/ug_typedefs.h2
-rw-r--r--moses/TranslationModel/UG/mmsapt.cpp618
-rw-r--r--moses/TranslationModel/UG/mmsapt.h278
-rw-r--r--moses/TranslationModel/UG/mmsapt_align.cc44
-rw-r--r--moses/TranslationModel/UG/ptable-describe-features.cc6
-rw-r--r--moses/TranslationModel/UG/ptable-lookup.cc31
-rw-r--r--moses/TranslationModel/UG/sapt_phrase_key.h2
-rw-r--r--moses/TranslationModel/UG/sapt_phrase_scorers.h2
-rw-r--r--moses/TranslationModel/UG/sapt_pscore_base.h57
-rw-r--r--moses/TranslationModel/UG/sapt_pscore_coherence.h14
-rw-r--r--moses/TranslationModel/UG/sapt_pscore_lex1.h47
-rw-r--r--moses/TranslationModel/UG/sapt_pscore_logcnt.h26
-rw-r--r--moses/TranslationModel/UG/sapt_pscore_pbwd.h18
-rw-r--r--moses/TranslationModel/UG/sapt_pscore_pfwd.h32
-rw-r--r--moses/TranslationModel/UG/sapt_pscore_phrasecount.h10
-rw-r--r--moses/TranslationModel/UG/sapt_pscore_provenance.h18
-rw-r--r--moses/TranslationModel/UG/sapt_pscore_rareness.h14
-rw-r--r--moses/TranslationModel/UG/sapt_pscore_unaligned.h24
-rw-r--r--moses/TranslationModel/UG/sapt_pscore_wordcount.h10
-rw-r--r--moses/TranslationModel/UG/sim-pe.cc18
-rw-r--r--moses/TranslationModel/UG/spe-check-coverage.cc44
-rw-r--r--moses/TranslationModel/UG/spe-check-coverage2.cc14
-rw-r--r--moses/TranslationModel/UG/spe-check-coverage3.cc46
-rw-r--r--moses/TranslationModel/UG/try-align.cc134
-rw-r--r--moses/TranslationModel/UG/try-align2.cc174
-rw-r--r--moses/TranslationModel/UG/util/ibm1-align.cc32
-rw-r--r--moses/TranslationModel/UG/util/tokenindex.dump.cc2
-rw-r--r--moses/TranslationModel/fuzzy-match/FuzzyMatchWrapper.cpp4
-rw-r--r--moses/TranslationModel/fuzzy-match/Vocabulary.cpp142
-rw-r--r--moses/TranslationOption.cpp4
-rw-r--r--moses/TranslationOption.h8
-rw-r--r--moses/TranslationOptionCollection.cpp14
-rw-r--r--moses/TranslationOptionCollection.h9
-rw-r--r--moses/TranslationOptionCollectionConfusionNet.cpp16
-rw-r--r--moses/TranslationOptionCollectionConfusionNet.h2
-rw-r--r--moses/TranslationOptionCollectionLattice.cpp10
-rw-r--r--moses/TranslationOptionCollectionLattice.h2
-rw-r--r--moses/TranslationOptionCollectionText.cpp4
-rw-r--r--moses/TranslationOptionCollectionText.h2
-rw-r--r--moses/TranslationTask.cpp144
-rw-r--r--moses/TranslationTask.h57
-rw-r--r--moses/TrellisPath.cpp40
-rw-r--r--moses/TrellisPath.h14
-rw-r--r--moses/TypeDef.h11
-rw-r--r--moses/Util.h2
-rw-r--r--moses/WordLattice.cpp7
-rw-r--r--moses/WordLattice.h6
-rw-r--r--moses/mbr.cpp4
-rw-r--r--moses/server/Optimizer.cpp24
-rw-r--r--moses/server/Optimizer.h4
-rw-r--r--moses/server/TranslationRequest.cpp154
-rw-r--r--moses/server/TranslationRequest.h72
-rw-r--r--moses/server/Translator.cpp16
-rw-r--r--moses/server/Translator.h6
-rw-r--r--moses/server/Updater.cpp16
-rw-r--r--moses/server/Updater.h8
-rw-r--r--moses/thread_safe_container.h125
-rw-r--r--phrase-extract/ExtractionPhrasePair.cpp2
-rw-r--r--phrase-extract/XmlTree.h86
-rw-r--r--phrase-extract/consolidate-main.cpp14
-rw-r--r--phrase-extract/extract-ghkm/ExtractGHKM.cpp2
-rw-r--r--phrase-extract/extract-mixed-syntax/ConsistentPhrases.cpp16
-rw-r--r--phrase-extract/extract-mixed-syntax/pugiconfig.hpp2
-rw-r--r--phrase-extract/extract-mixed-syntax/pugixml.cpp2
-rw-r--r--phrase-extract/extract-mixed-syntax/pugixml.hpp96
-rw-r--r--phrase-extract/pcfg-common/pcfg.cc6
-rw-r--r--phrase-extract/pcfg-common/tool.cc6
-rw-r--r--phrase-extract/pcfg-common/xml_tree_parser.cc6
-rw-r--r--phrase-extract/pcfg-extract/main.cc6
-rw-r--r--phrase-extract/pcfg-extract/pcfg_extract.cc6
-rw-r--r--phrase-extract/pcfg-extract/rule_collection.cc6
-rw-r--r--phrase-extract/pcfg-extract/rule_extractor.cc6
-rw-r--r--phrase-extract/pcfg-score/main.cc6
-rw-r--r--phrase-extract/pcfg-score/pcfg_score.cc6
-rw-r--r--phrase-extract/pcfg-score/tree_scorer.cc6
-rw-r--r--scripts/ems/experiment.meta7
-rwxr-xr-xscripts/other/buckwalter.perl33
-rw-r--r--scripts/tokenizer/pre_tokenize_cleaning.py78
-rwxr-xr-xscripts/training/filter-model-given-input.pl8
-rw-r--r--scripts/training/rdlm/README4
-rwxr-xr-xscripts/training/rdlm/extract_syntactic_ngrams.py5
-rwxr-xr-xscripts/training/rdlm/extract_vocab.py4
-rwxr-xr-xscripts/training/rdlm/train_rdlm.py17
-rwxr-xr-xscripts/training/train-model.perl2
-rwxr-xr-xscripts/training/wrappers/madamira-wrapper.perl67
-rw-r--r--search/applied.hh12
-rw-r--r--search/config.hh2
-rw-r--r--search/edge.hh8
-rw-r--r--search/edge_generator.cc12
-rw-r--r--search/header.hh2
-rw-r--r--search/nbest.cc4
-rw-r--r--search/nbest.hh6
-rw-r--r--search/rule.hh4
-rw-r--r--search/types.hh2
-rw-r--r--search/vertex.cc4
-rw-r--r--search/vertex.hh4
-rw-r--r--search/vertex_generator.hh6
-rw-r--r--symal/symal.cpp6
-rw-r--r--util/Jamfile2
-rw-r--r--util/bit_packing.cc4
-rw-r--r--util/bit_packing.hh24
-rw-r--r--util/cat_compressed_main.cc2
-rw-r--r--util/ersatz_progress.cc2
-rw-r--r--util/ersatz_progress.hh6
-rw-r--r--util/exception.cc2
-rw-r--r--util/exception.hh16
-rw-r--r--util/file.hh6
-rw-r--r--util/file_piece.cc32
-rw-r--r--util/fixed_array.hh32
-rw-r--r--util/getopt.c2
-rw-r--r--util/getopt.hh2
-rw-r--r--util/mmap.cc2
-rw-r--r--util/mmap.hh22
-rw-r--r--util/multi_intersection.hh4
-rw-r--r--util/multi_intersection_test.cc2
-rw-r--r--util/murmur_hash.cc22
-rw-r--r--util/pcqueue.hh6
-rw-r--r--util/pool.hh4
-rw-r--r--util/probing_hash_table.hh12
-rw-r--r--util/random.cc43
-rw-r--r--util/random.hh229
-rw-r--r--util/random_test.cc191
-rw-r--r--util/read_compressed.cc8
-rw-r--r--util/read_compressed.hh8
-rw-r--r--util/scoped.hh2
-rw-r--r--util/sorted_uniform.hh6
-rw-r--r--util/sorted_uniform_test.cc2
-rw-r--r--util/stream/block.hh28
-rw-r--r--util/stream/chain.cc12
-rw-r--r--util/stream/chain.hh70
-rw-r--r--util/stream/config.hh16
-rw-r--r--util/stream/io.cc2
-rw-r--r--util/stream/io.hh6
-rw-r--r--util/stream/line_input.cc4
-rw-r--r--util/stream/multi_progress.cc4
-rw-r--r--util/stream/multi_progress.hh12
-rw-r--r--util/stream/multi_stream.hh2
-rw-r--r--util/stream/sort.hh60
-rw-r--r--util/stream/sort_test.cc2
-rw-r--r--util/stream/stream.hh2
-rw-r--r--util/stream/timer.hh2
-rw-r--r--util/thread_pool.hh2
-rw-r--r--util/tokenize_piece.hh2
-rw-r--r--util/usage.cc6
466 files changed, 10578 insertions, 8164 deletions
diff --git a/BUILD-INSTRUCTIONS.txt b/BUILD-INSTRUCTIONS.txt
index 9c0d237d6..c08a4bf2f 100644
--- a/BUILD-INSTRUCTIONS.txt
+++ b/BUILD-INSTRUCTIONS.txt
@@ -1,4 +1,3 @@
-Please see the Moses website on how to compile and run Moses
- http://www.statmt.org/moses/?n=Development.GetStarted
+Instructions for building and installing Moses are online:
-blah blah blah
+ http://www.statmt.org/moses/?n=Development.GetStarted
diff --git a/Jamroot b/Jamroot
index 8b20d1345..65282ff63 100644
--- a/Jamroot
+++ b/Jamroot
@@ -72,43 +72,37 @@
#--clean to clean
#--debug-build to build with Og. Only available with gcc 4.8+
+import os ;
import option ;
import modules ;
import path ;
path-constant TOP : . ;
+
include $(TOP)/jam-files/sanity.jam ;
-include $(TOP)/jam-files/server.jam ;
-# exit : 0 ;
+home = [ os.environ "HOME" ] ;
+if [ path.exists $(home)/moses-environment.jam ]
+{
+ # for those of use who don't like typing in command line bjam options all day long
+ include $(home)/moses-environment.jam ;
+}
+include $(TOP)/jam-files/check-environment.jam ; # get resource locations
+ # from environment variables
+include $(TOP)/jam-files/xmlrpc-c.jam ; # xmlrpc-c stuff for the server
+include $(TOP)/jam-files/curlpp.jam ; # curlpp stuff for bias lookup (MMT only)
+
+# exit "done" : 0 ;
-if [ build_server ] != no
+max-order = [ option.get "max-kenlm-order" : 6 : 6 ] ;
+if ! [ option.get "max-kenlm-order" ]
{
- xmlrpc-c-prefix = [ shell_or_die "$(xmlrpc-c-config-cmd) c++2 abyss-server --prefix" ] ;
- echo "XMLRPC-C: BUILDING MOSES WITH XMLRPC_C LIBRARY VERSION $(xmlrpc-c-version) FROM $(xmlrpc-c-prefix)" ;
-
- xmlrpc-cxxflags = [ shell_or_die "$(xmlrpc-c-config-cmd) c++2 abyss-server --cflags" ] ;
- requirements += <define>HAVE_XMLRPC_C ;
- requirements += <cxxflags>$(xmlrpc-cxxflags) ;
-
- xmlrpc-linkflags = [ shell_or_die "$(xmlrpc-c-config-cmd) c++2 abyss-server --libs" ] ;
- for local i in [ SPLIT_BY_CHARACTERS $(xmlrpc-linkflags) : " " ]
- {
- local libname = [ MATCH "-l(xmlrpc.*)" : $(i) ] ;
- if $(libname)
- {
- external-lib $(libname)
- : : <runtime-link>static:<link>static <runtime-link>shared:<link>shared ;
- requirements += <library>$(libname) ;
- }
- local pathname = [ MATCH "-L(.*)" : $(i) ] ;
- if $(pathname)
- {
- requirements += <library-path>$(pathname) ;
- }
- }
+ # some classes in Moses pull in header files from KenLM, so this needs to be
+ # defined here, not in moses/lm/Jamfile
+ option.set "max-kenlm-order" : 6 ;
+ requirements += <define>KENLM_MAX_ORDER=$(max-order) ;
}
-# echo $(requirements) ;
-# exit 0 ;
+# exit "all done" : 0 ;
+
boost 104400 ;
external-lib z ;
@@ -139,6 +133,7 @@ if [ option.get "filter-warnings" : : "yes" ] {
requirements += <cxxflags>-Wno-unused-but-set-variable ;
requirements += <cxxflags>-Wno-unused-result ;
requirements += <cxxflags>-Wno-unused-variable ;
+ requirements += <cxxflags>-Wcomment ;
}
if [ option.get "debug-build" : : "yes" ] {
@@ -228,10 +223,11 @@ build-projects lm util phrase-extract phrase-extract/syntax-common search moses
if [ option.get "with-mm" : : "yes" ]
{
alias mm :
+ moses/TranslationModel/UG//bitext-find
moses/TranslationModel/UG//ptable-describe-features
moses/TranslationModel/UG//count-ptable-features
moses/TranslationModel/UG//ptable-lookup
- moses/TranslationModel/UG//spe-check-coverage
+ # moses/TranslationModel/UG//spe-check-coverage
moses/TranslationModel/UG/mm//mtt-demo1
moses/TranslationModel/UG/mm//mtt-build
moses/TranslationModel/UG/mm//mtt-dump
@@ -307,6 +303,3 @@ if [ path.exists $(TOP)/dist ] && $(prefix) != dist {
local temp = [ _shell "mkdir -p $(TOP)/bin" ] ;
local temp = [ _shell "rm $(TOP)/bin/moses_chart" ] ;
local temp = [ _shell "cd $(TOP)/bin && ln -s moses moses_chart" ] ;
-
-
-
diff --git a/biconcor/Vocabulary.cpp b/biconcor/Vocabulary.cpp
index 9d52ee44e..f0f07c97d 100644
--- a/biconcor/Vocabulary.cpp
+++ b/biconcor/Vocabulary.cpp
@@ -1,101 +1,101 @@
-// $Id: Vocabulary.cpp 1565 2008-02-22 14:42:01Z bojar $
-#include "Vocabulary.h"
-#include <fstream>
-
-namespace
-{
-
-const int MAX_LENGTH = 10000;
-
-} // namespace
-
-using namespace std;
-
-// as in beamdecoder/tables.cpp
-vector<WORD_ID> Vocabulary::Tokenize( const char input[] )
-{
- vector< WORD_ID > token;
- bool betweenWords = true;
- int start=0;
- int i=0;
- for(; input[i] != '\0'; i++) {
- bool isSpace = (input[i] == ' ' || input[i] == '\t');
-
- if (!isSpace && betweenWords) {
- start = i;
- betweenWords = false;
- } else if (isSpace && !betweenWords) {
- token.push_back( StoreIfNew ( string( input+start, i-start ) ) );
- betweenWords = true;
- }
- }
- if (!betweenWords)
- token.push_back( StoreIfNew ( string( input+start, i-start ) ) );
- return token;
-}
-
-WORD_ID Vocabulary::StoreIfNew( const WORD& word )
-{
- map<WORD, WORD_ID>::iterator i = lookup.find( word );
-
- if( i != lookup.end() )
- return i->second;
-
- WORD_ID id = vocab.size();
- vocab.push_back( word );
- lookup[ word ] = id;
- return id;
-}
-
-WORD_ID Vocabulary::GetWordID( const WORD &word ) const
-{
- map<WORD, WORD_ID>::const_iterator i = lookup.find( word );
- if( i == lookup.end() )
- return 0;
- WORD_ID w= (WORD_ID) i->second;
- return w;
-}
-
-void Vocabulary::Save(const string& fileName ) const
-{
- ofstream vcbFile;
- vcbFile.open( fileName.c_str(), ios::out | ios::ate | ios::trunc);
-
- if (!vcbFile) {
- cerr << "Failed to open " << vcbFile << endl;
- exit(1);
- }
-
- vector< WORD >::const_iterator i;
- for(i = vocab.begin(); i != vocab.end(); i++) {
- const string &word = *i;
- vcbFile << word << endl;
- }
- vcbFile.close();
-}
-
-void Vocabulary::Load(const string& fileName )
-{
- ifstream vcbFile;
- char line[MAX_LENGTH];
- vcbFile.open(fileName.c_str());
-
- if (!vcbFile) {
- cerr << "no such file or directory: " << vcbFile << endl;
- exit(1);
- }
-
- cerr << "loading from " << fileName << endl;
- istream *fileP = &vcbFile;
- int count = 0;
- while(!fileP->eof()) {
- SAFE_GETLINE((*fileP), line, MAX_LENGTH, '\n');
- if (fileP->eof()) break;
- int length = 0;
- for(; line[length] != '\0'; length++);
- StoreIfNew( string( line, length ) );
- count++;
- }
- vcbFile.close();
- cerr << count << " word read, vocabulary size " << vocab.size() << endl;
-}
+// $Id: Vocabulary.cpp 1565 2008-02-22 14:42:01Z bojar $
+#include "Vocabulary.h"
+#include <fstream>
+
+namespace
+{
+
+const int MAX_LENGTH = 10000;
+
+} // namespace
+
+using namespace std;
+
+// as in beamdecoder/tables.cpp
+vector<WORD_ID> Vocabulary::Tokenize( const char input[] )
+{
+ vector< WORD_ID > token;
+ bool betweenWords = true;
+ int start=0;
+ int i=0;
+ for(; input[i] != '\0'; i++) {
+ bool isSpace = (input[i] == ' ' || input[i] == '\t');
+
+ if (!isSpace && betweenWords) {
+ start = i;
+ betweenWords = false;
+ } else if (isSpace && !betweenWords) {
+ token.push_back( StoreIfNew ( string( input+start, i-start ) ) );
+ betweenWords = true;
+ }
+ }
+ if (!betweenWords)
+ token.push_back( StoreIfNew ( string( input+start, i-start ) ) );
+ return token;
+}
+
+WORD_ID Vocabulary::StoreIfNew( const WORD& word )
+{
+ map<WORD, WORD_ID>::iterator i = lookup.find( word );
+
+ if( i != lookup.end() )
+ return i->second;
+
+ WORD_ID id = vocab.size();
+ vocab.push_back( word );
+ lookup[ word ] = id;
+ return id;
+}
+
+WORD_ID Vocabulary::GetWordID( const WORD &word ) const
+{
+ map<WORD, WORD_ID>::const_iterator i = lookup.find( word );
+ if( i == lookup.end() )
+ return 0;
+ WORD_ID w= (WORD_ID) i->second;
+ return w;
+}
+
+void Vocabulary::Save(const string& fileName ) const
+{
+ ofstream vcbFile;
+ vcbFile.open( fileName.c_str(), ios::out | ios::ate | ios::trunc);
+
+ if (!vcbFile) {
+ cerr << "Failed to open " << vcbFile << endl;
+ exit(1);
+ }
+
+ vector< WORD >::const_iterator i;
+ for(i = vocab.begin(); i != vocab.end(); i++) {
+ const string &word = *i;
+ vcbFile << word << endl;
+ }
+ vcbFile.close();
+}
+
+void Vocabulary::Load(const string& fileName )
+{
+ ifstream vcbFile;
+ char line[MAX_LENGTH];
+ vcbFile.open(fileName.c_str());
+
+ if (!vcbFile) {
+ cerr << "no such file or directory: " << vcbFile << endl;
+ exit(1);
+ }
+
+ cerr << "loading from " << fileName << endl;
+ istream *fileP = &vcbFile;
+ int count = 0;
+ while(!fileP->eof()) {
+ SAFE_GETLINE((*fileP), line, MAX_LENGTH, '\n');
+ if (fileP->eof()) break;
+ int length = 0;
+ for(; line[length] != '\0'; length++);
+ StoreIfNew( string( line, length ) );
+ count++;
+ }
+ vcbFile.close();
+ cerr << count << " word read, vocabulary size " << vocab.size() << endl;
+}
diff --git a/contrib/c++tokenizer/tokenizer.cpp b/contrib/c++tokenizer/tokenizer.cpp
index 035ba2e97..6d3dd7046 100644
--- a/contrib/c++tokenizer/tokenizer.cpp
+++ b/contrib/c++tokenizer/tokenizer.cpp
@@ -46,7 +46,7 @@ RE2 curr_en_x("^[Nn]?[\'][\\p{L}]"); // english contraction suffixes conjoin to
RE2 pre_en_x(".*[\\p{L}\\p{N}]+$"); // valid english contraction prefixes
RE2 curr_fr_x(".*[\\p{L}\\p{N}]+[\']"); // french/italian contraction prefixes conjoin to the right
RE2 post_fr_x("^[\\p{L}\\p{N}]*"); // valid french/italian contraction suffixes
-// anything rarely used will just be given as a string and compiled on demand by RE2
+// anything rarely used will just be given as a string and compiled on demand by RE2
const char *
SPC_BYTE = " ";
@@ -85,8 +85,8 @@ const char *ESCAPE_MOSES[] = {
"&apos;", // ' 6 (27)
"&quot;", // " 7 (22)
};
-
-const std::set<std::string>
+
+const std::set<std::string>
ESCAPE_SET = {
std::string(ESCAPE_MOSES[0]),
std::string(ESCAPE_MOSES[1]),
@@ -98,7 +98,7 @@ ESCAPE_SET = {
std::string(ESCAPE_MOSES[7]),
};
-const std::map<std::wstring,gunichar>
+const std::map<std::wstring,gunichar>
ENTITY_MAP = {
{ std::wstring(L"&quot;"), L'"' },
{ std::wstring(L"&amp;"), L'&' },
@@ -355,7 +355,7 @@ ENTITY_MAP = {
{ std::wstring(L"&diams;"), L'\u2666' }
};
-inline gunichar
+inline gunichar
get_entity(gunichar *ptr, size_t len) {
// try hex, decimal entity first
gunichar ech(0);
@@ -380,16 +380,16 @@ get_entity(gunichar *ptr, size_t len) {
ech = 0;
}
}
- if (ech)
+ if (ech)
return ech;
- std::map<std::wstring,gunichar>::const_iterator it =
+ std::map<std::wstring,gunichar>::const_iterator it =
ENTITY_MAP.find(std::wstring((wchar_t *)(ptr),len));
return it != ENTITY_MAP.end() ? it->second : gunichar(0);
}
-inline gunichar
+inline gunichar
get_entity(char *ptr, size_t len) {
glong ulen = 0;
gunichar *gtmp = g_utf8_to_ucs4_fast((const gchar *)ptr, len, &ulen);
@@ -399,7 +399,7 @@ get_entity(char *ptr, size_t len) {
}
-inline std::string
+inline std::string
trim(const std::string& in)
{
std::size_t start = 0;
@@ -413,7 +413,7 @@ trim(const std::string& in)
}
-inline std::vector<std::string>
+inline std::vector<std::string>
split(const std::string& in)
{
std::vector<std::string> outv;
@@ -476,7 +476,7 @@ Tokenizer::Tokenizer(const Parameters& _)
//
// dtor deletes dynamically allocated per-language RE2 compiled expressions
//
-Tokenizer::~Tokenizer()
+Tokenizer::~Tokenizer()
{
for (auto& ptr : prot_pat_vec) {
if (ptr == &numprefixed_x || ptr == &quasinumeric_x)
@@ -491,7 +491,7 @@ Tokenizer::~Tokenizer()
// others into nbpre_gen_set
//
std::pair<int,int>
-Tokenizer::load_prefixes(std::ifstream& ifs)
+Tokenizer::load_prefixes(std::ifstream& ifs)
{
RE2 numonly("(.*)[\\s]+(\\#NUMERIC_ONLY\\#)");
std::string line;
@@ -547,7 +547,7 @@ Tokenizer::init(const char *cfg_dir_optional) {
try {
std::pair<int,int> counts = load_prefixes(cfg);
if (verbose_p) {
- std::cerr << "loaded " << counts.first << " non-numeric, "
+ std::cerr << "loaded " << counts.first << " non-numeric, "
<< counts.second << " numeric prefixes from "
<< nbpre_path << std::endl;
}
@@ -570,7 +570,7 @@ Tokenizer::init(const char *cfg_dir_optional) {
std::string protpat_path(cfg_dir);
protpat_path.append("/protected_pattern.").append(lang_iso);
// default to generic version
- if (::access(protpat_path.c_str(),R_OK))
+ if (::access(protpat_path.c_str(),R_OK))
protpat_path = protpat_path.substr(0,protpat_path.size()-lang_iso.size()-1);
prot_pat_vec.push_back(&numprefixed_x);
@@ -596,7 +596,7 @@ Tokenizer::init(const char *cfg_dir_optional) {
throw std::runtime_error(ess.str());
}
if (verbose_p) {
- std::cerr << "loaded " << npat << " protected patterns from "
+ std::cerr << "loaded " << npat << " protected patterns from "
<< protpat_path << std::endl;
}
} else if (verbose_p) {
@@ -612,7 +612,7 @@ Tokenizer::reset() {
//
// apply ctor-selected tokenization to a string, in-place, no newlines allowed,
-// assumes protections are applied already, some invariants are in place,
+// assumes protections are applied already, some invariants are in place,
// e.g. that successive chars <= ' ' have been normalized to a single ' '
//
void
@@ -633,7 +633,7 @@ Tokenizer::protected_tokenize(std::string& text) {
}
if (pos < textpc.size() && textpc[pos] != ' ')
words.push_back(textpc.substr(pos,textpc.size()-pos));
-
+
// regurgitate words with look-ahead handling for tokens with final mumble
std::string outs;
std::size_t nwords(words.size());
@@ -659,7 +659,7 @@ Tokenizer::protected_tokenize(std::string& text) {
// lower-case look-ahead does not break
sentence_break_p = false;
}
- }
+ }
outs.append(words[ii].data(),len);
if (sentence_break_p)
@@ -671,15 +671,15 @@ Tokenizer::protected_tokenize(std::string& text) {
}
-bool
+bool
Tokenizer::unescape(std::string& word) {
std::ostringstream oss;
std::size_t was = 0; // last processed
std::size_t pos = 0; // last unprocessed
std::size_t len = 0; // processed length
bool hit = false;
- for (std::size_t endp=0;
- (pos = word.find('&',was)) != std::string::npos && (endp = word.find(';',pos)) != std::string::npos;
+ for (std::size_t endp=0;
+ (pos = word.find('&',was)) != std::string::npos && (endp = word.find(';',pos)) != std::string::npos;
was = endp == std::string::npos ? pos : 1+endp) {
len = endp - pos + 1;
glong ulen(0);
@@ -703,7 +703,7 @@ Tokenizer::unescape(std::string& word) {
}
g_free(gtmp);
}
- if (was < word.size())
+ if (was < word.size())
oss << word.substr(was);
if (hit)
word = oss.str();
@@ -727,7 +727,7 @@ Tokenizer::escape(std::string& text) {
if (mod_p)
outs.append(pp,pt-pp+1);
} else {
- if (mod_p)
+ if (mod_p)
outs.append(pp,mk-pp);
pt = --mk;
}
@@ -751,7 +751,7 @@ Tokenizer::escape(std::string& text) {
} else if (*pt > ']') {
if (*pt =='|') { // 7c
sequence_p = ESCAPE_MOSES[0];
- }
+ }
} else if (*pt > 'Z') {
if (*pt == '<') { // 3e
sequence_p = ESCAPE_MOSES[4];
@@ -761,11 +761,11 @@ Tokenizer::escape(std::string& text) {
sequence_p = ESCAPE_MOSES[1];
} else if (*pt == ']') { // 5d
sequence_p = ESCAPE_MOSES[2];
- }
+ }
}
if (sequence_p) {
- if (pt > pp)
+ if (pt > pp)
outs.append(pp,pt-pp);
outs.append(sequence_p);
mod_p = true;
@@ -774,7 +774,7 @@ Tokenizer::escape(std::string& text) {
++pt;
}
}
-
+
if (mod_p) {
if (pp < pt) {
outs.append(pp,pt-pp);
@@ -795,13 +795,13 @@ Tokenizer::penn_tokenize(const std::string& buf)
std::string text(buf);
std::string outs;
- if (skip_alltags_p)
+ if (skip_alltags_p)
RE2::GlobalReplace(&text,genl_tags_x,SPC_BYTE);
// directed quote patches
size_t len = text.size();
- if (len > 2 && text.substr(0,2) == "``")
- text.replace(0,2,"`` ",3);
+ if (len > 2 && text.substr(0,2) == "``")
+ text.replace(0,2,"`` ",3);
else if (text[0] == '"')
text.replace(0,1,"`` ",3);
else if (text[0] == '`' || text[0] == '\'')
@@ -811,9 +811,9 @@ Tokenizer::penn_tokenize(const std::string& buf)
RE2::GlobalReplace(&text,x1_v_gg,one_gg);
RE2::GlobalReplace(&text,x1_v_g,"\\1 ` \\2");
RE2::GlobalReplace(&text,x1_v_q,"\\1 ` ");
-
+
// protect ellipsis
- for (size_t pos = text.find("..."); pos != std::string::npos; pos = text.find("...",pos+11))
+ for (size_t pos = text.find("..."); pos != std::string::npos; pos = text.find("...",pos+11))
text.replace(pos,3,"MANYELIPSIS",11);
// numeric commas
@@ -826,13 +826,13 @@ Tokenizer::penn_tokenize(const std::string& buf)
// isolable slash
RE2::GlobalReplace(&text,slash_x,special_refs);
-
+
// isolate final period
RE2::GlobalReplace(&text,final_x,"\\1 \\2\\3");
-
+
// isolate q.m., e.m.
RE2::GlobalReplace(&text,qx_x,isolate_ref);
-
+
// isolate braces
RE2::GlobalReplace(&text,braces_x,isolate_ref);
@@ -866,7 +866,7 @@ Tokenizer::penn_tokenize(const std::string& buf)
}
std::string ntext(SPC_BYTE);
ntext.append(text);
-
+
// convert double quote to paired single-quotes
RE2::GlobalReplace(&ntext,"\""," '' ");
@@ -894,7 +894,7 @@ Tokenizer::penn_tokenize(const std::string& buf)
RE2::GlobalReplace(&ntext," ([Ww])anna "," \\1an na ");
protected_tokenize(ntext);
-
+
// restore ellipsis
RE2::GlobalReplace(&ntext,"MANYELIPSIS","...");
@@ -919,7 +919,7 @@ Tokenizer::quik_tokenize(const std::string& buf)
int num = 0;
// this is the main moses-compatible tokenizer
-
+
// push all the prefixes matching protected patterns
std::vector<std::string> prot_stack;
std::string match;
@@ -942,7 +942,7 @@ Tokenizer::quik_tokenize(const std::string& buf)
}
}
}
-
+
const char *pt(text.c_str());
const char *ep(pt + text.size());
while (pt < ep && *pt >= 0 && *pt <= ' ')
@@ -990,8 +990,8 @@ Tokenizer::quik_tokenize(const std::string& buf)
if (!since_start) {
if (std::isalpha(char(*ucs4)))
alpha_prefix++;
- } else if (alpha_prefix == since_start
- && char(*ucs4) == ':'
+ } else if (alpha_prefix == since_start
+ && char(*ucs4) == ':'
&& next_type != G_UNICODE_SPACE_SEPARATOR) {
in_url_p = true;
}
@@ -1018,7 +1018,7 @@ Tokenizer::quik_tokenize(const std::string& buf)
// fallthough
case G_UNICODE_UPPERCASE_LETTER:
case G_UNICODE_LOWERCASE_LETTER:
- if (downcase_p && curr_type == G_UNICODE_UPPERCASE_LETTER)
+ if (downcase_p && curr_type == G_UNICODE_UPPERCASE_LETTER)
curr_uch = g_unichar_tolower(*ucs4);
break;
case G_UNICODE_SPACING_MARK:
@@ -1082,8 +1082,8 @@ Tokenizer::quik_tokenize(const std::string& buf)
substitute_p = L"@-@";
post_break_p = pre_break_p = true;
} else if ( ( curr_uch > gunichar(L'\u002D') && curr_uch < gunichar(L'\u2010') ) ||
- ( curr_uch > gunichar(L'\u2011')
- && curr_uch != gunichar(L'\u30A0')
+ ( curr_uch > gunichar(L'\u2011')
+ && curr_uch != gunichar(L'\u30A0')
&& curr_uch < gunichar(L'\uFE63') ) ) {
// dash, not a hyphen
post_break_p = pre_break_p = true;
@@ -1151,7 +1151,7 @@ Tokenizer::quik_tokenize(const std::string& buf)
default:
post_break_p = pre_break_p = prev_uch != curr_uch;
break;
- }
+ }
}
}
break;
@@ -1159,8 +1159,8 @@ Tokenizer::quik_tokenize(const std::string& buf)
switch (curr_uch) {
case gunichar(L':'):
case gunichar(L'/'):
- if (refined_p && !in_url_p
- && prev_type == G_UNICODE_DECIMAL_NUMBER
+ if (refined_p && !in_url_p
+ && prev_type == G_UNICODE_DECIMAL_NUMBER
&& next_type == G_UNICODE_DECIMAL_NUMBER) {
break;
}
@@ -1178,7 +1178,7 @@ Tokenizer::quik_tokenize(const std::string& buf)
break;
case gunichar(L'&'):
if (unescape_p) {
- if (next_type == G_UNICODE_LOWERCASE_LETTER || next_type == G_UNICODE_UPPERCASE_LETTER
+ if (next_type == G_UNICODE_LOWERCASE_LETTER || next_type == G_UNICODE_UPPERCASE_LETTER
|| next_type == G_UNICODE_DECIMAL_NUMBER || next_uch == gunichar(L'#')) {
gunichar *eptr = nxt4;
GUnicodeType eptr_type(G_UNICODE_UNASSIGNED);
@@ -1223,16 +1223,16 @@ Tokenizer::quik_tokenize(const std::string& buf)
next_type = nxt4 < lim4 ? g_unichar_type(next_uch) : G_UNICODE_UNASSIGNED;
goto retry;
}
-
+
}
post_break_p = pre_break_p = !in_url_p || next_type != G_UNICODE_SPACE_SEPARATOR;
- if (escape_p)
+ if (escape_p)
substitute_p = L"&amp;";
break;
case gunichar(L'\''):
if (english_p) {
if (!in_url_p) {
- bool next_letter_p = next_type == G_UNICODE_LOWERCASE_LETTER
+ bool next_letter_p = next_type == G_UNICODE_LOWERCASE_LETTER
|| next_type == G_UNICODE_UPPERCASE_LETTER;
pre_break_p = true;
if (next_letter_p && refined_p) {
@@ -1241,9 +1241,9 @@ Tokenizer::quik_tokenize(const std::string& buf)
*(uptr - 1) = gunichar(L' ');
*(uptr++) = prev_uch;
pre_break_p = false;
- }
+ }
}
- post_break_p = since_start == 0
+ post_break_p = since_start == 0
|| (!next_letter_p && next_type != G_UNICODE_DECIMAL_NUMBER);
}
} else if (latin_p) {
@@ -1252,12 +1252,12 @@ Tokenizer::quik_tokenize(const std::string& buf)
} else {
post_break_p = pre_break_p = !in_url_p;
}
- if (escape_p)
+ if (escape_p)
substitute_p = L"&apos;";
break;
case gunichar(L'"'):
post_break_p = pre_break_p = true;
- if (escape_p)
+ if (escape_p)
substitute_p = L"&quot;";
break;
case gunichar(L','):
@@ -1303,7 +1303,7 @@ Tokenizer::quik_tokenize(const std::string& buf)
}
}
// terminal isolated letter does not break
- } else if (class_follows_p(nxt4,lim4,G_UNICODE_LOWERCASE_LETTER) ||
+ } else if (class_follows_p(nxt4,lim4,G_UNICODE_LOWERCASE_LETTER) ||
g_unichar_type(*nxt4) == G_UNICODE_DASH_PUNCTUATION) {
// lower-case look-ahead does not break
} else {
@@ -1315,7 +1315,7 @@ Tokenizer::quik_tokenize(const std::string& buf)
pre_break_p = true;
break;
}
- }
+ }
break;
}
} else {
@@ -1346,11 +1346,11 @@ Tokenizer::quik_tokenize(const std::string& buf)
case gunichar(L')'):
break;
case gunichar(L'['):
- if (escape_p)
+ if (escape_p)
substitute_p = L"&#91;";
break;
case gunichar(L']'):
- if (escape_p)
+ if (escape_p)
substitute_p = L"&#93;";
break;
default:
@@ -1377,7 +1377,7 @@ Tokenizer::quik_tokenize(const std::string& buf)
if (english_p) {
if (!in_url_p) {
pre_break_p = true;
- post_break_p = since_start == 0 ||
+ post_break_p = since_start == 0 ||
(next_type != G_UNICODE_LOWERCASE_LETTER && next_type != G_UNICODE_UPPERCASE_LETTER && next_type != G_UNICODE_DECIMAL_NUMBER);
}
} else if (latin_p) {
@@ -1386,23 +1386,23 @@ Tokenizer::quik_tokenize(const std::string& buf)
} else {
post_break_p = pre_break_p = !in_url_p;
}
- if (escape_p)
+ if (escape_p)
substitute_p = L"&apos;";
- else
+ else
curr_uch = gunichar(L'\'');
break;
case gunichar(L'|'):
- if (escape_p)
+ if (escape_p)
substitute_p = L"&#124;";
post_break_p = pre_break_p = true;
break;
case gunichar(L'<'):
- if (escape_p)
+ if (escape_p)
substitute_p = L"&lt;";
post_break_p = pre_break_p = true;
break;
case gunichar(L'>'):
- if (escape_p)
+ if (escape_p)
substitute_p = L"&gt;";
post_break_p = pre_break_p = true;
break;
@@ -1414,7 +1414,7 @@ Tokenizer::quik_tokenize(const std::string& buf)
case gunichar(L'='):
case gunichar(L'~'):
in_num_p = false;
- post_break_p = pre_break_p = !in_url_p;
+ post_break_p = pre_break_p = !in_url_p;
break;
case gunichar(L'+'):
post_break_p = pre_break_p = !in_url_p;
@@ -1444,12 +1444,12 @@ Tokenizer::quik_tokenize(const std::string& buf)
curr_uch = gunichar(L' ');
} else if (curr_uch < gunichar(L' ')) {
curr_uch = gunichar(L' ');
- } else if (curr_uch == gunichar(L'\u0092') &&
+ } else if (curr_uch == gunichar(L'\u0092') &&
(next_type == G_UNICODE_LOWERCASE_LETTER || next_type == G_UNICODE_UPPERCASE_LETTER)) {
// observed corpus corruption case
if (english_p) {
pre_break_p = true;
- post_break_p = since_start == 0 ||
+ post_break_p = since_start == 0 ||
(next_type != G_UNICODE_LOWERCASE_LETTER && next_type != G_UNICODE_UPPERCASE_LETTER && next_type != G_UNICODE_DECIMAL_NUMBER);
} else if (latin_p) {
post_break_p = true;
@@ -1457,9 +1457,9 @@ Tokenizer::quik_tokenize(const std::string& buf)
} else {
post_break_p = pre_break_p = true;
}
- if (escape_p)
+ if (escape_p)
substitute_p = L"&apos;";
- else
+ else
curr_uch = gunichar(L'\'');
} else {
post_break_p = pre_break_p = true;
@@ -1491,7 +1491,7 @@ Tokenizer::quik_tokenize(const std::string& buf)
in_url_p = in_num_p = false;
break;
}
-
+
if (pre_break_p || curr_uch == gunichar(L' ') || (bad_length && curr_type != G_UNICODE_UNASSIGNED)) {
if (since_start) {
// non-empty token emitted previously, so pre-break must emit token separator
@@ -1501,8 +1501,8 @@ Tokenizer::quik_tokenize(const std::string& buf)
if (curr_uch == gunichar(L' '))
// suppress emission below, fall-through to substitute logic
curr_uch = 0;
- }
-
+ }
+
if (substitute_p) {
for (gunichar *sptr = (gunichar *)substitute_p; *sptr; ++sptr) {
*uptr++ = *sptr;
@@ -1521,7 +1521,7 @@ Tokenizer::quik_tokenize(const std::string& buf)
glong nbytes = 0;
gchar *utf8 = g_ucs4_to_utf8(ubuf,uptr-ubuf,0,&nbytes,0); // g_free
- if (utf8[nbytes-1] == ' ')
+ if (utf8[nbytes-1] == ' ')
--nbytes;
text.assign((const char *)utf8,(const char *)(utf8 + nbytes));
g_free(utf8);
@@ -1552,7 +1552,7 @@ Tokenizer::quik_tokenize(const std::string& buf)
}
-std::size_t
+std::size_t
Tokenizer::tokenize(std::istream& is, std::ostream& os)
{
std::size_t line_no = 0;
@@ -1561,10 +1561,10 @@ Tokenizer::tokenize(std::istream& is, std::ostream& os)
std::vector< std::vector< std::string > > results(nthreads);
std::vector< boost::thread > workers(nthreads);
bool done_p = !(is.good() && os.good());
-
+
for (std::size_t tranche = 0; !done_p; ++tranche) {
-
+
// for loop starting threads for chunks of input
for (std::size_t ithread = 0; ithread < nthreads; ++ithread) {
@@ -1589,19 +1589,19 @@ Tokenizer::tokenize(std::istream& is, std::ostream& os)
results[ithread].resize(line_pos);
break;
}
- lines[ithread][line_pos].clear();
- } else if (skip_xml_p &&
- (RE2::FullMatch(istr,tag_line_x) || RE2::FullMatch(istr,white_line_x))) {
- lines[ithread][line_pos].clear();
+ lines[ithread][line_pos].clear();
+ } else if (skip_xml_p &&
+ (RE2::FullMatch(istr,tag_line_x) || RE2::FullMatch(istr,white_line_x))) {
+ lines[ithread][line_pos].clear();
} else {
- lines[ithread][line_pos] =
- std::string(SPC_BYTE).append(istr).append(SPC_BYTE);
+ lines[ithread][line_pos] =
+ std::string(SPC_BYTE).append(istr).append(SPC_BYTE);
}
- }
+ }
if (line_pos) {
- workers[ithread] =
- boost::thread(VectorTokenizerCallable(this,lines[ithread],results[ithread]));
+ workers[ithread] =
+ boost::thread(VectorTokenizerCallable(this,lines[ithread],results[ithread]));
}
} // end for loop starting threads
@@ -1616,22 +1616,22 @@ Tokenizer::tokenize(std::istream& is, std::ostream& os)
if (nlin != nres) {
std::ostringstream emsg;
- emsg << "Tranche " << tranche
- << " worker " << ithread << "/" << nthreads
+ emsg << "Tranche " << tranche
+ << " worker " << ithread << "/" << nthreads
<< " |lines|==" << nlin << " != |results|==" << nres;
throw std::runtime_error(emsg.str());
}
- for (std::size_t ires = 0; ires < nres; ++ires)
+ for (std::size_t ires = 0; ires < nres; ++ires)
os << results[ithread][ires] << std::endl;
} // end loop over joined results
-
+
if (verbose_p) {
std::cerr << line_no << ' ';
std::cerr.flush();
}
-
+
} // end loop over chunks
return line_no;
@@ -1642,18 +1642,18 @@ std::string
Tokenizer::detokenize(const std::string& buf)
{
std::vector<std::string> words = split(trim(buf));
-
+
std::size_t squotes = 0;
std::size_t dquotes = 0;
std::string prepends("");
std::ostringstream oss;
-
+
std::size_t nwords = words.size();
std::size_t iword = 0;
- if (unescape_p)
- for (auto &word: words)
+ if (unescape_p)
+ for (auto &word: words)
unescape(word);
for (auto &word: words) {
@@ -1665,13 +1665,13 @@ Tokenizer::detokenize(const std::string& buf)
} else if (RE2::FullMatch(word,left_x)) {
oss << word;
prepends = SPC_BYTE;
- } else if (english_p && iword
- && RE2::FullMatch(word,curr_en_x)
+ } else if (english_p && iword
+ && RE2::FullMatch(word,curr_en_x)
&& RE2::FullMatch(words[iword-1],pre_en_x)) {
oss << word;
prepends = SPC_BYTE;
- } else if (latin_p && iword < nwords - 2
- && RE2::FullMatch(word,curr_fr_x)
+ } else if (latin_p && iword < nwords - 2
+ && RE2::FullMatch(word,curr_fr_x)
&& RE2::FullMatch(words[iword+1],post_fr_x)) {
oss << prepends << word;
prepends.clear();
@@ -1679,7 +1679,7 @@ Tokenizer::detokenize(const std::string& buf)
if ((word.at(0) == '\'' && ((squotes % 2) == 0 )) ||
(word.at(0) == '"' && ((dquotes % 2) == 0))) {
if (english_p && iword
- && word.at(0) == '\''
+ && word.at(0) == '\''
&& std::tolower(words[iword-1].at(words[iword-1].size()-1)) == 's') {
oss << word;
prepends = SPC_BYTE;
@@ -1698,7 +1698,7 @@ Tokenizer::detokenize(const std::string& buf)
prepends = SPC_BYTE;
if (word.at(0) == '\'')
squotes++;
- else if (word.at(0) == '"')
+ else if (word.at(0) == '"')
dquotes++;
}
} else {
@@ -1707,8 +1707,8 @@ Tokenizer::detokenize(const std::string& buf)
}
iword++;
}
-
-
+
+
std::string text(oss.str());
RE2::GlobalReplace(&text," +",SPC_BYTE);
RE2::GlobalReplace(&text,"\n ","\n");
@@ -1718,14 +1718,14 @@ Tokenizer::detokenize(const std::string& buf)
std::size_t
-Tokenizer::detokenize(std::istream& is, std::ostream& os)
+Tokenizer::detokenize(std::istream& is, std::ostream& os)
{
size_t line_no = 0;
while (is.good() && os.good()) {
std::string istr;
std::getline(is,istr);
line_no ++;
- if (istr.empty())
+ if (istr.empty())
continue;
if (skip_xml_p && (RE2::FullMatch(istr,tag_line_x) || RE2::FullMatch(istr,white_line_x))) {
os << istr << std::endl;
@@ -1749,7 +1749,7 @@ Tokenizer::splitter(const std::string &istr, bool *continuation_ptr) {
return parts;
}
gunichar *uout = (gunichar *)g_malloc0(2*ncp*sizeof(gunichar));
-
+
const wchar_t GENL_HYPH = L'\u2010';
const wchar_t IDEO_STOP = L'\u3002';
const wchar_t KANA_MDOT = L'\u30FB';
@@ -1786,7 +1786,7 @@ Tokenizer::splitter(const std::string &istr, bool *continuation_ptr) {
std::vector<std::size_t> breaks;
std::set<std::size_t> suppress;
-
+
for (; icp <= ncp; ++icp) {
currwc = wchar_t(ucs4[icp]);
curr_type = g_unichar_type(currwc);
@@ -1798,7 +1798,7 @@ Tokenizer::splitter(const std::string &istr, bool *continuation_ptr) {
case G_UNICODE_OTHER_NUMBER:
curr_class = numba;
curr_word_p = true;
- break;
+ break;
case G_UNICODE_LOWERCASE_LETTER:
case G_UNICODE_MODIFIER_LETTER:
case G_UNICODE_OTHER_LETTER:
@@ -1822,7 +1822,7 @@ Tokenizer::splitter(const std::string &istr, bool *continuation_ptr) {
} else if (currwc >= SMAL_HYPH) {
curr_word_p = true;
} else {
- curr_word_p = (currwc >= WAVE_DASH) && (currwc <= KANA_DHYP);
+ curr_word_p = (currwc >= WAVE_DASH) && (currwc <= KANA_DHYP);
}
break;
case G_UNICODE_CLOSE_PUNCTUATION:
@@ -1860,7 +1860,7 @@ Tokenizer::splitter(const std::string &istr, bool *continuation_ptr) {
curr_word_p = false;
break;
}
-
+
// # condition for prefix test
// $words[$i] =~ /([\p{IsAlnum}\.\-]*)([\'\"\)\]\%\p{IsPf}]*)(\.+)$/
// $words[$i+1] =~ /^([ ]*[\'\"\(\[\¿\¡\p{IsPi}]*[ ]*[\p{IsUpper}0-9])/
@@ -1875,7 +1875,7 @@ Tokenizer::splitter(const std::string &istr, bool *continuation_ptr) {
} else if (curr_word_p) {
if (!fini_word) {
init_word = ocp;
- }
+ }
fini_word = ocp+1;
dotslen = finilen = 0;
} else if (curr_class >= quote && curr_class <= pfpct && curr_class != pinit) {
@@ -1893,7 +1893,7 @@ Tokenizer::splitter(const std::string &istr, bool *continuation_ptr) {
} else {
init_word = fini_word = 0;
}
-
+
if (check_abbr_p) {
// not a valid word character or post-word punctuation character: check word
std::wstring k((wchar_t *)uout+init_word,fini_word-init_word);
@@ -1986,7 +1986,7 @@ Tokenizer::splitter(const std::string &istr, bool *continuation_ptr) {
}
init_word = fini_word = 0;
}
-
+
if (seqpos >= SEQ_LIM) {
seqpos = 0;
}
@@ -2015,7 +2015,7 @@ Tokenizer::splitter(const std::string &istr, bool *continuation_ptr) {
continue;
}
}
-
+
if (!seqpos) {
if (curr_class != blank) {
uout[ocp++] = gunichar(currwc);
@@ -2024,7 +2024,7 @@ Tokenizer::splitter(const std::string &istr, bool *continuation_ptr) {
}
continue;
}
-
+
if (curr_class == blank) {
if (prev_class != blank) {
seq[seqpos] = blank;
@@ -2034,7 +2034,7 @@ Tokenizer::splitter(const std::string &istr, bool *continuation_ptr) {
}
if (icp < ncp)
continue;
- }
+ }
if (curr_class >= quote && curr_class <= pfini) {
if (prev_class < quote || prev_class > pfini) {
@@ -2158,8 +2158,8 @@ Tokenizer::splitter(const std::string &istr, bool *continuation_ptr) {
if (uout[chkpos] == L'\n' || uout[chkpos] == L' ') {
endpos = chkpos;
continue;
- }
- if (g_unichar_isgraph(uout[chkpos]))
+ }
+ if (g_unichar_isgraph(uout[chkpos]))
break;
endpos = chkpos;
}
@@ -2171,17 +2171,17 @@ Tokenizer::splitter(const std::string &istr, bool *continuation_ptr) {
if (continuation_ptr)
*continuation_ptr = endpos > iop;
iop = nextpos;
- }
-
+ }
+
g_free(uout);
g_free(ucs4);
-
+
return parts;
}
std::pair<std::size_t,std::size_t>
-Tokenizer::splitter(std::istream& is, std::ostream& os)
+Tokenizer::splitter(std::istream& is, std::ostream& os)
{
std::pair<std::size_t,std::size_t> counts = { 0, 0 };
bool continuation_p = false;
@@ -2197,7 +2197,7 @@ Tokenizer::splitter(std::istream& is, std::ostream& os)
if (istr.empty() && (is.eof() ||!para_marks_p))
continue;
- if (skip_xml_p && (RE2::FullMatch(istr,tag_line_x) || RE2::FullMatch(istr,white_line_x)))
+ if (skip_xml_p && (RE2::FullMatch(istr,tag_line_x) || RE2::FullMatch(istr,white_line_x)))
continue;
std::vector<std::string> sentences(splitter(istr,&continuation_p));
@@ -2221,13 +2221,13 @@ Tokenizer::splitter(std::istream& is, std::ostream& os)
os << " ";
pending_gap = false;
}
-
- for (std::size_t ii = 0; ii < nsents-1; ++ii)
+
+ for (std::size_t ii = 0; ii < nsents-1; ++ii)
os << sentences[ii] << std::endl;
-
+
os << sentences[nsents-1];
- if (continuation_p)
+ if (continuation_p)
pending_gap = !split_breaks_p;
if (!pending_gap)
os << std::endl;
diff --git a/contrib/c++tokenizer/tokenizer.h b/contrib/c++tokenizer/tokenizer.h
index cc1de2770..978f20197 100644
--- a/contrib/c++tokenizer/tokenizer.h
+++ b/contrib/c++tokenizer/tokenizer.h
@@ -26,7 +26,7 @@ class Tokenizer {
private:
- typedef enum {
+ typedef enum {
empty = 0,
blank,
upper, // upper case
@@ -56,7 +56,7 @@ private:
// non-breaking prefixes (other) ucs4
std::set<std::wstring> nbpre_gen_ucs4;
- // compiled protected patterns
+ // compiled protected patterns
std::vector<re2::RE2 *> prot_pat_vec;
protected:
@@ -96,10 +96,10 @@ protected:
Tokenizer *tokenizer;
std::vector<std::string>& in;
std::vector<std::string>& out;
-
- VectorTokenizerCallable(Tokenizer *_tokenizer,
- std::vector<std::string>& _in,
- std::vector<std::string>& _out)
+
+ VectorTokenizerCallable(Tokenizer *_tokenizer,
+ std::vector<std::string>& _in,
+ std::vector<std::string>& _out)
: tokenizer(_tokenizer)
, in(_in)
, out(_out) {
@@ -107,10 +107,10 @@ protected:
void operator()() {
out.resize(in.size());
- for (std::size_t ii = 0; ii < in.size(); ++ii)
+ for (std::size_t ii = 0; ii < in.size(); ++ii)
if (in[ii].empty())
out[ii] = in[ii];
- else if (tokenizer->penn_p)
+ else if (tokenizer->penn_p)
out[ii] = tokenizer->penn_tokenize(in[ii]);
else
out[ii] = tokenizer->quik_tokenize(in[ii]);
diff --git a/contrib/c++tokenizer/tokenizer_main.cpp b/contrib/c++tokenizer/tokenizer_main.cpp
index 7adb599e7..358a68cc3 100644
--- a/contrib/c++tokenizer/tokenizer_main.cpp
+++ b/contrib/c++tokenizer/tokenizer_main.cpp
@@ -10,8 +10,8 @@ using namespace TOKENIZER_NAMESPACE ;
#endif
-void
-usage(const char *path)
+void
+usage(const char *path)
{
std::cerr << "Usage: " << path << "[-{v|x|p|a|e|s|u|n|N]* [LL] [-{c|o} PATH]* INFILE*" << std::endl;
std::cerr << " -a -- aggressive hyphenization" << std::endl;
@@ -89,7 +89,7 @@ copy_words(Tokenizer& tize, std::istream& ifs, std::ostream& ofs) {
int nlines = 0;
std::string line;
while (ifs.good() && std::getline(ifs,line)) {
- if (line.empty())
+ if (line.empty())
continue;
std::vector<std::string> tokens(tize.tokens(line));
int count = 0;
@@ -127,7 +127,7 @@ copy_words(Tokenizer& tize, std::istream& ifs, std::ostream& ofs) {
}
-int main(int ac, char **av)
+int main(int ac, char **av)
{
int rc = 0;
Parameters params;
@@ -140,7 +140,7 @@ int main(int ac, char **av)
if (!detokenize_p)
params.split_p = std::strstr(av[0],"splitter") != 0;
- while (++av,--ac) {
+ while (++av,--ac) {
if (**av == '-') {
switch (av[0][1]) {
case 'a':
@@ -244,7 +244,7 @@ int main(int ac, char **av)
if (comma) {
*comma++ = 0;
params.chunksize = std::strtoul(comma,0,0);
- }
+ }
params.nthreads = std::strtoul(*av,0,0);
} else {
params.args.push_back(std::string(*av));
@@ -275,7 +275,7 @@ int main(int ac, char **av)
cfg_mos_str.append("/moses");
if (!::access(cfg_mos_str.c_str(),X_OK)) {
params.cfg_path = strdup(cfg_mos_str.c_str());
- } else if (!::access(cfg_shr_str.c_str(),X_OK)) {
+ } else if (!::access(cfg_shr_str.c_str(),X_OK)) {
params.cfg_path = strdup(cfg_shr_str.c_str());
} else if (!::access(cfg_dir_str.c_str(),X_OK)) {
params.cfg_path = strdup(cfg_dir_str.c_str());
@@ -287,7 +287,7 @@ int main(int ac, char **av)
if (params.verbose_p) {
std::cerr << "config path: " << params.cfg_path << std::endl;
}
- }
+ }
std::unique_ptr<std::ofstream> pofs = 0;
if (!params.out_path.empty()) {
@@ -345,7 +345,7 @@ int main(int ac, char **av)
if (plines.second) {
std::cerr << "%%% " << plines.second << " sentences." << std::endl;
}
- }
+ }
return rc;
}
diff --git a/contrib/eppex/ISS.h b/contrib/eppex/ISS.h
index 7921fcbf8..9c4e1fc22 100644
--- a/contrib/eppex/ISS.h
+++ b/contrib/eppex/ISS.h
@@ -1,236 +1,236 @@
-/**
- * ISS (Indexed Strings Storage) - memory efficient storage for permanent strings.
- *
- * Implementation note: use #define USE_HASHSET to switch between implementation
- * using __gnu_cxx::hash_set and implementation using std::set.
- *
- * (C) Ceslav Przywara, UFAL MFF UK, 2011
- *
- * $Id$
- */
-
-#ifndef _ISS_H
-#define _ISS_H
-
-#include <limits>
-#include <vector>
-#include <string.h>
-
-// Use hashset instead of std::set for string-to-number indexing?
-#ifdef USE_HASHSET
-#include <ext/hash_set>
-#else
-#include <set>
-#endif
-
-#include <boost/pool/pool.hpp>
-
-#ifdef USE_HASHSET
-// Forward declaration of comparator functor.
-template<class IndType>
-class StringsEqualComparator;
-
-template<class IndType>
-class Hasher;
-#else
-// Forward declaration of comparator functor.
-template<class IndType>
-class StringsLessComparator;
-#endif
-
-/**
- */
-template<class IndType>
-class IndexedStringsStorage {
-
-public:
-
- typedef IndType index_type;
-
-#ifdef USE_HASHSET
- typedef StringsEqualComparator<IndType> equality_comparator_t;
-
- typedef Hasher<IndType> hasher_t;
-
- /** @typedef Hash set used as lookup table (string -> numeric index). */
- typedef __gnu_cxx::hash_set<IndType, hasher_t, equality_comparator_t> index_t;
-#else
- typedef StringsLessComparator<IndType> less_comparator_t;
-
- /** @typedef Set used as lookup table (string -> numeric index). */
- typedef std::set<IndType, less_comparator_t> index_t;
-#endif
- /** @typedef Container of pointers to stored C-strings. Acts as
- * conversion table: numeric index -> string.
- */
- typedef std::vector<const char*> table_t;
-
-private:
-
- /** @var memory pool used to store C-strings */
- boost::pool<> _storage;
-
- /** @var index-to-string conversion table */
- table_t _table;
-
- /** @var index lookup table */
- index_t _index;
-
-public:
- /** Default constructor.
- */
- IndexedStringsStorage(void);
-
- /** @return True, if the indices are exhausted (new strings cannot be stored).
- */
- inline bool is_full(void) const { return _table.size() == std::numeric_limits<IndType>::max(); }
-
- /** Retrieves pointer to C-string instance represented by given index.
- * Note: No range checks are performed!
- * @param index Index of C-string to retrieve.
- * @return Pointer to stored C-string instance.
- */
- inline const char* get(IndType index) const { return _table[index]; }
-
- /** Stores the string and returns its numeric index.
- * @param str Pointer to C-string to store.
- * @return Index of stored copy of str.
- * @throw std::bad_alloc When insertion of new string would cause
- * overflow of indices datatype.
- */
- IndType put(const char* str);
-
- /** @return Number of unique strings stored so far.
- */
- inline table_t::size_type size(void) const { return _table.size(); }
-};
-
-
-/** Functor designed for less than comparison of C-strings stored within StringStore.
- * @param IndType Type of numerical indices of strings within given StringStore.
- */
-#ifdef USE_HASHSET
-template<class IndType>
-class StringsEqualComparator: public std::binary_function<IndType, IndType, bool> {
-#else
-template<class IndType>
-class StringsLessComparator: public std::binary_function<IndType, IndType, bool> {
-#endif
- /** @var conversion table: index -> string (necessary for indices comparison) */
- const typename IndexedStringsStorage<IndType>::table_t& _table;
-public:
-#ifdef USE_HASHSET
- StringsEqualComparator<IndType>(const typename IndexedStringsStorage<IndType>::table_t& table): _table(table) {}
-#else
- StringsLessComparator<IndType>(const typename IndexedStringsStorage<IndType>::table_t& table): _table(table) {}
-#endif
-
- /** Comparison of two pointers to C-strings.
- * @param lhs Pointer to 1st C-string.
- * @param rhs Pointer to 2nd C-string.
- * @return True, if 1st argument is equal/less than 2nd argument.
- */
- inline bool operator()(IndType lhs, IndType rhs) const {
-#ifdef USE_HASHSET
- return strcmp(_table[lhs], _table[rhs]) == 0;
-#else
- return strcmp(_table[lhs], _table[rhs]) < 0;
-#endif
- }
-};
-
-#ifdef USE_HASHSET
-/** Functor... TODO.
- */
-template<class IndType>
-class Hasher: public std::unary_function<IndType, size_t> {
-
- __gnu_cxx::hash<const char*> _hash;
-
- /** @var conversion table: index -> string (necessary for indices comparison) */
- const typename IndexedStringsStorage<IndType>::table_t& _table;
-
-public:
- /** */
- Hasher<IndType>(const typename IndexedStringsStorage<IndType>::table_t& table): _hash(), _table(table) {}
-
- /** Hashing function.
- * @param index
- * @return Counted hash.
- */
- inline size_t operator()(const IndType index) const {
- return _hash(_table[index]);
- }
-};
-#endif
-
-template <class IndType>
-#ifdef USE_HASHSET
-IndexedStringsStorage<IndType>::IndexedStringsStorage(void): _storage(sizeof(char)), _table(), _index(100, hasher_t(_table), equality_comparator_t(_table)) {}
-#else
-IndexedStringsStorage<IndType>::IndexedStringsStorage(void): _storage(sizeof(char)), _table(), _index(less_comparator_t(_table)) {}
-#endif
-
-template <class IndType>
-IndType IndexedStringsStorage<IndType>::put(const char* str) {
-
- if ( this->is_full() ) {
- // What a pity, not a single index left to spend.
- throw std::bad_alloc();
- }
-
- // To use the index for lookup we first have to store passed string
- // in conversion table (cause during lookup we compare the strings indirectly
- // by using their indices).
- // Note: thread unsafe! TODO: Redesing.
- IndType index = static_cast<IndType>(_table.size());
- _table.push_back(str);
-
-#ifdef USE_HASHSET
- //
- typename index_t::iterator iIndex = _index.find(index);
-#else
- // A lower_bound() search enables us to use "found" iterator as a hint for
- // eventual insertion.
- typename index_t::iterator iIndex = _index.lower_bound(index);
-#endif
-
- if ( (iIndex != _index.end())
-#ifndef USE_HASHSET
- // In case of lower_bound() search we have to also compare found item
- // with passed string.
- && (strcmp(_table[*iIndex], str) == 0)
-#endif
- ) {
- // String is already present in storage!
- // Pop back temporary stored pointer...
- _table.pop_back();
- // ...and return numeric index to already stored copy of `str`.
- return static_cast<IndType>(*iIndex);
- }
-
- // String not found within storage.
-
- // Allocate memory required for string storage...
- char* mem = static_cast<char*>(_storage.ordered_malloc(strlen(str) + 1));
- // ...and fill it with copy of passed string.
- strcpy(mem, str);
-
- // Overwrite temporary stored pointer to `str` with pointer to freshly
- // saved copy.
- _table[index] = mem;
-
-#ifdef USE_HASHSET
- // Insert the index into lookup table.
- _index.insert(index);
-#else
- // Insert the index into lookup table (use previously retrieved iterator
- // as a hint).
- _index.insert(iIndex, index);
-#endif
-
- // Finally.
- return index;
-}
-
-#endif
+/**
+ * ISS (Indexed Strings Storage) - memory efficient storage for permanent strings.
+ *
+ * Implementation note: use #define USE_HASHSET to switch between implementation
+ * using __gnu_cxx::hash_set and implementation using std::set.
+ *
+ * (C) Ceslav Przywara, UFAL MFF UK, 2011
+ *
+ * $Id$
+ */
+
+#ifndef _ISS_H
+#define _ISS_H
+
+#include <limits>
+#include <vector>
+#include <string.h>
+
+// Use hashset instead of std::set for string-to-number indexing?
+#ifdef USE_HASHSET
+#include <ext/hash_set>
+#else
+#include <set>
+#endif
+
+#include <boost/pool/pool.hpp>
+
+#ifdef USE_HASHSET
+// Forward declaration of comparator functor.
+template<class IndType>
+class StringsEqualComparator;
+
+template<class IndType>
+class Hasher;
+#else
+// Forward declaration of comparator functor.
+template<class IndType>
+class StringsLessComparator;
+#endif
+
+/**
+ */
+template<class IndType>
+class IndexedStringsStorage {
+
+public:
+
+ typedef IndType index_type;
+
+#ifdef USE_HASHSET
+ typedef StringsEqualComparator<IndType> equality_comparator_t;
+
+ typedef Hasher<IndType> hasher_t;
+
+ /** @typedef Hash set used as lookup table (string -> numeric index). */
+ typedef __gnu_cxx::hash_set<IndType, hasher_t, equality_comparator_t> index_t;
+#else
+ typedef StringsLessComparator<IndType> less_comparator_t;
+
+ /** @typedef Set used as lookup table (string -> numeric index). */
+ typedef std::set<IndType, less_comparator_t> index_t;
+#endif
+ /** @typedef Container of pointers to stored C-strings. Acts as
+ * conversion table: numeric index -> string.
+ */
+ typedef std::vector<const char*> table_t;
+
+private:
+
+ /** @var memory pool used to store C-strings */
+ boost::pool<> _storage;
+
+ /** @var index-to-string conversion table */
+ table_t _table;
+
+ /** @var index lookup table */
+ index_t _index;
+
+public:
+ /** Default constructor.
+ */
+ IndexedStringsStorage(void);
+
+ /** @return True, if the indices are exhausted (new strings cannot be stored).
+ */
+ inline bool is_full(void) const { return _table.size() == std::numeric_limits<IndType>::max(); }
+
+ /** Retrieves pointer to C-string instance represented by given index.
+ * Note: No range checks are performed!
+ * @param index Index of C-string to retrieve.
+ * @return Pointer to stored C-string instance.
+ */
+ inline const char* get(IndType index) const { return _table[index]; }
+
+ /** Stores the string and returns its numeric index.
+ * @param str Pointer to C-string to store.
+ * @return Index of stored copy of str.
+ * @throw std::bad_alloc When insertion of new string would cause
+ * overflow of indices datatype.
+ */
+ IndType put(const char* str);
+
+ /** @return Number of unique strings stored so far.
+ */
+ inline table_t::size_type size(void) const { return _table.size(); }
+};
+
+
+/** Functor designed for less than comparison of C-strings stored within StringStore.
+ * @param IndType Type of numerical indices of strings within given StringStore.
+ */
+#ifdef USE_HASHSET
+template<class IndType>
+class StringsEqualComparator: public std::binary_function<IndType, IndType, bool> {
+#else
+template<class IndType>
+class StringsLessComparator: public std::binary_function<IndType, IndType, bool> {
+#endif
+ /** @var conversion table: index -> string (necessary for indices comparison) */
+ const typename IndexedStringsStorage<IndType>::table_t& _table;
+public:
+#ifdef USE_HASHSET
+ StringsEqualComparator<IndType>(const typename IndexedStringsStorage<IndType>::table_t& table): _table(table) {}
+#else
+ StringsLessComparator<IndType>(const typename IndexedStringsStorage<IndType>::table_t& table): _table(table) {}
+#endif
+
+ /** Comparison of two pointers to C-strings.
+ * @param lhs Pointer to 1st C-string.
+ * @param rhs Pointer to 2nd C-string.
+ * @return True, if 1st argument is equal/less than 2nd argument.
+ */
+ inline bool operator()(IndType lhs, IndType rhs) const {
+#ifdef USE_HASHSET
+ return strcmp(_table[lhs], _table[rhs]) == 0;
+#else
+ return strcmp(_table[lhs], _table[rhs]) < 0;
+#endif
+ }
+};
+
+#ifdef USE_HASHSET
+/** Functor... TODO.
+ */
+template<class IndType>
+class Hasher: public std::unary_function<IndType, size_t> {
+
+ __gnu_cxx::hash<const char*> _hash;
+
+ /** @var conversion table: index -> string (necessary for indices comparison) */
+ const typename IndexedStringsStorage<IndType>::table_t& _table;
+
+public:
+ /** */
+ Hasher<IndType>(const typename IndexedStringsStorage<IndType>::table_t& table): _hash(), _table(table) {}
+
+ /** Hashing function.
+ * @param index
+ * @return Counted hash.
+ */
+ inline size_t operator()(const IndType index) const {
+ return _hash(_table[index]);
+ }
+};
+#endif
+
+template <class IndType>
+#ifdef USE_HASHSET
+IndexedStringsStorage<IndType>::IndexedStringsStorage(void): _storage(sizeof(char)), _table(), _index(100, hasher_t(_table), equality_comparator_t(_table)) {}
+#else
+IndexedStringsStorage<IndType>::IndexedStringsStorage(void): _storage(sizeof(char)), _table(), _index(less_comparator_t(_table)) {}
+#endif
+
+template <class IndType>
+IndType IndexedStringsStorage<IndType>::put(const char* str) {
+
+ if ( this->is_full() ) {
+ // What a pity, not a single index left to spend.
+ throw std::bad_alloc();
+ }
+
+ // To use the index for lookup we first have to store passed string
+ // in conversion table (cause during lookup we compare the strings indirectly
+ // by using their indices).
+ // Note: thread unsafe! TODO: Redesing.
+ IndType index = static_cast<IndType>(_table.size());
+ _table.push_back(str);
+
+#ifdef USE_HASHSET
+ //
+ typename index_t::iterator iIndex = _index.find(index);
+#else
+ // A lower_bound() search enables us to use "found" iterator as a hint for
+ // eventual insertion.
+ typename index_t::iterator iIndex = _index.lower_bound(index);
+#endif
+
+ if ( (iIndex != _index.end())
+#ifndef USE_HASHSET
+ // In case of lower_bound() search we have to also compare found item
+ // with passed string.
+ && (strcmp(_table[*iIndex], str) == 0)
+#endif
+ ) {
+ // String is already present in storage!
+ // Pop back temporary stored pointer...
+ _table.pop_back();
+ // ...and return numeric index to already stored copy of `str`.
+ return static_cast<IndType>(*iIndex);
+ }
+
+ // String not found within storage.
+
+ // Allocate memory required for string storage...
+ char* mem = static_cast<char*>(_storage.ordered_malloc(strlen(str) + 1));
+ // ...and fill it with copy of passed string.
+ strcpy(mem, str);
+
+ // Overwrite temporary stored pointer to `str` with pointer to freshly
+ // saved copy.
+ _table[index] = mem;
+
+#ifdef USE_HASHSET
+ // Insert the index into lookup table.
+ _index.insert(index);
+#else
+ // Insert the index into lookup table (use previously retrieved iterator
+ // as a hint).
+ _index.insert(iIndex, index);
+#endif
+
+ // Finally.
+ return index;
+}
+
+#endif
diff --git a/contrib/eppex/LossyCounter.h b/contrib/eppex/LossyCounter.h
index 2796c8090..a3cf3339f 100644
--- a/contrib/eppex/LossyCounter.h
+++ b/contrib/eppex/LossyCounter.h
@@ -83,7 +83,7 @@ public:
const counter_t bucketWidth; // ceil(1/error)
private:
-
+
/** @var Current epoch bucket ID (b-current) */
counter_t _bucketId;
@@ -182,7 +182,7 @@ class LossyCounterIterator: public std::iterator<std::forward_iterator_tag, type
public:
typedef LossyCounterIterator<T> self_type;
-
+
typedef typename LossyCounter<T>::storage_t::const_iterator const_iterator;
protected:
@@ -288,7 +288,7 @@ protected:
template<class T>
void LossyCounter<T>::add(const T& item) {
-
+
typename storage_t::iterator iter = _storage.find(item);
if ( iter == _storage.end() ) {
@@ -330,7 +330,7 @@ void LossyCounter<T>::prune(void) {
////////////////////////////////////////////////////////////////////////////////
template<class T>
-LossyCounterIterator<T> LossyCounterIterator<T>::operator++(void) {
+LossyCounterIterator<T> LossyCounterIterator<T>::operator++(void) {
this->forward();
return *this;
}
diff --git a/contrib/eppex/eppex.cpp b/contrib/eppex/eppex.cpp
index d382890d2..76490d9d2 100644
--- a/contrib/eppex/eppex.cpp
+++ b/contrib/eppex/eppex.cpp
@@ -92,7 +92,7 @@ int main(int argc, char* argv[]) {
// Init lossy counters.
std::string lossyCountersParams;
int paramIdx = 5;
-
+
while ( (argc > paramIdx) && (*argv[paramIdx] != '-') ) {
std::string param = std::string(argv[paramIdx]);
if ( !parse_lossy_counting_params(param) ) {
@@ -113,7 +113,7 @@ int main(int argc, char* argv[]) {
usage(argv[0]);
}
}
-
+
if ( (argc > paramIdx) && (strcmp(argv[paramIdx], "--compact") == 0) ) {
compactOutputFlag = true;
++paramIdx;
@@ -154,7 +154,7 @@ int main(int argc, char* argv[]) {
readInput(eFile, fFile, aFile);
std::cerr << std::endl; // Leave the progress bar end on previous line.
-
+
// close input files
eFile.close();
fFile.close();
diff --git a/contrib/eppex/phrase-extract.cpp b/contrib/eppex/phrase-extract.cpp
index 5dff43b78..46337a8b7 100644
--- a/contrib/eppex/phrase-extract.cpp
+++ b/contrib/eppex/phrase-extract.cpp
@@ -32,14 +32,14 @@ typedef std::vector<output_pair_t> output_vector_t;
class PhraseComp {
/** @var If true, sort by target phrase first. */
bool _inverted;
-
+
bool compareAlignments(const indexed_phrases_pair_t& a, const indexed_phrases_pair_t& b);
int comparePhrases(const indexed_phrases_pair_t::phrase_t& a, const indexed_phrases_pair_t::phrase_t& b);
-
+
public:
PhraseComp(bool inverted): _inverted(inverted) {}
-
+
bool operator()(const output_pair_t& a, const output_pair_t& b);
};
@@ -448,9 +448,9 @@ void extract(SentenceAlignment &sentence) {
((phraseModel)? getOrientString(phrasePrevOrient, phraseType) + " " + getOrientString(phraseNextOrient, phraseType) : "") + " | " +
((hierModel)? getOrientString(hierPrevOrient, hierType) + " " + getOrientString(hierNextOrient, hierType) : "");
}
-
+
addPhrase(sentence, startE, endE, startF, endF, orientationInfo);
-
+
} // end of for loop through inbound phrases
} // end if buildExtraStructure
@@ -567,7 +567,7 @@ bool PhraseComp::operator()(const output_pair_t& a, const output_pair_t& b) {
else {
return cmp < 0;
}
-
+
}
@@ -607,7 +607,7 @@ bool PhraseComp::compareAlignments(const indexed_phrases_pair_t& a, const indexe
return cmp < 0;
}
}
-
+
// Note: LC_ALL=C GNU sort treats shorter item as lesser than longer one.
return (cmp == 0) ? (aSize < bSize) : (cmp < 0);
@@ -685,7 +685,7 @@ void processSortedOutput(OutputProcessor& processor) {
void processUnsortedOutput(OutputProcessor& processor) {
-
+
LossyCountersVector::value_type current = NULL, prev = NULL;
for ( size_t i = 1; i < lossyCounters.size(); ++i ) { // Intentionally skip 0.
@@ -759,7 +759,7 @@ void printStats(void) {
if ( (current == NULL) || ((current != prev) && (prev != NULL)) ) {
// Time to print.
to = i-1;
-
+
// Increment overall stats.
outputMass += prev->outputMass;
outputSize += prev->outputSize;
@@ -787,7 +787,7 @@ void printStats(void) {
from = i;
}
-
+
prev = current;
}
diff --git a/contrib/mira/Main.cpp b/contrib/mira/Main.cpp
index abf92b598..acc2f8886 100644
--- a/contrib/mira/Main.cpp
+++ b/contrib/mira/Main.cpp
@@ -46,6 +46,7 @@ namespace mpi = boost::mpi;
#include "moses/FF/PhrasePairFeature.h"
#include "moses/FF/WordPenaltyProducer.h"
#include "moses/LM/Base.h"
+#include "util/random.hh"
using namespace Mira;
using namespace std;
@@ -54,6 +55,7 @@ namespace po = boost::program_options;
int main(int argc, char** argv)
{
+ util::rand_init();
size_t rank = 0;
size_t size = 1;
#ifdef MPI_ENABLE
diff --git a/contrib/mira/Main.h b/contrib/mira/Main.h
index 8736257f6..b8faedae7 100644
--- a/contrib/mira/Main.h
+++ b/contrib/mira/Main.h
@@ -25,6 +25,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
#include "moses/Word.h"
#include "moses/FF/FeatureFunction.h"
#include "Decoder.h"
+#include "util/random.hh"
typedef std::map<const Moses::FeatureFunction*, std::vector< float > > ProducerWeightMap;
typedef std::pair<const Moses::FeatureFunction*, std::vector< float > > ProducerWeightPair;
@@ -37,8 +38,7 @@ template <class T> bool from_string(T& t, const std::string& s, std::ios_base& (
struct RandomIndex {
ptrdiff_t operator()(ptrdiff_t max) {
- srand(time(0)); // Initialize random number generator with current time.
- return static_cast<ptrdiff_t> (rand() % max);
+ return util::rand_excl(max);
}
};
diff --git a/contrib/other-builds/cmake/boost.example/main.cpp b/contrib/other-builds/cmake/boost.example/main.cpp
index 7b95fb2a9..b77388e46 100644
--- a/contrib/other-builds/cmake/boost.example/main.cpp
+++ b/contrib/other-builds/cmake/boost.example/main.cpp
@@ -10,15 +10,15 @@ int main(int argc, char* argv[])
using namespace boost::locale;
using namespace std;
-
+
generator gen;
locale loc=gen("");
-
+
cout.imbue(loc);
-
+
cout << "Hello, World" << endl;
-
+
cout << "This is how we show currency in this locale " << as::currency << 103.34 << endl;
-
+
return 0;
}
diff --git a/contrib/other-builds/moses-cmd/moses-cmd.project b/contrib/other-builds/moses-cmd/moses-cmd.project
index b978b451e..ecef4038b 100644
--- a/contrib/other-builds/moses-cmd/moses-cmd.project
+++ b/contrib/other-builds/moses-cmd/moses-cmd.project
@@ -1,5 +1,22 @@
<?xml version="1.0" encoding="UTF-8"?>
<CodeLite_Project Name="moses-cmd" InternalType="Console">
+ <Plugins>
+ <Plugin Name="CMakePlugin">
+ <![CDATA[[{
+ "name": "Debug",
+ "enabled": false,
+ "buildDirectory": "build",
+ "sourceDirectory": "$(ProjectPath)",
+ "generator": "",
+ "buildType": "",
+ "arguments": [],
+ "parentProject": ""
+ }]]]>
+ </Plugin>
+ <Plugin Name="qmake">
+ <![CDATA[00010001N0005Debug000000000000]]>
+ </Plugin>
+ </Plugins>
<Description/>
<Dependencies/>
<VirtualDirectory Name="src"/>
@@ -9,6 +26,14 @@
<File Name="../../../moses-cmd/MainVW.cpp" ExcludeProjConfig="Debug"/>
<File Name="../../../moses-cmd/MainVW.h" ExcludeProjConfig="Debug"/>
</VirtualDirectory>
+ <Dependencies Name="Release"/>
+ <Dependencies Name="Debug">
+ <Project Name="OnDiskPt"/>
+ <Project Name="lm"/>
+ <Project Name="moses"/>
+ <Project Name="search"/>
+ <Project Name="util"/>
+ </Dependencies>
<Settings Type="Executable">
<GlobalSettings>
<Compiler Options="" C_Options="" Assembler="">
@@ -53,7 +78,7 @@
<Library Value="rt"/>
</Linker>
<ResourceCompiler Options="" Required="no"/>
- <General OutputFile="$(IntermediateDirectory)/$(ProjectName)" IntermediateDirectory="./Debug" Command="./$(ProjectName)" CommandArguments="" UseSeparateDebugArgs="no" DebugArguments="" WorkingDirectory="$(IntermediateDirectory)" PauseExecWhenProcTerminates="yes" IsGUIProgram="no" IsEnabled="yes"/>
+ <General OutputFile="$(IntermediateDirectory)/$(ProjectName)" IntermediateDirectory="./Debug" Command="./$(ProjectName)" CommandArguments="-f /var/folders/c4/2p48fcwx611dmkdqq44mbblm0000gn/T/ZVd8xvuJAR.ini -i /Users/hieu/workspace/github/moses-regression-tests/tests/phrase.basic-surface-binptable.oldformat/to-translate.txt" UseSeparateDebugArgs="yes" DebugArguments="" WorkingDirectory="$(IntermediateDirectory)" PauseExecWhenProcTerminates="yes" IsGUIProgram="no" IsEnabled="yes"/>
<Environment EnvVarSetName="&lt;Use Defaults&gt;" DbgSetName="&lt;Use Defaults&gt;">
<![CDATA[]]>
</Environment>
@@ -125,12 +150,4 @@
</Completion>
</Configuration>
</Settings>
- <Dependencies Name="Release"/>
- <Dependencies Name="Debug">
- <Project Name="OnDiskPt"/>
- <Project Name="lm"/>
- <Project Name="moses"/>
- <Project Name="search"/>
- <Project Name="util"/>
- </Dependencies>
</CodeLite_Project>
diff --git a/contrib/other-builds/moses/moses.project b/contrib/other-builds/moses/moses.project
index 55bf4e8f1..adebcdfb4 100644
--- a/contrib/other-builds/moses/moses.project
+++ b/contrib/other-builds/moses/moses.project
@@ -778,6 +778,8 @@
<File Name="../../../moses/PP/SpanLengthPhraseProperty.h"/>
<File Name="../../../moses/PP/TreeStructurePhraseProperty.h"/>
</VirtualDirectory>
+ <Dependencies Name="Debug"/>
+ <Dependencies Name="Release"/>
<Settings Type="Static Library">
<GlobalSettings>
<Compiler Options="" C_Options="" Assembler="">
@@ -796,6 +798,7 @@
<IncludePath Value="/Users/hieu/workspace/github/mosesdecoder/boost/include"/>
<Preprocessor Value="MAX_NUM_FACTORS=4"/>
<Preprocessor Value="KENLM_MAX_ORDER=7"/>
+ <Preprocessor Value="WITH_THREADS"/>
</Compiler>
<Linker Options="" Required="yes"/>
<ResourceCompiler Options="" Required="no"/>
@@ -870,6 +873,4 @@
</Completion>
</Configuration>
</Settings>
- <Dependencies Name="Debug"/>
- <Dependencies Name="Release"/>
</CodeLite_Project>
diff --git a/contrib/other-builds/util/util.project b/contrib/other-builds/util/util.project
index 573c78296..1006ddb52 100644
--- a/contrib/other-builds/util/util.project
+++ b/contrib/other-builds/util/util.project
@@ -30,6 +30,8 @@
<File Name="../../../util/string_piece.cc"/>
<File Name="../../../util/tokenize_piece_test.cc" ExcludeProjConfig="Debug"/>
<File Name="../../../util/usage.cc"/>
+ <File Name="../../../util/random.cc"/>
+ <File Name="../../../util/random.hh"/>
</VirtualDirectory>
<VirtualDirectory Name="double-conversion">
<File Name="../../../util/double-conversion/bignum-dtoa.cc"/>
diff --git a/contrib/python/moses/dictree.cpp b/contrib/python/moses/dictree.cpp
index 207d7c3f7..d9008f6e3 100644
--- a/contrib/python/moses/dictree.cpp
+++ b/contrib/python/moses/dictree.cpp
@@ -557,7 +557,7 @@ static const char *__pyx_f[] = {
* ctypedef vector[const_str_pointer] Tokens
* ctypedef float FValue # <<<<<<<<<<<<<<
* ctypedef vector[FValue] Scores
- *
+ *
*/
typedef float __pyx_t_5moses_8cdictree_FValue;
@@ -582,7 +582,7 @@ struct __pyx_obj_5moses_7dictree___pyx_scope_struct_8_genexpr;
/* "cdictree.pxd":4
* from libcpp.vector cimport vector
- *
+ *
* ctypedef string* str_pointer # <<<<<<<<<<<<<<
* ctypedef string* const_str_pointer "const str_pointer"
* ctypedef vector[const_str_pointer] Tokens
@@ -590,7 +590,7 @@ struct __pyx_obj_5moses_7dictree___pyx_scope_struct_8_genexpr;
typedef std::string *__pyx_t_5moses_8cdictree_str_pointer;
/* "cdictree.pxd":5
- *
+ *
* ctypedef string* str_pointer
* ctypedef string* const_str_pointer "const str_pointer" # <<<<<<<<<<<<<<
* ctypedef vector[const_str_pointer] Tokens
@@ -611,7 +611,7 @@ typedef std::vector<const str_pointer> __pyx_t_5moses_8cdictree_Tokens;
* ctypedef vector[const_str_pointer] Tokens
* ctypedef float FValue
* ctypedef vector[FValue] Scores # <<<<<<<<<<<<<<
- *
+ *
* cdef extern from 'PhraseDictionaryTree.h' namespace 'Moses':
*/
typedef std::vector<__pyx_t_5moses_8cdictree_FValue> __pyx_t_5moses_8cdictree_Scores;
@@ -619,7 +619,7 @@ struct __pyx_opt_args_5moses_7dictree_20PhraseDictionaryTree_getTargetProduction
/* "moses/dictree.pyx":252
* and os.path.isfile(stem + ".binphr.tgtvoc")
- *
+ *
* cdef TargetProduction getTargetProduction(self, cdictree.StringTgtCand& cand, wa = None, converter = None): # <<<<<<<<<<<<<<
* """Converts a StringTgtCandidate (c++ object) and possibly a word-alignment info (string) to a TargetProduction (python object)."""
* cdef list words = [cand.tokens[i].c_str() for i in xrange(cand.tokens.size())]
@@ -632,7 +632,7 @@ struct __pyx_opt_args_5moses_7dictree_20PhraseDictionaryTree_getTargetProduction
/* "moses/dictree.pyx":23
* raise TypeError('Cannot convert %s to string' % type(data))
- *
+ *
* cdef class Production(object): # <<<<<<<<<<<<<<
* """
* General class that represents a context-free production or a flat contiguous phrase.
@@ -646,7 +646,7 @@ struct __pyx_obj_5moses_7dictree_Production {
/* "moses/dictree.pyx":104
* return x >= y
- *
+ *
* cdef class Alignment(list): # <<<<<<<<<<<<<<
* """
* This represents a list of alignment points (pairs of integers).
@@ -658,7 +658,7 @@ struct __pyx_obj_5moses_7dictree_Alignment {
/* "moses/dictree.pyx":125
* return ' '.join('%d-%d' % (s, t) for s, t in self)
- *
+ *
* cdef class FValues(list): # <<<<<<<<<<<<<<
* """
* This represents a list of feature values (floats).
@@ -670,7 +670,7 @@ struct __pyx_obj_5moses_7dictree_FValues {
/* "moses/dictree.pyx":137
* return ' '.join(str(x) for x in self)
- *
+ *
* cdef class TargetProduction(Production): # <<<<<<<<<<<<<<
* """This class specializes production making it the target side of a translation rule.
* On top of lhs and rhs it comes with alignment information a tuple of real-valued features.
@@ -684,9 +684,9 @@ struct __pyx_obj_5moses_7dictree_TargetProduction {
/* "moses/dictree.pyx":175
* return repr((repr(self.rhs), repr(self.lhs), repr(self.scores), repr(self.alignment)))
- *
+ *
* cdef class QueryResult(list): # <<<<<<<<<<<<<<
- *
+ *
* cdef readonly Production source
*/
struct __pyx_obj_5moses_7dictree_QueryResult {
@@ -696,10 +696,10 @@ struct __pyx_obj_5moses_7dictree_QueryResult {
/* "moses/dictree.pyx":184
- *
- *
+ *
+ *
* cdef class DictionaryTree(object): # <<<<<<<<<<<<<<
- *
+ *
* @classmethod
*/
struct __pyx_obj_5moses_7dictree_DictionaryTree {
@@ -709,7 +709,7 @@ struct __pyx_obj_5moses_7dictree_DictionaryTree {
/* "moses/dictree.pyx":202
* raise NotImplementedError
- *
+ *
* cdef class PhraseDictionaryTree(DictionaryTree): # <<<<<<<<<<<<<<
* """This class encapsulates a Moses::PhraseDictionaryTree for operations over
* binary phrase tables."""
@@ -728,9 +728,9 @@ struct __pyx_obj_5moses_7dictree_PhraseDictionaryTree {
/* "moses/dictree.pyx":290
* return results
- *
+ *
* cdef class OnDiskWrapper(DictionaryTree): # <<<<<<<<<<<<<<
- *
+ *
* cdef condiskpt.OnDiskWrapper *wrapper
*/
struct __pyx_obj_5moses_7dictree_OnDiskWrapper {
@@ -745,7 +745,7 @@ struct __pyx_obj_5moses_7dictree_OnDiskWrapper {
/* "moses/dictree.pyx":50
* return IndexError, 'Index %s out of range' % str(key)
- *
+ *
* def __iter__(self): # <<<<<<<<<<<<<<
* for x in self.rhs:
* yield x
@@ -761,10 +761,10 @@ struct __pyx_obj_5moses_7dictree___pyx_scope_struct____iter__ {
/* "moses/dictree.pyx":122
* ValueError, 'Cannot figure out pairs from: %s' % type(alignment)
- *
+ *
* def __str__(self): # <<<<<<<<<<<<<<
* return ' '.join('%d-%d' % (s, t) for s, t in self)
- *
+ *
*/
struct __pyx_obj_5moses_7dictree___pyx_scope_struct_1___str__ {
PyObject_HEAD
@@ -773,10 +773,10 @@ struct __pyx_obj_5moses_7dictree___pyx_scope_struct_1___str__ {
/* "moses/dictree.pyx":123
- *
+ *
* def __str__(self):
* return ' '.join('%d-%d' % (s, t) for s, t in self) # <<<<<<<<<<<<<<
- *
+ *
* cdef class FValues(list):
*/
struct __pyx_obj_5moses_7dictree___pyx_scope_struct_2_genexpr {
@@ -792,10 +792,10 @@ struct __pyx_obj_5moses_7dictree___pyx_scope_struct_2_genexpr {
/* "moses/dictree.pyx":134
* super(FValues, self).__init__(values)
- *
+ *
* def __str__(self): # <<<<<<<<<<<<<<
* return ' '.join(str(x) for x in self)
- *
+ *
*/
struct __pyx_obj_5moses_7dictree___pyx_scope_struct_3___str__ {
PyObject_HEAD
@@ -804,10 +804,10 @@ struct __pyx_obj_5moses_7dictree___pyx_scope_struct_3___str__ {
/* "moses/dictree.pyx":135
- *
+ *
* def __str__(self):
* return ' '.join(str(x) for x in self) # <<<<<<<<<<<<<<
- *
+ *
* cdef class TargetProduction(Production):
*/
struct __pyx_obj_5moses_7dictree___pyx_scope_struct_4_genexpr {
@@ -822,7 +822,7 @@ struct __pyx_obj_5moses_7dictree___pyx_scope_struct_4_genexpr {
/* "moses/dictree.pyx":258
* return TargetProduction(words, scores, wa)
- *
+ *
* def query(self, line, converter = lambda x: log(x), cmp = lambda x, y: fsign(y.scores[2] - x.scores[2]), key = None): # <<<<<<<<<<<<<<
* """
* Returns a list of target productions that translate a given source production
@@ -838,7 +838,7 @@ struct __pyx_obj_5moses_7dictree___pyx_scope_struct_5_query {
* cdef vector[string]* wa = NULL
* cdef Production source = Production(f.c_str() for f in fphrase) # <<<<<<<<<<<<<<
* cdef QueryResult results = QueryResult(source)
- *
+ *
*/
struct __pyx_obj_5moses_7dictree___pyx_scope_struct_6_genexpr {
PyObject_HEAD
@@ -850,7 +850,7 @@ struct __pyx_obj_5moses_7dictree___pyx_scope_struct_6_genexpr {
/* "moses/dictree.pyx":316
* return Production(tokens[:-1], tokens[-1])
- *
+ *
* def query(self, line, converter = None, cmp = None, key = None): # <<<<<<<<<<<<<<
* """
* Returns a list of target productions that translate a given source production
@@ -881,7 +881,7 @@ struct __pyx_obj_5moses_7dictree___pyx_scope_struct_8_genexpr {
/* "moses/dictree.pyx":202
* raise NotImplementedError
- *
+ *
* cdef class PhraseDictionaryTree(DictionaryTree): # <<<<<<<<<<<<<<
* """This class encapsulates a Moses::PhraseDictionaryTree for operations over
* binary phrase tables."""
@@ -895,9 +895,9 @@ static struct __pyx_vtabstruct_5moses_7dictree_PhraseDictionaryTree *__pyx_vtabp
/* "moses/dictree.pyx":290
* return results
- *
+ *
* cdef class OnDiskWrapper(DictionaryTree): # <<<<<<<<<<<<<<
- *
+ *
* cdef condiskpt.OnDiskWrapper *wrapper
*/
@@ -1608,7 +1608,7 @@ static PyObject *__pyx_codeobj__13;
static PyObject *__pyx_codeobj__15;
/* "moses/dictree.pyx":156
- *
+ *
* @staticmethod
* def desc(x, y, key = lambda r: r.scores[0]): # <<<<<<<<<<<<<<
* """Returns the sign of key(y) - key(x).
@@ -1662,7 +1662,7 @@ static PyObject *__pyx_lambda_funcdef_5moses_7dictree_lambda1(CYTHON_UNUSED PyOb
/* "moses/dictree.pyx":258
* return TargetProduction(words, scores, wa)
- *
+ *
* def query(self, line, converter = lambda x: log(x), cmp = lambda x, y: fsign(y.scores[2] - x.scores[2]), key = None): # <<<<<<<<<<<<<<
* """
* Returns a list of target productions that translate a given source production
@@ -1832,7 +1832,7 @@ static PyObject *__pyx_lambda_funcdef_5moses_7dictree_lambda3(CYTHON_UNUSED PyOb
/* "moses/dictree.pyx":12
* from math import log
- *
+ *
* cpdef int fsign(float x): # <<<<<<<<<<<<<<
* """Simply returns the sign of float x (zero is assumed +), it's defined here just so one gains a little bit with static typing"""
* return 1 if x >= 0 else -1
@@ -1849,7 +1849,7 @@ static int __pyx_f_5moses_7dictree_fsign(float __pyx_v_x, CYTHON_UNUSED int __py
* cpdef int fsign(float x):
* """Simply returns the sign of float x (zero is assumed +), it's defined here just so one gains a little bit with static typing"""
* return 1 if x >= 0 else -1 # <<<<<<<<<<<<<<
- *
+ *
* cdef bytes as_str(data):
*/
if (((__pyx_v_x >= 0.0) != 0)) {
@@ -1862,7 +1862,7 @@ static int __pyx_f_5moses_7dictree_fsign(float __pyx_v_x, CYTHON_UNUSED int __py
/* "moses/dictree.pyx":12
* from math import log
- *
+ *
* cpdef int fsign(float x): # <<<<<<<<<<<<<<
* """Simply returns the sign of float x (zero is assumed +), it's defined here just so one gains a little bit with static typing"""
* return 1 if x >= 0 else -1
@@ -1929,7 +1929,7 @@ static PyObject *__pyx_pf_5moses_7dictree_fsign(CYTHON_UNUSED PyObject *__pyx_se
/* "moses/dictree.pyx":16
* return 1 if x >= 0 else -1
- *
+ *
* cdef bytes as_str(data): # <<<<<<<<<<<<<<
* if isinstance(data, bytes):
* return data
@@ -1948,13 +1948,13 @@ static PyObject *__pyx_f_5moses_7dictree_as_str(PyObject *__pyx_v_data) {
__Pyx_RefNannySetupContext("as_str", 0);
/* "moses/dictree.pyx":17
- *
+ *
* cdef bytes as_str(data):
* if isinstance(data, bytes): # <<<<<<<<<<<<<<
* return data
* elif isinstance(data, unicode):
*/
- __pyx_t_1 = PyBytes_Check(__pyx_v_data);
+ __pyx_t_1 = PyBytes_Check(__pyx_v_data);
__pyx_t_2 = (__pyx_t_1 != 0);
if (__pyx_t_2) {
@@ -1979,7 +1979,7 @@ static PyObject *__pyx_f_5moses_7dictree_as_str(PyObject *__pyx_v_data) {
* return data.encode('UTF-8')
* raise TypeError('Cannot convert %s to string' % type(data))
*/
- __pyx_t_2 = PyUnicode_Check(__pyx_v_data);
+ __pyx_t_2 = PyUnicode_Check(__pyx_v_data);
__pyx_t_1 = (__pyx_t_2 != 0);
if (__pyx_t_1) {
@@ -1988,7 +1988,7 @@ static PyObject *__pyx_f_5moses_7dictree_as_str(PyObject *__pyx_v_data) {
* elif isinstance(data, unicode):
* return data.encode('UTF-8') # <<<<<<<<<<<<<<
* raise TypeError('Cannot convert %s to string' % type(data))
- *
+ *
*/
__Pyx_XDECREF(__pyx_r);
__pyx_t_3 = __Pyx_PyObject_GetAttrStr(__pyx_v_data, __pyx_n_s_encode); if (unlikely(!__pyx_t_3)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 20; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
@@ -2006,7 +2006,7 @@ static PyObject *__pyx_f_5moses_7dictree_as_str(PyObject *__pyx_v_data) {
* elif isinstance(data, unicode):
* return data.encode('UTF-8')
* raise TypeError('Cannot convert %s to string' % type(data)) # <<<<<<<<<<<<<<
- *
+ *
* cdef class Production(object):
*/
__pyx_t_4 = __Pyx_PyString_Format(__pyx_kp_s_Cannot_convert_s_to_string, ((PyObject *)Py_TYPE(__pyx_v_data))); if (unlikely(!__pyx_t_4)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 21; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
@@ -2025,7 +2025,7 @@ static PyObject *__pyx_f_5moses_7dictree_as_str(PyObject *__pyx_v_data) {
/* "moses/dictree.pyx":16
* return 1 if x >= 0 else -1
- *
+ *
* cdef bytes as_str(data): # <<<<<<<<<<<<<<
* if isinstance(data, bytes):
* return data
@@ -2045,7 +2045,7 @@ static PyObject *__pyx_f_5moses_7dictree_as_str(PyObject *__pyx_v_data) {
/* "moses/dictree.pyx":33
* cdef readonly tuple rhs
- *
+ *
* def __init__(self, rhs, lhs = None): # <<<<<<<<<<<<<<
* """
* :rhs right-hand side of the production (or the flat contiguous phrase) - sequence of strings
@@ -2134,7 +2134,7 @@ static int __pyx_pf_5moses_7dictree_10Production___init__(struct __pyx_obj_5mose
* """
* self.rhs = tuple(rhs) # <<<<<<<<<<<<<<
* self.lhs = lhs
- *
+ *
*/
__pyx_t_1 = PyTuple_New(1); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 38; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
__Pyx_GOTREF(__pyx_t_1);
@@ -2154,7 +2154,7 @@ static int __pyx_pf_5moses_7dictree_10Production___init__(struct __pyx_obj_5mose
* """
* self.rhs = tuple(rhs)
* self.lhs = lhs # <<<<<<<<<<<<<<
- *
+ *
* def __len__(self):
*/
if (!(likely(PyBytes_CheckExact(__pyx_v_lhs))||((__pyx_v_lhs) == Py_None)||(PyErr_Format(PyExc_TypeError, "Expected %.16s, got %.200s", "bytes", Py_TYPE(__pyx_v_lhs)->tp_name), 0))) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 39; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
@@ -2168,7 +2168,7 @@ static int __pyx_pf_5moses_7dictree_10Production___init__(struct __pyx_obj_5mose
/* "moses/dictree.pyx":33
* cdef readonly tuple rhs
- *
+ *
* def __init__(self, rhs, lhs = None): # <<<<<<<<<<<<<<
* """
* :rhs right-hand side of the production (or the flat contiguous phrase) - sequence of strings
@@ -2189,10 +2189,10 @@ static int __pyx_pf_5moses_7dictree_10Production___init__(struct __pyx_obj_5mose
/* "moses/dictree.pyx":41
* self.lhs = lhs
- *
+ *
* def __len__(self): # <<<<<<<<<<<<<<
* return len(self.rhs)
- *
+ *
*/
/* Python wrapper */
@@ -2219,10 +2219,10 @@ static Py_ssize_t __pyx_pf_5moses_7dictree_10Production_2__len__(struct __pyx_ob
__Pyx_RefNannySetupContext("__len__", 0);
/* "moses/dictree.pyx":42
- *
+ *
* def __len__(self):
* return len(self.rhs) # <<<<<<<<<<<<<<
- *
+ *
* def __getitem__(self, key):
*/
__pyx_t_1 = __pyx_v_self->rhs;
@@ -2238,10 +2238,10 @@ static Py_ssize_t __pyx_pf_5moses_7dictree_10Production_2__len__(struct __pyx_ob
/* "moses/dictree.pyx":41
* self.lhs = lhs
- *
+ *
* def __len__(self): # <<<<<<<<<<<<<<
* return len(self.rhs)
- *
+ *
*/
/* function exit code */
@@ -2256,7 +2256,7 @@ static Py_ssize_t __pyx_pf_5moses_7dictree_10Production_2__len__(struct __pyx_ob
/* "moses/dictree.pyx":44
* return len(self.rhs)
- *
+ *
* def __getitem__(self, key): # <<<<<<<<<<<<<<
* if 0 <= key < len(self.rhs):
* return self.rhs[key]
@@ -2288,7 +2288,7 @@ static PyObject *__pyx_pf_5moses_7dictree_10Production_4__getitem__(struct __pyx
__Pyx_RefNannySetupContext("__getitem__", 0);
/* "moses/dictree.pyx":45
- *
+ *
* def __getitem__(self, key):
* if 0 <= key < len(self.rhs): # <<<<<<<<<<<<<<
* return self.rhs[key]
@@ -2338,7 +2338,7 @@ static PyObject *__pyx_pf_5moses_7dictree_10Production_4__getitem__(struct __pyx
* return self.rhs[key]
* else:
* return IndexError, 'Index %s out of range' % str(key) # <<<<<<<<<<<<<<
- *
+ *
* def __iter__(self):
*/
__Pyx_XDECREF(__pyx_r);
@@ -2368,7 +2368,7 @@ static PyObject *__pyx_pf_5moses_7dictree_10Production_4__getitem__(struct __pyx
/* "moses/dictree.pyx":44
* return len(self.rhs)
- *
+ *
* def __getitem__(self, key): # <<<<<<<<<<<<<<
* if 0 <= key < len(self.rhs):
* return self.rhs[key]
@@ -2389,7 +2389,7 @@ static PyObject *__pyx_gb_5moses_7dictree_10Production_8generator(__pyx_Generato
/* "moses/dictree.pyx":50
* return IndexError, 'Index %s out of range' % str(key)
- *
+ *
* def __iter__(self): # <<<<<<<<<<<<<<
* for x in self.rhs:
* yield x
@@ -2468,11 +2468,11 @@ static PyObject *__pyx_gb_5moses_7dictree_10Production_8generator(__pyx_Generato
if (unlikely(!__pyx_sent_value)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 50; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
/* "moses/dictree.pyx":51
- *
+ *
* def __iter__(self):
* for x in self.rhs: # <<<<<<<<<<<<<<
* yield x
- *
+ *
*/
if (unlikely(__pyx_cur_scope->__pyx_v_self->rhs == Py_None)) {
PyErr_SetString(PyExc_TypeError, "'NoneType' object is not iterable");
@@ -2495,7 +2495,7 @@ static PyObject *__pyx_gb_5moses_7dictree_10Production_8generator(__pyx_Generato
* def __iter__(self):
* for x in self.rhs:
* yield x # <<<<<<<<<<<<<<
- *
+ *
* def __contains__(self, item):
*/
__Pyx_INCREF(__pyx_cur_scope->__pyx_v_x);
@@ -2519,7 +2519,7 @@ static PyObject *__pyx_gb_5moses_7dictree_10Production_8generator(__pyx_Generato
/* "moses/dictree.pyx":50
* return IndexError, 'Index %s out of range' % str(key)
- *
+ *
* def __iter__(self): # <<<<<<<<<<<<<<
* for x in self.rhs:
* yield x
@@ -2542,10 +2542,10 @@ static PyObject *__pyx_gb_5moses_7dictree_10Production_8generator(__pyx_Generato
/* "moses/dictree.pyx":54
* yield x
- *
+ *
* def __contains__(self, item): # <<<<<<<<<<<<<<
* return item in self.rhs
- *
+ *
*/
/* Python wrapper */
@@ -2571,10 +2571,10 @@ static int __pyx_pf_5moses_7dictree_10Production_9__contains__(struct __pyx_obj_
__Pyx_RefNannySetupContext("__contains__", 0);
/* "moses/dictree.pyx":55
- *
+ *
* def __contains__(self, item):
* return item in self.rhs # <<<<<<<<<<<<<<
- *
+ *
* def __reversed__(self):
*/
__pyx_t_1 = (__Pyx_PySequence_Contains(__pyx_v_item, __pyx_v_self->rhs, Py_EQ)); if (unlikely(__pyx_t_1 < 0)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 55; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
@@ -2583,10 +2583,10 @@ static int __pyx_pf_5moses_7dictree_10Production_9__contains__(struct __pyx_obj_
/* "moses/dictree.pyx":54
* yield x
- *
+ *
* def __contains__(self, item): # <<<<<<<<<<<<<<
* return item in self.rhs
- *
+ *
*/
/* function exit code */
@@ -2600,10 +2600,10 @@ static int __pyx_pf_5moses_7dictree_10Production_9__contains__(struct __pyx_obj_
/* "moses/dictree.pyx":57
* return item in self.rhs
- *
+ *
* def __reversed__(self): # <<<<<<<<<<<<<<
* return reversed(self.rhs)
- *
+ *
*/
/* Python wrapper */
@@ -2630,10 +2630,10 @@ static PyObject *__pyx_pf_5moses_7dictree_10Production_11__reversed__(struct __p
__Pyx_RefNannySetupContext("__reversed__", 0);
/* "moses/dictree.pyx":58
- *
+ *
* def __reversed__(self):
* return reversed(self.rhs) # <<<<<<<<<<<<<<
- *
+ *
* def __hash__(self):
*/
__Pyx_XDECREF(__pyx_r);
@@ -2651,10 +2651,10 @@ static PyObject *__pyx_pf_5moses_7dictree_10Production_11__reversed__(struct __p
/* "moses/dictree.pyx":57
* return item in self.rhs
- *
+ *
* def __reversed__(self): # <<<<<<<<<<<<<<
* return reversed(self.rhs)
- *
+ *
*/
/* function exit code */
@@ -2671,10 +2671,10 @@ static PyObject *__pyx_pf_5moses_7dictree_10Production_11__reversed__(struct __p
/* "moses/dictree.pyx":60
* return reversed(self.rhs)
- *
+ *
* def __hash__(self): # <<<<<<<<<<<<<<
* return hash(self.rhs)
- *
+ *
*/
/* Python wrapper */
@@ -2701,10 +2701,10 @@ static Py_hash_t __pyx_pf_5moses_7dictree_10Production_13__hash__(struct __pyx_o
__Pyx_RefNannySetupContext("__hash__", 0);
/* "moses/dictree.pyx":61
- *
+ *
* def __hash__(self):
* return hash(self.rhs) # <<<<<<<<<<<<<<
- *
+ *
* def __str__(self):
*/
__pyx_t_1 = __pyx_v_self->rhs;
@@ -2716,10 +2716,10 @@ static Py_hash_t __pyx_pf_5moses_7dictree_10Production_13__hash__(struct __pyx_o
/* "moses/dictree.pyx":60
* return reversed(self.rhs)
- *
+ *
* def __hash__(self): # <<<<<<<<<<<<<<
* return hash(self.rhs)
- *
+ *
*/
/* function exit code */
@@ -2735,7 +2735,7 @@ static Py_hash_t __pyx_pf_5moses_7dictree_10Production_13__hash__(struct __pyx_o
/* "moses/dictree.pyx":63
* return hash(self.rhs)
- *
+ *
* def __str__(self): # <<<<<<<<<<<<<<
* if self.lhs:
* return '%s -> %s' % (self.lhs, ' '.join(self.rhs))
@@ -2766,7 +2766,7 @@ static PyObject *__pyx_pf_5moses_7dictree_10Production_15__str__(struct __pyx_ob
__Pyx_RefNannySetupContext("__str__", 0);
/* "moses/dictree.pyx":64
- *
+ *
* def __str__(self):
* if self.lhs: # <<<<<<<<<<<<<<
* return '%s -> %s' % (self.lhs, ' '.join(self.rhs))
@@ -2809,7 +2809,7 @@ static PyObject *__pyx_pf_5moses_7dictree_10Production_15__str__(struct __pyx_ob
* return '%s -> %s' % (self.lhs, ' '.join(self.rhs))
* else:
* return ' '.join(self.rhs) # <<<<<<<<<<<<<<
- *
+ *
* def __repr__(self):
*/
__Pyx_XDECREF(__pyx_r);
@@ -2825,7 +2825,7 @@ static PyObject *__pyx_pf_5moses_7dictree_10Production_15__str__(struct __pyx_ob
/* "moses/dictree.pyx":63
* return hash(self.rhs)
- *
+ *
* def __str__(self): # <<<<<<<<<<<<<<
* if self.lhs:
* return '%s -> %s' % (self.lhs, ' '.join(self.rhs))
@@ -2845,10 +2845,10 @@ static PyObject *__pyx_pf_5moses_7dictree_10Production_15__str__(struct __pyx_ob
/* "moses/dictree.pyx":69
* return ' '.join(self.rhs)
- *
+ *
* def __repr__(self): # <<<<<<<<<<<<<<
* return repr(self.as_tuple())
- *
+ *
*/
/* Python wrapper */
@@ -2875,10 +2875,10 @@ static PyObject *__pyx_pf_5moses_7dictree_10Production_17__repr__(struct __pyx_o
__Pyx_RefNannySetupContext("__repr__", 0);
/* "moses/dictree.pyx":70
- *
+ *
* def __repr__(self):
* return repr(self.as_tuple()) # <<<<<<<<<<<<<<
- *
+ *
* def as_tuple(self, lhs_first = False):
*/
__Pyx_XDECREF(__pyx_r);
@@ -2896,10 +2896,10 @@ static PyObject *__pyx_pf_5moses_7dictree_10Production_17__repr__(struct __pyx_o
/* "moses/dictree.pyx":69
* return ' '.join(self.rhs)
- *
+ *
* def __repr__(self): # <<<<<<<<<<<<<<
* return repr(self.as_tuple())
- *
+ *
*/
/* function exit code */
@@ -2916,7 +2916,7 @@ static PyObject *__pyx_pf_5moses_7dictree_10Production_17__repr__(struct __pyx_o
/* "moses/dictree.pyx":72
* return repr(self.as_tuple())
- *
+ *
* def as_tuple(self, lhs_first = False): # <<<<<<<<<<<<<<
* """
* Returns a tuple (lhs) + rhs or rhs + (lhs) depending on the flag 'lhs_first'.
@@ -3066,7 +3066,7 @@ static PyObject *__pyx_pf_5moses_7dictree_10Production_19as_tuple(struct __pyx_o
* return self.rhs + tuple([self.lhs])
* else:
* return self.rhs # <<<<<<<<<<<<<<
- *
+ *
* def __richcmp__(self, other, op):
*/
__Pyx_XDECREF(__pyx_r);
@@ -3077,7 +3077,7 @@ static PyObject *__pyx_pf_5moses_7dictree_10Production_19as_tuple(struct __pyx_o
/* "moses/dictree.pyx":72
* return repr(self.as_tuple())
- *
+ *
* def as_tuple(self, lhs_first = False): # <<<<<<<<<<<<<<
* """
* Returns a tuple (lhs) + rhs or rhs + (lhs) depending on the flag 'lhs_first'.
@@ -3097,7 +3097,7 @@ static PyObject *__pyx_pf_5moses_7dictree_10Production_19as_tuple(struct __pyx_o
/* "moses/dictree.pyx":84
* return self.rhs
- *
+ *
* def __richcmp__(self, other, op): # <<<<<<<<<<<<<<
* """
* The comparison uses 'as_tuple()', therefore in the CFG case, the lhs will be part of the production and it will be placed in the end
@@ -3307,7 +3307,7 @@ static PyObject *__pyx_pf_5moses_7dictree_10Production_21__richcmp__(PyObject *_
* return x > y
* elif op == 5: # <<<<<<<<<<<<<<
* return x >= y
- *
+ *
*/
__pyx_t_1 = PyObject_RichCompare(__pyx_v_op, __pyx_int_5, Py_EQ); __Pyx_XGOTREF(__pyx_t_1); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 101; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
__pyx_t_3 = __Pyx_PyObject_IsTrue(__pyx_t_1); if (unlikely(__pyx_t_3 < 0)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 101; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
@@ -3318,7 +3318,7 @@ static PyObject *__pyx_pf_5moses_7dictree_10Production_21__richcmp__(PyObject *_
* return x > y
* elif op == 5:
* return x >= y # <<<<<<<<<<<<<<
- *
+ *
* cdef class Alignment(list):
*/
__Pyx_XDECREF(__pyx_r);
@@ -3330,7 +3330,7 @@ static PyObject *__pyx_pf_5moses_7dictree_10Production_21__richcmp__(PyObject *_
/* "moses/dictree.pyx":84
* return self.rhs
- *
+ *
* def __richcmp__(self, other, op): # <<<<<<<<<<<<<<
* """
* The comparison uses 'as_tuple()', therefore in the CFG case, the lhs will be part of the production and it will be placed in the end
@@ -3354,10 +3354,10 @@ static PyObject *__pyx_pf_5moses_7dictree_10Production_21__richcmp__(PyObject *_
/* "moses/dictree.pyx":30
* """
- *
+ *
* cdef readonly bytes lhs # <<<<<<<<<<<<<<
* cdef readonly tuple rhs
- *
+ *
*/
/* Python wrapper */
@@ -3390,10 +3390,10 @@ static PyObject *__pyx_pf_5moses_7dictree_10Production_3lhs___get__(struct __pyx
}
/* "moses/dictree.pyx":31
- *
+ *
* cdef readonly bytes lhs
* cdef readonly tuple rhs # <<<<<<<<<<<<<<
- *
+ *
* def __init__(self, rhs, lhs = None):
*/
@@ -3428,7 +3428,7 @@ static PyObject *__pyx_pf_5moses_7dictree_10Production_3rhs___get__(struct __pyx
/* "moses/dictree.pyx":110
* """
- *
+ *
* def __init__(self, alignment): # <<<<<<<<<<<<<<
* if type(alignment) is str:
* pairs = []
@@ -3511,7 +3511,7 @@ static int __pyx_pf_5moses_7dictree_9Alignment___init__(struct __pyx_obj_5moses_
__Pyx_RefNannySetupContext("__init__", 0);
/* "moses/dictree.pyx":111
- *
+ *
* def __init__(self, alignment):
* if type(alignment) is str: # <<<<<<<<<<<<<<
* pairs = []
@@ -3610,11 +3610,11 @@ static int __pyx_pf_5moses_7dictree_9Alignment___init__(struct __pyx_obj_5moses_
}
#if CYTHON_COMPILING_IN_CPYTHON
if (likely(PyTuple_CheckExact(sequence))) {
- __pyx_t_4 = PyTuple_GET_ITEM(sequence, 0);
- __pyx_t_8 = PyTuple_GET_ITEM(sequence, 1);
+ __pyx_t_4 = PyTuple_GET_ITEM(sequence, 0);
+ __pyx_t_8 = PyTuple_GET_ITEM(sequence, 1);
} else {
- __pyx_t_4 = PyList_GET_ITEM(sequence, 0);
- __pyx_t_8 = PyList_GET_ITEM(sequence, 1);
+ __pyx_t_4 = PyList_GET_ITEM(sequence, 0);
+ __pyx_t_8 = PyList_GET_ITEM(sequence, 1);
}
__Pyx_INCREF(__pyx_t_4);
__Pyx_INCREF(__pyx_t_8);
@@ -3772,7 +3772,7 @@ static int __pyx_pf_5moses_7dictree_9Alignment___init__(struct __pyx_obj_5moses_
* super(Alignment, self).__init__(alignment)
* else:
* ValueError, 'Cannot figure out pairs from: %s' % type(alignment) # <<<<<<<<<<<<<<
- *
+ *
* def __str__(self):
*/
__pyx_t_3 = __Pyx_PyString_Format(__pyx_kp_s_Cannot_figure_out_pairs_from_s, ((PyObject *)Py_TYPE(__pyx_v_alignment))); if (unlikely(!__pyx_t_3)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 120; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
@@ -3791,7 +3791,7 @@ static int __pyx_pf_5moses_7dictree_9Alignment___init__(struct __pyx_obj_5moses_
/* "moses/dictree.pyx":110
* """
- *
+ *
* def __init__(self, alignment): # <<<<<<<<<<<<<<
* if type(alignment) is str:
* pairs = []
@@ -3819,10 +3819,10 @@ static int __pyx_pf_5moses_7dictree_9Alignment___init__(struct __pyx_obj_5moses_
/* "moses/dictree.pyx":122
* ValueError, 'Cannot figure out pairs from: %s' % type(alignment)
- *
+ *
* def __str__(self): # <<<<<<<<<<<<<<
* return ' '.join('%d-%d' % (s, t) for s, t in self)
- *
+ *
*/
/* Python wrapper */
@@ -3840,10 +3840,10 @@ static PyObject *__pyx_pw_5moses_7dictree_9Alignment_3__str__(PyObject *__pyx_v_
static PyObject *__pyx_gb_5moses_7dictree_9Alignment_7__str___2generator1(__pyx_GeneratorObject *__pyx_generator, PyObject *__pyx_sent_value); /* proto */
/* "moses/dictree.pyx":123
- *
+ *
* def __str__(self):
* return ' '.join('%d-%d' % (s, t) for s, t in self) # <<<<<<<<<<<<<<
- *
+ *
* cdef class FValues(list):
*/
@@ -3960,11 +3960,11 @@ static PyObject *__pyx_gb_5moses_7dictree_9Alignment_7__str___2generator1(__pyx_
}
#if CYTHON_COMPILING_IN_CPYTHON
if (likely(PyTuple_CheckExact(sequence))) {
- __pyx_t_5 = PyTuple_GET_ITEM(sequence, 0);
- __pyx_t_6 = PyTuple_GET_ITEM(sequence, 1);
+ __pyx_t_5 = PyTuple_GET_ITEM(sequence, 0);
+ __pyx_t_6 = PyTuple_GET_ITEM(sequence, 1);
} else {
- __pyx_t_5 = PyList_GET_ITEM(sequence, 0);
- __pyx_t_6 = PyList_GET_ITEM(sequence, 1);
+ __pyx_t_5 = PyList_GET_ITEM(sequence, 0);
+ __pyx_t_6 = PyList_GET_ITEM(sequence, 1);
}
__Pyx_INCREF(__pyx_t_5);
__Pyx_INCREF(__pyx_t_6);
@@ -4056,10 +4056,10 @@ static PyObject *__pyx_gb_5moses_7dictree_9Alignment_7__str___2generator1(__pyx_
/* "moses/dictree.pyx":122
* ValueError, 'Cannot figure out pairs from: %s' % type(alignment)
- *
+ *
* def __str__(self): # <<<<<<<<<<<<<<
* return ' '.join('%d-%d' % (s, t) for s, t in self)
- *
+ *
*/
static PyObject *__pyx_pf_5moses_7dictree_9Alignment_2__str__(struct __pyx_obj_5moses_7dictree_Alignment *__pyx_v_self) {
@@ -4083,10 +4083,10 @@ static PyObject *__pyx_pf_5moses_7dictree_9Alignment_2__str__(struct __pyx_obj_5
__Pyx_GIVEREF((PyObject *)__pyx_cur_scope->__pyx_v_self);
/* "moses/dictree.pyx":123
- *
+ *
* def __str__(self):
* return ' '.join('%d-%d' % (s, t) for s, t in self) # <<<<<<<<<<<<<<
- *
+ *
* cdef class FValues(list):
*/
__Pyx_XDECREF(__pyx_r);
@@ -4101,10 +4101,10 @@ static PyObject *__pyx_pf_5moses_7dictree_9Alignment_2__str__(struct __pyx_obj_5
/* "moses/dictree.pyx":122
* ValueError, 'Cannot figure out pairs from: %s' % type(alignment)
- *
+ *
* def __str__(self): # <<<<<<<<<<<<<<
* return ' '.join('%d-%d' % (s, t) for s, t in self)
- *
+ *
*/
/* function exit code */
@@ -4122,10 +4122,10 @@ static PyObject *__pyx_pf_5moses_7dictree_9Alignment_2__str__(struct __pyx_obj_5
/* "moses/dictree.pyx":131
* """
- *
+ *
* def __init__(self, values): # <<<<<<<<<<<<<<
* super(FValues, self).__init__(values)
- *
+ *
*/
/* Python wrapper */
@@ -4192,10 +4192,10 @@ static int __pyx_pf_5moses_7dictree_7FValues___init__(struct __pyx_obj_5moses_7d
__Pyx_RefNannySetupContext("__init__", 0);
/* "moses/dictree.pyx":132
- *
+ *
* def __init__(self, values):
* super(FValues, self).__init__(values) # <<<<<<<<<<<<<<
- *
+ *
* def __str__(self):
*/
__pyx_t_1 = PyTuple_New(2); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 132; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
@@ -4225,10 +4225,10 @@ static int __pyx_pf_5moses_7dictree_7FValues___init__(struct __pyx_obj_5moses_7d
/* "moses/dictree.pyx":131
* """
- *
+ *
* def __init__(self, values): # <<<<<<<<<<<<<<
* super(FValues, self).__init__(values)
- *
+ *
*/
/* function exit code */
@@ -4247,10 +4247,10 @@ static int __pyx_pf_5moses_7dictree_7FValues___init__(struct __pyx_obj_5moses_7d
/* "moses/dictree.pyx":134
* super(FValues, self).__init__(values)
- *
+ *
* def __str__(self): # <<<<<<<<<<<<<<
* return ' '.join(str(x) for x in self)
- *
+ *
*/
/* Python wrapper */
@@ -4268,10 +4268,10 @@ static PyObject *__pyx_pw_5moses_7dictree_7FValues_3__str__(PyObject *__pyx_v_se
static PyObject *__pyx_gb_5moses_7dictree_7FValues_7__str___2generator2(__pyx_GeneratorObject *__pyx_generator, PyObject *__pyx_sent_value); /* proto */
/* "moses/dictree.pyx":135
- *
+ *
* def __str__(self):
* return ' '.join(str(x) for x in self) # <<<<<<<<<<<<<<
- *
+ *
* cdef class TargetProduction(Production):
*/
@@ -4422,10 +4422,10 @@ static PyObject *__pyx_gb_5moses_7dictree_7FValues_7__str___2generator2(__pyx_Ge
/* "moses/dictree.pyx":134
* super(FValues, self).__init__(values)
- *
+ *
* def __str__(self): # <<<<<<<<<<<<<<
* return ' '.join(str(x) for x in self)
- *
+ *
*/
static PyObject *__pyx_pf_5moses_7dictree_7FValues_2__str__(struct __pyx_obj_5moses_7dictree_FValues *__pyx_v_self) {
@@ -4449,10 +4449,10 @@ static PyObject *__pyx_pf_5moses_7dictree_7FValues_2__str__(struct __pyx_obj_5mo
__Pyx_GIVEREF((PyObject *)__pyx_cur_scope->__pyx_v_self);
/* "moses/dictree.pyx":135
- *
+ *
* def __str__(self):
* return ' '.join(str(x) for x in self) # <<<<<<<<<<<<<<
- *
+ *
* cdef class TargetProduction(Production):
*/
__Pyx_XDECREF(__pyx_r);
@@ -4467,10 +4467,10 @@ static PyObject *__pyx_pf_5moses_7dictree_7FValues_2__str__(struct __pyx_obj_5mo
/* "moses/dictree.pyx":134
* super(FValues, self).__init__(values)
- *
+ *
* def __str__(self): # <<<<<<<<<<<<<<
* return ' '.join(str(x) for x in self)
- *
+ *
*/
/* function exit code */
@@ -4488,7 +4488,7 @@ static PyObject *__pyx_pf_5moses_7dictree_7FValues_2__str__(struct __pyx_obj_5mo
/* "moses/dictree.pyx":144
* cdef readonly FValues scores
- *
+ *
* def __init__(self, rhs, scores, alignment = [], lhs = None): # <<<<<<<<<<<<<<
* """
* :rhs right-hand side tokens (sequence of terminals and nonterminals)
@@ -4632,7 +4632,7 @@ static int __pyx_pf_5moses_7dictree_16TargetProduction___init__(struct __pyx_obj
* super(TargetProduction, self).__init__(rhs, lhs)
* self.scores = FValues(scores) # <<<<<<<<<<<<<<
* self.alignment = Alignment(alignment)
- *
+ *
*/
__pyx_t_3 = PyTuple_New(1); if (unlikely(!__pyx_t_3)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 152; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
__Pyx_GOTREF(__pyx_t_3);
@@ -4652,7 +4652,7 @@ static int __pyx_pf_5moses_7dictree_16TargetProduction___init__(struct __pyx_obj
* super(TargetProduction, self).__init__(rhs, lhs)
* self.scores = FValues(scores)
* self.alignment = Alignment(alignment) # <<<<<<<<<<<<<<
- *
+ *
* @staticmethod
*/
__pyx_t_2 = PyTuple_New(1); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 153; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
@@ -4671,7 +4671,7 @@ static int __pyx_pf_5moses_7dictree_16TargetProduction___init__(struct __pyx_obj
/* "moses/dictree.pyx":144
* cdef readonly FValues scores
- *
+ *
* def __init__(self, rhs, scores, alignment = [], lhs = None): # <<<<<<<<<<<<<<
* """
* :rhs right-hand side tokens (sequence of terminals and nonterminals)
@@ -4692,7 +4692,7 @@ static int __pyx_pf_5moses_7dictree_16TargetProduction___init__(struct __pyx_obj
}
/* "moses/dictree.pyx":156
- *
+ *
* @staticmethod
* def desc(x, y, key = lambda r: r.scores[0]): # <<<<<<<<<<<<<<
* """Returns the sign of key(y) - key(x).
@@ -4790,7 +4790,7 @@ static PyObject *__pyx_pf_5moses_7dictree_16TargetProduction_2desc(PyObject *__p
* Can only be used if scores is not an empty vector as
* keys defaults to scores[0]"""
* return fsign(key(y) - key(x)) # <<<<<<<<<<<<<<
- *
+ *
* def __str__(self):
*/
__Pyx_XDECREF(__pyx_r);
@@ -4823,7 +4823,7 @@ static PyObject *__pyx_pf_5moses_7dictree_16TargetProduction_2desc(PyObject *__p
goto __pyx_L0;
/* "moses/dictree.pyx":156
- *
+ *
* @staticmethod
* def desc(x, y, key = lambda r: r.scores[0]): # <<<<<<<<<<<<<<
* """Returns the sign of key(y) - key(x).
@@ -4845,7 +4845,7 @@ static PyObject *__pyx_pf_5moses_7dictree_16TargetProduction_2desc(PyObject *__p
/* "moses/dictree.pyx":162
* return fsign(key(y) - key(x))
- *
+ *
* def __str__(self): # <<<<<<<<<<<<<<
* """Returns a string such as: <words> ||| <scores> [||| word-alignment info]"""
* if self.lhs:
@@ -4955,7 +4955,7 @@ static PyObject *__pyx_pf_5moses_7dictree_16TargetProduction_4__str__(struct __p
* return ' ||| '.join((' '.join(chain(self.rhs, lhs)),
* str(self.scores), # <<<<<<<<<<<<<<
* str(self.alignment)))
- *
+ *
*/
__pyx_t_4 = PyTuple_New(1); if (unlikely(!__pyx_t_4)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 169; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
__Pyx_GOTREF(__pyx_t_4);
@@ -4970,7 +4970,7 @@ static PyObject *__pyx_pf_5moses_7dictree_16TargetProduction_4__str__(struct __p
* return ' ||| '.join((' '.join(chain(self.rhs, lhs)),
* str(self.scores),
* str(self.alignment))) # <<<<<<<<<<<<<<
- *
+ *
* def __repr__(self):
*/
__pyx_t_4 = PyTuple_New(1); if (unlikely(!__pyx_t_4)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 170; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
@@ -5009,7 +5009,7 @@ static PyObject *__pyx_pf_5moses_7dictree_16TargetProduction_4__str__(struct __p
/* "moses/dictree.pyx":162
* return fsign(key(y) - key(x))
- *
+ *
* def __str__(self): # <<<<<<<<<<<<<<
* """Returns a string such as: <words> ||| <scores> [||| word-alignment info]"""
* if self.lhs:
@@ -5032,10 +5032,10 @@ static PyObject *__pyx_pf_5moses_7dictree_16TargetProduction_4__str__(struct __p
/* "moses/dictree.pyx":172
* str(self.alignment)))
- *
+ *
* def __repr__(self): # <<<<<<<<<<<<<<
* return repr((repr(self.rhs), repr(self.lhs), repr(self.scores), repr(self.alignment)))
- *
+ *
*/
/* Python wrapper */
@@ -5065,10 +5065,10 @@ static PyObject *__pyx_pf_5moses_7dictree_16TargetProduction_6__repr__(struct __
__Pyx_RefNannySetupContext("__repr__", 0);
/* "moses/dictree.pyx":173
- *
+ *
* def __repr__(self):
* return repr((repr(self.rhs), repr(self.lhs), repr(self.scores), repr(self.alignment))) # <<<<<<<<<<<<<<
- *
+ *
* cdef class QueryResult(list):
*/
__Pyx_XDECREF(__pyx_r);
@@ -5115,10 +5115,10 @@ static PyObject *__pyx_pf_5moses_7dictree_16TargetProduction_6__repr__(struct __
/* "moses/dictree.pyx":172
* str(self.alignment)))
- *
+ *
* def __repr__(self): # <<<<<<<<<<<<<<
* return repr((repr(self.rhs), repr(self.lhs), repr(self.scores), repr(self.alignment)))
- *
+ *
*/
/* function exit code */
@@ -5141,7 +5141,7 @@ static PyObject *__pyx_pf_5moses_7dictree_16TargetProduction_6__repr__(struct __
* """
* cdef readonly Alignment alignment # <<<<<<<<<<<<<<
* cdef readonly FValues scores
- *
+ *
*/
/* Python wrapper */
@@ -5177,7 +5177,7 @@ static PyObject *__pyx_pf_5moses_7dictree_16TargetProduction_9alignment___get__(
* """
* cdef readonly Alignment alignment
* cdef readonly FValues scores # <<<<<<<<<<<<<<
- *
+ *
* def __init__(self, rhs, scores, alignment = [], lhs = None):
*/
@@ -5212,7 +5212,7 @@ static PyObject *__pyx_pf_5moses_7dictree_16TargetProduction_6scores___get__(str
/* "moses/dictree.pyx":179
* cdef readonly Production source
- *
+ *
* def __init__(self, source, targets = []): # <<<<<<<<<<<<<<
* super(QueryResult, self).__init__(targets)
* self.source = source
@@ -5294,11 +5294,11 @@ static int __pyx_pf_5moses_7dictree_11QueryResult___init__(struct __pyx_obj_5mos
__Pyx_RefNannySetupContext("__init__", 0);
/* "moses/dictree.pyx":180
- *
+ *
* def __init__(self, source, targets = []):
* super(QueryResult, self).__init__(targets) # <<<<<<<<<<<<<<
* self.source = source
- *
+ *
*/
__pyx_t_1 = PyTuple_New(2); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 180; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
__Pyx_GOTREF(__pyx_t_1);
@@ -5329,8 +5329,8 @@ static int __pyx_pf_5moses_7dictree_11QueryResult___init__(struct __pyx_obj_5mos
* def __init__(self, source, targets = []):
* super(QueryResult, self).__init__(targets)
* self.source = source # <<<<<<<<<<<<<<
- *
- *
+ *
+ *
*/
if (!(likely(((__pyx_v_source) == Py_None) || likely(__Pyx_TypeTest(__pyx_v_source, __pyx_ptype_5moses_7dictree_Production))))) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 181; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
__pyx_t_3 = __pyx_v_source;
@@ -5343,7 +5343,7 @@ static int __pyx_pf_5moses_7dictree_11QueryResult___init__(struct __pyx_obj_5mos
/* "moses/dictree.pyx":179
* cdef readonly Production source
- *
+ *
* def __init__(self, source, targets = []): # <<<<<<<<<<<<<<
* super(QueryResult, self).__init__(targets)
* self.source = source
@@ -5365,9 +5365,9 @@ static int __pyx_pf_5moses_7dictree_11QueryResult___init__(struct __pyx_obj_5mos
/* "moses/dictree.pyx":177
* cdef class QueryResult(list):
- *
+ *
* cdef readonly Production source # <<<<<<<<<<<<<<
- *
+ *
* def __init__(self, source, targets = []):
*/
@@ -5401,7 +5401,7 @@ static PyObject *__pyx_pf_5moses_7dictree_11QueryResult_6source___get__(struct _
}
/* "moses/dictree.pyx":187
- *
+ *
* @classmethod
* def canLoad(cls, path, bint wa = False): # <<<<<<<<<<<<<<
* """Whether or not the path represents a valid table for that class."""
@@ -5488,14 +5488,14 @@ static PyObject *__pyx_pf_5moses_7dictree_14DictionaryTree_canLoad(CYTHON_UNUSED
* def canLoad(cls, path, bint wa = False):
* """Whether or not the path represents a valid table for that class."""
* raise NotImplementedError # <<<<<<<<<<<<<<
- *
+ *
* def query(self, line, converter = None, cmp = None, key = None):
*/
__Pyx_Raise(__pyx_builtin_NotImplementedError, 0, 0, 0);
{__pyx_filename = __pyx_f[0]; __pyx_lineno = 189; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
/* "moses/dictree.pyx":187
- *
+ *
* @classmethod
* def canLoad(cls, path, bint wa = False): # <<<<<<<<<<<<<<
* """Whether or not the path represents a valid table for that class."""
@@ -5513,7 +5513,7 @@ static PyObject *__pyx_pf_5moses_7dictree_14DictionaryTree_canLoad(CYTHON_UNUSED
/* "moses/dictree.pyx":191
* raise NotImplementedError
- *
+ *
* def query(self, line, converter = None, cmp = None, key = None): # <<<<<<<<<<<<<<
* """
* Returns a list of target productions that translate a given source production
@@ -5616,7 +5616,7 @@ static PyObject *__pyx_pf_5moses_7dictree_14DictionaryTree_2query(CYTHON_UNUSED
* :return QueryResult
* """
* raise NotImplementedError # <<<<<<<<<<<<<<
- *
+ *
* cdef class PhraseDictionaryTree(DictionaryTree):
*/
__Pyx_Raise(__pyx_builtin_NotImplementedError, 0, 0, 0);
@@ -5624,7 +5624,7 @@ static PyObject *__pyx_pf_5moses_7dictree_14DictionaryTree_2query(CYTHON_UNUSED
/* "moses/dictree.pyx":191
* raise NotImplementedError
- *
+ *
* def query(self, line, converter = None, cmp = None, key = None): # <<<<<<<<<<<<<<
* """
* Returns a list of target productions that translate a given source production
@@ -5641,7 +5641,7 @@ static PyObject *__pyx_pf_5moses_7dictree_14DictionaryTree_2query(CYTHON_UNUSED
/* "moses/dictree.pyx":213
* cdef readonly unsigned tableLimit
- *
+ *
* def __cinit__(self, bytes path, unsigned tableLimit = 20, unsigned nscores = 5, bint wa = False, delimiters = ' \t'): # <<<<<<<<<<<<<<
* """
* :path stem of the table, e.g europarl.fr-en is the stem for europar.fr-en.binphr.*
@@ -5771,7 +5771,7 @@ static int __pyx_pf_5moses_7dictree_20PhraseDictionaryTree___cinit__(struct __py
/* "moses/dictree.pyx":221
* """
- *
+ *
* if not PhraseDictionaryTree.canLoad(path, wa): # <<<<<<<<<<<<<<
* raise ValueError, "'%s' doesn't seem a valid binary table." % path
* self.path = path
@@ -5798,7 +5798,7 @@ static int __pyx_pf_5moses_7dictree_20PhraseDictionaryTree___cinit__(struct __py
if (__pyx_t_5) {
/* "moses/dictree.pyx":222
- *
+ *
* if not PhraseDictionaryTree.canLoad(path, wa):
* raise ValueError, "'%s' doesn't seem a valid binary table." % path # <<<<<<<<<<<<<<
* self.path = path
@@ -5881,7 +5881,7 @@ static int __pyx_pf_5moses_7dictree_20PhraseDictionaryTree___cinit__(struct __py
* self.tree = new cdictree.PhraseDictionaryTree()
* self.tree.NeedAlignmentInfo(wa) # <<<<<<<<<<<<<<
* self.tree.Read(path)
- *
+ *
*/
__pyx_v_self->tree->NeedAlignmentInfo(__pyx_v_wa);
@@ -5889,7 +5889,7 @@ static int __pyx_pf_5moses_7dictree_20PhraseDictionaryTree___cinit__(struct __py
* self.tree = new cdictree.PhraseDictionaryTree()
* self.tree.NeedAlignmentInfo(wa)
* self.tree.Read(path) # <<<<<<<<<<<<<<
- *
+ *
* def __dealloc__(self):
*/
__pyx_t_6 = __pyx_convert_string_from_py_(__pyx_v_path); if (unlikely(PyErr_Occurred())) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 230; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
@@ -5897,7 +5897,7 @@ static int __pyx_pf_5moses_7dictree_20PhraseDictionaryTree___cinit__(struct __py
/* "moses/dictree.pyx":213
* cdef readonly unsigned tableLimit
- *
+ *
* def __cinit__(self, bytes path, unsigned tableLimit = 20, unsigned nscores = 5, bint wa = False, delimiters = ' \t'): # <<<<<<<<<<<<<<
* """
* :path stem of the table, e.g europarl.fr-en is the stem for europar.fr-en.binphr.*
@@ -5919,10 +5919,10 @@ static int __pyx_pf_5moses_7dictree_20PhraseDictionaryTree___cinit__(struct __py
/* "moses/dictree.pyx":232
* self.tree.Read(path)
- *
+ *
* def __dealloc__(self): # <<<<<<<<<<<<<<
* del self.tree
- *
+ *
*/
/* Python wrapper */
@@ -5941,20 +5941,20 @@ static void __pyx_pf_5moses_7dictree_20PhraseDictionaryTree_2__dealloc__(struct
__Pyx_RefNannySetupContext("__dealloc__", 0);
/* "moses/dictree.pyx":233
- *
+ *
* def __dealloc__(self):
* del self.tree # <<<<<<<<<<<<<<
- *
+ *
* @classmethod
*/
delete __pyx_v_self->tree;
/* "moses/dictree.pyx":232
* self.tree.Read(path)
- *
+ *
* def __dealloc__(self): # <<<<<<<<<<<<<<
* del self.tree
- *
+ *
*/
/* function exit code */
@@ -5962,7 +5962,7 @@ static void __pyx_pf_5moses_7dictree_20PhraseDictionaryTree_2__dealloc__(struct
}
/* "moses/dictree.pyx":236
- *
+ *
* @classmethod
* def canLoad(cls, stem, bint wa = False): # <<<<<<<<<<<<<<
* """This sanity check was added to the constructor, but you can access it from outside this class
@@ -6368,7 +6368,7 @@ static PyObject *__pyx_pf_5moses_7dictree_20PhraseDictionaryTree_4canLoad(CYTHON
* and os.path.isfile(stem + ".binphr.srcvoc") \
* and os.path.isfile(stem + ".binphr.tgtdata") \ # <<<<<<<<<<<<<<
* and os.path.isfile(stem + ".binphr.tgtvoc")
- *
+ *
*/
__pyx_t_8 = __Pyx_GetModuleGlobalName(__pyx_n_s_os); if (unlikely(!__pyx_t_8)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 249; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
__Pyx_GOTREF(__pyx_t_8);
@@ -6397,7 +6397,7 @@ static PyObject *__pyx_pf_5moses_7dictree_20PhraseDictionaryTree_4canLoad(CYTHON
* and os.path.isfile(stem + ".binphr.srcvoc") \
* and os.path.isfile(stem + ".binphr.tgtdata") \
* and os.path.isfile(stem + ".binphr.tgtvoc") # <<<<<<<<<<<<<<
- *
+ *
* cdef TargetProduction getTargetProduction(self, cdictree.StringTgtCand& cand, wa = None, converter = None):
*/
__pyx_t_6 = __Pyx_GetModuleGlobalName(__pyx_n_s_os); if (unlikely(!__pyx_t_6)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 250; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
@@ -6449,7 +6449,7 @@ static PyObject *__pyx_pf_5moses_7dictree_20PhraseDictionaryTree_4canLoad(CYTHON
}
/* "moses/dictree.pyx":236
- *
+ *
* @classmethod
* def canLoad(cls, stem, bint wa = False): # <<<<<<<<<<<<<<
* """This sanity check was added to the constructor, but you can access it from outside this class
@@ -6475,7 +6475,7 @@ static PyObject *__pyx_pf_5moses_7dictree_20PhraseDictionaryTree_4canLoad(CYTHON
/* "moses/dictree.pyx":252
* and os.path.isfile(stem + ".binphr.tgtvoc")
- *
+ *
* cdef TargetProduction getTargetProduction(self, cdictree.StringTgtCand& cand, wa = None, converter = None): # <<<<<<<<<<<<<<
* """Converts a StringTgtCandidate (c++ object) and possibly a word-alignment info (string) to a TargetProduction (python object)."""
* cdef list words = [cand.tokens[i].c_str() for i in xrange(cand.tokens.size())]
@@ -6538,7 +6538,7 @@ static struct __pyx_obj_5moses_7dictree_TargetProduction *__pyx_f_5moses_7dictre
* cdef list words = [cand.tokens[i].c_str() for i in xrange(cand.tokens.size())]
* cdef list scores = [score for score in cand.scores] if converter is None else [converter(score) for score in cand.scores] # <<<<<<<<<<<<<<
* return TargetProduction(words, scores, wa)
- *
+ *
*/
__pyx_t_5 = (__pyx_v_converter == Py_None);
if ((__pyx_t_5 != 0)) {
@@ -6592,7 +6592,7 @@ static struct __pyx_obj_5moses_7dictree_TargetProduction *__pyx_f_5moses_7dictre
* cdef list words = [cand.tokens[i].c_str() for i in xrange(cand.tokens.size())]
* cdef list scores = [score for score in cand.scores] if converter is None else [converter(score) for score in cand.scores]
* return TargetProduction(words, scores, wa) # <<<<<<<<<<<<<<
- *
+ *
* def query(self, line, converter = lambda x: log(x), cmp = lambda x, y: fsign(y.scores[2] - x.scores[2]), key = None):
*/
__Pyx_XDECREF(((PyObject *)__pyx_r));
@@ -6616,7 +6616,7 @@ static struct __pyx_obj_5moses_7dictree_TargetProduction *__pyx_f_5moses_7dictre
/* "moses/dictree.pyx":252
* and os.path.isfile(stem + ".binphr.tgtvoc")
- *
+ *
* cdef TargetProduction getTargetProduction(self, cdictree.StringTgtCand& cand, wa = None, converter = None): # <<<<<<<<<<<<<<
* """Converts a StringTgtCandidate (c++ object) and possibly a word-alignment info (string) to a TargetProduction (python object)."""
* cdef list words = [cand.tokens[i].c_str() for i in xrange(cand.tokens.size())]
@@ -6641,7 +6641,7 @@ static struct __pyx_obj_5moses_7dictree_TargetProduction *__pyx_f_5moses_7dictre
/* "moses/dictree.pyx":258
* return TargetProduction(words, scores, wa)
- *
+ *
* def query(self, line, converter = lambda x: log(x), cmp = lambda x, y: fsign(y.scores[2] - x.scores[2]), key = None): # <<<<<<<<<<<<<<
* """
* Returns a list of target productions that translate a given source production
@@ -6738,7 +6738,7 @@ static PyObject *__pyx_gb_5moses_7dictree_20PhraseDictionaryTree_5query_2generat
* cdef vector[string]* wa = NULL
* cdef Production source = Production(f.c_str() for f in fphrase) # <<<<<<<<<<<<<<
* cdef QueryResult results = QueryResult(source)
- *
+ *
*/
static PyObject *__pyx_pf_5moses_7dictree_20PhraseDictionaryTree_5query_genexpr(PyObject *__pyx_self) {
@@ -6836,7 +6836,7 @@ static PyObject *__pyx_gb_5moses_7dictree_20PhraseDictionaryTree_5query_2generat
/* "moses/dictree.pyx":258
* return TargetProduction(words, scores, wa)
- *
+ *
* def query(self, line, converter = lambda x: log(x), cmp = lambda x, y: fsign(y.scores[2] - x.scores[2]), key = None): # <<<<<<<<<<<<<<
* """
* Returns a list of target productions that translate a given source production
@@ -6930,7 +6930,7 @@ static PyObject *__pyx_pf_5moses_7dictree_20PhraseDictionaryTree_6query(struct _
* cdef vector[string]* wa = NULL
* cdef Production source = Production(f.c_str() for f in fphrase) # <<<<<<<<<<<<<<
* cdef QueryResult results = QueryResult(source)
- *
+ *
*/
__pyx_t_1 = __pyx_pf_5moses_7dictree_20PhraseDictionaryTree_5query_genexpr(((PyObject*)__pyx_cur_scope)); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 271; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
__Pyx_GOTREF(__pyx_t_1);
@@ -6949,7 +6949,7 @@ static PyObject *__pyx_pf_5moses_7dictree_20PhraseDictionaryTree_6query(struct _
* cdef vector[string]* wa = NULL
* cdef Production source = Production(f.c_str() for f in fphrase)
* cdef QueryResult results = QueryResult(source) # <<<<<<<<<<<<<<
- *
+ *
* if not self.wa:
*/
__pyx_t_1 = PyTuple_New(1); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 272; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
@@ -6965,7 +6965,7 @@ static PyObject *__pyx_pf_5moses_7dictree_20PhraseDictionaryTree_6query(struct _
/* "moses/dictree.pyx":274
* cdef QueryResult results = QueryResult(source)
- *
+ *
* if not self.wa: # <<<<<<<<<<<<<<
* self.tree.GetTargetCandidates(fphrase, rv[0])
* results.extend([self.getTargetProduction(candidate, None, converter) for candidate in rv[0]])
@@ -6974,7 +6974,7 @@ static PyObject *__pyx_pf_5moses_7dictree_20PhraseDictionaryTree_6query(struct _
if (__pyx_t_6) {
/* "moses/dictree.pyx":275
- *
+ *
* if not self.wa:
* self.tree.GetTargetCandidates(fphrase, rv[0]) # <<<<<<<<<<<<<<
* results.extend([self.getTargetProduction(candidate, None, converter) for candidate in rv[0]])
@@ -7175,7 +7175,7 @@ static PyObject *__pyx_pf_5moses_7dictree_20PhraseDictionaryTree_6query(struct _
* return QueryResult(source, results[0:self.tableLimit])
* else:
* return results # <<<<<<<<<<<<<<
- *
+ *
* cdef class OnDiskWrapper(DictionaryTree):
*/
__Pyx_XDECREF(__pyx_r);
@@ -7186,7 +7186,7 @@ static PyObject *__pyx_pf_5moses_7dictree_20PhraseDictionaryTree_6query(struct _
/* "moses/dictree.pyx":258
* return TargetProduction(words, scores, wa)
- *
+ *
* def query(self, line, converter = lambda x: log(x), cmp = lambda x, y: fsign(y.scores[2] - x.scores[2]), key = None): # <<<<<<<<<<<<<<
* """
* Returns a list of target productions that translate a given source production
@@ -7211,7 +7211,7 @@ static PyObject *__pyx_pf_5moses_7dictree_20PhraseDictionaryTree_6query(struct _
}
/* "moses/dictree.pyx":207
- *
+ *
* cdef cdictree.PhraseDictionaryTree* tree
* cdef readonly bytes path # <<<<<<<<<<<<<<
* cdef readonly unsigned nscores
@@ -7346,7 +7346,7 @@ static PyObject *__pyx_pf_5moses_7dictree_20PhraseDictionaryTree_2wa___get__(str
* cdef readonly bint wa
* cdef readonly bytes delimiters # <<<<<<<<<<<<<<
* cdef readonly unsigned tableLimit
- *
+ *
*/
/* Python wrapper */
@@ -7382,7 +7382,7 @@ static PyObject *__pyx_pf_5moses_7dictree_20PhraseDictionaryTree_10delimiters___
* cdef readonly bint wa
* cdef readonly bytes delimiters
* cdef readonly unsigned tableLimit # <<<<<<<<<<<<<<
- *
+ *
* def __cinit__(self, bytes path, unsigned tableLimit = 20, unsigned nscores = 5, bint wa = False, delimiters = ' \t'):
*/
@@ -7427,7 +7427,7 @@ static PyObject *__pyx_pf_5moses_7dictree_20PhraseDictionaryTree_10tableLimit___
/* "moses/dictree.pyx":297
* cdef readonly unsigned tableLimit
- *
+ *
* def __cinit__(self, bytes path, unsigned tableLimit = 20, delimiters = ' \t'): # <<<<<<<<<<<<<<
* self.delimiters = delimiters
* self.tableLimit = tableLimit
@@ -7527,7 +7527,7 @@ static int __pyx_pf_5moses_7dictree_13OnDiskWrapper___cinit__(struct __pyx_obj_5
__Pyx_RefNannySetupContext("__cinit__", 0);
/* "moses/dictree.pyx":298
- *
+ *
* def __cinit__(self, bytes path, unsigned tableLimit = 20, delimiters = ' \t'):
* self.delimiters = delimiters # <<<<<<<<<<<<<<
* self.tableLimit = tableLimit
@@ -7565,7 +7565,7 @@ static int __pyx_pf_5moses_7dictree_13OnDiskWrapper___cinit__(struct __pyx_obj_5
* self.wrapper = new condiskpt.OnDiskWrapper()
* self.wrapper.BeginLoad(string(path)) # <<<<<<<<<<<<<<
* self.finder = new condiskpt.OnDiskQuery(self.wrapper[0])
- *
+ *
*/
__pyx_t_2 = __Pyx_PyObject_AsString(__pyx_v_path); if (unlikely((!__pyx_t_2) && PyErr_Occurred())) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 301; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
try {
@@ -7580,14 +7580,14 @@ static int __pyx_pf_5moses_7dictree_13OnDiskWrapper___cinit__(struct __pyx_obj_5
* self.wrapper = new condiskpt.OnDiskWrapper()
* self.wrapper.BeginLoad(string(path))
* self.finder = new condiskpt.OnDiskQuery(self.wrapper[0]) # <<<<<<<<<<<<<<
- *
+ *
* @classmethod
*/
__pyx_v_self->finder = new OnDiskPt::OnDiskQuery((__pyx_v_self->wrapper[0]));
/* "moses/dictree.pyx":297
* cdef readonly unsigned tableLimit
- *
+ *
* def __cinit__(self, bytes path, unsigned tableLimit = 20, delimiters = ' \t'): # <<<<<<<<<<<<<<
* self.delimiters = delimiters
* self.tableLimit = tableLimit
@@ -7606,7 +7606,7 @@ static int __pyx_pf_5moses_7dictree_13OnDiskWrapper___cinit__(struct __pyx_obj_5
}
/* "moses/dictree.pyx":305
- *
+ *
* @classmethod
* def canLoad(cls, stem, bint wa = False): # <<<<<<<<<<<<<<
* return os.path.isfile(stem + "/Misc.dat") \
@@ -7808,7 +7808,7 @@ static PyObject *__pyx_pf_5moses_7dictree_13OnDiskWrapper_2canLoad(CYTHON_UNUSED
* and os.path.isfile(stem + "/TargetColl.dat") \
* and os.path.isfile(stem + "/TargetInd.dat") \ # <<<<<<<<<<<<<<
* and os.path.isfile(stem + "/Vocab.dat")
- *
+ *
*/
__pyx_t_6 = __Pyx_GetModuleGlobalName(__pyx_n_s_os); if (unlikely(!__pyx_t_6)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 309; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
__Pyx_GOTREF(__pyx_t_6);
@@ -7837,7 +7837,7 @@ static PyObject *__pyx_pf_5moses_7dictree_13OnDiskWrapper_2canLoad(CYTHON_UNUSED
* and os.path.isfile(stem + "/TargetColl.dat") \
* and os.path.isfile(stem + "/TargetInd.dat") \
* and os.path.isfile(stem + "/Vocab.dat") # <<<<<<<<<<<<<<
- *
+ *
* cdef Production getSourceProduction(self, vector[string] ftokens):
*/
__pyx_t_7 = __Pyx_GetModuleGlobalName(__pyx_n_s_os); if (unlikely(!__pyx_t_7)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 310; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
@@ -7888,7 +7888,7 @@ static PyObject *__pyx_pf_5moses_7dictree_13OnDiskWrapper_2canLoad(CYTHON_UNUSED
goto __pyx_L0;
/* "moses/dictree.pyx":305
- *
+ *
* @classmethod
* def canLoad(cls, stem, bint wa = False): # <<<<<<<<<<<<<<
* return os.path.isfile(stem + "/Misc.dat") \
@@ -7914,7 +7914,7 @@ static PyObject *__pyx_pf_5moses_7dictree_13OnDiskWrapper_2canLoad(CYTHON_UNUSED
/* "moses/dictree.pyx":312
* and os.path.isfile(stem + "/Vocab.dat")
- *
+ *
* cdef Production getSourceProduction(self, vector[string] ftokens): # <<<<<<<<<<<<<<
* cdef list tokens = [f.c_str() for f in ftokens]
* return Production(tokens[:-1], tokens[-1])
@@ -7936,11 +7936,11 @@ static struct __pyx_obj_5moses_7dictree_Production *__pyx_f_5moses_7dictree_13On
__Pyx_RefNannySetupContext("getSourceProduction", 0);
/* "moses/dictree.pyx":313
- *
+ *
* cdef Production getSourceProduction(self, vector[string] ftokens):
* cdef list tokens = [f.c_str() for f in ftokens] # <<<<<<<<<<<<<<
* return Production(tokens[:-1], tokens[-1])
- *
+ *
*/
__pyx_t_1 = PyList_New(0); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 313; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
__Pyx_GOTREF(__pyx_t_1);
@@ -7962,7 +7962,7 @@ static struct __pyx_obj_5moses_7dictree_Production *__pyx_f_5moses_7dictree_13On
* cdef Production getSourceProduction(self, vector[string] ftokens):
* cdef list tokens = [f.c_str() for f in ftokens]
* return Production(tokens[:-1], tokens[-1]) # <<<<<<<<<<<<<<
- *
+ *
* def query(self, line, converter = None, cmp = None, key = None):
*/
__Pyx_XDECREF(((PyObject *)__pyx_r));
@@ -7987,7 +7987,7 @@ static struct __pyx_obj_5moses_7dictree_Production *__pyx_f_5moses_7dictree_13On
/* "moses/dictree.pyx":312
* and os.path.isfile(stem + "/Vocab.dat")
- *
+ *
* cdef Production getSourceProduction(self, vector[string] ftokens): # <<<<<<<<<<<<<<
* cdef list tokens = [f.c_str() for f in ftokens]
* return Production(tokens[:-1], tokens[-1])
@@ -8009,7 +8009,7 @@ static struct __pyx_obj_5moses_7dictree_Production *__pyx_f_5moses_7dictree_13On
/* "moses/dictree.pyx":316
* return Production(tokens[:-1], tokens[-1])
- *
+ *
* def query(self, line, converter = None, cmp = None, key = None): # <<<<<<<<<<<<<<
* """
* Returns a list of target productions that translate a given source production
@@ -8231,7 +8231,7 @@ static PyObject *__pyx_gb_5moses_7dictree_13OnDiskWrapper_5query_2generator4(__p
/* "moses/dictree.pyx":316
* return Production(tokens[:-1], tokens[-1])
- *
+ *
* def query(self, line, converter = None, cmp = None, key = None): # <<<<<<<<<<<<<<
* """
* Returns a list of target productions that translate a given source production
@@ -8549,7 +8549,7 @@ static PyObject *__pyx_pf_5moses_7dictree_13OnDiskWrapper_4query(struct __pyx_ob
* if cmp:
* results.sort(cmp=cmp, key=key) # <<<<<<<<<<<<<<
* return results
- *
+ *
*/
__pyx_t_1 = __Pyx_PyObject_GetAttrStr(((PyObject *)__pyx_v_results), __pyx_n_s_sort); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 346; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
__Pyx_GOTREF(__pyx_t_1);
@@ -8570,7 +8570,7 @@ static PyObject *__pyx_pf_5moses_7dictree_13OnDiskWrapper_4query(struct __pyx_ob
* if cmp:
* results.sort(cmp=cmp, key=key)
* return results # <<<<<<<<<<<<<<
- *
+ *
* def load(path, nscores, limit):
*/
__Pyx_XDECREF(__pyx_r);
@@ -8580,7 +8580,7 @@ static PyObject *__pyx_pf_5moses_7dictree_13OnDiskWrapper_4query(struct __pyx_ob
/* "moses/dictree.pyx":316
* return Production(tokens[:-1], tokens[-1])
- *
+ *
* def query(self, line, converter = None, cmp = None, key = None): # <<<<<<<<<<<<<<
* """
* Returns a list of target productions that translate a given source production
@@ -8611,7 +8611,7 @@ static PyObject *__pyx_pf_5moses_7dictree_13OnDiskWrapper_4query(struct __pyx_ob
* cdef condiskpt.OnDiskQuery *finder
* cdef readonly bytes delimiters # <<<<<<<<<<<<<<
* cdef readonly unsigned tableLimit
- *
+ *
*/
/* Python wrapper */
@@ -8647,7 +8647,7 @@ static PyObject *__pyx_pf_5moses_7dictree_13OnDiskWrapper_10delimiters___get__(s
* cdef condiskpt.OnDiskQuery *finder
* cdef readonly bytes delimiters
* cdef readonly unsigned tableLimit # <<<<<<<<<<<<<<
- *
+ *
* def __cinit__(self, bytes path, unsigned tableLimit = 20, delimiters = ' \t'):
*/
@@ -8692,7 +8692,7 @@ static PyObject *__pyx_pf_5moses_7dictree_13OnDiskWrapper_10tableLimit___get__(s
/* "moses/dictree.pyx":349
* return results
- *
+ *
* def load(path, nscores, limit): # <<<<<<<<<<<<<<
* """Finds out the correct implementation depending on the content of 'path' and returns the appropriate dictionary tree."""
* if PhraseDictionaryTree.canLoad(path, False):
@@ -8953,7 +8953,7 @@ static PyObject *__pyx_pf_5moses_7dictree_2load(CYTHON_UNUSED PyObject *__pyx_se
/* "moses/dictree.pyx":349
* return results
- *
+ *
* def load(path, nscores, limit): # <<<<<<<<<<<<<<
* """Finds out the correct implementation depending on the content of 'path' and returns the appropriate dictionary tree."""
* if PhraseDictionaryTree.canLoad(path, False):
@@ -8973,7 +8973,7 @@ static PyObject *__pyx_pf_5moses_7dictree_2load(CYTHON_UNUSED PyObject *__pyx_se
}
/* "string.from_py":13
- *
+ *
* @cname("__pyx_convert_string_from_py_")
* cdef string __pyx_convert_string_from_py_(object o) except *: # <<<<<<<<<<<<<<
* cdef Py_ssize_t length
@@ -8996,7 +8996,7 @@ static std::string __pyx_convert_string_from_py_(PyObject *__pyx_v_o) {
* cdef Py_ssize_t length
* cdef char* data = __Pyx_PyObject_AsStringAndSize(o, &length) # <<<<<<<<<<<<<<
* return string(data, length)
- *
+ *
*/
__pyx_t_1 = __Pyx_PyObject_AsStringAndSize(__pyx_v_o, (&__pyx_v_length)); if (unlikely(__pyx_t_1 == NULL)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 15; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
__pyx_v_data = __pyx_t_1;
@@ -9005,14 +9005,14 @@ static std::string __pyx_convert_string_from_py_(PyObject *__pyx_v_o) {
* cdef Py_ssize_t length
* cdef char* data = __Pyx_PyObject_AsStringAndSize(o, &length)
* return string(data, length) # <<<<<<<<<<<<<<
- *
- *
+ *
+ *
*/
__pyx_r = std::string(__pyx_v_data, __pyx_v_length);
goto __pyx_L0;
/* "string.from_py":13
- *
+ *
* @cname("__pyx_convert_string_from_py_")
* cdef string __pyx_convert_string_from_py_(object o) except *: # <<<<<<<<<<<<<<
* cdef Py_ssize_t length
@@ -9028,11 +9028,11 @@ static std::string __pyx_convert_string_from_py_(PyObject *__pyx_v_o) {
}
/* "vector.to_py":63
- *
+ *
* @cname("__pyx_convert_vector_to_py___pyx_t_5moses_8cdictree_FValue")
* cdef object __pyx_convert_vector_to_py___pyx_t_5moses_8cdictree_FValue(vector[X]& v): # <<<<<<<<<<<<<<
* return [X_to_py(v[i]) for i in range(v.size())]
- *
+ *
*/
static PyObject *__pyx_convert_vector_to_py___pyx_t_5moses_8cdictree_FValue(const std::vector<__pyx_t_5moses_8cdictree_FValue> &__pyx_v_v) {
@@ -9052,8 +9052,8 @@ static PyObject *__pyx_convert_vector_to_py___pyx_t_5moses_8cdictree_FValue(cons
* @cname("__pyx_convert_vector_to_py___pyx_t_5moses_8cdictree_FValue")
* cdef object __pyx_convert_vector_to_py___pyx_t_5moses_8cdictree_FValue(vector[X]& v):
* return [X_to_py(v[i]) for i in range(v.size())] # <<<<<<<<<<<<<<
- *
- *
+ *
+ *
*/
__Pyx_XDECREF(__pyx_r);
__pyx_t_1 = PyList_New(0); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[2]; __pyx_lineno = 64; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
@@ -9071,11 +9071,11 @@ static PyObject *__pyx_convert_vector_to_py___pyx_t_5moses_8cdictree_FValue(cons
goto __pyx_L0;
/* "vector.to_py":63
- *
+ *
* @cname("__pyx_convert_vector_to_py___pyx_t_5moses_8cdictree_FValue")
* cdef object __pyx_convert_vector_to_py___pyx_t_5moses_8cdictree_FValue(vector[X]& v): # <<<<<<<<<<<<<<
* return [X_to_py(v[i]) for i in range(v.size())]
- *
+ *
*/
/* function exit code */
@@ -9091,11 +9091,11 @@ static PyObject *__pyx_convert_vector_to_py___pyx_t_5moses_8cdictree_FValue(cons
}
/* "pair.to_py":180
- *
+ *
* @cname("__pyx_convert_pair_to_py_int____int")
* cdef object __pyx_convert_pair_to_py_int____int(pair[X,Y]& p): # <<<<<<<<<<<<<<
* return X_to_py(p.first), Y_to_py(p.second)
- *
+ *
*/
static PyObject *__pyx_convert_pair_to_py_int____int(const std::pair<int,int> &__pyx_v_p) {
@@ -9113,8 +9113,8 @@ static PyObject *__pyx_convert_pair_to_py_int____int(const std::pair<int,int> &_
* @cname("__pyx_convert_pair_to_py_int____int")
* cdef object __pyx_convert_pair_to_py_int____int(pair[X,Y]& p):
* return X_to_py(p.first), Y_to_py(p.second) # <<<<<<<<<<<<<<
- *
- *
+ *
+ *
*/
__Pyx_XDECREF(__pyx_r);
__pyx_t_1 = __Pyx_PyInt_From_int(__pyx_v_p.first); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[3]; __pyx_lineno = 181; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
@@ -9134,11 +9134,11 @@ static PyObject *__pyx_convert_pair_to_py_int____int(const std::pair<int,int> &_
goto __pyx_L0;
/* "pair.to_py":180
- *
+ *
* @cname("__pyx_convert_pair_to_py_int____int")
* cdef object __pyx_convert_pair_to_py_int____int(pair[X,Y]& p): # <<<<<<<<<<<<<<
* return X_to_py(p.first), Y_to_py(p.second)
- *
+ *
*/
/* function exit code */
@@ -9155,11 +9155,11 @@ static PyObject *__pyx_convert_pair_to_py_int____int(const std::pair<int,int> &_
}
/* "vector.to_py":63
- *
+ *
* @cname("__pyx_convert_vector_to_py_OnDiskPt_3a__3a_AlignPair")
* cdef object __pyx_convert_vector_to_py_OnDiskPt_3a__3a_AlignPair(vector[X]& v): # <<<<<<<<<<<<<<
* return [X_to_py(v[i]) for i in range(v.size())]
- *
+ *
*/
static PyObject *__pyx_convert_vector_to_py_OnDiskPt_3a__3a_AlignPair(const std::vector<OnDiskPt::AlignPair> &__pyx_v_v) {
@@ -9179,8 +9179,8 @@ static PyObject *__pyx_convert_vector_to_py_OnDiskPt_3a__3a_AlignPair(const std:
* @cname("__pyx_convert_vector_to_py_OnDiskPt_3a__3a_AlignPair")
* cdef object __pyx_convert_vector_to_py_OnDiskPt_3a__3a_AlignPair(vector[X]& v):
* return [X_to_py(v[i]) for i in range(v.size())] # <<<<<<<<<<<<<<
- *
- *
+ *
+ *
*/
__Pyx_XDECREF(__pyx_r);
__pyx_t_1 = PyList_New(0); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[4]; __pyx_lineno = 64; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
@@ -9198,11 +9198,11 @@ static PyObject *__pyx_convert_vector_to_py_OnDiskPt_3a__3a_AlignPair(const std:
goto __pyx_L0;
/* "vector.to_py":63
- *
+ *
* @cname("__pyx_convert_vector_to_py_OnDiskPt_3a__3a_AlignPair")
* cdef object __pyx_convert_vector_to_py_OnDiskPt_3a__3a_AlignPair(vector[X]& v): # <<<<<<<<<<<<<<
* return [X_to_py(v[i]) for i in range(v.size())]
- *
+ *
*/
/* function exit code */
@@ -11255,7 +11255,7 @@ static int __Pyx_InitCachedConstants(void) {
* elif isinstance(data, unicode):
* return data.encode('UTF-8') # <<<<<<<<<<<<<<
* raise TypeError('Cannot convert %s to string' % type(data))
- *
+ *
*/
__pyx_tuple_ = PyTuple_Pack(1, __pyx_kp_s_UTF_8); if (unlikely(!__pyx_tuple_)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 20; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
__Pyx_GOTREF(__pyx_tuple_);
@@ -11273,7 +11273,7 @@ static int __Pyx_InitCachedConstants(void) {
__Pyx_GIVEREF(__pyx_tuple__4);
/* "moses/dictree.pyx":156
- *
+ *
* @staticmethod
* def desc(x, y, key = lambda r: r.scores[0]): # <<<<<<<<<<<<<<
* """Returns the sign of key(y) - key(x).
@@ -11286,7 +11286,7 @@ static int __Pyx_InitCachedConstants(void) {
/* "moses/dictree.pyx":349
* return results
- *
+ *
* def load(path, nscores, limit): # <<<<<<<<<<<<<<
* """Finds out the correct implementation depending on the content of 'path' and returns the appropriate dictionary tree."""
* if PhraseDictionaryTree.canLoad(path, False):
@@ -11539,7 +11539,7 @@ PyMODINIT_FUNC PyInit_dictree(void)
* cimport cdictree
* cimport condiskpt
* from math import log # <<<<<<<<<<<<<<
- *
+ *
* cpdef int fsign(float x):
*/
__pyx_t_2 = PyList_New(1); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 10; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
@@ -11558,7 +11558,7 @@ PyMODINIT_FUNC PyInit_dictree(void)
/* "moses/dictree.pyx":144
* cdef readonly FValues scores
- *
+ *
* def __init__(self, rhs, scores, alignment = [], lhs = None): # <<<<<<<<<<<<<<
* """
* :rhs right-hand side tokens (sequence of terminals and nonterminals)
@@ -11570,7 +11570,7 @@ PyMODINIT_FUNC PyInit_dictree(void)
__pyx_t_1 = 0;
/* "moses/dictree.pyx":156
- *
+ *
* @staticmethod
* def desc(x, y, key = lambda r: r.scores[0]): # <<<<<<<<<<<<<<
* """Returns the sign of key(y) - key(x).
@@ -11584,7 +11584,7 @@ PyMODINIT_FUNC PyInit_dictree(void)
/* "moses/dictree.pyx":155
* self.alignment = Alignment(alignment)
- *
+ *
* @staticmethod # <<<<<<<<<<<<<<
* def desc(x, y, key = lambda r: r.scores[0]):
* """Returns the sign of key(y) - key(x).
@@ -11604,7 +11604,7 @@ PyMODINIT_FUNC PyInit_dictree(void)
PyType_Modified(__pyx_ptype_5moses_7dictree_TargetProduction);
/* "moses/dictree.pyx":156
- *
+ *
* @staticmethod
* def desc(x, y, key = lambda r: r.scores[0]): # <<<<<<<<<<<<<<
* """Returns the sign of key(y) - key(x).
@@ -11615,7 +11615,7 @@ PyMODINIT_FUNC PyInit_dictree(void)
/* "moses/dictree.pyx":155
* self.alignment = Alignment(alignment)
- *
+ *
* @staticmethod # <<<<<<<<<<<<<<
* def desc(x, y, key = lambda r: r.scores[0]):
* """Returns the sign of key(y) - key(x).
@@ -11634,7 +11634,7 @@ PyMODINIT_FUNC PyInit_dictree(void)
/* "moses/dictree.pyx":179
* cdef readonly Production source
- *
+ *
* def __init__(self, source, targets = []): # <<<<<<<<<<<<<<
* super(QueryResult, self).__init__(targets)
* self.source = source
@@ -11646,7 +11646,7 @@ PyMODINIT_FUNC PyInit_dictree(void)
__pyx_t_1 = 0;
/* "moses/dictree.pyx":187
- *
+ *
* @classmethod
* def canLoad(cls, path, bint wa = False): # <<<<<<<<<<<<<<
* """Whether or not the path represents a valid table for that class."""
@@ -11657,7 +11657,7 @@ PyMODINIT_FUNC PyInit_dictree(void)
/* "moses/dictree.pyx":186
* cdef class DictionaryTree(object):
- *
+ *
* @classmethod # <<<<<<<<<<<<<<
* def canLoad(cls, path, bint wa = False):
* """Whether or not the path represents a valid table for that class."""
@@ -11670,7 +11670,7 @@ PyMODINIT_FUNC PyInit_dictree(void)
PyType_Modified(__pyx_ptype_5moses_7dictree_DictionaryTree);
/* "moses/dictree.pyx":236
- *
+ *
* @classmethod
* def canLoad(cls, stem, bint wa = False): # <<<<<<<<<<<<<<
* """This sanity check was added to the constructor, but you can access it from outside this class
@@ -11681,7 +11681,7 @@ PyMODINIT_FUNC PyInit_dictree(void)
/* "moses/dictree.pyx":235
* del self.tree
- *
+ *
* @classmethod # <<<<<<<<<<<<<<
* def canLoad(cls, stem, bint wa = False):
* """This sanity check was added to the constructor, but you can access it from outside this class
@@ -11695,7 +11695,7 @@ PyMODINIT_FUNC PyInit_dictree(void)
/* "moses/dictree.pyx":258
* return TargetProduction(words, scores, wa)
- *
+ *
* def query(self, line, converter = lambda x: log(x), cmp = lambda x, y: fsign(y.scores[2] - x.scores[2]), key = None): # <<<<<<<<<<<<<<
* """
* Returns a list of target productions that translate a given source production
@@ -11712,7 +11712,7 @@ PyMODINIT_FUNC PyInit_dictree(void)
__pyx_t_1 = 0;
/* "moses/dictree.pyx":305
- *
+ *
* @classmethod
* def canLoad(cls, stem, bint wa = False): # <<<<<<<<<<<<<<
* return os.path.isfile(stem + "/Misc.dat") \
@@ -11723,7 +11723,7 @@ PyMODINIT_FUNC PyInit_dictree(void)
/* "moses/dictree.pyx":304
* self.finder = new condiskpt.OnDiskQuery(self.wrapper[0])
- *
+ *
* @classmethod # <<<<<<<<<<<<<<
* def canLoad(cls, stem, bint wa = False):
* return os.path.isfile(stem + "/Misc.dat") \
@@ -11737,7 +11737,7 @@ PyMODINIT_FUNC PyInit_dictree(void)
/* "moses/dictree.pyx":349
* return results
- *
+ *
* def load(path, nscores, limit): # <<<<<<<<<<<<<<
* """Finds out the correct implementation depending on the content of 'path' and returns the appropriate dictionary tree."""
* if PhraseDictionaryTree.canLoad(path, False):
@@ -11749,7 +11749,7 @@ PyMODINIT_FUNC PyInit_dictree(void)
/* "moses/dictree.pyx":1
* # This module wraps phrase/rule tables # <<<<<<<<<<<<<<
- *
+ *
* from libcpp.string cimport string
*/
__pyx_t_2 = PyDict_New(); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
@@ -11758,11 +11758,11 @@ PyMODINIT_FUNC PyInit_dictree(void)
__Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
/* "vector.to_py":63
- *
+ *
* @cname("__pyx_convert_vector_to_py_OnDiskPt_3a__3a_AlignPair")
* cdef object __pyx_convert_vector_to_py_OnDiskPt_3a__3a_AlignPair(vector[X]& v): # <<<<<<<<<<<<<<
* return [X_to_py(v[i]) for i in range(v.size())]
- *
+ *
*/
goto __pyx_L0;
__pyx_L1_error:;
diff --git a/contrib/relent-filter/sigtest-filter/WIN32_functions.cpp b/contrib/relent-filter/sigtest-filter/WIN32_functions.cpp
index cfc15d592..989c727a2 100755
--- a/contrib/relent-filter/sigtest-filter/WIN32_functions.cpp
+++ b/contrib/relent-filter/sigtest-filter/WIN32_functions.cpp
@@ -1,231 +1,231 @@
-// XGetopt.cpp Version 1.2
-//
-// Author: Hans Dietrich
-// hdietrich2@hotmail.com
-//
-// Description:
-// XGetopt.cpp implements getopt(), a function to parse command lines.
-//
-// History
-// Version 1.2 - 2003 May 17
-// - Added Unicode support
-//
-// Version 1.1 - 2002 March 10
-// - Added example to XGetopt.cpp module header
-//
-// This software is released into the public domain.
-// You are free to use it in any way you like.
-//
-// This software is provided "as is" with no expressed
-// or implied warranty. I accept no liability for any
-// damage or loss of business that this software may cause.
-//
-///////////////////////////////////////////////////////////////////////////////
-
-
-///////////////////////////////////////////////////////////////////////////////
-// if you are using precompiled headers then include this line:
-///////////////////////////////////////////////////////////////////////////////
-
-
-///////////////////////////////////////////////////////////////////////////////
-// if you are not using precompiled headers then include these lines:
-//#include <windows.h>
-//#include <cstdio>
-//#include <tchar.h>
-///////////////////////////////////////////////////////////////////////////////
-
-
-#include <cstdio>
-#include <cstring>
-#include <cmath>
-#include "WIN32_functions.h"
-
-
-///////////////////////////////////////////////////////////////////////////////
-//
-// X G e t o p t . c p p
-//
-//
-// NAME
-// getopt -- parse command line options
-//
-// SYNOPSIS
-// int getopt(int argc, char *argv[], char *optstring)
-//
-// extern char *optarg;
-// extern int optind;
-//
-// DESCRIPTION
-// The getopt() function parses the command line arguments. Its
-// arguments argc and argv are the argument count and array as
-// passed into the application on program invocation. In the case
-// of Visual C++ programs, argc and argv are available via the
-// variables __argc and __argv (double underscores), respectively.
-// getopt returns the next option letter in argv that matches a
-// letter in optstring. (Note: Unicode programs should use
-// __targv instead of __argv. Also, all character and string
-// literals should be enclosed in ( ) ).
-//
-// optstring is a string of recognized option letters; if a letter
-// is followed by a colon, the option is expected to have an argument
-// that may or may not be separated from it by white space. optarg
-// is set to point to the start of the option argument on return from
-// getopt.
-//
-// Option letters may be combined, e.g., "-ab" is equivalent to
-// "-a -b". Option letters are case sensitive.
-//
-// getopt places in the external variable optind the argv index
-// of the next argument to be processed. optind is initialized
-// to 0 before the first call to getopt.
-//
-// When all options have been processed (i.e., up to the first
-// non-option argument), getopt returns EOF, optarg will point
-// to the argument, and optind will be set to the argv index of
-// the argument. If there are no non-option arguments, optarg
-// will be set to NULL.
-//
-// The special option "--" may be used to delimit the end of the
-// options; EOF will be returned, and "--" (and everything after it)
-// will be skipped.
-//
-// RETURN VALUE
-// For option letters contained in the string optstring, getopt
-// will return the option letter. getopt returns a question mark (?)
-// when it encounters an option letter not included in optstring.
-// EOF is returned when processing is finished.
-//
-// BUGS
-// 1) Long options are not supported.
-// 2) The GNU double-colon extension is not supported.
-// 3) The environment variable POSIXLY_CORRECT is not supported.
-// 4) The + syntax is not supported.
-// 5) The automatic permutation of arguments is not supported.
-// 6) This implementation of getopt() returns EOF if an error is
-// encountered, instead of -1 as the latest standard requires.
-//
-// EXAMPLE
-// BOOL CMyApp::ProcessCommandLine(int argc, char *argv[])
-// {
-// int c;
-//
-// while ((c = getopt(argc, argv, ("aBn:"))) != EOF)
-// {
-// switch (c)
-// {
-// case ('a'):
-// TRACE(("option a\n"));
-// //
-// // set some flag here
-// //
-// break;
-//
-// case ('B'):
-// TRACE( ("option B\n"));
-// //
-// // set some other flag here
-// //
-// break;
-//
-// case ('n'):
-// TRACE(("option n: value=%d\n"), atoi(optarg));
-// //
-// // do something with value here
-// //
-// break;
-//
-// case ('?'):
-// TRACE(("ERROR: illegal option %s\n"), argv[optind-1]);
-// return FALSE;
-// break;
-//
-// default:
-// TRACE(("WARNING: no handler for option %c\n"), c);
-// return FALSE;
-// break;
-// }
-// }
-// //
-// // check for non-option args here
-// //
-// return TRUE;
-// }
-//
-///////////////////////////////////////////////////////////////////////////////
-
-char *optarg; // global argument pointer
-int optind = 0; // global argv index
-
-int getopt(int argc, char *argv[], char *optstring)
-{
- static char *next = NULL;
- if (optind == 0)
- next = NULL;
-
- optarg = NULL;
-
- if (next == NULL || *next =='\0') {
- if (optind == 0)
- optind++;
-
- if (optind >= argc || argv[optind][0] != ('-') || argv[optind][1] == ('\0')) {
- optarg = NULL;
- if (optind < argc)
- optarg = argv[optind];
- return EOF;
- }
-
- if (strcmp(argv[optind], "--") == 0) {
- optind++;
- optarg = NULL;
- if (optind < argc)
- optarg = argv[optind];
- return EOF;
- }
-
- next = argv[optind];
- next++; // skip past -
- optind++;
- }
-
- char c = *next++;
- char *cp = strchr(optstring, c);
-
- if (cp == NULL || c == (':'))
- return ('?');
-
- cp++;
- if (*cp == (':')) {
- if (*next != ('\0')) {
- optarg = next;
- next = NULL;
- } else if (optind < argc) {
- optarg = argv[optind];
- optind++;
- } else {
- return ('?');
- }
- }
-
- return c;
-}
-
-// for an overview, see
-// W. Press, S. Teukolsky and W. Vetterling. (1992) Numerical Recipes in C. Chapter 6.1.
-double lgamma(int x)
-{
- // size_t xx=(size_t)x; xx--; size_t sum=1; while (xx) { sum *= xx--; } return log((double)(sum));
- if (x <= 2) {
- return 0.0;
- }
- static double coefs[6] = {76.18009172947146, -86.50532032941677, 24.01409824083091, -1.231739572450155, 0.1208650973866179e-2, -0.5395239384953e-5};
- double tmp=(double)x+5.5;
- tmp -= (((double)x)+0.5)*log(tmp);
- double y=(double)x;
- double sum = 1.000000000190015;
- for (size_t j=0; j<6; ++j) {
- sum += coefs[j]/++y;
- }
- return -tmp+log(2.5066282746310005*sum/(double)x);
-}
+// XGetopt.cpp Version 1.2
+//
+// Author: Hans Dietrich
+// hdietrich2@hotmail.com
+//
+// Description:
+// XGetopt.cpp implements getopt(), a function to parse command lines.
+//
+// History
+// Version 1.2 - 2003 May 17
+// - Added Unicode support
+//
+// Version 1.1 - 2002 March 10
+// - Added example to XGetopt.cpp module header
+//
+// This software is released into the public domain.
+// You are free to use it in any way you like.
+//
+// This software is provided "as is" with no expressed
+// or implied warranty. I accept no liability for any
+// damage or loss of business that this software may cause.
+//
+///////////////////////////////////////////////////////////////////////////////
+
+
+///////////////////////////////////////////////////////////////////////////////
+// if you are using precompiled headers then include this line:
+///////////////////////////////////////////////////////////////////////////////
+
+
+///////////////////////////////////////////////////////////////////////////////
+// if you are not using precompiled headers then include these lines:
+//#include <windows.h>
+//#include <cstdio>
+//#include <tchar.h>
+///////////////////////////////////////////////////////////////////////////////
+
+
+#include <cstdio>
+#include <cstring>
+#include <cmath>
+#include "WIN32_functions.h"
+
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// X G e t o p t . c p p
+//
+//
+// NAME
+// getopt -- parse command line options
+//
+// SYNOPSIS
+// int getopt(int argc, char *argv[], char *optstring)
+//
+// extern char *optarg;
+// extern int optind;
+//
+// DESCRIPTION
+// The getopt() function parses the command line arguments. Its
+// arguments argc and argv are the argument count and array as
+// passed into the application on program invocation. In the case
+// of Visual C++ programs, argc and argv are available via the
+// variables __argc and __argv (double underscores), respectively.
+// getopt returns the next option letter in argv that matches a
+// letter in optstring. (Note: Unicode programs should use
+// __targv instead of __argv. Also, all character and string
+// literals should be enclosed in ( ) ).
+//
+// optstring is a string of recognized option letters; if a letter
+// is followed by a colon, the option is expected to have an argument
+// that may or may not be separated from it by white space. optarg
+// is set to point to the start of the option argument on return from
+// getopt.
+//
+// Option letters may be combined, e.g., "-ab" is equivalent to
+// "-a -b". Option letters are case sensitive.
+//
+// getopt places in the external variable optind the argv index
+// of the next argument to be processed. optind is initialized
+// to 0 before the first call to getopt.
+//
+// When all options have been processed (i.e., up to the first
+// non-option argument), getopt returns EOF, optarg will point
+// to the argument, and optind will be set to the argv index of
+// the argument. If there are no non-option arguments, optarg
+// will be set to NULL.
+//
+// The special option "--" may be used to delimit the end of the
+// options; EOF will be returned, and "--" (and everything after it)
+// will be skipped.
+//
+// RETURN VALUE
+// For option letters contained in the string optstring, getopt
+// will return the option letter. getopt returns a question mark (?)
+// when it encounters an option letter not included in optstring.
+// EOF is returned when processing is finished.
+//
+// BUGS
+// 1) Long options are not supported.
+// 2) The GNU double-colon extension is not supported.
+// 3) The environment variable POSIXLY_CORRECT is not supported.
+// 4) The + syntax is not supported.
+// 5) The automatic permutation of arguments is not supported.
+// 6) This implementation of getopt() returns EOF if an error is
+// encountered, instead of -1 as the latest standard requires.
+//
+// EXAMPLE
+// BOOL CMyApp::ProcessCommandLine(int argc, char *argv[])
+// {
+// int c;
+//
+// while ((c = getopt(argc, argv, ("aBn:"))) != EOF)
+// {
+// switch (c)
+// {
+// case ('a'):
+// TRACE(("option a\n"));
+// //
+// // set some flag here
+// //
+// break;
+//
+// case ('B'):
+// TRACE( ("option B\n"));
+// //
+// // set some other flag here
+// //
+// break;
+//
+// case ('n'):
+// TRACE(("option n: value=%d\n"), atoi(optarg));
+// //
+// // do something with value here
+// //
+// break;
+//
+// case ('?'):
+// TRACE(("ERROR: illegal option %s\n"), argv[optind-1]);
+// return FALSE;
+// break;
+//
+// default:
+// TRACE(("WARNING: no handler for option %c\n"), c);
+// return FALSE;
+// break;
+// }
+// }
+// //
+// // check for non-option args here
+// //
+// return TRUE;
+// }
+//
+///////////////////////////////////////////////////////////////////////////////
+
+char *optarg; // global argument pointer
+int optind = 0; // global argv index
+
+int getopt(int argc, char *argv[], char *optstring)
+{
+ static char *next = NULL;
+ if (optind == 0)
+ next = NULL;
+
+ optarg = NULL;
+
+ if (next == NULL || *next =='\0') {
+ if (optind == 0)
+ optind++;
+
+ if (optind >= argc || argv[optind][0] != ('-') || argv[optind][1] == ('\0')) {
+ optarg = NULL;
+ if (optind < argc)
+ optarg = argv[optind];
+ return EOF;
+ }
+
+ if (strcmp(argv[optind], "--") == 0) {
+ optind++;
+ optarg = NULL;
+ if (optind < argc)
+ optarg = argv[optind];
+ return EOF;
+ }
+
+ next = argv[optind];
+ next++; // skip past -
+ optind++;
+ }
+
+ char c = *next++;
+ char *cp = strchr(optstring, c);
+
+ if (cp == NULL || c == (':'))
+ return ('?');
+
+ cp++;
+ if (*cp == (':')) {
+ if (*next != ('\0')) {
+ optarg = next;
+ next = NULL;
+ } else if (optind < argc) {
+ optarg = argv[optind];
+ optind++;
+ } else {
+ return ('?');
+ }
+ }
+
+ return c;
+}
+
+// for an overview, see
+// W. Press, S. Teukolsky and W. Vetterling. (1992) Numerical Recipes in C. Chapter 6.1.
+double lgamma(int x)
+{
+ // size_t xx=(size_t)x; xx--; size_t sum=1; while (xx) { sum *= xx--; } return log((double)(sum));
+ if (x <= 2) {
+ return 0.0;
+ }
+ static double coefs[6] = {76.18009172947146, -86.50532032941677, 24.01409824083091, -1.231739572450155, 0.1208650973866179e-2, -0.5395239384953e-5};
+ double tmp=(double)x+5.5;
+ tmp -= (((double)x)+0.5)*log(tmp);
+ double y=(double)x;
+ double sum = 1.000000000190015;
+ for (size_t j=0; j<6; ++j) {
+ sum += coefs[j]/++y;
+ }
+ return -tmp+log(2.5066282746310005*sum/(double)x);
+}
diff --git a/contrib/relent-filter/sigtest-filter/WIN32_functions.h b/contrib/relent-filter/sigtest-filter/WIN32_functions.h
index 6a719392e..ad644018b 100755
--- a/contrib/relent-filter/sigtest-filter/WIN32_functions.h
+++ b/contrib/relent-filter/sigtest-filter/WIN32_functions.h
@@ -1,24 +1,24 @@
-// XGetopt.h Version 1.2
-//
-// Author: Hans Dietrich
-// hdietrich2@hotmail.com
-//
-// This software is released into the public domain.
-// You are free to use it in any way you like.
-//
-// This software is provided "as is" with no expressed
-// or implied warranty. I accept no liability for any
-// damage or loss of business that this software may cause.
-//
-///////////////////////////////////////////////////////////////////////////////
-
-#ifndef XGETOPT_H
-#define XGETOPT_H
-
-extern int optind, opterr;
-extern char *optarg;
-
-int getopt(int argc, char *argv[], char *optstring);
-double lgamma(int x);
-
-#endif //XGETOPT_H
+// XGetopt.h Version 1.2
+//
+// Author: Hans Dietrich
+// hdietrich2@hotmail.com
+//
+// This software is released into the public domain.
+// You are free to use it in any way you like.
+//
+// This software is provided "as is" with no expressed
+// or implied warranty. I accept no liability for any
+// damage or loss of business that this software may cause.
+//
+///////////////////////////////////////////////////////////////////////////////
+
+#ifndef XGETOPT_H
+#define XGETOPT_H
+
+extern int optind, opterr;
+extern char *optarg;
+
+int getopt(int argc, char *argv[], char *optstring);
+double lgamma(int x);
+
+#endif //XGETOPT_H
diff --git a/contrib/relent-filter/sigtest-filter/filter-pt.cpp b/contrib/relent-filter/sigtest-filter/filter-pt.cpp
index 4a51953ea..e2408900d 100755
--- a/contrib/relent-filter/sigtest-filter/filter-pt.cpp
+++ b/contrib/relent-filter/sigtest-filter/filter-pt.cpp
@@ -1,5 +1,5 @@
-#include <cstring>
+#include <cstring>
#include <cassert>
#include <cstdio>
#include <cstdlib>
diff --git a/contrib/relent-filter/src/IOWrapper.cpp b/contrib/relent-filter/src/IOWrapper.cpp
index 053735c96..7ad7697ce 100755
--- a/contrib/relent-filter/src/IOWrapper.cpp
+++ b/contrib/relent-filter/src/IOWrapper.cpp
@@ -234,13 +234,13 @@ void OutputAlignment(ostream &out, const AlignmentInfo &ai, size_t sourceOffset,
{
typedef std::vector< const std::pair<size_t,size_t>* > AlignVec;
AlignVec alignments = ai.GetSortedAlignments();
-
+
AlignVec::const_iterator it;
for (it = alignments.begin(); it != alignments.end(); ++it) {
const std::pair<size_t,size_t> &alignment = **it;
out << alignment.first + sourceOffset << "-" << alignment.second + targetOffset << " ";
}
-
+
}
void OutputAlignment(ostream &out, const vector<const Hypothesis *> &edges)
@@ -251,7 +251,7 @@ void OutputAlignment(ostream &out, const vector<const Hypothesis *> &edges)
const Hypothesis &edge = *edges[currEdge];
const TargetPhrase &tp = edge.GetCurrTargetPhrase();
size_t sourceOffset = edge.GetCurrSourceWordsRange().GetStartPos();
-
+
OutputAlignment(out, tp.GetAlignmentInfo(), sourceOffset, targetOffset);
targetOffset += tp.GetSize();
@@ -263,7 +263,7 @@ void OutputAlignment(OutputCollector* collector, size_t lineNo , const vector<co
{
ostringstream out;
OutputAlignment(out, edges);
-
+
collector->Write(lineNo,out.str());
}
@@ -477,7 +477,7 @@ void OutputNBest(std::ostream& out, const Moses::TrellisPathList &nBestList, con
const int sourceOffset = sourceRange.GetStartPos();
const int targetOffset = targetRange.GetStartPos();
const AlignmentInfo &ai = edge.GetCurrTargetPhrase().GetAlignmentInfo();
-
+
OutputAlignment(out, ai, sourceOffset, targetOffset);
}
diff --git a/contrib/relent-filter/src/Main.cpp b/contrib/relent-filter/src/Main.cpp
index 1f86e2cc7..6a2bf4b01 100755
--- a/contrib/relent-filter/src/Main.cpp
+++ b/contrib/relent-filter/src/Main.cpp
@@ -42,6 +42,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
#include "RelativeEntropyCalc.h"
#include "LexicalReordering.h"
#include "LexicalReorderingState.h"
+#include "util/random.hh"
#ifdef HAVE_PROTOBUF
#include "hypergraph.pb.h"
@@ -167,18 +168,18 @@ static void ShowWeights()
int main(int argc, char** argv)
{
try {
-
+
// echo command line, if verbose
IFVERBOSE(1) {
TRACE_ERR("command: ");
for(int i=0; i<argc; ++i) TRACE_ERR(argv[i]<<" ");
TRACE_ERR(endl);
}
-
+
// set number of significant decimals in output
fix(cout,PRECISION);
fix(cerr,PRECISION);
-
+
// load all the settings into the Parameter class
// (stores them as strings, or array of strings)
Parameter* params = new Parameter();
@@ -186,34 +187,34 @@ int main(int argc, char** argv)
params->Explain();
exit(1);
}
-
-
+
+
// initialize all "global" variables, which are stored in StaticData
// note: this also loads models such as the language model, etc.
if (!StaticData::LoadDataStatic(params, argv[0])) {
exit(1);
}
-
+
// setting "-show-weights" -> just dump out weights and exit
if (params->isParamSpecified("show-weights")) {
ShowWeights();
exit(0);
}
-
+
// shorthand for accessing information in StaticData
const StaticData& staticData = StaticData::Instance();
-
-
+
+
//initialise random numbers
- srand(time(NULL));
-
+ rand_init();
+
// set up read/writing class
IOWrapper* ioWrapper = GetIOWrapper(staticData);
if (!ioWrapper) {
cerr << "Error; Failed to create IO object" << endl;
exit(1);
}
-
+
// check on weights
vector<float> weights = staticData.GetAllWeights();
IFVERBOSE(2) {
@@ -232,7 +233,7 @@ int main(int argc, char** argv)
// setting lexicalized reordering setup
PhraseBasedReorderingState::m_useFirstBackwardScore = false;
-
+
auto_ptr<OutputCollector> outputCollector;
outputCollector.reset(new OutputCollector());
@@ -240,7 +241,7 @@ int main(int argc, char** argv)
#ifdef WITH_THREADS
ThreadPool pool(staticData.ThreadCount());
#endif
-
+
// main loop over set of input sentences
InputType* source = NULL;
size_t lineCount = 0;
@@ -258,11 +259,11 @@ int main(int argc, char** argv)
task->Run();
delete task;
#endif
-
+
source = NULL; //make sure it doesn't get deleted
++lineCount;
}
-
+
// we are done, finishing up
#ifdef WITH_THREADS
pool.Stop(true); //flush remaining jobs
diff --git a/contrib/relent-filter/src/RelativeEntropyCalc.cpp b/contrib/relent-filter/src/RelativeEntropyCalc.cpp
index 212eedf87..9ba334fca 100755
--- a/contrib/relent-filter/src/RelativeEntropyCalc.cpp
+++ b/contrib/relent-filter/src/RelativeEntropyCalc.cpp
@@ -70,7 +70,7 @@ namespace MosesCmd
if (neg_log_div > 100){
return 100;
}
- return neg_log_div;
+ return neg_log_div;
}
void RelativeEntropyCalc::ConcatOutputPhraseRecursive(Phrase& phrase, const Hypothesis *hypo){
diff --git a/contrib/relent-filter/src/TranslationAnalysis.cpp b/contrib/relent-filter/src/TranslationAnalysis.cpp
index 89da48301..eb5f36293 100755
--- a/contrib/relent-filter/src/TranslationAnalysis.cpp
+++ b/contrib/relent-filter/src/TranslationAnalysis.cpp
@@ -57,7 +57,7 @@ void PrintTranslationAnalysis(const TranslationSystem* system, std::ostream &os,
}
}
}
-
+
bool epsilon = false;
if (target == "") {
target="<EPSILON>";
diff --git a/contrib/rephraser/paraphrase.cpp b/contrib/rephraser/paraphrase.cpp
index ad9dbc891..0556d6ccd 100644
--- a/contrib/rephraser/paraphrase.cpp
+++ b/contrib/rephraser/paraphrase.cpp
@@ -60,12 +60,12 @@ static void add(const string& e, const vector<float> scores,
static void finalise(Probs& p_e_given_f, Probs& p_f_given_e) {
//cerr << "Sizes: p(e|f): " << p_e_given_f.size() << " p(f|e): " << p_f_given_e.size() << endl;
- for (Probs::const_iterator e1_iter = p_f_given_e.begin() ;
+ for (Probs::const_iterator e1_iter = p_f_given_e.begin() ;
e1_iter != p_f_given_e.end(); ++e1_iter) {
for (Probs::const_iterator e2_iter = p_e_given_f.begin() ;
e2_iter != p_e_given_f.end(); ++e2_iter) {
- if (e1_iter->second == e2_iter->second) continue;
+ if (e1_iter->second == e2_iter->second) continue;
cout << e1_iter->second << " ||| " << e2_iter->second << " ||| " <<
e1_iter->first * e2_iter->first << " ||| " << endl;
}
diff --git a/contrib/server/Jamfile b/contrib/server/Jamfile
index 06624514b..048e540b0 100644
--- a/contrib/server/Jamfile
+++ b/contrib/server/Jamfile
@@ -11,7 +11,7 @@ else
{
with-xmlrpc-c = [ option.get "with-xmlrpc-c" ] ;
if $(with-xmlrpc-c) {
- echo Bulding mosesserver. ;
+ echo While building mosesserver ... ;
echo "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!" ;
echo "!!! You are linking the XMLRPC-C library; Do NOT use v.1.25.29 !!!" ;
echo "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!" ;
diff --git a/contrib/server/mosesserver.cpp b/contrib/server/mosesserver.cpp
index 30c0d4299..337962aa6 100644
--- a/contrib/server/mosesserver.cpp
+++ b/contrib/server/mosesserver.cpp
@@ -3,10 +3,10 @@
// The separate moses server executable is being phased out.
// Since there were problems with the migration into the main
// executable, this separate program is still included in the
-// distribution for legacy reasons. Contributors are encouraged
-// to add their contributions to moses/server rather than
+// distribution for legacy reasons. Contributors are encouraged
+// to add their contributions to moses/server rather than
// contrib/server. This recommendation does not apply to wrapper
-// scripts.
+// scripts.
// The future is this:
/** main function of the command line version of the decoder **/
@@ -37,6 +37,7 @@ int main(int argc, char** argv)
#include "moses/Manager.h"
#include "moses/StaticData.h"
#include "moses/ThreadPool.h"
+#include "moses/TranslationTask.h"
#include "moses/TranslationModel/PhraseDictionaryDynSuffixArray.h"
#include "moses/TranslationModel/PhraseDictionaryMultiModelCounts.h"
#if PT_UG
@@ -82,7 +83,7 @@ public:
pdsa->add(source_,target_,alignment_);
#else
const PhraseDictionary* pdf = PhraseDictionary::GetColl()[0];
- PhraseDictionaryDynSuffixArray*
+ PhraseDictionaryDynSuffixArray*
pdsa = (PhraseDictionaryDynSuffixArray*) pdf;
cerr << "Inserting into address " << pdsa << endl;
pdsa->insertSnt(source_, target_, alignment_);
@@ -145,7 +146,7 @@ public:
}
}
*/
-
+
void breakOutParams(const params_t& params) {
params_t::const_iterator si = params.find("source");
if(si == params.end())
@@ -232,33 +233,43 @@ public:
/**
* Required so that translations can be sent to a thread pool.
**/
-class TranslationTask : public virtual Moses::Task {
-public:
+class TranslationTask : public virtual Moses::TranslationTask {
+protected:
TranslationTask(xmlrpc_c::paramList const& paramList,
- boost::condition_variable& cond, boost::mutex& mut)
+ boost::condition_variable& cond, boost::mutex& mut)
: m_paramList(paramList),
m_cond(cond),
m_mut(mut),
m_done(false)
{}
+public:
+ static boost::shared_ptr<TranslationTask>
+ create(xmlrpc_c::paramList const& paramList,
+ boost::condition_variable& cond, boost::mutex& mut)
+ {
+ boost::shared_ptr<TranslationTask> ret(new TranslationTask(paramList, cond, mut));
+ ret->m_self = ret;
+ return ret;
+ }
+
virtual bool DeleteAfterExecution() {return false;}
bool IsDone() const {return m_done;}
const map<string, xmlrpc_c::value>& GetRetData() { return m_retData;}
- virtual void Run() {
-
+ virtual void
+ Run()
+ {
+ using namespace xmlrpc_c;
const params_t params = m_paramList.getStruct(0);
m_paramList.verifyEnd(1);
params_t::const_iterator si = params.find("text");
if (si == params.end()) {
- throw xmlrpc_c::fault(
- "Missing source text",
- xmlrpc_c::fault::CODE_PARSE);
+ throw fault("Missing source text", fault::CODE_PARSE);
}
- const string source((xmlrpc_c::value_string(si->second)));
+ const string source = value_string(si->second);
XVERBOSE(1,"Input: " << source << endl);
si = params.find("align");
@@ -272,7 +283,7 @@ public:
si = params.find("report-all-factors");
bool reportAllFactors = (si != params.end());
si = params.find("nbest");
- int nbest_size = (si == params.end()) ? 0 : int(xmlrpc_c::value_int(si->second));
+ int nbest_size = (si == params.end()) ? 0 : int(value_int(si->second));
si = params.find("nbest-distinct");
bool nbest_distinct = (si != params.end());
@@ -281,20 +292,24 @@ public:
vector<float> multiModelWeights;
si = params.find("lambda");
- if (si != params.end()) {
- xmlrpc_c::value_array multiModelArray = xmlrpc_c::value_array(si->second);
- vector<xmlrpc_c::value> multiModelValueVector(multiModelArray.vectorValueValue());
- for (size_t i=0;i < multiModelValueVector.size();i++) {
- multiModelWeights.push_back(xmlrpc_c::value_double(multiModelValueVector[i]));
- }
- }
+ if (si != params.end())
+ {
+ value_array multiModelArray = value_array(si->second);
+ vector<value> multiModelValueVector(multiModelArray.vectorValueValue());
+ for (size_t i=0;i < multiModelValueVector.size();i++)
+ {
+ multiModelWeights.push_back(value_double(multiModelValueVector[i]));
+ }
+ }
si = params.find("model_name");
- if (si != params.end() && multiModelWeights.size() > 0) {
- const string model_name = xmlrpc_c::value_string(si->second);
- PhraseDictionaryMultiModel* pdmm = (PhraseDictionaryMultiModel*) FindPhraseDictionary(model_name);
+ if (si != params.end() && multiModelWeights.size() > 0)
+ {
+ const string model_name = value_string(si->second);
+ PhraseDictionaryMultiModel* pdmm
+ = (PhraseDictionaryMultiModel*) FindPhraseDictionary(model_name);
pdmm->SetTemporaryMultiModelWeightsVector(multiModelWeights);
- }
+ }
const StaticData &staticData = StaticData::Instance();
@@ -306,13 +321,14 @@ public:
stringstream out, graphInfo, transCollOpts;
- if (staticData.IsSyntax()) {
- TreeInput tinput;
- const vector<FactorType>&
- inputFactorOrder = staticData.GetInputFactorOrder();
- stringstream in(source + "\n");
- tinput.Read(in,inputFactorOrder);
- ChartManager manager(tinput);
+ if (staticData.IsSyntax())
+ {
+ boost::shared_ptr<TreeInput> tinput(new TreeInput);
+ const vector<FactorType>& IFO = staticData.GetInputFactorOrder();
+ istringstream in(source + "\n");
+ tinput->Read(in,IFO);
+ ttasksptr task = Moses::TranslationTask::create(tinput);
+ ChartManager manager(task);
manager.Decode();
const ChartHypothesis *hypo = manager.GetBestHypothesis();
outputChartHypo(out,hypo);
@@ -320,57 +336,50 @@ public:
// const size_t translationId = tinput.GetTranslationId();
std::ostringstream sgstream;
manager.OutputSearchGraphMoses(sgstream);
- m_retData.insert(pair<string, xmlrpc_c::value>("sg", xmlrpc_c::value_string(sgstream.str())));
+ m_retData["sg"] = value_string(sgstream.str());
}
- } else {
- size_t lineNumber = 0; // TODO: Include sentence request number here?
- Sentence sentence;
- sentence.SetTranslationId(lineNumber);
-
- const vector<FactorType> &
- inputFactorOrder = staticData.GetInputFactorOrder();
- stringstream in(source + "\n");
- sentence.Read(in,inputFactorOrder);
- Manager manager(sentence);
- manager.Decode();
+ }
+ else
+ {
+ // size_t lineNumber = 0; // TODO: Include sentence request number here?
+ boost::shared_ptr<Sentence> sentence(new Sentence(0,source));
+ ttasksptr task = Moses::TranslationTask::create(sentence);
+ Manager manager(task);
+ manager.Decode();
const Hypothesis* hypo = manager.GetBestHypothesis();
vector<xmlrpc_c::value> alignInfo;
outputHypo(out,hypo,addAlignInfo,alignInfo,reportAllFactors);
- if (addAlignInfo) {
- m_retData.insert(pair<string, xmlrpc_c::value>("align", xmlrpc_c::value_array(alignInfo)));
- }
- if (addWordAlignInfo) {
- stringstream wordAlignment;
- hypo->OutputAlignment(wordAlignment);
- vector<xmlrpc_c::value> alignments;
- string alignmentPair;
- while (wordAlignment >> alignmentPair) {
+ if (addAlignInfo) m_retData["align"] = value_array(alignInfo);
+ if (addWordAlignInfo)
+ {
+ stringstream wordAlignment;
+ hypo->OutputAlignment(wordAlignment);
+ vector<xmlrpc_c::value> alignments;
+ string alignmentPair;
+ while (wordAlignment >> alignmentPair)
+ {
int pos = alignmentPair.find('-');
map<string, xmlrpc_c::value> wordAlignInfo;
- wordAlignInfo["source-word"] = xmlrpc_c::value_int(atoi(alignmentPair.substr(0, pos).c_str()));
- wordAlignInfo["target-word"] = xmlrpc_c::value_int(atoi(alignmentPair.substr(pos + 1).c_str()));
- alignments.push_back(xmlrpc_c::value_struct(wordAlignInfo));
- }
- m_retData.insert(pair<string, xmlrpc_c::value_array>("word-align", alignments));
- }
-
- if (addGraphInfo) {
- insertGraphInfo(manager,m_retData);
- }
- if (addTopts) {
- insertTranslationOptions(manager,m_retData);
- }
- if (nbest_size>0) {
- outputNBest(manager, m_retData, nbest_size, nbest_distinct,
- reportAllFactors, addAlignInfo, addScoreBreakdown);
- }
+ wordAlignInfo["source-word"]
+ = value_int(atoi(alignmentPair.substr(0, pos).c_str()));
+ wordAlignInfo["target-word"]
+ = value_int(atoi(alignmentPair.substr(pos + 1).c_str()));
+ alignments.push_back(value_struct(wordAlignInfo));
+ }
+ m_retData["word-align"] = value_array(alignments);
+ }
+
+ if (addGraphInfo) insertGraphInfo(manager,m_retData);
+ if (addTopts) insertTranslationOptions(manager,m_retData);
+ if (nbest_size > 0)
+ {
+ outputNBest(manager, m_retData, nbest_size, nbest_distinct,
+ reportAllFactors, addAlignInfo, addScoreBreakdown);
+ }
(const_cast<StaticData&>(staticData)).SetOutputSearchGraph(false);
-
- }
- pair<string, xmlrpc_c::value>
- text("text", xmlrpc_c::value_string(out.str()));
- m_retData.insert(text);
+ }
+ m_retData["text"] = value_string(out.str());
XVERBOSE(1,"Output: " << out.str() << endl);
{
boost::lock_guard<boost::mutex> lock(m_mut);
@@ -380,9 +389,12 @@ public:
}
- void outputHypo(ostream& out, const Hypothesis* hypo, bool addAlignmentInfo, vector<xmlrpc_c::value>& alignInfo, bool reportAllFactors = false) {
+ void outputHypo(ostream& out, const Hypothesis* hypo,
+ bool addAlignmentInfo, vector<xmlrpc_c::value>& alignInfo,
+ bool reportAllFactors = false) {
if (hypo->GetPrevHypo() != NULL) {
- outputHypo(out,hypo->GetPrevHypo(),addAlignmentInfo, alignInfo, reportAllFactors);
+ outputHypo(out,hypo->GetPrevHypo(),addAlignmentInfo,
+ alignInfo, reportAllFactors);
Phrase p = hypo->GetCurrTargetPhrase();
if(reportAllFactors) {
out << p << " ";
@@ -524,7 +536,7 @@ public:
{
// should the score breakdown be reported in a more structured manner?
ostringstream buf;
- path.GetScoreBreakdown().OutputAllFeatureScores(buf);
+ path.GetScoreBreakdown()->OutputAllFeatureScores(buf);
nBestXMLItem["fvals"] = xmlrpc_c::value_string(buf.str());
}
@@ -535,14 +547,14 @@ public:
retData.insert(pair<string, xmlrpc_c::value>("nbest", xmlrpc_c::value_array(nBestXml)));
}
- void
- insertTranslationOptions(Manager& manager, map<string, xmlrpc_c::value>& retData)
+ void
+ insertTranslationOptions(Manager& manager, map<string, xmlrpc_c::value>& retData)
{
const TranslationOptionCollection* toptsColl = manager.getSntTranslationOptions();
vector<xmlrpc_c::value> toptsXml;
size_t const stop = toptsColl->GetSource().GetSize();
TranslationOptionList const* tol;
- for (size_t s = 0 ; s < stop ; ++s)
+ for (size_t s = 0 ; s < stop ; ++s)
{
for (size_t e = s; (tol = toptsColl->GetTranslationOptionList(s,e)) != NULL; ++e)
{
@@ -557,11 +569,11 @@ public:
toptXml["start"] = xmlrpc_c::value_int(s);
toptXml["end"] = xmlrpc_c::value_int(e);
vector<xmlrpc_c::value> scoresXml;
- const std::valarray<FValue> &scores
+ const std::valarray<FValue> &scores
= topt->GetScoreBreakdown().getCoreFeatures();
- for (size_t j = 0; j < scores.size(); ++j)
+ for (size_t j = 0; j < scores.size(); ++j)
scoresXml.push_back(xmlrpc_c::value_double(scores[j]));
-
+
toptXml["scores"] = xmlrpc_c::value_array(scoresXml);
toptsXml.push_back(xmlrpc_c::value_struct(toptXml));
}
@@ -569,7 +581,7 @@ public:
}
retData.insert(pair<string, xmlrpc_c::value>("topt", xmlrpc_c::value_array(toptsXml)));
}
-
+
private:
xmlrpc_c::paramList const& m_paramList;
map<string, xmlrpc_c::value> m_retData;
@@ -595,7 +607,7 @@ public:
boost::condition_variable cond;
boost::mutex mut;
typedef ::TranslationTask TTask;
- boost::shared_ptr<TTask> task(new TTask(paramList,cond,mut));
+ boost::shared_ptr<TTask> task = TTask::create(paramList,cond,mut);
m_threadPool.Submit(task);
boost::unique_lock<boost::mutex> lock(mut);
while (!task->IsDone()) {
@@ -607,8 +619,8 @@ private:
Moses::ThreadPool m_threadPool;
};
-static
-void
+static
+void
PrintFeatureWeight(ostream& out, const FeatureFunction* ff)
{
out << ff->GetScoreProducerDescription() << "=";
@@ -620,16 +632,16 @@ PrintFeatureWeight(ostream& out, const FeatureFunction* ff)
out << endl;
}
-static
-void
+static
+void
ShowWeights(ostream& out)
{
// adapted from moses-cmd/Main.cpp
std::ios::fmtflags old_flags = out.setf(std::ios::fixed);
size_t old_precision = out.precision(6);
- const vector<const StatelessFeatureFunction*>&
+ const vector<const StatelessFeatureFunction*>&
slf = StatelessFeatureFunction::GetStatelessFeatureFunctions();
- const vector<const StatefulFeatureFunction*>&
+ const vector<const StatefulFeatureFunction*>&
sff = StatefulFeatureFunction::GetStatefulFeatureFunctions();
for (size_t i = 0; i < sff.size(); ++i) {
@@ -650,7 +662,7 @@ ShowWeights(ostream& out)
out << ff->GetScoreProducerDescription() << " UNTUNEABLE" << endl;
}
}
- if (! (old_flags & std::ios::fixed))
+ if (! (old_flags & std::ios::fixed))
out.unsetf(std::ios::fixed);
out.precision(old_precision);
}
@@ -742,7 +754,7 @@ int main(int argc, char** argv)
.allowOrigin("*")
);
*/
-
+
XVERBOSE(1,"Listening on port " << port << endl);
if (isSerial) {
while(1) myAbyssServer.runOnce();
diff --git a/contrib/sigtest-filter/WIN32_functions.cpp b/contrib/sigtest-filter/WIN32_functions.cpp
index cfc15d592..989c727a2 100644
--- a/contrib/sigtest-filter/WIN32_functions.cpp
+++ b/contrib/sigtest-filter/WIN32_functions.cpp
@@ -1,231 +1,231 @@
-// XGetopt.cpp Version 1.2
-//
-// Author: Hans Dietrich
-// hdietrich2@hotmail.com
-//
-// Description:
-// XGetopt.cpp implements getopt(), a function to parse command lines.
-//
-// History
-// Version 1.2 - 2003 May 17
-// - Added Unicode support
-//
-// Version 1.1 - 2002 March 10
-// - Added example to XGetopt.cpp module header
-//
-// This software is released into the public domain.
-// You are free to use it in any way you like.
-//
-// This software is provided "as is" with no expressed
-// or implied warranty. I accept no liability for any
-// damage or loss of business that this software may cause.
-//
-///////////////////////////////////////////////////////////////////////////////
-
-
-///////////////////////////////////////////////////////////////////////////////
-// if you are using precompiled headers then include this line:
-///////////////////////////////////////////////////////////////////////////////
-
-
-///////////////////////////////////////////////////////////////////////////////
-// if you are not using precompiled headers then include these lines:
-//#include <windows.h>
-//#include <cstdio>
-//#include <tchar.h>
-///////////////////////////////////////////////////////////////////////////////
-
-
-#include <cstdio>
-#include <cstring>
-#include <cmath>
-#include "WIN32_functions.h"
-
-
-///////////////////////////////////////////////////////////////////////////////
-//
-// X G e t o p t . c p p
-//
-//
-// NAME
-// getopt -- parse command line options
-//
-// SYNOPSIS
-// int getopt(int argc, char *argv[], char *optstring)
-//
-// extern char *optarg;
-// extern int optind;
-//
-// DESCRIPTION
-// The getopt() function parses the command line arguments. Its
-// arguments argc and argv are the argument count and array as
-// passed into the application on program invocation. In the case
-// of Visual C++ programs, argc and argv are available via the
-// variables __argc and __argv (double underscores), respectively.
-// getopt returns the next option letter in argv that matches a
-// letter in optstring. (Note: Unicode programs should use
-// __targv instead of __argv. Also, all character and string
-// literals should be enclosed in ( ) ).
-//
-// optstring is a string of recognized option letters; if a letter
-// is followed by a colon, the option is expected to have an argument
-// that may or may not be separated from it by white space. optarg
-// is set to point to the start of the option argument on return from
-// getopt.
-//
-// Option letters may be combined, e.g., "-ab" is equivalent to
-// "-a -b". Option letters are case sensitive.
-//
-// getopt places in the external variable optind the argv index
-// of the next argument to be processed. optind is initialized
-// to 0 before the first call to getopt.
-//
-// When all options have been processed (i.e., up to the first
-// non-option argument), getopt returns EOF, optarg will point
-// to the argument, and optind will be set to the argv index of
-// the argument. If there are no non-option arguments, optarg
-// will be set to NULL.
-//
-// The special option "--" may be used to delimit the end of the
-// options; EOF will be returned, and "--" (and everything after it)
-// will be skipped.
-//
-// RETURN VALUE
-// For option letters contained in the string optstring, getopt
-// will return the option letter. getopt returns a question mark (?)
-// when it encounters an option letter not included in optstring.
-// EOF is returned when processing is finished.
-//
-// BUGS
-// 1) Long options are not supported.
-// 2) The GNU double-colon extension is not supported.
-// 3) The environment variable POSIXLY_CORRECT is not supported.
-// 4) The + syntax is not supported.
-// 5) The automatic permutation of arguments is not supported.
-// 6) This implementation of getopt() returns EOF if an error is
-// encountered, instead of -1 as the latest standard requires.
-//
-// EXAMPLE
-// BOOL CMyApp::ProcessCommandLine(int argc, char *argv[])
-// {
-// int c;
-//
-// while ((c = getopt(argc, argv, ("aBn:"))) != EOF)
-// {
-// switch (c)
-// {
-// case ('a'):
-// TRACE(("option a\n"));
-// //
-// // set some flag here
-// //
-// break;
-//
-// case ('B'):
-// TRACE( ("option B\n"));
-// //
-// // set some other flag here
-// //
-// break;
-//
-// case ('n'):
-// TRACE(("option n: value=%d\n"), atoi(optarg));
-// //
-// // do something with value here
-// //
-// break;
-//
-// case ('?'):
-// TRACE(("ERROR: illegal option %s\n"), argv[optind-1]);
-// return FALSE;
-// break;
-//
-// default:
-// TRACE(("WARNING: no handler for option %c\n"), c);
-// return FALSE;
-// break;
-// }
-// }
-// //
-// // check for non-option args here
-// //
-// return TRUE;
-// }
-//
-///////////////////////////////////////////////////////////////////////////////
-
-char *optarg; // global argument pointer
-int optind = 0; // global argv index
-
-int getopt(int argc, char *argv[], char *optstring)
-{
- static char *next = NULL;
- if (optind == 0)
- next = NULL;
-
- optarg = NULL;
-
- if (next == NULL || *next =='\0') {
- if (optind == 0)
- optind++;
-
- if (optind >= argc || argv[optind][0] != ('-') || argv[optind][1] == ('\0')) {
- optarg = NULL;
- if (optind < argc)
- optarg = argv[optind];
- return EOF;
- }
-
- if (strcmp(argv[optind], "--") == 0) {
- optind++;
- optarg = NULL;
- if (optind < argc)
- optarg = argv[optind];
- return EOF;
- }
-
- next = argv[optind];
- next++; // skip past -
- optind++;
- }
-
- char c = *next++;
- char *cp = strchr(optstring, c);
-
- if (cp == NULL || c == (':'))
- return ('?');
-
- cp++;
- if (*cp == (':')) {
- if (*next != ('\0')) {
- optarg = next;
- next = NULL;
- } else if (optind < argc) {
- optarg = argv[optind];
- optind++;
- } else {
- return ('?');
- }
- }
-
- return c;
-}
-
-// for an overview, see
-// W. Press, S. Teukolsky and W. Vetterling. (1992) Numerical Recipes in C. Chapter 6.1.
-double lgamma(int x)
-{
- // size_t xx=(size_t)x; xx--; size_t sum=1; while (xx) { sum *= xx--; } return log((double)(sum));
- if (x <= 2) {
- return 0.0;
- }
- static double coefs[6] = {76.18009172947146, -86.50532032941677, 24.01409824083091, -1.231739572450155, 0.1208650973866179e-2, -0.5395239384953e-5};
- double tmp=(double)x+5.5;
- tmp -= (((double)x)+0.5)*log(tmp);
- double y=(double)x;
- double sum = 1.000000000190015;
- for (size_t j=0; j<6; ++j) {
- sum += coefs[j]/++y;
- }
- return -tmp+log(2.5066282746310005*sum/(double)x);
-}
+// XGetopt.cpp Version 1.2
+//
+// Author: Hans Dietrich
+// hdietrich2@hotmail.com
+//
+// Description:
+// XGetopt.cpp implements getopt(), a function to parse command lines.
+//
+// History
+// Version 1.2 - 2003 May 17
+// - Added Unicode support
+//
+// Version 1.1 - 2002 March 10
+// - Added example to XGetopt.cpp module header
+//
+// This software is released into the public domain.
+// You are free to use it in any way you like.
+//
+// This software is provided "as is" with no expressed
+// or implied warranty. I accept no liability for any
+// damage or loss of business that this software may cause.
+//
+///////////////////////////////////////////////////////////////////////////////
+
+
+///////////////////////////////////////////////////////////////////////////////
+// if you are using precompiled headers then include this line:
+///////////////////////////////////////////////////////////////////////////////
+
+
+///////////////////////////////////////////////////////////////////////////////
+// if you are not using precompiled headers then include these lines:
+//#include <windows.h>
+//#include <cstdio>
+//#include <tchar.h>
+///////////////////////////////////////////////////////////////////////////////
+
+
+#include <cstdio>
+#include <cstring>
+#include <cmath>
+#include "WIN32_functions.h"
+
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// X G e t o p t . c p p
+//
+//
+// NAME
+// getopt -- parse command line options
+//
+// SYNOPSIS
+// int getopt(int argc, char *argv[], char *optstring)
+//
+// extern char *optarg;
+// extern int optind;
+//
+// DESCRIPTION
+// The getopt() function parses the command line arguments. Its
+// arguments argc and argv are the argument count and array as
+// passed into the application on program invocation. In the case
+// of Visual C++ programs, argc and argv are available via the
+// variables __argc and __argv (double underscores), respectively.
+// getopt returns the next option letter in argv that matches a
+// letter in optstring. (Note: Unicode programs should use
+// __targv instead of __argv. Also, all character and string
+// literals should be enclosed in ( ) ).
+//
+// optstring is a string of recognized option letters; if a letter
+// is followed by a colon, the option is expected to have an argument
+// that may or may not be separated from it by white space. optarg
+// is set to point to the start of the option argument on return from
+// getopt.
+//
+// Option letters may be combined, e.g., "-ab" is equivalent to
+// "-a -b". Option letters are case sensitive.
+//
+// getopt places in the external variable optind the argv index
+// of the next argument to be processed. optind is initialized
+// to 0 before the first call to getopt.
+//
+// When all options have been processed (i.e., up to the first
+// non-option argument), getopt returns EOF, optarg will point
+// to the argument, and optind will be set to the argv index of
+// the argument. If there are no non-option arguments, optarg
+// will be set to NULL.
+//
+// The special option "--" may be used to delimit the end of the
+// options; EOF will be returned, and "--" (and everything after it)
+// will be skipped.
+//
+// RETURN VALUE
+// For option letters contained in the string optstring, getopt
+// will return the option letter. getopt returns a question mark (?)
+// when it encounters an option letter not included in optstring.
+// EOF is returned when processing is finished.
+//
+// BUGS
+// 1) Long options are not supported.
+// 2) The GNU double-colon extension is not supported.
+// 3) The environment variable POSIXLY_CORRECT is not supported.
+// 4) The + syntax is not supported.
+// 5) The automatic permutation of arguments is not supported.
+// 6) This implementation of getopt() returns EOF if an error is
+// encountered, instead of -1 as the latest standard requires.
+//
+// EXAMPLE
+// BOOL CMyApp::ProcessCommandLine(int argc, char *argv[])
+// {
+// int c;
+//
+// while ((c = getopt(argc, argv, ("aBn:"))) != EOF)
+// {
+// switch (c)
+// {
+// case ('a'):
+// TRACE(("option a\n"));
+// //
+// // set some flag here
+// //
+// break;
+//
+// case ('B'):
+// TRACE( ("option B\n"));
+// //
+// // set some other flag here
+// //
+// break;
+//
+// case ('n'):
+// TRACE(("option n: value=%d\n"), atoi(optarg));
+// //
+// // do something with value here
+// //
+// break;
+//
+// case ('?'):
+// TRACE(("ERROR: illegal option %s\n"), argv[optind-1]);
+// return FALSE;
+// break;
+//
+// default:
+// TRACE(("WARNING: no handler for option %c\n"), c);
+// return FALSE;
+// break;
+// }
+// }
+// //
+// // check for non-option args here
+// //
+// return TRUE;
+// }
+//
+///////////////////////////////////////////////////////////////////////////////
+
+char *optarg; // global argument pointer
+int optind = 0; // global argv index
+
+int getopt(int argc, char *argv[], char *optstring)
+{
+ static char *next = NULL;
+ if (optind == 0)
+ next = NULL;
+
+ optarg = NULL;
+
+ if (next == NULL || *next =='\0') {
+ if (optind == 0)
+ optind++;
+
+ if (optind >= argc || argv[optind][0] != ('-') || argv[optind][1] == ('\0')) {
+ optarg = NULL;
+ if (optind < argc)
+ optarg = argv[optind];
+ return EOF;
+ }
+
+ if (strcmp(argv[optind], "--") == 0) {
+ optind++;
+ optarg = NULL;
+ if (optind < argc)
+ optarg = argv[optind];
+ return EOF;
+ }
+
+ next = argv[optind];
+ next++; // skip past -
+ optind++;
+ }
+
+ char c = *next++;
+ char *cp = strchr(optstring, c);
+
+ if (cp == NULL || c == (':'))
+ return ('?');
+
+ cp++;
+ if (*cp == (':')) {
+ if (*next != ('\0')) {
+ optarg = next;
+ next = NULL;
+ } else if (optind < argc) {
+ optarg = argv[optind];
+ optind++;
+ } else {
+ return ('?');
+ }
+ }
+
+ return c;
+}
+
+// for an overview, see
+// W. Press, S. Teukolsky and W. Vetterling. (1992) Numerical Recipes in C. Chapter 6.1.
+double lgamma(int x)
+{
+ // size_t xx=(size_t)x; xx--; size_t sum=1; while (xx) { sum *= xx--; } return log((double)(sum));
+ if (x <= 2) {
+ return 0.0;
+ }
+ static double coefs[6] = {76.18009172947146, -86.50532032941677, 24.01409824083091, -1.231739572450155, 0.1208650973866179e-2, -0.5395239384953e-5};
+ double tmp=(double)x+5.5;
+ tmp -= (((double)x)+0.5)*log(tmp);
+ double y=(double)x;
+ double sum = 1.000000000190015;
+ for (size_t j=0; j<6; ++j) {
+ sum += coefs[j]/++y;
+ }
+ return -tmp+log(2.5066282746310005*sum/(double)x);
+}
diff --git a/contrib/sigtest-filter/WIN32_functions.h b/contrib/sigtest-filter/WIN32_functions.h
index 6a719392e..ad644018b 100644
--- a/contrib/sigtest-filter/WIN32_functions.h
+++ b/contrib/sigtest-filter/WIN32_functions.h
@@ -1,24 +1,24 @@
-// XGetopt.h Version 1.2
-//
-// Author: Hans Dietrich
-// hdietrich2@hotmail.com
-//
-// This software is released into the public domain.
-// You are free to use it in any way you like.
-//
-// This software is provided "as is" with no expressed
-// or implied warranty. I accept no liability for any
-// damage or loss of business that this software may cause.
-//
-///////////////////////////////////////////////////////////////////////////////
-
-#ifndef XGETOPT_H
-#define XGETOPT_H
-
-extern int optind, opterr;
-extern char *optarg;
-
-int getopt(int argc, char *argv[], char *optstring);
-double lgamma(int x);
-
-#endif //XGETOPT_H
+// XGetopt.h Version 1.2
+//
+// Author: Hans Dietrich
+// hdietrich2@hotmail.com
+//
+// This software is released into the public domain.
+// You are free to use it in any way you like.
+//
+// This software is provided "as is" with no expressed
+// or implied warranty. I accept no liability for any
+// damage or loss of business that this software may cause.
+//
+///////////////////////////////////////////////////////////////////////////////
+
+#ifndef XGETOPT_H
+#define XGETOPT_H
+
+extern int optind, opterr;
+extern char *optarg;
+
+int getopt(int argc, char *argv[], char *optstring);
+double lgamma(int x);
+
+#endif //XGETOPT_H
diff --git a/contrib/sigtest-filter/filter-pt.cpp b/contrib/sigtest-filter/filter-pt.cpp
index bd0b9ae36..50418d502 100644
--- a/contrib/sigtest-filter/filter-pt.cpp
+++ b/contrib/sigtest-filter/filter-pt.cpp
@@ -1,5 +1,5 @@
-#include <cstring>
+#include <cstring>
#include <cassert>
#include <cstdio>
#include <cstdlib>
@@ -14,7 +14,7 @@
#include <set>
#include <boost/thread/tss.hpp>
-#include <boost/thread.hpp>
+#include <boost/thread.hpp>
#include <boost/unordered_map.hpp>
#ifdef WIN32
@@ -58,9 +58,9 @@ typedef boost::shared_ptr<std::vector<TextLenType> > SentIdSet;
class Cache {
typedef std::pair<SentIdSet, clock_t> ClockedSet;
typedef boost::unordered_map<std::string, ClockedSet> ClockedMap;
-
+
public:
-
+
SentIdSet get(const std::string& phrase) {
boost::shared_lock<boost::shared_mutex> lock(m_mutex);
if(m_cont.count(phrase)) {
@@ -70,27 +70,27 @@ class Cache {
}
return SentIdSet( new SentIdSet::element_type() );
}
-
+
void put(const std::string& phrase, const SentIdSet set) {
boost::unique_lock<boost::shared_mutex> lock(m_mutex);
m_cont[phrase] = std::make_pair(set, clock());
}
-
+
static void set_max_cache(size_t max_cache) {
s_max_cache = max_cache;
}
-
+
void prune() {
if(s_max_cache > 0) {
boost::upgrade_lock<boost::shared_mutex> lock(m_mutex);
if(m_cont.size() > s_max_cache) {
std::vector<clock_t> clocks;
- for(ClockedMap::iterator it = m_cont.begin(); it != m_cont.end(); it++)
+ for(ClockedMap::iterator it = m_cont.begin(); it != m_cont.end(); it++)
clocks.push_back(it->second.second);
-
+
std::sort(clocks.begin(), clocks.end());
clock_t out = clocks[m_cont.size() - s_max_cache];
-
+
boost::upgrade_to_unique_lock<boost::shared_mutex> uniq_lock(lock);
for(ClockedMap::iterator it = m_cont.begin(); it != m_cont.end(); it++)
if(it->second.second < out)
@@ -98,7 +98,7 @@ class Cache {
}
}
}
-
+
private:
ClockedMap m_cont;
boost::shared_mutex m_mutex;
@@ -282,12 +282,12 @@ void lookup_phrase(SentIdSet& ids, const std::string& phrase,
i != locations.end(); ++i) {
ids->push_back(i->sentIdInCorpus);
}
-
+
std::sort(ids->begin(), ids->end());
SentIdSet::element_type::iterator it =
std::unique(ids->begin(), ids->end());
ids->resize(it - ids->begin());
-
+
if(ids->size() >= MINIMUM_SIZE_TO_KEEP)
cache.put(phrase, ids);
}
@@ -295,8 +295,8 @@ void lookup_phrase(SentIdSet& ids, const std::string& phrase,
void lookup_multiple_phrases(SentIdSet& ids, vector<std::string> & phrases,
C_SuffixArraySearchApplicationBase & my_sa,
- const std::string & rule, Cache& cache)
-{
+ const std::string & rule, Cache& cache)
+{
if (phrases.size() == 1) {
lookup_phrase(ids, phrases.front(), my_sa, cache);
@@ -372,32 +372,32 @@ void compute_cooc_stats_and_filter(std::vector<PTEntry*>& options,
delete *i;
options.erase(options.begin() + pfe_filter_limit,options.end());
}
-
+
if (pef_filter_only)
return;
-
+
if (options.empty())
return;
-
+
SentIdSet fset( new SentIdSet::element_type() );
find_occurrences(fset, options.front()->f_phrase, f_sa, f_cache);
size_t cf = fset->size();
-
+
for (std::vector<PTEntry*>::iterator i = options.begin();
i != options.end(); ++i) {
const std::string& e_phrase = (*i)->e_phrase;
SentIdSet eset( new SentIdSet::element_type() );
find_occurrences(eset, e_phrase, e_sa, e_cache);
size_t ce = eset->size();
-
+
SentIdSet efset( new SentIdSet::element_type() );
ordered_set_intersect(efset, fset, eset);
size_t cef = efset->size();
-
+
double nlp = -log(fisher_exact(cef, cf, ce));
(*i)->set_cooc_stats(cef, cf, ce, nlp);
}
-
+
std::vector<PTEntry*>::iterator new_end =
std::remove_if(options.begin(), options.end(),
NlogSigThresholder(sig_filter_limit));
@@ -406,7 +406,7 @@ void compute_cooc_stats_and_filter(std::vector<PTEntry*>& options,
}
void filter(std::istream* in, std::ostream* out, int pfe_index) {
-
+
std::vector<std::string> lines;
std::string prev = "";
std::vector<PTEntry*> options;
@@ -415,23 +415,23 @@ void filter(std::istream* in, std::ostream* out, int pfe_index) {
boost::mutex::scoped_lock lock(in_mutex);
if(in->eof())
break;
-
+
lines.clear();
std::string line;
while(getline(*in, line) && lines.size() < 500000)
lines.push_back(line);
}
-
+
std::stringstream out_temp;
for(std::vector<std::string>::iterator it = lines.begin(); it != lines.end(); it++) {
size_t tmp_lines = ++pt_lines;
if(tmp_lines % 10000 == 0) {
boost::mutex::scoped_lock lock(err_mutex);
std::cerr << ".";
-
+
if(tmp_lines % 500000 == 0)
std::cerr << "[n:" << tmp_lines << "]\n";
-
+
if(tmp_lines % 10000000 == 0) {
float pfefper = (100.0*(float)nremoved_pfefilter)/(float)pt_lines;
float sigfper = (100.0*(float)nremoved_sigfilter)/(float)pt_lines;
@@ -446,30 +446,30 @@ void filter(std::istream* in, std::ostream* out, int pfe_index) {
<< "------------------------------------------------------\n";
}
}
-
+
if(pt_lines % 10000 == 0) {
f_cache.prune();
e_cache.prune();
}
-
+
if(it->length() > 0) {
PTEntry* pp = new PTEntry(it->c_str(), pfe_index);
if (prev != pp->f_phrase) {
prev = pp->f_phrase;
-
+
if (!options.empty()) { // always true after first line
compute_cooc_stats_and_filter(options, f_cache, e_cache);
}
-
+
for (std::vector<PTEntry*>::iterator i = options.begin();
i != options.end(); ++i) {
out_temp << **i << '\n';
delete *i;
}
-
+
options.clear();
options.push_back(pp);
-
+
} else {
options.push_back(pp);
}
@@ -479,7 +479,7 @@ void filter(std::istream* in, std::ostream* out, int pfe_index) {
*out << out_temp.str() << std::flush;
}
compute_cooc_stats_and_filter(options, f_cache, e_cache);
-
+
boost::mutex::scoped_lock lock(out_mutex);
for (std::vector<PTEntry*>::iterator i = options.begin();
i != options.end(); ++i) {
@@ -512,11 +512,11 @@ int main(int argc, char * argv[])
pfe_filter_limit = atoi(optarg);
std::cerr << "P(f|e) filter limit: " << pfe_filter_limit << std::endl;
break;
- case 't':
+ case 't':
threads = atoi(optarg);
std::cerr << "Using threads: " << threads << std::endl;
break;
- case 'm':
+ case 'm':
max_cache = atoi(optarg);
std::cerr << "Using max phrases in caches: " << max_cache << std::endl;
break;
@@ -548,13 +548,13 @@ int main(int argc, char * argv[])
usage();
}
}
-
+
if (sig_filter_limit == 0.0) pef_filter_only = true;
//-----------------------------------------------------------------------------
if (optind != argc || ((!efile || !ffile) && !pef_filter_only)) {
usage();
}
-
+
//load the indexed corpus with vocabulary(noVoc=false) and with offset(noOffset=false)
if (!pef_filter_only) {
e_sa.loadData_forSearch(efile, false, false);
@@ -582,15 +582,15 @@ int main(int argc, char * argv[])
Cache::set_max_cache(max_cache);
std::ios_base::sync_with_stdio(false);
-
+
boost::thread_group threadGroup;
- for(int i = 0; i < threads; i++)
+ for(int i = 0; i < threads; i++)
threadGroup.add_thread(new boost::thread(filter, &std::cin, &std::cout, pfe_index));
threadGroup.join_all();
float pfefper = (100.0*(float)nremoved_pfefilter)/(float)pt_lines;
float sigfper = (100.0*(float)nremoved_sigfilter)/(float)pt_lines;
-
+
std::cerr << "\n\n------------------------------------------------------\n"
<< " unfiltered phrases pairs: " << pt_lines << "\n"
<< "\n"
@@ -599,5 +599,5 @@ int main(int argc, char * argv[])
<< " TOTAL FILTERED: " << (nremoved_pfefilter + nremoved_sigfilter) << " (" << (sigfper + pfefper) << "%)\n"
<< "\n"
<< " FILTERED phrase pairs: " << (pt_lines - nremoved_pfefilter - nremoved_sigfilter) << " (" << (100.0-sigfper - pfefper) << "%)\n"
- << "------------------------------------------------------\n";
+ << "------------------------------------------------------\n";
}
diff --git a/contrib/synlm/hhmm/rvtl/include/nl-archetypeset.h b/contrib/synlm/hhmm/rvtl/include/nl-archetypeset.h
index 914e85e92..342f10777 100644
--- a/contrib/synlm/hhmm/rvtl/include/nl-archetypeset.h
+++ b/contrib/synlm/hhmm/rvtl/include/nl-archetypeset.h
@@ -65,7 +65,7 @@ class Numbered : public T {
friend String& operator<< ( String& str, const Numbered<SD1,I,SD2,T,SD3>& rv ) { return str<<SD1<<rv.i<<SD2<<rv.getT()<<SD3; }
friend pair<StringInput,Numbered<SD1,I,SD2,T,SD3>*> operator>> ( StringInput ps, Numbered<SD1,I,SD2,T,SD3>& rv ) { return pair<StringInput,Numbered<SD1,I,SD2,T,SD3>*>(ps,&rv); }
friend StringInput operator>> ( pair<StringInput,Numbered<SD1,I,SD2,T,SD3>*> delimbuff, const char* psPostDelim ) {
- return ( (SD3[0]=='\0') ? delimbuff.first>>SD1>>delimbuff.second->i>>SD2>>delimbuff.second->setT()>>psPostDelim
+ return ( (SD3[0]=='\0') ? delimbuff.first>>SD1>>delimbuff.second->i>>SD2>>delimbuff.second->setT()>>psPostDelim
: delimbuff.first>>SD1>>delimbuff.second->i>>SD2>>delimbuff.second->setT()>>SD3>>psPostDelim );
}
};
@@ -106,7 +106,7 @@ template<class V>
pair<typename V::ElementType,int> ArchetypeSet<V>::getDistanceOfNearest ( const V& v ) const {
//const Scored<typename V::ElementType,pair<int,SafePtr<const V> > > sipvDummy ( DBL_MAX );
//MinHeap<Scored<typename V::ElementType,pair<int,SafePtr<const V> > > > hsiv ( MapType::size()+1, sipvDummy );
- MinHeap<Scored<typename V::ElementType,pair<int,SafePtr<const NV> > > >& hsiv =
+ MinHeap<Scored<typename V::ElementType,pair<int,SafePtr<const NV> > > >& hsiv =
const_cast<MinHeap<Scored<typename V::ElementType,pair<int,SafePtr<const NV> > > >&> ( hsivCalc );
hsiv.clear();
@@ -120,7 +120,7 @@ pair<typename V::ElementType,int> ArchetypeSet<V>::getDistanceOfNearest ( const
typename V::ElementType d = v.getMarginalDistance ( hsiv.get(iNext).first, hsiv.get(iNext).second.getRef() );
hsiv.set(iNext).setScore() = d;
//hsiv.set(iNext).setScore() = v.getMarginalDistance ( hsiv.getMin().first, iUpper->second.second );
- ////int j =
+ ////int j =
hsiv.fixDecr(iNext);
////cerr<<" adding ln"<<&hsiv.get(j).second.getRef()<<" marg-dist="<<d<<" new-score="<<double(hsiv.get(j).getScore())<<" new-pos="<<j<<"\n";
iNext++;
@@ -140,7 +140,7 @@ pair<typename V::ElementType,int> ArchetypeSet<V>::getDistanceOfNearest ( const
typename V::ElementType d = v.getMarginalDistance ( ++hsiv.setMin().first, hsiv.getMin().second.getRef() );
hsiv.setMin().setScore() += d;
////cerr<<" matching ln"<<&hsiv.getMin().second.getRef()<<" i="<<hsiv.setMin().first<<" marg-dist="<<d<<" new-score="<<hsiv.getMin().getScore();
- ////int j =
+ ////int j =
hsiv.fixIncr(0);
////cerr<<" new-pos="<<j<<"\n";
////if(j!=0) for(int i=0;i<iNext;i++) cerr<<" "<<i<<": ln"<<hsiv.get(i).second.getRef().lineNum.toInt()<<" new-score="<<double(hsiv.get(i).getScore())<<"\n";
@@ -151,7 +151,7 @@ pair<typename V::ElementType,int> ArchetypeSet<V>::getDistanceOfNearest ( const
hsiv.set(iNext).second = SafePtr<const NV> ( iUpper->second );
typename V::ElementType d = v.getMarginalDistance ( hsiv.get(iNext).first, hsiv.get(iNext).second.getRef() );
hsiv.set(iNext).setScore() = d;
- ////int j =
+ ////int j =
hsiv.fixDecr(iNext);
////cerr<<" adding ln"<<&hsiv.get(j).second.getRef()<<" marg-dist="<<d<<" new-score="<<double(hsiv.get(j).getScore())<<" new-pos="<<j<<"\n";
iNext++;
@@ -164,7 +164,7 @@ pair<typename V::ElementType,int> ArchetypeSet<V>::getDistanceOfNearest ( const
hsiv.set(iNext).second = SafePtr<const NV> ( iLower->second );
typename V::ElementType d = v.getMarginalDistance ( hsiv.get(iNext).first, hsiv.get(iNext).second.getRef() );
hsiv.set(iNext).setScore() = d;
- ////int j =
+ ////int j =
hsiv.fixDecr(iNext);
////cerr<<" adding ln"<<&hsiv.get(j).second.getRef()<<" marg-dist="<<d<<" new-score="<<double(hsiv.get(j).getScore())<<" new-pos="<<j<<"\n";
iNext++;
diff --git a/contrib/synlm/hhmm/rvtl/include/nl-array.h b/contrib/synlm/hhmm/rvtl/include/nl-array.h
index 0dfb74b44..6c6e1bb5f 100644
--- a/contrib/synlm/hhmm/rvtl/include/nl-array.h
+++ b/contrib/synlm/hhmm/rvtl/include/nl-array.h
@@ -27,7 +27,7 @@
#include <cassert>
#include <iostream>
-using namespace std;
+using namespace std;
////////////////////////////////////////////////////////////////////////////////
diff --git a/contrib/synlm/hhmm/rvtl/include/nl-beam.h b/contrib/synlm/hhmm/rvtl/include/nl-beam.h
index 398babe21..817e96206 100644
--- a/contrib/synlm/hhmm/rvtl/include/nl-beam.h
+++ b/contrib/synlm/hhmm/rvtl/include/nl-beam.h
@@ -101,8 +101,8 @@ class Beam {
void write(FILE *pf){
/* for (typename BeamMap::const_iterator i = mkid.begin(); i != mkid.end(); i++){
i->first.write(pf);
- fprintf(pf, " %d ", i->second.first);
-// i->second.second.write(pf);
+ fprintf(pf, " %d ", i->second.first);
+// i->second.second.write(pf);
fprintf(pf, "\n");
}
*/
diff --git a/contrib/synlm/hhmm/rvtl/include/nl-cpt.h b/contrib/synlm/hhmm/rvtl/include/nl-cpt.h
index a7c1a916c..dbfb947e3 100644
--- a/contrib/synlm/hhmm/rvtl/include/nl-cpt.h
+++ b/contrib/synlm/hhmm/rvtl/include/nl-cpt.h
@@ -394,7 +394,7 @@ class SimpleMap : public map<X,Y> {
private:
typedef map<X,Y> OrigMap;
static const Y yDummy;
-
+
public:
// Constructor / destructor methods...
SimpleMap ( ) : OrigMap() { }
@@ -899,7 +899,7 @@ class GenericHidVarCPTModel : public SimpleHash<K,typename Y::template ArrayDist
const typename Y::template ArrayDistrib<P>& getDistrib ( const K& k ) const {
return HKYP::get(k);
}
-
+
P& setProb ( const Y& y, const K& k ) {
pair<typename Y::BaseType,P>& yp = HKYP::set(k).add();
yp.first = y;
diff --git a/contrib/synlm/hhmm/rvtl/include/nl-crf.h b/contrib/synlm/hhmm/rvtl/include/nl-crf.h
index 44744ad03..a9b233b23 100644
--- a/contrib/synlm/hhmm/rvtl/include/nl-crf.h
+++ b/contrib/synlm/hhmm/rvtl/include/nl-crf.h
@@ -36,7 +36,7 @@
//
////////////////////////////////////////////////////////////////////////////////
-template <class Y,class X1,class X2>
+template <class Y,class X1,class X2>
class CRF3DModeledRV : public Y {
private:
@@ -90,7 +90,7 @@ template <class Y,class X1,class X2> SafeArray5D<Id<int>,int,int,int,int,float>
////////////////////////////////////////////////////////////////////////////////
-template <class Y,class X1,class X2>
+template <class Y,class X1,class X2>
Prob CRF3DModeledRV<Y,X1,X2>::getProb( const X1& x1, const X2& x2 ) const {
SafeArray2D<int,int,int> aaCnds ( cardOff, cardSh ) ;
@@ -131,7 +131,7 @@ Prob CRF3DModeledRV<Y,X1,X2>::getProb( const X1& x1, const X2& x2 ) const {
for ( int configRghtValSite=0; configRghtValSite<(1<<bitsValSite); configRghtValSite++ )
for ( int configValOverlap=0; configValOverlap<(1<<(bitsVal-bitsValSite)); configValOverlap++ ) {
int configRghtVal = (configValOverlap<<bitsValSite)+configRghtValSite;
- // For each possible preceding trellis node...
+ // For each possible preceding trellis node...
for ( int configLeftValSite=0; configLeftValSite<(1<<bitsValSite); configLeftValSite++ ) {
int configLeftVal = (configLeftValSite<<(bitsVal-bitsValSite))+configValOverlap;
// Add product of result and previous trellis cell to current trellis cell...
@@ -158,7 +158,7 @@ Prob CRF3DModeledRV<Y,X1,X2>::getProb( const X1& x1, const X2& x2 ) const {
////////////////////////////////////////////////////////////////////////////////
-template <class Y,class X1,class X2>
+template <class Y,class X1,class X2>
bool CRF3DModeledRV<Y,X1,X2>::readModelFields ( char* aps[], int numFields ) {
if ( 7==numFields )
setPotential ( X1(string(aps[1])), // globals
@@ -172,7 +172,7 @@ bool CRF3DModeledRV<Y,X1,X2>::readModelFields ( char* aps[], int numFields ) {
////////////////////////////////////////////////////////////////////////////////
-template <class Y,class X1,class X2>
+template <class Y,class X1,class X2>
void CRF3DModeledRV<Y,X1,X2>::writeObservCliqueConfigs ( FILE* pf, int frame, const char* psMdl,
const X1& x1, const X2& x2, bool bObsVal ) const {
fprintf ( pf, "%04d> %s ", frame, psMdl );
@@ -199,7 +199,7 @@ void CRF3DModeledRV<Y,X1,X2>::writeObservCliqueConfigs ( FILE* pf, int frame, co
//
////////////////////////////////////////////////////////////////////////////////
-template <class Y,class X1,class X2,class X3>
+template <class Y,class X1,class X2,class X3>
class CRF4DModeledRV : public Y {
private:
@@ -247,13 +247,13 @@ template <class Y,class X1,class X2,class X3> int CRF4DModeledRV<Y,X1,X2,X3>::c
template <class Y,class X1,class X2,class X3> int CRF4DModeledRV<Y,X1,X2,X3>::cardCnd = 0;
template <class Y,class X1,class X2,class X3> int CRF4DModeledRV<Y,X1,X2,X3>::bitsVal = 0;
template <class Y,class X1,class X2,class X3> int CRF4DModeledRV<Y,X1,X2,X3>::bitsValSite = 0;
-template <class Y,class X1,class X2,class X3> SafeArray5D<Id<int>,int,int,int,int,float>
+template <class Y,class X1,class X2,class X3> SafeArray5D<Id<int>,int,int,int,int,float>
CRF4DModeledRV<Y,X1,X2,X3>::aaaaaPotentials;
/* template <class Y,class X1,class X2> SafeArray3D<int> CRF4DModeledRV<Y,X1,X2>::aaaCnds; */
////////////////////////////////////////////////////////////////////////////////
-template <class Y,class X1,class X2,class X3>
+template <class Y,class X1,class X2,class X3>
Prob CRF4DModeledRV<Y,X1,X2,X3>::getProb( const X1& x1, const X2& x2, const X3& x3 ) const {
SafeArray2D<int,int,int> aaCnds ( cardOff, cardSh ) ;
@@ -294,7 +294,7 @@ Prob CRF4DModeledRV<Y,X1,X2,X3>::getProb( const X1& x1, const X2& x2, const X3&
for ( int configRghtValSite=0; configRghtValSite<(1<<bitsValSite); configRghtValSite++ )
for ( int configValOverlap=0; configValOverlap<(1<<(bitsVal-bitsValSite)); configValOverlap++ ) {
int configRghtVal = (configValOverlap<<bitsValSite)+configRghtValSite;
- // For each possible preceding trellis node...
+ // For each possible preceding trellis node...
for ( int configLeftValSite=0; configLeftValSite<(1<<bitsValSite); configLeftValSite++ ) {
int configLeftVal = (configLeftValSite<<(bitsVal-bitsValSite))+configValOverlap;
// Add product of result and previous trellis cell to current trellis cell...
@@ -321,7 +321,7 @@ Prob CRF4DModeledRV<Y,X1,X2,X3>::getProb( const X1& x1, const X2& x2, const X3&
////////////////////////////////////////////////////////////////////////////////
-template <class Y,class X1,class X2,class X3>
+template <class Y,class X1,class X2,class X3>
bool CRF4DModeledRV<Y,X1,X2,X3>::readModelFields ( char* aps[], int numFields ) {
if ( 7==numFields )
setPotential ( X1(string(aps[1])), // globals
@@ -335,9 +335,9 @@ bool CRF4DModeledRV<Y,X1,X2,X3>::readModelFields ( char* aps[], int numFields )
////////////////////////////////////////////////////////////////////////////////
-template <class Y,class X1,class X2, class X3>
+template <class Y,class X1,class X2, class X3>
void CRF4DModeledRV<Y,X1,X2,X3>::writeObservCliqueConfigs ( FILE* pf, int frame, const char* psMdl,
- const X1& x1, const X2& x2,
+ const X1& x1, const X2& x2,
const X3& x3, bool bObsVal ) const {
fprintf ( pf, "%04d> %s ", frame, psMdl );
// For each shape (feature slope)...
diff --git a/contrib/synlm/hhmm/rvtl/include/nl-denot.h b/contrib/synlm/hhmm/rvtl/include/nl-denot.h
index 0b50663a1..be92168b8 100644
--- a/contrib/synlm/hhmm/rvtl/include/nl-denot.h
+++ b/contrib/synlm/hhmm/rvtl/include/nl-denot.h
@@ -80,7 +80,7 @@ void VecE<N,I,RC>::read ( char* ps, const ReaderContext& rc ) {
*/
char* psT; int i=0;
for ( char* psU=strtok_r(ps,",",&psT);
- psU && i<NUM_ENTS;
+ psU && i<NUM_ENTS;
psU=strtok_r(NULL,",",&psT),i++ )
StaticSafeArray<N,I>::set(i) = psU;
}
@@ -166,7 +166,7 @@ void VecV<N,I,RC,ND1,ND2>::read ( char* ps, VecVReaderContext& rc ) {
// Chop into individual coinds strings...
char* psT; int i=0;
for ( char* psU=strtok_r(ps,",",&psT);
- psU && i<NUM_ENTS;
+ psU && i<NUM_ENTS;
psU=strtok_r(NULL,",",&psT), i++ )
asV.set(i) = psU;
@@ -230,7 +230,7 @@ class JointVecV { //// : public StaticSafeArray<V1::NUM_ENTS+V2::NUM_ENTS,I> {
static const int NUM_ENTS;
// Constructor / destructor methods...
JointVecV ( ) { }
- JointVecV ( const V1& a1, const V2& a2 ) {
+ JointVecV ( const V1& a1, const V2& a2 ) {
////fprintf(stderr,"iJoin "); a1.V1::write(stderr); fprintf(stderr," "); a2.V2::write(stderr); fprintf(stderr,"\n");
for (int i=0; i<NUM_ENTS; i++) {
if ( i<V1::NUM_ENTS ) set(i) = (a1.get(i)==-1) ? IntType(-1) : (a1.get(i)<V1::NUM_ENTS) ? IntType(a1.get(i)) : a1.get(i)+V2::NUM_ENTS;
diff --git a/contrib/synlm/hhmm/rvtl/include/nl-dtree-cont.h b/contrib/synlm/hhmm/rvtl/include/nl-dtree-cont.h
index cf6b00d28..1deb757a5 100644
--- a/contrib/synlm/hhmm/rvtl/include/nl-dtree-cont.h
+++ b/contrib/synlm/hhmm/rvtl/include/nl-dtree-cont.h
@@ -75,7 +75,7 @@ class ContDTree2DModel : public Generic2DModel<Y,X,P>, public Tree<ContDecisNode
// Extraction methods...
const P getProb ( const Y y, const X& x ) const {
const Tree<ContDecisNode<Y,P> >* ptr = this;
- while ( !ptr->isTerm() ) {
+ while ( !ptr->isTerm() ) {
double sumsqr=0.0;
for(A a;a<X::getSize();a.setNext()) sumsqr += pow(x.get(a.toInt()),2.0) / X::getSize();
Wt wtdavg = -Tree<ContDecisNode<Y,P> >::getWt();
@@ -112,7 +112,7 @@ class ContDTree2DModel : public Generic2DModel<Y,X,P>, public Tree<ContDecisNode
};
////////////////////
-template <class Y,class X, class P>
+template <class Y,class X, class P>
bool ContDTree2DModel<Y,X,P>::readFields ( char* aps[], int numFields ) {
if ( /*aps[0]==sId &&*/ (3==numFields || 4==numFields) ) {
//fprintf(stderr,"%s,%d\n",aps[3],numFields);
@@ -171,7 +171,7 @@ class ContDTree3DModel : public Generic3DModel<Y,X1,X2,P> {
};
////////////////////
-template <class Y,class X1,class X2, class P>
+template <class Y,class X1,class X2, class P>
bool ContDTree3DModel<Y,X1,X2,P>::readFields ( char* aps[], int numFields ) {
if ( /*aps[0]==sId &&*/ (4==numFields || 5==numFields) ) {
//fprintf(stderr,"%s,%d\n",aps[3],numFields);
@@ -212,7 +212,7 @@ bool ContDTree3DModel<Y,X1,X2,P>::readFields ( char* aps[], int numFields ) {
////////////////////////////////////////////////////////////////////////////////
template<class Y, class X, class P>
-class TrainableContDTree2DModel : public ContDTree2DModel<Y,X,P> {
+class TrainableContDTree2DModel : public ContDTree2DModel<Y,X,P> {
private:
List<Joint2DRV<X,Y> > lxy;
public:
@@ -225,7 +225,7 @@ class TrainableContDTree2DModel : public ContDTree2DModel<Y,X,P> {
void train ( List<Joint2DRV<X,Y> >&, const double ) ;
void train ( const double d ) { train(lxy,d); }
////// Input / output methods...
- bool readData ( char* vs[], int numFields ) {
+ bool readData ( char* vs[], int numFields ) {
if ( 3==numFields ) lxy.add() = Joint2DRV<X,Y> ( X(vs[1]), Y(vs[2]) );
else return false;
return true;
@@ -312,7 +312,7 @@ void TrainableContDTree2DModel<Y,X,P>::train ( List<Joint2DRV<X,Y> >& lxy, cons
// if ( double(rand())/double(RAND_MAX) < prRarest/modelY.getProb(pxy->getSub2()) ) {
dCtr++;
- double gamma = dTot/(dTot+dCtr); // 1.0/(double(epoch)+dCtr/dTot); // 1.0/double(epoch); // 1.0/(double(epoch)+dCtr/(dTot*prRarest*2.0)); //
+ double gamma = dTot/(dTot+dCtr); // 1.0/(double(epoch)+dCtr/dTot); // 1.0/double(epoch); // 1.0/(double(epoch)+dCtr/(dTot*prRarest*2.0)); //
// Weight deltas for next epoch...
Wt wDelta = 0.0;
@@ -333,7 +333,7 @@ void TrainableContDTree2DModel<Y,X,P>::train ( List<Joint2DRV<X,Y> >& lxy, cons
P prY = 1.0 / ( 1.0 + exp(-wtdavg) );
// Calc deltas for each feature/attribute/dimension...
- double dEachWt = 1.0/dTot; // 1.0/dTot * modelY.getProb ( Y(1-pxy->getSub2().toInt()) ); // 1.0/(dTot*prRarest*2.0); //
+ double dEachWt = 1.0/dTot; // 1.0/dTot * modelY.getProb ( Y(1-pxy->getSub2().toInt()) ); // 1.0/(dTot*prRarest*2.0); //
wDelta += dEachWt * -1 * ( prY - P(double(pxy->getSub2().toInt())) );
for ( A a; a<X::getSize(); a.setNext() )
awDeltas.set(a) += dEachWt * pxy->getSub1().get(a.toInt()) * ( prY - P(double(pxy->getSub2().toInt())) );
@@ -439,7 +439,7 @@ void TrainableContDTree2DModel<Y,X,P>::train ( List<Joint2DRV<X,Y> >& lxy, cons
////////////////////////////////////////////////////////////////////////////////
template<class Y, class X1, class X2, class P>
-class TrainableContDTree3DModel : public ContDTree3DModel<Y,X1,X2,P> {
+class TrainableContDTree3DModel : public ContDTree3DModel<Y,X1,X2,P> {
private:
@@ -455,7 +455,7 @@ class TrainableContDTree3DModel : public ContDTree3DModel<Y,X1,X2,P> {
TrainableContDTree2DModel<Y,X2,P>& setTree(const X1& x1) { return static_cast<TrainableContDTree2DModel<Y,X2,P>&>(ContDTree3DModel<Y,X1,X2,P>::setTree(x1)); }
////// Add training data to per-subphone lists...
- bool readData ( char* vs[], int numFields ) {
+ bool readData ( char* vs[], int numFields ) {
if ( 4==numFields ) {
mqlxy[X1(vs[1])].add() = Joint2DRV<X2,Y> ( X2(vs[2]), Y(vs[3]) );
////mqlxy[X1(vs[1])].getLast()->write(stderr); fprintf(stderr,"\n");
diff --git a/contrib/synlm/hhmm/rvtl/include/nl-dtree.h b/contrib/synlm/hhmm/rvtl/include/nl-dtree.h
index 2396f395c..93a0e4d42 100644
--- a/contrib/synlm/hhmm/rvtl/include/nl-dtree.h
+++ b/contrib/synlm/hhmm/rvtl/include/nl-dtree.h
@@ -129,8 +129,8 @@ class DTree2DModel : public Tree < typename X::ElementType, DecisNode<X,Y,P> > {
friend StringInput operator>> ( pair<StringInput,DTree2DModel<Y,X,P>*> si_m, const char* psD ) {
if (StringInput(NULL)==si_m.first) return si_m.first;
Y y; String xs; StringInput si,si2; si=si_m.first; DTree2DModel<Y,X,P>* pm=si_m.second;
- while((si2=si>>" ")!=NULL)si=si2;
- si=si>>xs>>" ";
+ while((si2=si>>" ")!=NULL)si=si2;
+ si=si>>xs>>" ";
while((si2=si>>" ")!=NULL)si=si2;
// Find appropriate node, creating nodes as necessary...
for(int i=1; i<int(strlen(xs.c_array()))-1; i++) {
@@ -140,22 +140,22 @@ class DTree2DModel : public Tree < typename X::ElementType, DecisNode<X,Y,P> > {
if ( si!=NULL && si[0]==':' ) {
si=si>>": ";
- while((si2=si>>" ")!=NULL)si=si2;
+ while((si2=si>>" ")!=NULL)si=si2;
si=si>>y>>" ";
- while((si2=si>>" ")!=NULL)si=si2;
+ while((si2=si>>" ")!=NULL)si=si2;
si=si>>"= ";
- while((si2=si>>" ")!=NULL)si=si2;
+ while((si2=si>>" ")!=NULL)si=si2;
// Specify attribute number (at nonterminal) or probability in distribution (at terminal)...
return (si!=NULL) ? si>>pm->setProb(y)>>psD : si;
}
else if ( si!=NULL && si[0]=='=' ) {
si=si>>"= "; //cerr<<" in after equals "<<((si==NULL) ? "yes" : "no") << endl;
- while((si2=si>>" ")!=NULL)si=si2;
+ while((si2=si>>" ")!=NULL)si=si2;
//m.setA() = atoi(si.c_str());
int aVar = 0;
- si=si>>aVar>>psD;
- pm->setA()=aVar;
+ si=si>>aVar>>psD;
+ pm->setA()=aVar;
////cerr<<" at end "<<((si==NULL) ? "yes" : "no") << endl;
////cerr<<" m.getA() is "<< m.getA().toInt() << endl;
return si;
@@ -169,15 +169,15 @@ class DTree2DModel : public Tree < typename X::ElementType, DecisNode<X,Y,P> > {
si=si_m.first;
sRt = si.c_str();
if (sRt.find(':')!=string::npos) {
- while((si2=si>>" [")!=NULL)si=si2;
- si=si>>xs>>"] ";
- while((si2=si>>" ")!=NULL)si=si2;
+ while((si2=si>>" [")!=NULL)si=si2;
+ si=si>>xs>>"] ";
+ while((si2=si>>" ")!=NULL)si=si2;
si=si>>": ";
- while((si2=si>>" ")!=NULL)si=si2;
+ while((si2=si>>" ")!=NULL)si=si2;
si=si>>y>>" ";
- while((si2=si>>" ")!=NULL)si=si2;
+ while((si2=si>>" ")!=NULL)si=si2;
si=si>>"= ";
-
+
// For DTree, must find the node labeled by X
//Tree<B,DecisNode<X,Y,P> >* ptr = m;
//assert(ptr);
@@ -189,15 +189,15 @@ class DTree2DModel : public Tree < typename X::ElementType, DecisNode<X,Y,P> > {
// Specify attribute number (at nonterminal) or probability in distribution (at terminal)...
return (si!=NULL) ? si>>m.setProb(y)>>psD : si;
} else {
- while((si2=si>>" [")!=NULL)si=si2;
+ while((si2=si>>" [")!=NULL)si=si2;
si=si>>xs>>"] "; //cerr<<" in bracket "<<((si==NULL) ? "yes" : "no") << endl;
- while((si2=si>>" ")!=NULL)si=si2;
+ while((si2=si>>" ")!=NULL)si=si2;
si=si>>"= "; //cerr<<" in after equals "<<((si==NULL) ? "yes" : "no") << endl;
//m.setA() = atoi(si.c_str());
int aVar = 0;
- si=si>>aVar>>psD;
- m.setA()=aVar;
+ si=si>>aVar>>psD;
+ m.setA()=aVar;
//cerr<<" at end "<<((si==NULL) ? "yes" : "no") << endl;
//cerr<<" m.getA() is "<< m.getA().toInt() << endl;
return si;
@@ -209,7 +209,7 @@ class DTree2DModel : public Tree < typename X::ElementType, DecisNode<X,Y,P> > {
};
////////////////////
-template <class Y,class X, class P>
+template <class Y,class X, class P>
bool DTree2DModel<Y,X,P>::readFields ( Array<char*>& aps ) {
if ( /*aps[0]==sId &&*/ (3==aps.size() || 4==aps.size()) ) {
//fprintf(stderr,"%s,%d\n",aps[3],numFields);
@@ -269,7 +269,7 @@ class DTree3DModel {
};
////////////////////
-template <class Y,class X1,class X2, class P>
+template <class Y,class X1,class X2, class P>
bool DTree3DModel<Y,X1,X2,P>::readFields ( char* aps[], int numFields ) {
if ( /*aps[0]==sId &&*/ (4==numFields || 5==numFields) ) {
//fprintf(stderr,"%s,%d\n",aps[3],numFields);
@@ -307,7 +307,7 @@ bool DTree3DModel<Y,X1,X2,P>::readFields ( char* aps[], int numFields ) {
////////////////////////////////////////////////////////////////////////////////
template<class Y, class X, class P>
-class TrainableDTree2DModel : public DTree2DModel<Y,X,P> {
+class TrainableDTree2DModel : public DTree2DModel<Y,X,P> {
private:
// Type members...
typedef typename X::ElementType B;
@@ -485,7 +485,7 @@ void TrainableDTree2DModel<Y,X,P>::train ( List<Joint2DRV<X,Y> >& lxy, const De
////////////////////////////////////////////////////////////////////////////////
template<class Y, class X1, class X2, class P>
-class TrainableDTree3DModel : public DTree3DModel<Y,X1,X2,P> {
+class TrainableDTree3DModel : public DTree3DModel<Y,X1,X2,P> {
private:
diff --git a/contrib/synlm/hhmm/rvtl/include/nl-fixedmatrix.h b/contrib/synlm/hhmm/rvtl/include/nl-fixedmatrix.h
index dbb9d9d9d..5e8b4d6d0 100644
--- a/contrib/synlm/hhmm/rvtl/include/nl-fixedmatrix.h
+++ b/contrib/synlm/hhmm/rvtl/include/nl-fixedmatrix.h
@@ -34,7 +34,7 @@ class Matrix : public SafeArray2D<Id<int>,Id<int>,T> {
Matrix ( ) : SafeArray2D<Id<int>,Id<int>,T>( ) { }//{ xSize=0; ySize=0; }
Matrix (int x, int y) : SafeArray2D<Id<int>,Id<int>,T>(x,y) { }//{ xSize=x; ySize=y; }
Matrix (int x, int y, const T& t) : SafeArray2D<Id<int>,Id<int>,T>(x,y,t) { }//{ xSize=x; ySize=y; }
- Matrix (const Matrix& a) : SafeArray2D<Id<int>,Id<int>,T>(a.xSize(),a.ySize()) { //xSize=a.xSize; ySize=a.ySize;
+ Matrix (const Matrix& a) : SafeArray2D<Id<int>,Id<int>,T>(a.xSize(),a.ySize()) { //xSize=a.xSize; ySize=a.ySize;
for(int i=0;i<xSize();i++) for(int j=0;j<ySize();j++) this->set(i,j)=a.get(i,j); }
// Specification methods...
//Matrix& operator= ( const Matrix<T>& sat )
@@ -195,34 +195,34 @@ class Matrix : public SafeArray2D<Id<int>,Id<int>,T> {
}
return false;
}
- bool operator== ( const Matrix<T>& a ) const {
+ bool operator== ( const Matrix<T>& a ) const {
if (xSize()!=a.xSize() || ySize()!=a.ySize()) return false;
- for (int i=0;i<a.xSize();i++)
+ for (int i=0;i<a.xSize();i++)
for (int j=0;j<a.ySize();j++)
if (this->get(Id<int>(i),Id<int>(j))!=a.get(Id<int>(i),Id<int>(j))) return false;
return true;
}
// Input/output methods...
- friend ostream& operator<< ( ostream& os, const Matrix<T>& a ) {
+ friend ostream& operator<< ( ostream& os, const Matrix<T>& a ) {
os<<"\n ";
for (int i=0;i<a.xSize();i++) {
for (int j=0;j<a.ySize();j++) {
os<<((j==0)?"":",")<<a.get(Id<int>(i),Id<int>(j));
- }
+ }
os<<(i==a.xSize()-1?"\n":"\n ");
}
- return os;
+ return os;
}
- friend String& operator<< ( String& str, const Matrix<T>& a ) {
+ friend String& operator<< ( String& str, const Matrix<T>& a ) {
str<<"\n ";
for (int i=0;i<a.xSize();i++) {
for (int j=0;j<a.ySize();j++) {
str<<((j==0)?"":",")<<a.get(Id<int>(i),Id<int>(j));
- }
+ }
str<<";";
}
- return str;
+ return str;
}
string getString( ) const;
@@ -234,7 +234,7 @@ string Matrix<T>::getString() const {
for (int j=0;j<ySize();j++) {
str += ((j==0)?"":",");
str += this->get(Id<int>(i),Id<int>(j));
- }
+ }
str += ";";
}
return str;
diff --git a/contrib/synlm/hhmm/rvtl/include/nl-gauss.h b/contrib/synlm/hhmm/rvtl/include/nl-gauss.h
index a2213086f..f5cc45159 100644
--- a/contrib/synlm/hhmm/rvtl/include/nl-gauss.h
+++ b/contrib/synlm/hhmm/rvtl/include/nl-gauss.h
@@ -43,7 +43,7 @@ static const PDFVal VARIANCE_THRESHOLD = 0.01; //0.0001; //0
//
////////////////////////////////////////////////////////////////////////////////
-template <class Y>
+template <class Y>
class DiagGauss1DModel : public Generic1DModel<Y,PDFVal> {
private:
// Member variables...
@@ -53,7 +53,7 @@ class DiagGauss1DModel : public Generic1DModel<Y,PDFVal> {
SimpleHash<Id<int>,PDFVal> aMeans;
SimpleHash<Id<int>,PDFVal> aVariances;
PDFVal prInvRootNormVariances;
- PDFVal prProduct;
+ PDFVal prProduct;
SimpleHash<Id<int>,PDFVal> algprNegHalfInvVariances;
public:
// Constructor / destructor methods...
@@ -78,7 +78,7 @@ class DiagGauss1DModel : public Generic1DModel<Y,PDFVal> {
};
////////////////////////////////////////
-template <class Y>
+template <class Y>
inline void DiagGauss1DModel<Y>::precomputeVarianceTerms ( ) {
// Inverse square root of norm of variances...
setInvRootNormVar() = 1.0;
@@ -92,7 +92,7 @@ inline void DiagGauss1DModel<Y>::precomputeVarianceTerms ( ) {
}
////////////////////////////////////////
-template <class Y>
+template <class Y>
inline PDFVal DiagGauss1DModel<Y>::getProb ( const Y& y ) const {
// fprintf(stderr,"--------------------\n");
// y.write(stderr);
@@ -109,7 +109,7 @@ inline PDFVal DiagGauss1DModel<Y>::getProb ( const Y& y ) const {
}
////////////////////////////////////////
-template <class Y>
+template <class Y>
bool DiagGauss1DModel<Y>::readFields ( char* as[], int numFields ) {
if ( 0==strcmp(as[1],"m") && numFields>2 ) {
char* psT;
@@ -126,12 +126,12 @@ bool DiagGauss1DModel<Y>::readFields ( char* as[], int numFields ) {
}
////////////////////////////////////////
-template <class Y>
+template <class Y>
void DiagGauss1DModel<Y>::writeFields ( FILE* pf, const string& sPref ) const {
fprintf(pf,"%s m = ",sPref.c_str());
for(int i=0; i<getNumFeats(); i++) fprintf(pf,"%s%f",(0==i)?"":"_",getMean(i));
fprintf ( pf, "\n" ) ;
-
+
fprintf(pf,"%s v = ",sPref.c_str());
for(int i=0; i<getNumFeats(); i++) fprintf(pf,"%s%f",(0==i)?"":"_",getVariance(i));
fprintf ( pf, "\n" ) ;
@@ -141,7 +141,7 @@ void DiagGauss1DModel<Y>::writeFields ( FILE* pf, const string& sPref ) const {
////////////////////////////////////////////////////////////////////////////////
/*
-template <class Y,class X>
+template <class Y,class X>
class DiagGauss2DModel : public Generic2DModel<Y,X,PDFVal> {
private:
// Member variables...
@@ -177,7 +177,7 @@ class DiagGauss2DModel : public Generic2DModel<Y,X,PDFVal> {
////////////////////////////////////////////////////////////////////////////////
-template <class Y,class X1,class X2>
+template <class Y,class X1,class X2>
class DiagGauss3DModel : public Generic3DModel<Y,X1,X2,PDFVal> {
private:
// Member variables...
@@ -220,7 +220,7 @@ class DiagGauss3DModel : public Generic3DModel<Y,X1,X2,PDFVal> {
//
////////////////////////////////////////////////////////////////////////////////
-template <class Y>
+template <class Y>
class TrainableDiagGauss1DModel : public DiagGauss1DModel<Y> {
public:
TrainableDiagGauss1DModel ( ) : DiagGauss1DModel<Y>() { }
diff --git a/contrib/synlm/hhmm/rvtl/include/nl-hash.h b/contrib/synlm/hhmm/rvtl/include/nl-hash.h
index 809284db9..b4d228b9c 100644
--- a/contrib/synlm/hhmm/rvtl/include/nl-hash.h
+++ b/contrib/synlm/hhmm/rvtl/include/nl-hash.h
@@ -54,7 +54,7 @@ class SimpleHash : public hash_map<X,Y,SimpleHashFn<X>,SimpleHashEqual<X> > /*pu
// tr1::unordered_map<X,Y,SimpleHashFn<X>,SimpleHashEqual<X> > mxy;
static const Y yDummy;
//static Y yNonconstDummy;
-
+
public:
// typedef typename OrigHash::const_iterator const_iterator;
// typedef typename OrigHash::iterator iterator;
diff --git a/contrib/synlm/hhmm/rvtl/include/nl-hmm.h b/contrib/synlm/hhmm/rvtl/include/nl-hmm.h
index 2f6cd0104..c4414c4b7 100644
--- a/contrib/synlm/hhmm/rvtl/include/nl-hmm.h
+++ b/contrib/synlm/hhmm/rvtl/include/nl-hmm.h
@@ -209,7 +209,7 @@ template <class MY, class MX, class S, class B>
void HMM<MY,MX,S,B>::debugPrint() const{
for (int frame=0, numFrames=aatnTrellis.getxSize(); frame<numFrames; frame++) {
-
+
for (int beamIndex=0, beamSize=aatnTrellis.getySize(); beamIndex<beamSize; beamIndex++) {
if (aatnTrellis.get(frame,beamIndex).getLogProb().toDouble() > 0) {
@@ -306,7 +306,7 @@ void HMM<MY,MX,S,B>::updateRanked ( const typename MX::RandVarType& x, bool b1 )
// Add best transition (top of queue)...
//mx.getProb(o,my.setTrellDat(ashpiQueue.getTop().first,ashpiQueue.getTop().second));
if ( ashpiQueue.getSize() > 0 ) {
- S s; my.setTrellDat(s,ashpiQueue.getTop().second);
+ S s; my.setTrellDat(s,ashpiQueue.getTop().second);
bFull |= btn.tryAdd ( s, IB(ashpiQueue.getTop().first,my.setBackDat(ashpiQueue.getTop().second)), ashpiQueue.getTop().third );
////cerr<<ashpiQueue.getSize()<<" queue elems A "<<ashpiQueue.getTop()<<"\n";
////cerr<<"/-----A-----\\\n"<<ashpiQueue<<"\\-----A-----/\n";
@@ -379,7 +379,7 @@ void HMM<MY,MX,S,B>::updateSerial ( const typename MX::RandVarType& x ) {
// Incorporate into trellis...
btn.tryAdd ( s, IB(i,my.setBackDat(y)), lgprFull );
//if(OUTPUT_VERYNOISY)
- // fprintf ( stderr," (S_t-1:[e^%0.6f] * Y:e^%0.6f * X:e^%0.6f = S_t:[e^%0.6f])\n",
+ // fprintf ( stderr," (S_t-1:[e^%0.6f] * Y:e^%0.6f * X:e^%0.6f = S_t:[e^%0.6f])\n",
// float(aatnTrellis.get(frameLast-1,i).getLogProb().toInt())/100.0,
// float(lgprY.toInt())/100.0,
// float(lgprX.toInt())/100.0,
@@ -389,7 +389,7 @@ void HMM<MY,MX,S,B>::updateSerial ( const typename MX::RandVarType& x ) {
}
// for(int i=0;i<BEAM_WIDTH;i++) {
-// fprintf(stderr,"> "); btn.get(i)->first.write(stderr); fprintf(stderr,"\n");
+// fprintf(stderr,"> "); btn.get(i)->first.write(stderr); fprintf(stderr,"\n");
// }
btn.sort(atnSorted);
@@ -429,8 +429,8 @@ void HMM<MY,MX,S,B>::each ( const typename MX::RandVarType& x, Beam<LogProb,S,IB
const TrellNode<S,B>& tnsbPrev = aatnTrellis.get(frameLast-1,i);
// If prob still not below beam minimum...
if ( tnsbPrev.getLogProb() > btn.getMin().getScore() ) {
- //if (OUTPUT_VERYNOISY) { fprintf(stderr,"FROM: "); tnsbPrev.getId().write(stderr); fprintf(stderr,"\n"); }
-
+ //if (OUTPUT_VERYNOISY) { fprintf(stderr,"FROM: "); tnsbPrev.getId().write(stderr); fprintf(stderr,"\n"); }
+
// For each possible transition...
const S& sPrev = tnsbPrev.getId();
typename MY::IterVal y;
@@ -447,7 +447,7 @@ void HMM<MY,MX,S,B>::each ( const typename MX::RandVarType& x, Beam<LogProb,S,IB
lgprX = mx.getProb(x,my.setTrellDat(s,y)); if ( !OUTPUT_VERYNOISY && LogProb()==lgprX ) continue;
#endif /////////////////////////////////////////////////////////////////
lgprFull = tnsbPrev.getLogProb() * lgprY * lgprX;
- if (OUTPUT_VERYNOISY) {
+ if (OUTPUT_VERYNOISY) {
boost::mutex::scoped_lock lock1(mutexHmmParanoiaLock);
//fprintf(stderr," TO: "); y.write(stderr); fprintf(stderr,"\n");
cout<<" "<<tnsbPrev.getId()<<" ==("<<tnsbPrev.getLogProb().toInt()<<"*"<<lgprY.toInt()<<"*"<<lgprX.toInt()<<"="<<lgprFull.toInt()<<")==> "<<y<<"\n";
@@ -459,7 +459,7 @@ void HMM<MY,MX,S,B>::each ( const typename MX::RandVarType& x, Beam<LogProb,S,IB
// Incorporate into trellis...
btn.tryAdd ( s, IB(i,my.setBackDat(y)), lgprFull );
// if(OUTPUT_VERYNOISY)
-// fprintf ( stderr," (S_t-1:[e^%0.6f] * Y:e^%0.6f * X:e^%0.6f = S_t:[e^%0.6f])\n",
+// fprintf ( stderr," (S_t-1:[e^%0.6f] * Y:e^%0.6f * X:e^%0.6f = S_t:[e^%0.6f])\n",
// float(aatnTrellis.get(frameLast-1,i).getLogProb().toInt())/100.0,
// float(lgprY.toInt())/100.0,
// float(lgprO.toInt())/100.0,
@@ -695,7 +695,7 @@ std::list<string> HMM<MY,MX,S,B>::getMLS(const S& sLast) const {
//// sprintf(tmp,"HYPOTH %04d> ", fr-1);
//// string tString(tmp);
//// tString +=
- string tString =
+ string tString =
//// aatnTrellis.get(fr,iBest).getId().getString() + " " +
aatnTrellis.get(fr,iBest).getBackData().getString()
//// + "\n"
@@ -737,7 +737,7 @@ template <class MY, class MX, class S, class B>
void HMM<MY,MX,S,B>::writeCurr ( ostream& os, int f=-1 ) const {
if ( -1==f ) f=frameLast;
if ( 0<=f && f<=frameLast )
- for ( int i=0; i<BEAM_WIDTH; i++ )
+ for ( int i=0; i<BEAM_WIDTH; i++ )
if(!(aatnTrellis.get(f,i).getLogProb() == LogProb())){
//fprintf(pf,"at f=%04d b=%04d: ",f,i);
os<<"at "<<std::setfill('0')<<std::setw(4)<<f<<" "<<std::setw(4)<<i<<": ";
@@ -765,7 +765,7 @@ void HMM<MY,MX,S,B>::writeCurrSum ( FILE* pf, int f=-1 ) const {
if ( 0<=f && f<=frameLast ) {
LogProb sum = 0.0;
LogProb logtop = 0.0;
- for ( int i=0; i<BEAM_WIDTH; i++ )
+ for ( int i=0; i<BEAM_WIDTH; i++ )
if(!(aatnTrellis.get(f,i).getLogProb() == LogProb())){
if(i==0) { logtop=aatnTrellis.get(f,i).getLogProb(); }
LogProb big1 = sum - logtop;
@@ -818,7 +818,7 @@ void HMM<MY,MX,S,B>::gatherElementsInBeam( SafeArray1D<Id<int>,pair<S,LogProb> >
result->init(BEAM_WIDTH);
if ( -1==f ) f=frameLast;
if ( 0<=f && f<=frameLast ) {
- for ( int i=0; i<BEAM_WIDTH && &(aatnTrellis.get(f,i))!=NULL; i++ ) {
+ for ( int i=0; i<BEAM_WIDTH && &(aatnTrellis.get(f,i))!=NULL; i++ ) {
result->set(i).first = aatnTrellis.get(f,i).getId();
result->set(i).second = aatnTrellis.get(f,i).getLogProb();
}
@@ -836,7 +836,7 @@ void HMM<MY,MX,S,B>::writeCurrEntropy ( FILE* pf, int f=-1 ) const {
if ( 0<=f && f<=frameLast ) {
LogProb logh = 0.0;
LogProb logtop = 0.0;
- for ( int i=0; i<BEAM_WIDTH; i++ )
+ for ( int i=0; i<BEAM_WIDTH; i++ )
if(!(aatnTrellis.get(f,i).getLogProb() == LogProb())){
if(i==0) { logtop=aatnTrellis.get(f,i).getLogProb(); }
LogProb big1 = logh - logtop;
@@ -862,12 +862,12 @@ void HMM<MY,MX,S,B>::writeCurrDepths ( FILE* pf, int f=-1 ) const {
Array<int> depths = Array<int>();
Array<LogProb> logprobs = Array<LogProb>();
double avgdepth = 0.0;
- for ( int i=0; i<BEAM_WIDTH; i++ )
+ for ( int i=0; i<BEAM_WIDTH; i++ )
if(!(aatnTrellis.get(f,i).getLogProb() == LogProb())){
if(i==0) { logtop=aatnTrellis.get(f,i).getLogProb(); }
logprobs.set(i) = aatnTrellis.get(f,i).getLogProb();
-
+
// loop over values in S node to find lowest meaningful depth
for ( int j=0; j<aatnTrellis.get(f,i).getId().first.getSize(); j++) {
// store the depth, if it's equal to G_BOT/G_BOT
@@ -996,7 +996,7 @@ int HMM<MY,MX,S,B>::getBeamUsed ( int f=-1 ) const {
if ( -1==f ) f=frameLast;
int ctr=0;
if ( 0<=f && f<=frameLast )
- for ( int i=0; i<BEAM_WIDTH; i++ )
+ for ( int i=0; i<BEAM_WIDTH; i++ )
if(!(aatnTrellis.get(f,i).getLogProb() == LogProb())){
ctr++;
}
diff --git a/contrib/synlm/hhmm/rvtl/include/nl-hmm2.h b/contrib/synlm/hhmm/rvtl/include/nl-hmm2.h
index 711d589be..04941088d 100644
--- a/contrib/synlm/hhmm/rvtl/include/nl-hmm2.h
+++ b/contrib/synlm/hhmm/rvtl/include/nl-hmm2.h
@@ -269,7 +269,7 @@ void HMM<MH,MO,X,B>::updateRanked ( const typename MO::RandVarType& o ) {
// Add best transition (top of queue)...
//mo.getProb(o,mh.setTrellDat(axhpiQueue.getTop().first,axhpiQueue.getTop().second));
if ( axhpiQueue.getSize() > 0 ) {
- X x; mh.setTrellDat(x,axhpiQueue.getTop().second);
+ X x; mh.setTrellDat(x,axhpiQueue.getTop().second);
bFull |= btn.tryAdd ( x, IB(axhpiQueue.getTop().first,mh.setBackDat(axhpiQueue.getTop().second)), axhpiQueue.getTop().third );
//cerr<<axhpiQueue.getSize()<<" queue elems A "<<axhpiQueue.getTop()<<"\n";
//cerr<<"/-----A-----\\\n + bFull: "<<bFull<<"\naxhpiQueue: \n"<<axhpiQueue<<"\\-----A-----/\n";
@@ -341,7 +341,7 @@ void HMM<MH,MO,X,B>::updateSerial ( const typename MO::RandVarType& o ) {
// Incorporate into trellis...
btn.tryAdd ( x, IB(i,mh.setBackDat(h)), lgprFull );
//if(OUTPUT_VERYNOISY)
- // fprintf ( stderr," (X_t-1:[e^%0.6f] * H:e^%0.6f * O:e^%0.6f = X_t:[e^%0.6f])\n",
+ // fprintf ( stderr," (X_t-1:[e^%0.6f] * H:e^%0.6f * O:e^%0.6f = X_t:[e^%0.6f])\n",
// float(aatnTrellis.get(frameLast-1,i).getLogProb().toInt())/100.0,
// float(lgprH.toInt())/100.0,
// float(lgprO.toInt())/100.0,
@@ -351,7 +351,7 @@ void HMM<MH,MO,X,B>::updateSerial ( const typename MO::RandVarType& o ) {
}
// for(int i=0;i<BEAM_WIDTH;i++) {
-// fprintf(stderr,"> "); btn.get(i)->first.write(stderr); fprintf(stderr,"\n");
+// fprintf(stderr,"> "); btn.get(i)->first.write(stderr); fprintf(stderr,"\n");
// }
btn.sort(atnSorted);
@@ -390,8 +390,8 @@ void HMM<MH,MO,X,B>::each ( const typename MO::RandVarType& o, Beam<LogProb,X,IB
const TrellNode<X,B>& tnxbPrev = aatnTrellis.get(frameLast-1,i);
// If prob still not below beam minimum...
if ( tnxbPrev.getLogProb() > btn.getMin().getScore() ) {
- //if (OUTPUT_VERYNOISY) { fprintf(stderr,"FROM: "); tnxbPrev.getId().write(stderr); fprintf(stderr,"\n"); }
-
+ //if (OUTPUT_VERYNOISY) { fprintf(stderr,"FROM: "); tnxbPrev.getId().write(stderr); fprintf(stderr,"\n"); }
+
// For each possible transition...
const X& xPrev = tnxbPrev.getId();
typename MH::IterVal h;
@@ -408,7 +408,7 @@ void HMM<MH,MO,X,B>::each ( const typename MO::RandVarType& o, Beam<LogProb,X,IB
lgprO = mo.getProb(o,mh.setTrellDat(x,h)); if ( !OUTPUT_VERYNOISY && LogProb()==lgprO ) continue;
#endif /////////////////////////////////////////////////////////////////
lgprFull = tnxbPrev.getLogProb() * lgprH * lgprO;
- if (OUTPUT_VERYNOISY) {
+ if (OUTPUT_VERYNOISY) {
boost::mutex::scoped_lock lock1(mutexHmmParanoiaLock);
//fprintf(stderr," TO: "); h.write(stderr); fprintf(stderr,"\n");
cout<<" "<<tnxbPrev.getId()<<" ==("<<tnxbPrev.getLogProb().toInt()<<"*"<<lgprH.toInt()<<"*"<<lgprO.toInt()<<"="<<lgprFull.toInt()<<")==> "<<h<<"\n";
@@ -420,7 +420,7 @@ void HMM<MH,MO,X,B>::each ( const typename MO::RandVarType& o, Beam<LogProb,X,IB
// Incorporate into trellis...
btn.tryAdd ( x, IB(i,mh.setBackDat(h)), lgprFull );
// if(OUTPUT_VERYNOISY)
-// fprintf ( stderr," (X_t-1:[e^%0.6f] * H:e^%0.6f * O:e^%0.6f = X_t:[e^%0.6f])\n",
+// fprintf ( stderr," (X_t-1:[e^%0.6f] * H:e^%0.6f * O:e^%0.6f = X_t:[e^%0.6f])\n",
// float(aatnTrellis.get(frameLast-1,i).getLogProb().toInt())/100.0,
// float(lgprH.toInt())/100.0,
// float(lgprO.toInt())/100.0,
@@ -656,7 +656,7 @@ std::list<string> HMM<MH,MO,X,B>::getMLS(const X& xLast) const {
//// sprintf(tmp,"HYPOTH %04d> ", fr-1);
//// string tString(tmp);
//// tString +=
- string tString =
+ string tString =
//// aatnTrellis.get(fr,iBest).getId().getString() + " " +
aatnTrellis.get(fr,iBest).getBackData().getString()
//// + "\n"
@@ -697,7 +697,7 @@ template <class MH, class MO, class X, class B>
void HMM<MH,MO,X,B>::writeCurr ( FILE* pf, int f=-1 ) const {
if ( -1==f ) f=frameLast;
if ( 0<=f && f<=frameLast )
- for ( int i=0; i<BEAM_WIDTH; i++ )
+ for ( int i=0; i<BEAM_WIDTH; i++ )
if(!(aatnTrellis.get(f,i).getLogProb() == LogProb())){
fprintf(pf,"at f=%04d b=%04d: ",f,i);
String str; str<<aatnTrellis.get(f,i).getId(); //.write(pf);
@@ -721,7 +721,7 @@ void HMM<MH,MO,X,B>::writeCurrSum ( FILE* pf, int f=-1 ) const {
if ( 0<=f && f<=frameLast ) {
LogProb sum = 0.0;
LogProb logtop = 0.0;
- for ( int i=0; i<BEAM_WIDTH; i++ )
+ for ( int i=0; i<BEAM_WIDTH; i++ )
if(!(aatnTrellis.get(f,i).getLogProb() == LogProb())){
if(i==0) { logtop=aatnTrellis.get(f,i).getLogProb(); }
LogProb big1 = sum - logtop;
@@ -741,7 +741,7 @@ void HMM<MH,MO,X,B>::writeCurrEntropy ( FILE* pf, int f=-1 ) const {
if ( 0<=f && f<=frameLast ) {
LogProb logh = 0.0;
LogProb logtop = 0.0;
- for ( int i=0; i<BEAM_WIDTH; i++ )
+ for ( int i=0; i<BEAM_WIDTH; i++ )
if(!(aatnTrellis.get(f,i).getLogProb() == LogProb())){
if(i==0) { logtop=aatnTrellis.get(f,i).getLogProb(); }
LogProb big1 = logh - logtop;
@@ -768,12 +768,12 @@ void HMM<MH,MO,X,B>::writeCurrDepths ( FILE* pf, int f=-1 ) const {
Array<int> depths = Array<int>();
Array<LogProb> logprobs = Array<LogProb>();
double avgdepth = 0.0;
- for ( int i=0; i<BEAM_WIDTH; i++ )
+ for ( int i=0; i<BEAM_WIDTH; i++ )
if(!(aatnTrellis.get(f,i).getLogProb() == LogProb())){
if(i==0) { logtop=aatnTrellis.get(f,i).getLogProb(); }
logprobs.set(i) = aatnTrellis.get(f,i).getLogProb();
-
+
// loop over values in S node to find lowest meaningful depth
for ( int j=0; j<aatnTrellis.get(f,i).getId().first.getSize(); j++) {
// store the depth, if it's equal to G_BOT/G_BOT
@@ -900,7 +900,7 @@ int HMM<MH,MO,X,B>::getBeamUsed ( int f=-1 ) const {
if ( -1==f ) f=frameLast;
int ctr=0;
if ( 0<=f && f<=frameLast )
- for ( int i=0; i<BEAM_WIDTH; i++ )
+ for ( int i=0; i<BEAM_WIDTH; i++ )
if(!(aatnTrellis.get(f,i).getLogProb() == LogProb())){
ctr++;
}
diff --git a/contrib/synlm/hhmm/rvtl/include/nl-hmmloop.h b/contrib/synlm/hhmm/rvtl/include/nl-hmmloop.h
index c476b4271..a8b8d5f27 100644
--- a/contrib/synlm/hhmm/rvtl/include/nl-hmmloop.h
+++ b/contrib/synlm/hhmm/rvtl/include/nl-hmmloop.h
@@ -348,7 +348,7 @@ const TrellNode<S,B>& HMMLoop<MY,MX,S,B>::update ( const typename MX::RandVarTyp
//modX.getProb(o,modY.setTrellDat(ashpiQueue.getTop().first,ashpiQueue.getTop().second));
if ( ashpiQueue.getSize() > 0 ) {
S s ( ashpiQueue.getTop().second );
- ////S s; modY.setTrellDat(s,ashpiQueue.getTop().second);
+ ////S s; modY.setTrellDat(s,ashpiQueue.getTop().second);
bFull |= btn.tryAdd ( s, IB(ashpiQueue.getTop().first,B(ashpiQueue.getTop().second)), ashpiQueue.getTop().third );
////cerr<<ashpiQueue.getSize()<<" queue elems A "<<ashpiQueue.getTop()<<"\n";
////cerr<<"/-----A-----\\\n"<<ashpiQueue<<"\\-----A-----/\n";
diff --git a/contrib/synlm/hhmm/rvtl/include/nl-linsep.h b/contrib/synlm/hhmm/rvtl/include/nl-linsep.h
index ac3ef3312..5c644a0fb 100644
--- a/contrib/synlm/hhmm/rvtl/include/nl-linsep.h
+++ b/contrib/synlm/hhmm/rvtl/include/nl-linsep.h
@@ -90,8 +90,8 @@ class Vector : public X {
Vector<X> operator- ( ElementType d ) const { Vector<X> vO; for(uint i=0;i<SIZE;i++) vO[i] = X::get(i)-d; return vO; }
friend Vector<X> operator* ( ElementType d, const Vector<X>& v ) { Vector<X> vO; for(uint i=0;i<SIZE;i++) vO[i] = d*v[i]; return vO; }
friend Vector<X> operator/ ( ElementType d, const Vector<X>& v ) { Vector<X> vO; for(uint i=0;i<SIZE;i++) vO[i] = d/v[i]; return vO; }
- friend Vector<X> operator+ ( ElementType d, const Vector<X>& v ) { Vector<X> vO; for(uint i=0;i<SIZE;i++) vO[i] = d+v[i]; return vO; }
- friend Vector<X> operator- ( ElementType d, const Vector<X>& v ) { Vector<X> vO; for(uint i=0;i<SIZE;i++) vO[i] = d-v[i]; return vO; }
+ friend Vector<X> operator+ ( ElementType d, const Vector<X>& v ) { Vector<X> vO; for(uint i=0;i<SIZE;i++) vO[i] = d+v[i]; return vO; }
+ friend Vector<X> operator- ( ElementType d, const Vector<X>& v ) { Vector<X> vO; for(uint i=0;i<SIZE;i++) vO[i] = d-v[i]; return vO; }
Vector<X>& operator*= ( ElementType d ) { for(uint i=0;i<SIZE;i++) X::set(i)*=d; return *this; }
Vector<X>& operator/= ( ElementType d ) { for(uint i=0;i<SIZE;i++) X::set(i)/=d; return *this; }
Vector<X>& operator+= ( ElementType d ) { for(uint i=0;i<SIZE;i++) X::set(i)+=d; return *this; }
diff --git a/contrib/synlm/hhmm/rvtl/include/nl-mixture.h b/contrib/synlm/hhmm/rvtl/include/nl-mixture.h
index 2da5aacb2..3a88bea81 100644
--- a/contrib/synlm/hhmm/rvtl/include/nl-mixture.h
+++ b/contrib/synlm/hhmm/rvtl/include/nl-mixture.h
@@ -97,7 +97,7 @@ class Mixture3DModel : public Generic2DModel<Y,X,Prob> {
//
////////////////////////////////////////////////////////////////////////////////
-template <template <class MY> class M,class Y,class C>
+template <template <class MY> class M,class Y,class C>
class TrainableMixture2DModel : public Mixture2DModel<M,Y,C> {
// private:
// LogPDFVal logpdfPrevDataAvg;
@@ -110,7 +110,7 @@ class TrainableMixture2DModel : public Mixture2DModel<M,Y,C> {
};
////////////////////////////////////////
-template <template <class MY> class M,class Y,class C>
+template <template <class MY> class M,class Y,class C>
void TrainableMixture2DModel<M,Y,C>::updateFields ( const List<Joint2DRV<Y,Prob> >& lyp, const PDFVal WEIGHT_LIMIT, bool& bShouldStop ) {
LogPDFVal logpdfData = 0.0;
CPT1DModel<C,Prob> mprPseudoEmpC; // pseudo-empirical prob marginal
@@ -178,7 +178,7 @@ void TrainableMixture2DModel<M,Y,C>::updateFields ( const List<Joint2DRV<Y,Prob>
}
////////////////////////////////////////
-template <template <class MY> class M,class Y,class C>
+template <template <class MY> class M,class Y,class C>
void TrainableMixture2DModel<M,Y,C>::train ( List<Joint2DRV<Y,Prob> >& lyp, const int EPOCH_LIMIT, const PDFVal WEIGHT_LIMIT ) {
// Normalize model...
@@ -204,7 +204,7 @@ void TrainableMixture2DModel<M,Y,C>::train ( List<Joint2DRV<Y,Prob> >& lyp, cons
////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////
-template <template <class MY> class M,class Y,class X,class C>
+template <template <class MY> class M,class Y,class X,class C>
class TrainableMixture3DModel : public Generic2DModel<Y,X,C> {
private:
string sId;
@@ -225,7 +225,7 @@ class TrainableMixture3DModel : public Generic2DModel<Y,X,C> {
};
////////////////////////////////////////
-template <template <class MY> class M,class Y,class X,class C>
+template <template <class MY> class M,class Y,class X,class C>
void TrainableMixture3DModel<M,Y,X,C>::train ( const int EPOCH_LIMIT, const PDFVal WEIGHT_LIMIT ) {
// Update each subphone from list...
int ctr = 0;
@@ -237,7 +237,7 @@ void TrainableMixture3DModel<M,Y,X,C>::train ( const int EPOCH_LIMIT, const PDFV
}
////////////////////////////////////////
-template <template <class MY> class M,class Y,class X,class C>
+template <template <class MY> class M,class Y,class X,class C>
void TrainableMixture3DModel<M,Y,X,C>::train ( const List<Joint3DRV<X,Y,Prob> >& lxyp, const int EPOCH_LIMIT, const PDFVal WEIGHT_LIMIT ) {
// Chop list into phone-specific sub-lists...
ListedObject<Joint3DRV<X,Y,Prob> >* pxyp;
@@ -248,7 +248,7 @@ void TrainableMixture3DModel<M,Y,X,C>::train ( const List<Joint3DRV<X,Y,Prob> >&
}
////////////////////////////////////////
-template <template <class MY> class M,class Y,class X,class C>
+template <template <class MY> class M,class Y,class X,class C>
bool TrainableMixture3DModel<M,Y,X,C>::readData ( char* as[], int numFields ) {
if ( /*as[0]!=sId+"dat" ||*/ numFields!=3 ) return false;
alyp.set(X(as[1])).add() = Joint2DRV<Y,Prob>(Y(as[2]),Prob(1.0));
@@ -256,7 +256,7 @@ bool TrainableMixture3DModel<M,Y,X,C>::readData ( char* as[], int numFields ) {
}
////////////////////////////////////////
-template <template <class MY> class M,class Y,class X,class C>
+template <template <class MY> class M,class Y,class X,class C>
void TrainableMixture3DModel<M,Y,X,C>::writeFields ( FILE* pf, string sPref ) {
X x; for ( bool b=x.setFirst(); b; b=x.setNext() ) {
am.get(x).writeFields(pf,sPref+" "+x.getString());
diff --git a/contrib/synlm/hhmm/rvtl/include/nl-modelfile.h b/contrib/synlm/hhmm/rvtl/include/nl-modelfile.h
index dc6bec487..8b9730659 100644
--- a/contrib/synlm/hhmm/rvtl/include/nl-modelfile.h
+++ b/contrib/synlm/hhmm/rvtl/include/nl-modelfile.h
@@ -37,7 +37,7 @@ void processModelFilePtr ( FILE* pf, bool rF(Array<char*>&) ) {
int i=0; int numFields=0; int c=' '; int line=1;
CONSUME_ALL(pf,c,WHITESPACE(c),line); // Get to first record
while ( c!=EOF ) { // For each record
- if ( c=='#' ) CONSUME_ALL(pf, c, c!='\n' && c!='\0', line ) ; // If comment, consume
+ if ( c=='#' ) CONSUME_ALL(pf, c, c!='\n' && c!='\0', line ) ; // If comment, consume
else { // If no comment,
Array<char*> aps(100);
String psBuff(1000);
@@ -49,7 +49,7 @@ void processModelFilePtr ( FILE* pf, bool rF(Array<char*>&) ) {
if (!z) break;
aps[i]=z;
}
-
+
if ( !rF(aps) ) // Try to process fields, else complain
fprintf( stderr, "\nERROR: %d %d-arg %s in line %d\n\n", numFields, aps.size(), aps[0], line);
}
@@ -75,7 +75,7 @@ void processModelSocket ( const int tSockfd, int& c, bool rF(Array<char*>&) ) {
int i=0; int numFields=0; int line=1;
CONSUME_ALL_SOCKET(tSockfd,c,WHITESPACE(c),line); // Get to first record
while ( c!='\0' && c!='\5' ) { // For each record
- if ( c=='#' ) CONSUME_ALL_SOCKET(tSockfd, c, (c!='\n' && c!='\0' && c!='\5'), line ) ; // If comment, consume
+ if ( c=='#' ) CONSUME_ALL_SOCKET(tSockfd, c, (c!='\n' && c!='\0' && c!='\5'), line ) ; // If comment, consume
else { // If no comment,
Array<char*> aps(100);
String psBuff(1000);
@@ -88,7 +88,7 @@ void processModelSocket ( const int tSockfd, int& c, bool rF(Array<char*>&) ) {
if (!z) break;
aps[i]=z;
}
-
+
if ( !rF(aps) ) // Try to process fields, else complain
fprintf( stderr, "\nERROR: %d-arg %s in line %d\n\n", numFields, aps[0], line);
}
@@ -97,7 +97,7 @@ void processModelSocket ( const int tSockfd, int& c, bool rF(Array<char*>&) ) {
}
void processModelSocket ( const int tSockfd, bool rF(Array<char*>&) ) {
- int c=' ';
+ int c=' ';
processModelSocket ( tSockfd, c, rF );
}
diff --git a/contrib/synlm/hhmm/rvtl/include/nl-oblidtree.h b/contrib/synlm/hhmm/rvtl/include/nl-oblidtree.h
index 24c82e313..d5bfd5c8e 100644
--- a/contrib/synlm/hhmm/rvtl/include/nl-oblidtree.h
+++ b/contrib/synlm/hhmm/rvtl/include/nl-oblidtree.h
@@ -80,12 +80,12 @@ class binuint {
// Input / output methods...
friend StringInput operator>> ( StringInput si, binuint& i ) {
if(si==NULL) return si;
- i.b=0;
+ i.b=0;
for ( char c=si[0]; '0'<=c && c<='1'; ++si,c=si[0])
{ i.b=i.b*2+c-'0'; }
return si; }
- friend ostream& operator<< ( ostream& os, binuint i ) { for(int e=uint(log2(i.b));e>=0;e--)os <<((i.b>>e)%2); return os; }
- friend String& operator<< ( String& str, binuint i ) { for(int e=uint(log2(i.b));e>=0;e--)str<<((i.b>>e)%2); return str; }
+ friend ostream& operator<< ( ostream& os, binuint i ) { for(int e=uint(log2(i.b));e>=0;e--)os <<((i.b>>e)%2); return os; }
+ friend String& operator<< ( String& str, binuint i ) { for(int e=uint(log2(i.b));e>=0;e--)str<<((i.b>>e)%2); return str; }
};
////////////////////////////////////////////////////////////////////////////////
diff --git a/contrib/synlm/hhmm/rvtl/include/nl-prob.h b/contrib/synlm/hhmm/rvtl/include/nl-prob.h
index 76cf2fb57..03211404b 100644
--- a/contrib/synlm/hhmm/rvtl/include/nl-prob.h
+++ b/contrib/synlm/hhmm/rvtl/include/nl-prob.h
@@ -43,7 +43,7 @@ class Prob {
Prob ( ) { gVal = 0.0; }
Prob (double d) { gVal = d; }
Prob (const char* ps) { gVal = atof(ps); }
-
+
operator double() const { return gVal; }
double toDouble() const { return gVal; }
Prob& operator+= ( const Prob p ) { gVal += p.gVal; return *this; }
@@ -54,7 +54,7 @@ class Prob {
friend ostream& operator<< ( ostream& os, const Prob& pr ) { return os<<pr.toDouble(); }
friend String& operator<< ( String& str, const Prob& pr ) { return str<<pr.toDouble(); }
friend pair<StringInput,Prob*> operator>> ( StringInput si, Prob& n ) { return pair<StringInput,Prob*>(si,&n); }
- friend StringInput operator>> ( pair<StringInput,Prob*> si_n, const char* psDlm ) {
+ friend StringInput operator>> ( pair<StringInput,Prob*> si_n, const char* psDlm ) {
double d=0.0; StringInput si=si_n.first>>d>>psDlm; *si_n.second=Prob(d); return si; }
};
@@ -129,7 +129,7 @@ class LogProb : public Id<int> {
friend ostream& operator<< ( ostream& os, const LogProb& lp ) { return os<<lp.toInt(); }
friend String& operator<< ( String& str, const LogProb& lp ) { return str<<lp.toInt(); }
friend pair<StringInput,LogProb*> operator>> ( StringInput si, LogProb& n ) { return pair<StringInput,LogProb*>(si,&n); }
- friend StringInput operator>> ( pair<StringInput,LogProb*> si_n, const char* psDlm ) {
+ friend StringInput operator>> ( pair<StringInput,LogProb*> si_n, const char* psDlm ) {
double d=0.0; StringInput si=si_n.first>>d>>psDlm; *si_n.second=LogProb(d); return si; }
};
diff --git a/contrib/synlm/hhmm/rvtl/include/nl-probmodel.h b/contrib/synlm/hhmm/rvtl/include/nl-probmodel.h
index 2dcff7b30..2b0a0281c 100644
--- a/contrib/synlm/hhmm/rvtl/include/nl-probmodel.h
+++ b/contrib/synlm/hhmm/rvtl/include/nl-probmodel.h
@@ -33,7 +33,7 @@
//
////////////////////////////////////////////////////////////////////////////////
-template<class Y,class P>
+template<class Y,class P>
class Generic1DModel {
public:
typedef Y RVType;
@@ -45,7 +45,7 @@ class Generic1DModel {
////////////////////////////////////////////////////////////
-template<class Y,class X1,class P>
+template<class Y,class X1,class P>
class Generic2DModel {
public:
typedef Y RVType;
@@ -60,7 +60,7 @@ class Generic2DModel {
////////////////////////////////////////////////////////////
-template<class Y,class X1,class X2,class P>
+template<class Y,class X1,class X2,class P>
class Generic3DModel {
public:
typedef Y RVType;
@@ -76,7 +76,7 @@ class Generic3DModel {
////////////////////////////////////////////////////////////
-template<class Y,class X1,class X2,class X3,class P>
+template<class Y,class X1,class X2,class X3,class P>
class Generic4DModel {
public:
typedef Y RVType;
@@ -93,7 +93,7 @@ class Generic4DModel {
////////////////////////////////////////////////////////////
-template<class Y,class X1,class X2,class X3,class X4,class P>
+template<class Y,class X1,class X2,class X3,class X4,class P>
class Generic5DModel {
public:
typedef Y RVType;
@@ -111,7 +111,7 @@ class Generic5DModel {
////////////////////////////////////////////////////////////
-template<class Y,class X1,class X2,class X3,class X4,class X5,class P>
+template<class Y,class X1,class X2,class X3,class X4,class X5,class P>
class Generic6DModel {
public:
typedef Y RVType;
@@ -130,7 +130,7 @@ class Generic6DModel {
////////////////////////////////////////////////////////////
-template<class Y,class X1,class X2,class X3,class X4,class X5,class X6,class P>
+template<class Y,class X1,class X2,class X3,class X4,class X5,class X6,class P>
class Generic7DModel {
public:
typedef Y RVType;
@@ -302,7 +302,7 @@ class Modeled5DRV : public M::RVType {
const typename M::Dep2Type& x2,
const typename M::Dep3Type& x3,
const typename M::Dep4Type& x4 ) const { return m.getProb(*this,x1,x2,x3,x4); }
-
+
};
///////////////////////////////////////////////////////////////////////////////
@@ -346,7 +346,7 @@ class Modeled6DRV : public M::RVType {
const typename M::Dep3Type& x3,
const typename M::Dep4Type& x4,
const typename M::Dep5Type& x5 ) const { return m.getProb(*this,x1,x2,x3,x4,x5); }
-
+
};
///////////////////////////////////////////////////////////////////////////////
@@ -395,7 +395,7 @@ class Modeled7DRV : public M::RVType {
const typename M::Dep4Type& x4,
const typename M::Dep5Type& x5,
const typename M::Dep6Type& x6 ) const { return m.getProb(*this,x1,x2,x3,x4,x5,x6); }
-
+
};
///////////////////////////////////////////////////////////////////////////////
diff --git a/contrib/synlm/hhmm/rvtl/include/nl-racpt.h b/contrib/synlm/hhmm/rvtl/include/nl-racpt.h
index 5d1502f1f..342e86de2 100644
--- a/contrib/synlm/hhmm/rvtl/include/nl-racpt.h
+++ b/contrib/synlm/hhmm/rvtl/include/nl-racpt.h
@@ -42,7 +42,7 @@ class GenericRACPTModel : public SimpleHash<K,P> {
return ( SimpleHash<K,P>::contains(k) );
}
-/*
+/*
P getProb ( const IterVal& ikyp, const K& k ) const {
if ( ikyp.iter.first == ikyp.iter.second ) { cerr<<"ERROR: no iterator to fix probability: "<<k<<endl; return P(); }
return ( ikyp.iter.first->second );
@@ -91,7 +91,7 @@ class GenericRACPTModel : public SimpleHash<K,P> {
for ( typename HKP::const_iterator ik=HKP::begin(); ik!=HKP::end(); ik++ ) {
K k=ik->first;
os << psId<<" "<<k<<" = "<<getProb(k).toDouble()<<endl;
-
+
// IterVal y;
// for ( bool b=setFirst(y,k); b; b=setNext(y,k) )
// os<<psId<<" "<<k<<" : "<<y<<" = "<<getProb(y,k).toDouble()<<"\n";
@@ -110,14 +110,14 @@ class GenericRACPTModel : public SimpleHash<K,P> {
friend pair<StringInput,GenericRACPTModel<K,P>*> operator>> ( StringInput si, GenericRACPTModel<K,P>& m ) {
return pair<StringInput,GenericRACPTModel<K,P>*>(si,&m); }
-
+
friend StringInput operator>> ( pair<StringInput,GenericRACPTModel<K,P>*> delimbuff, const char* psD ) {
- K k;
- StringInput si,si2,si3;
+ K k;
+ StringInput si,si2,si3;
GenericRACPTModel<K,P>& m = *delimbuff.second;
si=delimbuff.first;
if ( si==NULL ) return si;
-
+
// Kill the colon since we're treating the whole thing as the condition
char * str = si.c_str();
char * p = strchr(str, ':');
@@ -125,17 +125,17 @@ class GenericRACPTModel : public SimpleHash<K,P> {
p[0] = ' ';
}
si=str;
- while((si2=si>>" ")!=NULL)si=si2;
+ while((si2=si>>" ")!=NULL)si=si2;
si=si>>k>>" ";
- while((si2=si>>" ")!=NULL)si=si2;
+ while((si2=si>>" ")!=NULL)si=si2;
si=si>>"= ";
- while((si2=si>>" ")!=NULL)si=si2;
+ while((si2=si>>" ")!=NULL)si=si2;
return (si!=NULL) ? si>>m.setProb(k)>>psD : si;
}
};
-template<class Y, class P>
+template<class Y, class P>
class RandAccCPT1DModel : public GenericRACPTModel<MapKey1D<Y>,P> {
public:
// typedef typename GenericCPTModel<Y,MapKey1D<Unit>,P>::IterVal IterVal;
@@ -170,7 +170,7 @@ P& setProb ( const Y& y ) {
////////////////////
-template<class Y, class X1, class P>
+template<class Y, class X1, class P>
class RandAccCPT2DModel : public GenericRACPTModel<MapKey2D<X1,Y>,P> {
public:
@@ -187,7 +187,7 @@ class RandAccCPT2DModel : public GenericRACPTModel<MapKey2D<X1,Y>,P> {
P getProb ( const Y& y, const X1& x1 ) const {
return GenericRACPTModel<MapKey2D<X1,Y>,P>::getProb ( MapKey2D<X1,Y>(x1,y) );
}
-
+
/*
P& setProb ( const Y& y, const X1& x1 ) {
cerr << "setProb called on racpt2d" << endl;
@@ -199,7 +199,7 @@ class RandAccCPT2DModel : public GenericRACPTModel<MapKey2D<X1,Y>,P> {
////////////////////
-template<class Y, class X1, class X2, class P>
+template<class Y, class X1, class X2, class P>
class RandAccCPT3DModel : public GenericRACPTModel<MapKey3D<X1,X2,Y>,P> {
public:
@@ -219,7 +219,7 @@ class RandAccCPT3DModel : public GenericRACPTModel<MapKey3D<X1,X2,Y>,P> {
/*
////////////////////
-template<class Y, class X1, class X2, class X3, class P>
+template<class Y, class X1, class X2, class X3, class P>
class CPT4DModel : public GenericCPTModel<Y,MapKey3D<X1,X2,X3>,P> {
public:
typedef typename GenericCPTModel<Y,MapKey3D<X1,X2,X3>,P>::IterVal IterVal;
@@ -256,7 +256,7 @@ class CPT4DModel : public GenericCPTModel<Y,MapKey3D<X1,X2,X3>,P> {
////////////////////
-template<class Y, class X1, class X2, class X3, class X4, class P>
+template<class Y, class X1, class X2, class X3, class X4, class P>
class CPT5DModel : public GenericCPTModel<Y,MapKey4D<X1,X2,X3,X4>,P> {
public:
typedef typename GenericCPTModel<Y,MapKey4D<X1,X2,X3,X4>,P>::IterVal IterVal;
@@ -293,7 +293,7 @@ class CPT5DModel : public GenericCPTModel<Y,MapKey4D<X1,X2,X3,X4>,P> {
////////////////////
-template<class Y, class X1, class X2, class X3, class X4, class X5, class P>
+template<class Y, class X1, class X2, class X3, class X4, class X5, class P>
class RACPT6DModel : public GenericCPTModel<Y,MapKey5D<X1,X2,X3,X4,X5>,P> {
public:
typedef typename GenericCPTModel<Y,MapKey5D<X1,X2,X3,X4,X5>,P>::IterVal IterVal;
diff --git a/contrib/synlm/hhmm/rvtl/include/nl-randvar.h b/contrib/synlm/hhmm/rvtl/include/nl-randvar.h
index 66cc0b8f2..b4caa2bde 100644
--- a/contrib/synlm/hhmm/rvtl/include/nl-randvar.h
+++ b/contrib/synlm/hhmm/rvtl/include/nl-randvar.h
@@ -129,7 +129,7 @@ class DiscreteDomainRV : public Id<T> {
friend pair<StringInput,DiscreteDomainRV<T,domain>*> operator>> ( const StringInput ps, DiscreteDomainRV<T,domain>& rv ) { return pair<StringInput,DiscreteDomainRV<T,domain>*>(ps,&rv); }
friend StringInput operator>> ( pair<StringInput,DiscreteDomainRV<T,domain>*> delimbuff, const char* psDlm ) {
if (StringInput(NULL)==delimbuff.first) return delimbuff.first;
- ////assert(*delimbuff.second<domain.getSize());
+ ////assert(*delimbuff.second<domain.getSize());
int j=0;
StringInput psIn = delimbuff.first;
if(psDlm[0]=='\0') { *delimbuff.second=psIn.c_str(); return psIn+strlen(psIn.c_str()); }
@@ -203,7 +203,7 @@ template <class T> const T RefRV<T>::DUMMY;
////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////
-template<class V1,class V2>
+template<class V1,class V2>
class Joint2DRV {
public:
@@ -216,7 +216,7 @@ class Joint2DRV {
Joint2DRV ( const V1& v1, const V2& v2 ) { first=v1; second=v2; }
// Extraction methods...
- size_t getHashKey ( ) const { size_t k=rotLeft(first.getHashKey(),3); k^=second.getHashKey();
+ size_t getHashKey ( ) const { size_t k=rotLeft(first.getHashKey(),3); k^=second.getHashKey();
/*fprintf(stderr," (%d) %d ^& %d = %d\n",sizeof(*this),x1.getHashKey(),x2.getHashKey(),k);*/ return k; }
bool operator< ( const Joint2DRV<V1,V2>& j ) const { return ( (first<j.first) ||
(first==j.first && second<j.second) ); }
@@ -276,7 +276,7 @@ class DelimitedJoint2DRV : public Joint2DRV<V1,V2> {
friend pair<StringInput,DelimitedJoint2DRV<SD1,V1,SD2,V2,SD3>*> operator>> ( StringInput ps, DelimitedJoint2DRV<SD1,V1,SD2,V2,SD3>& rv ) { return pair<StringInput,DelimitedJoint2DRV<SD1,V1,SD2,V2,SD3>*>(ps,&rv); }
friend StringInput operator>> ( pair<StringInput,DelimitedJoint2DRV<SD1,V1,SD2,V2,SD3>*> delimbuff, const char* psDlm ) {
if (StringInput(NULL)==delimbuff.first) return delimbuff.first;
- return ( (SD3[0]=='\0') ? delimbuff.first>>SD1>>delimbuff.second->first>>SD2>>delimbuff.second->second>>psDlm
+ return ( (SD3[0]=='\0') ? delimbuff.first>>SD1>>delimbuff.second->first>>SD2>>delimbuff.second->second>>psDlm
: delimbuff.first>>SD1>>delimbuff.second->first>>SD2>>delimbuff.second->second>>SD3>>psDlm );
}
};
@@ -290,7 +290,7 @@ class DelimitedJoint2DRV : public Joint2DRV<V1,V2> {
//
////////////////////////////////////////////////////////////////////////////////
-template<class V1,class V2,class V3>
+template<class V1,class V2,class V3>
class Joint3DRV {
public:
@@ -361,7 +361,7 @@ class DelimitedJoint3DRV : public Joint3DRV<V1,V2,V3> {
return pair<StringInput,DelimitedJoint3DRV<SD1,V1,SD2,V2,SD3,V3,SD4>*>(ps,&rv); }
friend StringInput operator>> ( pair<StringInput,DelimitedJoint3DRV<SD1,V1,SD2,V2,SD3,V3,SD4>*> delimbuff, const char* psDlm ) {
if (StringInput(NULL)==delimbuff.first) return delimbuff.first;
- return ( (SD4[0]=='\0') ? delimbuff.first>>SD1>>delimbuff.second->first>>SD2>>delimbuff.second->second>>SD3>>delimbuff.second->third>>psDlm
+ return ( (SD4[0]=='\0') ? delimbuff.first>>SD1>>delimbuff.second->first>>SD2>>delimbuff.second->second>>SD3>>delimbuff.second->third>>psDlm
: delimbuff.first>>SD1>>delimbuff.second->first>>SD2>>delimbuff.second->second>>SD3>>delimbuff.second->third>>SD4>>psDlm );
}
};
@@ -453,7 +453,7 @@ class DelimitedJoint4DRV : public Joint4DRV<V1,V2,V3,V4> {
//
////////////////////////////////////////////////////////////////////////////////
-template <int I, class T>
+template <int I, class T>
class JointArrayRV {
private:
// Data members...
@@ -491,7 +491,7 @@ class JointArrayRV {
////////////////////////////////////////////////////////////////////////////////
-template <int I, char* SD, class T>
+template <int I, char* SD, class T>
class DelimitedJointArrayRV : public JointArrayRV<I,T> {
public:
@@ -569,7 +569,7 @@ class History {
/*
void read ( char* ps, const ReaderContext& rc=ReaderContext() ) { char* psT; for(int i=0;i<N;i++){char* z=strtok_r((0==i)?ps:NULL,";",&psT); assert(z); at.set(i).read(z);} }
//at.set(i).read(strtok_r((0==i)?ps:NULL,";",&psT)); }
- */
+ */
friend ostream& operator<< ( ostream& os, const History<N,T>& a ) { for(int i=0;i<N;i++)os<<((i==0)?"":";")<<a.getBack(i); return os; }
friend pair<StringInput,History<N,T>*> operator>> ( StringInput ps, History<N,T>& a ) { return pair<StringInput,History<N,T>*>(ps,&a); }
diff --git a/contrib/synlm/hhmm/rvtl/include/nl-safeids.h b/contrib/synlm/hhmm/rvtl/include/nl-safeids.h
index 50837c366..c5f9dcb67 100644
--- a/contrib/synlm/hhmm/rvtl/include/nl-safeids.h
+++ b/contrib/synlm/hhmm/rvtl/include/nl-safeids.h
@@ -30,7 +30,7 @@
#include "nl-stream.h"
#include <iostream>
-using namespace std;
+using namespace std;
////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////
@@ -39,7 +39,7 @@ using namespace std;
//
////////////////////////////////////////////////////////////////////////////////
-template <int I, class T>
+template <int I, class T>
class StaticSafeArray {
private:
// Data members...
@@ -84,7 +84,7 @@ class StaticSafeArray {
////////////////////////////////////////////////////////////////////////////////
-template <int I, char* SD, class T>
+template <int I, char* SD, class T>
class DelimitedStaticSafeArray : public StaticSafeArray<I,T> {
public:
DelimitedStaticSafeArray ( ) : StaticSafeArray<I,T>() { }
@@ -349,7 +349,7 @@ class SafeArray2D {
// Extraction methods...
const T& get (const X1& x,const X2& y) const { assert(at!=NULL);
assert(x.toInt()>=0); assert(x.toInt()<xSize);
- assert(y.toInt()>=0);
+ assert(y.toInt()>=0);
//this assert failed when compile without -DNDEBUG (needed for debugging). Have to figure out why before adding this assert back in
//assert(y.toInt()<ySize);
return at[x.toInt()*ySize + y.toInt()];}
@@ -423,7 +423,7 @@ class SafeArray4D {
{ delete[] at; wSize=sat.wSize; xSize=sat.xSize; ySize=sat.ySize;
zSize=sat.zSize; at=new T[wSize*xSize*ySize*zSize];
for(int i=0;i<wSize*xSize*ySize*zSize;i++) at[i]=sat.at[i]; return *this; }
- void init (int w,int x,int y,int z)
+ void init (int w,int x,int y,int z)
{ delete[] at; wSize=w; xSize=x; ySize=y; zSize=z; at=new T[w*x*y*z]; }
void init (int w,int x,int y,int z,const T& t)
{ delete[] at; wSize=w; xSize=x; ySize=y; zSize=z; at=new T[w*x*y*z];
@@ -472,7 +472,7 @@ class SafeArray5D {
{ delete[] at; vSize=sat.vSize; wSize=sat.wSize; xSize=sat.xSize;
ySize=sat.ySize; zSize=sat.zSize; at=new T[vSize*wSize*xSize*ySize*zSize];
for(int i=0;i<vSize*wSize*xSize*ySize*zSize;i++) at[i]=sat.at[i]; return *this; }
- void init(int v,int w,int x,int y,int z)
+ void init(int v,int w,int x,int y,int z)
{ delete[] at; vSize=v; wSize=w; xSize=x; ySize=y; zSize=z; at=new T[v*w*x*y*z]; }
void init(int v,int w,int x,int y,int z,const T& t)
{ delete[] at; vSize=v; wSize=w; xSize=x; ySize=y; zSize=z; at=new T[v*w*x*y*z];
diff --git a/contrib/synlm/hhmm/rvtl/include/nl-stream.h b/contrib/synlm/hhmm/rvtl/include/nl-stream.h
index 8f743e12b..ee3b641fb 100644
--- a/contrib/synlm/hhmm/rvtl/include/nl-stream.h
+++ b/contrib/synlm/hhmm/rvtl/include/nl-stream.h
@@ -86,7 +86,7 @@ class IStream {
friend ostream& operator<< ( ostream& os, const IStream& is ) { return os<<is.iIndex<<","<<is.psrc<<","<<*is.psrc; }
// Match single char...
- friend IStream operator>> ( IStream is, char& c ) {
+ friend IStream operator>> ( IStream is, char& c ) {
// Propagate fail...
if (IStream()==is) return IStream();
c=is.get(is.iIndex);
@@ -106,7 +106,7 @@ class IStream {
// Match anything else followed by zero-terminated string delimiter...
template<class X> friend pair<IStream,X*> operator>> ( IStream is, X& x ) { return pair<IStream,X*>(is,&x); }
- template<class X> friend IStream operator>> ( pair<IStream,X*> is_x, const char* psDlm ) {
+ template<class X> friend IStream operator>> ( pair<IStream,X*> is_x, const char* psDlm ) {
IStream& is = is_x.first;
X& x = *is_x.second;
// Propagate fail...
@@ -129,7 +129,7 @@ class IStream {
}
// Match integer followed by zero-terminated string delimiter...
- friend IStream operator>> ( pair<IStream,int*> is_x, const char* psDlm ) {
+ friend IStream operator>> ( pair<IStream,int*> is_x, const char* psDlm ) {
IStream& is = is_x.first;
int& x = *is_x.second;
// Propagate fail...
@@ -151,7 +151,7 @@ class IStream {
}
// Match unsigned int followed by zero-terminated string delimiter...
- friend IStream operator>> ( pair<IStream,unsigned int*> is_x, const char* psDlm ) {
+ friend IStream operator>> ( pair<IStream,unsigned int*> is_x, const char* psDlm ) {
IStream& is = is_x.first;
unsigned int& x = *is_x.second;
// Propagate fail...
@@ -173,7 +173,7 @@ class IStream {
}
// Match float followed by zero-terminated string delimiter...
- friend IStream operator>> ( pair<IStream,float*> is_x, const char* psDlm ) {
+ friend IStream operator>> ( pair<IStream,float*> is_x, const char* psDlm ) {
IStream& is = is_x.first;
float& x = *is_x.second;
// Propagate fail...
@@ -195,7 +195,7 @@ class IStream {
}
// Match double followed by zero-terminated string delimiter...
- friend IStream operator>> ( pair<IStream,double*> is_x, const char* psDlm ) {
+ friend IStream operator>> ( pair<IStream,double*> is_x, const char* psDlm ) {
IStream& is = is_x.first;
double& x = *is_x.second;
// Propagate fail...
@@ -217,7 +217,7 @@ class IStream {
}
// Match void pointer followed by zero-terminated string delimiter...
- friend IStream operator>> ( pair<IStream,void**> is_x, const char* psDlm ) {
+ friend IStream operator>> ( pair<IStream,void**> is_x, const char* psDlm ) {
IStream& is = is_x.first;
// Propagate fail...
if (IStream()==is) return IStream();
diff --git a/contrib/synlm/hhmm/rvtl/include/nl-string.h b/contrib/synlm/hhmm/rvtl/include/nl-string.h
index 1a7fc34ae..73e831539 100644
--- a/contrib/synlm/hhmm/rvtl/include/nl-string.h
+++ b/contrib/synlm/hhmm/rvtl/include/nl-string.h
@@ -68,13 +68,13 @@ class StringInput {
friend StringInput operator>> ( StringInput psIn, const char* psDlm ) {
if (StringInput(NULL)==psIn) return psIn;
int i;
- for (i=0; psIn[i]!='\0' && psDlm[i]!='\0'; i++)
+ for (i=0; psIn[i]!='\0' && psDlm[i]!='\0'; i++)
if(psIn[i]!=psDlm[i]) return StringInput(NULL); //psIn;
return (psDlm[i]!='\0') ? StringInput(NULL) : (psIn[i]!='\0') ? psIn+i : SI_EOS;
}
friend pair<StringInput,int*> operator>> ( StringInput ps, int& n ) { return pair<StringInput,int*>(ps,&n); }
- friend StringInput operator>> ( pair<StringInput,int*> delimbuff, const char* psDlm ) {
+ friend StringInput operator>> ( pair<StringInput,int*> delimbuff, const char* psDlm ) {
if (StringInput(NULL)==delimbuff.first) return delimbuff.first;
///int i; for(i=0;psIn[i]!='\0';i++) if(psIn[i]==psDlm[i]) return psIn; return psIn+i;
int j=0;
@@ -90,7 +90,7 @@ class StringInput {
}
friend pair<StringInput,unsigned int*> operator>> ( StringInput ps, unsigned int& n ) { return pair<StringInput,unsigned int*>(ps,&n); }
- friend StringInput operator>> ( pair<StringInput,unsigned int*> delimbuff, const char* psDlm ) {
+ friend StringInput operator>> ( pair<StringInput,unsigned int*> delimbuff, const char* psDlm ) {
if (StringInput(NULL)==delimbuff.first) return delimbuff.first;
///int i; for(i=0;psIn[i]!='\0';i++) if(psIn[i]==psDlm[i]) return psIn; return psIn+i;
int j=0;
@@ -106,7 +106,7 @@ class StringInput {
}
friend pair<StringInput,double*> operator>> ( StringInput ps, double& d ) { return pair<StringInput,double*>(ps,&d); }
- friend StringInput operator>> ( pair<StringInput,double*> delimbuff, const char* psDlm ) {
+ friend StringInput operator>> ( pair<StringInput,double*> delimbuff, const char* psDlm ) {
if (StringInput(NULL)==delimbuff.first) return delimbuff.first;
///int i; for(i=0;psIn[i]!='\0';i++) if(psIn[i]==psDlm[i]) return psIn; return psIn+i;
int j=0;
@@ -191,7 +191,7 @@ class String : public Array<char> {
friend pair<StringInput,String*> operator>> ( const StringInput ps, String& s ) { return pair<StringInput,String*>(ps,&s); }
friend StringInput operator>> ( pair<StringInput,String*> delimbuff, const char* psDlm ) {
if (StringInput(NULL)==delimbuff.first) return delimbuff.first;
- ////assert(*delimbuff.second<domain.getSize());
+ ////assert(*delimbuff.second<domain.getSize());
int j=0;
StringInput psIn = delimbuff.first;
if(psDlm[0]=='\0') { *delimbuff.second=String(psIn.c_str()); return psIn+strlen(psIn.c_str()); }
diff --git a/contrib/synlm/hhmm/rvtl/include/nl-stringindex.h b/contrib/synlm/hhmm/rvtl/include/nl-stringindex.h
index 22931f081..933aba23d 100644
--- a/contrib/synlm/hhmm/rvtl/include/nl-stringindex.h
+++ b/contrib/synlm/hhmm/rvtl/include/nl-stringindex.h
@@ -38,7 +38,7 @@ class StringIndex{
map <string, int> msi;
map <int, string> mis;
int maxIndex;
-
+
public:
// Constructor / destructor methods...
diff --git a/contrib/synlm/hhmm/rvtl/include/nl-tetrahex.h b/contrib/synlm/hhmm/rvtl/include/nl-tetrahex.h
index d77e4f471..60746bd53 100644
--- a/contrib/synlm/hhmm/rvtl/include/nl-tetrahex.h
+++ b/contrib/synlm/hhmm/rvtl/include/nl-tetrahex.h
@@ -22,7 +22,7 @@
///////////////////////////////////////////////////////////////////////////////
/***********************************************
- * nl-tetrahex.h
+ * nl-tetrahex.h
* a little header with some base conversion stuff
* so that we can represent base 16, 32 or 64 with
* one character.
diff --git a/contrib/synlm/hhmm/rvtl/include/nl-timer.h b/contrib/synlm/hhmm/rvtl/include/nl-timer.h
index 3fa7c5387..f6d08c992 100644
--- a/contrib/synlm/hhmm/rvtl/include/nl-timer.h
+++ b/contrib/synlm/hhmm/rvtl/include/nl-timer.h
@@ -41,7 +41,7 @@ class Timer {
}
double elapsed ( ) { // in milliseconds.
return (double(kept.tv_sec)*1000.0 + double(kept.tv_usec)/1000.0);
- //struct timeval end; gettimeofday(&end,NULL);
+ //struct timeval end; gettimeofday(&end,NULL);
//double beg_time_s = (double) beg.tv_sec + (double) ((double)beg.tv_usec / 1000000.0);
//double end_time_s = (double) end.tv_sec + (double) ((double)end.tv_usec / 1000000.0);
//return ( (end_time_s - beg_time_s) * 1000.0 );
diff --git a/contrib/synlm/hhmm/wsjparse/include/HHMMLangModel-gf.h b/contrib/synlm/hhmm/wsjparse/include/HHMMLangModel-gf.h
index 3987d6969..a8405ea13 100644
--- a/contrib/synlm/hhmm/wsjparse/include/HHMMLangModel-gf.h
+++ b/contrib/synlm/hhmm/wsjparse/include/HHMMLangModel-gf.h
@@ -136,7 +136,7 @@ class Rd : public DiscreteDomainRV<int,domRd> {
}
if (!hToG.contains(*this)) {
size_t i=s.find(',');
- assert(i!=string::npos);
+ assert(i!=string::npos);
hToG.set(*this) = G(s.substr(i+1).c_str());
if ( '1'==s[0] )
hFromG.set(G(s.substr(i+1).c_str())) = *this;
diff --git a/contrib/synlm/hhmm/wsjparse/include/TextObsModel.h b/contrib/synlm/hhmm/wsjparse/include/TextObsModel.h
index 5e277e857..57abe5ec6 100644
--- a/contrib/synlm/hhmm/wsjparse/include/TextObsModel.h
+++ b/contrib/synlm/hhmm/wsjparse/include/TextObsModel.h
@@ -42,11 +42,11 @@ typedef HidVarCPT2DModel<P,C,LogProb> PgivCModel;
class WModel {
private:
TrainableDTree2DModel<P,W,LogProb> modPgivWdt;
-
+
RandAccCPT2DModel<P,W,LogProb> modPgivWs;
RandAccCPT1DModel<P,LogProb> modP;
RandAccCPT1DModel<W,LogProb> modW;
-
+
public:
//LogProb getProb ( const W& w, const HidVarCPT1DModel<P,LogProb>::IterVal& p ) const {
LogProb getProb ( const W& w, const P::ArrayIterator<LogProb>& p ) const {
@@ -93,8 +93,8 @@ class OModel {
};
typedef DistribModeledWgivC RandVarType;
-
-
+
+
void calcProb ( OModel::RandVarType& o, const W& w ) const {
o.clear();
@@ -106,7 +106,7 @@ class OModel {
for (LogProb pr=modPgivC.setIterProb(p,c,aCtr); pr!=LogProb(); pr = modPgivC.setIterProb(p,c,aCtr=0) ){
o.setProb(c) += modPgivC.getProb(p,c).toProb() * modWgivP.getProb(w,p).toProb();
}
-
+
}
}
@@ -134,7 +134,7 @@ class XModel {
RandAccCPT2DModel<P,W,Prob> modPgivW;
RandAccCPT1DModel<P,Prob> modP;
RandAccCPT1DModel<W,Prob> modW;
-
+
public:
typedef X RandVarType;
diff --git a/jam-files/check-environment.jam b/jam-files/check-environment.jam
new file mode 100644
index 000000000..2381bb857
--- /dev/null
+++ b/jam-files/check-environment.jam
@@ -0,0 +1,42 @@
+# get stuff from environment variables if not set on the command line
+# unless blocked explicitly
+for local what in cmph irstlm
+{
+ if ! [ option.get "with-$(what)" ] && ! [ option.get "no-$(what)" : : no ]
+ {
+ local where = [ os.environ "$(what:U)_ROOT" ] ;
+ if $(where)
+ {
+ echo "setting option with-$(what) from environment variable "
+ "$(what:U)_ROOT ." ;
+ option.set "with-$(what)" : $(where) ;
+ }
+ }
+ local where = [ option.get "with-$(what)" ] ;
+}
+
+# if --with-moses-regtest is specified without a directory
+local regtest = [ option.get "with-moses-regtest" : no : yes ] ;
+if $(regtest) = yes
+{ # regtests requested but no path given
+ echo "Regression tests requested but no path given." ;
+ local $(where) = [ os.environ "MOSES_REGTEST_ROOT" ] ;
+ if ! $(where)
+ {
+ local HOME = [ os.environ "HOME" ] ;
+ if [ path.exists $(HOME)/moses-regression-tests ]
+ {
+ echo "Using ~/moses-regression-tests as the default." ;
+ option.set "with-moses-regtest" : "~/moses-regression-tests" ;
+ }
+ }
+ else
+ {
+ if [ path.exists $(where) ]
+ {
+ echo "Using $(where) from environment variable MOSES_REGTEST_ROOT."
+ option.set "with-regtest" : $(where) ;
+ }
+ }
+}
+
diff --git a/jam-files/curlpp.jam b/jam-files/curlpp.jam
new file mode 100644
index 000000000..77d5be9f3
--- /dev/null
+++ b/jam-files/curlpp.jam
@@ -0,0 +1,123 @@
+# -*- jam -*-
+# configuration for curlpp
+# I haven't been able to wrap my mind around bjam yet, so chances are
+# there's a much better way to do things.
+
+module curlppvars { } # this stores the variables we want to keep
+
+if [ option.get "no-curlpp" : : yes ]
+{
+ rule curlpp ( what ? ) { } # never return anything
+}
+else
+{
+ local version ;
+ local prefix ;
+ # check if a non-standard location for curl is given
+ local curlpp = [ option.get "with-curlpp" ] ;
+ if ! $(curlpp) # maybe via environment variable CURLPP_ROOT ?
+ {
+ local where = [ os.environ "CURLPP_ROOT" ] ;
+ if $(where)
+ {
+ option.set "with-curlpp" : $(where) ;
+ local msg = "CURLPP: setting --with-curlpp=$(where) via environment" ;
+ echo "$(msg) variable CURLPP_ROOT" ;
+ }
+ curlpp = [ option.get "with-curlpp" ] ;
+ }
+
+ local config ;
+ if $(curlpp)
+ {
+ config = $(curlpp)/bin/curlpp-config ;
+ }
+ else # is curlpp-config in the path ?
+ {
+ local curlpp-check = [ _shell "curlpp-config 2>/dev/null" : exit-status ] ;
+ if $(curlpp-check[2]) = 0 { config = curlpp-config ; }
+ }
+
+ if $(config)
+ {
+ prefix = [ shell_or_die "$(config) --prefix" ] ;
+ version = [ shell_or_die "$(config) --version" ] ;
+ version = [ SPLIT_BY_CHARACTERS $(version) : " " ] ;
+ version = [ trim-nl $(version[2]) ] ;
+ modules.poke curlppvars : prefix : $(prefix) ;
+ modules.poke curlppvars : version : $(version) ;
+
+ requirements += <define>HAVE_CURLPP ;
+ local cpp-cflags = [ shell_or_die "$(config) --cflags" ] ;
+ for local i in [ SPLIT_BY_CHARACTERS $(cpp-cflags) : " " ]
+ {
+ local incpath = [ MATCH "-I(.*)" : $(i) ] ;
+ if $(incpath)
+ {
+ # echo "CURLPP: $(i)" ;
+ requirements += <cxxflags>"-isystem $(incpath)" ;
+ # requirements += <include>$(incpath) ;
+ }
+ }
+ local cpp-libs = [ shell_or_die "$(config) --libs" ] ;
+ local cpp-prefix = [ shell_or_die "$(config) --prefix" ] ;
+ for local i in [ SPLIT_BY_CHARACTERS $(cpp-libs) : " " ]
+ {
+ local libpath = [ MATCH "^-L(.*)" : $(i) ] ;
+ if $(libpath) { requirements += <library-path>$(libpath) ; }
+ local libname = [ MATCH "^-l(.*)" : $(i) ] ;
+ if $(libname)
+ {
+ # local curl = [ MATCH "^-l(.*pp)" : $(i) ] ;
+ # if [ path.exists $(cpp-prefix)/lib/lib$(libname).a ]
+ # {
+ # echo "CURLPP: STATIC LINKING FOR LIBRARY: $(libname)" ;
+ # lib $(libname) : : <link>static ;
+ # }
+ # else
+ # {
+ external-lib $(libname) : $(cpp-prefix)/lib ;
+ # }
+ requirements += <library>$(libname)/<link>shared ;
+ # requirements += <library>$(libname) ;
+ }
+ else
+ {
+ requirements += <linkflags>$(i) ;
+ }
+
+ # requirements += <library-path>/usr/lib/x86_64-linux-gnu ;
+ # for local xtra in idn rtmp ssl crypto ssl crypto ldap rt
+ # {
+ # external-lib $(xtra) : /usr/lib/x86_64-linux-gnu ;
+ # requirements += <library>$(xtra) ;
+ # }
+ }
+ # for local e in idn rtmp ssl crypto ldap rt
+ # {
+ # external-lib $(e) ; # : /usr/lib/x86_64-linux-gnu /usr/lib32 ;
+ # requirements += <library>$(e) ;
+ # }
+
+ # the rule curlpp provides access to all the variables defined in this file
+ # if none argument is given, it returns $(version), which should only be
+ # defined if curl is available
+ rule curlpp ( what ? )
+ {
+ if $(what)
+ {
+ retval = [ modules.peek curlppvars : $(what) ] ;
+ if $(retval) { return $(retval) ; }
+ }
+ else { return "yes" ; }
+ }
+ }
+ else { rule curlpp { } }
+}
+
+if [ curlpp ]
+{
+ local prefix = [ curlpp prefix ] ;
+ local version = [ curlpp version ] ;
+ echo "CULRPP: USING VERSION $(version) FROM $(prefix)" ;
+}
diff --git a/jam-files/sanity.jam b/jam-files/sanity.jam
index e991b7eb2..b16343c7b 100644
--- a/jam-files/sanity.jam
+++ b/jam-files/sanity.jam
@@ -22,6 +22,14 @@ rule shell_or_fail ( cmd ) {
}
}
+rule shell_or_die ( cmd ) {
+ local ret = [ SHELL $(cmd) : exit-status ] ;
+ if $(ret[2]) != 0 {
+ exit $(cmd) failed : 1 ;
+ }
+ return [ trim-nl $(ret[1]) ] ;
+}
+
cxxflags = [ os.environ "CXXFLAGS" ] ;
cflags = [ os.environ "CFLAGS" ] ;
ldflags = [ os.environ "LDFLAGS" ] ;
diff --git a/jam-files/server.jam b/jam-files/server.jam
deleted file mode 100644
index 1cb2e0d38..000000000
--- a/jam-files/server.jam
+++ /dev/null
@@ -1,86 +0,0 @@
-# import path ;
-
-import option ;
-# Is the XMLRPC-C server available?
-
-rule shell_or_die ( cmd ) {
- local ret = [ _shell $(cmd) : exit-status ] ;
- if $(ret[2]) != 0 {
- exit "Failed to run $(cmd)" : 1 ;
- }
- return $(ret[1]) ;
-}
-
-build-server = [ option.get "no-xmlrpc-c" : "yes" : "no" ] ;
-
-if $(build-server) = yes
-{
- # by default, we try to build server capabilities into the server
- xmlrpc-c-path = [ option.get "with-xmlrpc-c" ] ;
- if $(xmlrpc-c-path) = ""
- {
- xmlrpc-c-config-cmd = "xmlrpc-c-config" ;
- }
- else
- {
- xmlrpc-c-config-cmd = "$(xmlrpc-c-path)/bin/xmlrpc-c-config" ;
- }
-
- # check if xmlrpc-config is available
- xmlrpc-check = [ _shell "$(xmlrpc-c-config-cmd) --features 2>/dev/null" : exit-status ] ;
- if $(xmlrpc-check[2]) = 0
- {
- # xmlrpc-c-config was found. Now check if abyss server is available
- if [ MATCH "(abyss-server)" : $(xmlrpc-check[1]) ]
- {
- # Yes, abyss server is available. Is it the right xmlrpc-c version
- # Version 1.25.29 does not work.
- xmlrpc-check = [ _shell "$(xmlrpc-c-config-cmd) --version 2>/dev/null" : exit-status ] ;
- xmlrpc-c-version = $(xmlrpc-check[1]) ;
- if [ MATCH "(1.25.29)" : $(xmlrpc-c-version) ]
- {
- echo "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!" ;
- echo "XMLRPC-C: Moses is not compatible with xmlrpc-c version $(xmlrpc-c-version). " ;
- echo "XMLRPC-C: Use another one or compile without server functionality (--no-xmlrpc-c)." ;
- echo "XMLRPC-C: Build aborted." ;
- exit : 1 ;
- }
- else
- {
- # echo "XMLRPC-C: Found abyss server." ;
- }
- }
- else
- {
- echo "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!" ;
- echo "XMLRPC-C: Found xmlrpc-c but it does not provide the abyss server." ;
- echo "XMLRPC-C: Use another xmlrpc-c installation that provides one " ;
- echo "XMLRPC-C: or compile without server functionality (--no-xmlrpc-c)." ;
- exit : 1 ;
- }
- }
- else if [ option.get "with-xmlrpc-c" ]
- {
- echo "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!" ;
- echo "XMLRPC-C: Could not find $(xmlrpc-c-config-cmd). Build aborted. " ;
- exit : 1 ;
- }
- else
- {
- build-server = no ;
- rule build_server { return no ; }
- }
-}
-
-if $(build-server) = yes
-{
- xmlrpc-path = [ _shell "$(xmlrpc-c-config-cmd) --prefix 2>/dev/null" : exit-status ] ;
- rule build_server { return $(xmlrpc-c-config-cmd) ; }
- rule xmlrpc_path { return $(xmlrpc-path[1]) ; }
-}
-else
-{
- rule build_server { return no ; }
-}
-
-
diff --git a/jam-files/xmlrpc-c.jam b/jam-files/xmlrpc-c.jam
new file mode 100644
index 000000000..d14810e6a
--- /dev/null
+++ b/jam-files/xmlrpc-c.jam
@@ -0,0 +1,100 @@
+# This module handles the use (or non-use) of the externall
+# xmlrpc-c library (including the abyss server) that is needed for
+# moses server functionality
+
+if [ option.get "no-xmlrpc-c" ]
+{
+ rule xmlrpc ( what ? ) { } # never return anything
+}
+else
+{
+ local xmlrpc = [ option.get "with-xmlrpc-c" ] ;
+ if ! $(xmlrpc) # check for environment variable
+ {
+ local where = [ os.environ "XMLRPC_C_ROOT" ] ;
+ if $(where)
+ {
+ option.set "with-xmlrpc-c" : $(where) ;
+ local msg = "setting --with-xmlrpc-c=$(where) via environment " ;
+ echo "$(msg) variable XMLRPC_C_ROOT" ;
+ }
+ xmlrpc = [ option.get "with-xmlrpc-c" ] ;
+ }
+ local config ;
+ if ! $(xmlrpc) { config = "xmlrpc-c-config" ; }
+ else { config = "$(xmlrpc)/bin/xmlrpc-c-config" ; }
+
+ # check if xmlrpc-config can be executed
+ xmlrpc-check = [ _shell "$(config) --features 2>/dev/null" : exit-status ] ;
+
+ if $(xmlrpc-check[2]) = 0 # yes it can
+ {
+ # is the abyss server is available ?
+ if [ MATCH "(abyss-server)" : $(xmlrpc-check[1]) ]
+ {
+ # Yes, abyss server is available. Is it the right xmlrpc-c version ?
+ # Note: Version 1.25.29 does not work.
+ xmlrpc-check = [ _shell "$(config) --version 2>/dev/null" : exit-status ] ;
+ xmlrpc-c-version = $(xmlrpc-check[1]) ;
+ if [ MATCH "(1.25.29)" : $(xmlrpc-c-version) ]
+ {
+ echo "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!" ;
+ echo "XMLRPC-C: Moses is not compatible with xmlrpc-c version $(xmlrpc-c-version). " ;
+ echo "XMLRPC-C: Use another one or compile without server functionality (--no-xmlrpc-c)." ;
+ echo "XMLRPC-C: Build aborted." ;
+ exit : 1 ;
+ }
+ }
+ else
+ {
+ echo "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!" ;
+ echo "XMLRPC-C: Found xmlrpc-c but it does not provide the abyss server." ;
+ echo "XMLRPC-C: Use another xmlrpc-c installation that provides one " ;
+ echo "XMLRPC-C: or compile without server functionality (--no-xmlrpc-c)." ;
+ exit : 1 ;
+ }
+ local prefix = [ shell_or_die "$(config) --prefix" ] ;
+ local version = [ shell_or_die "$(config) --version" ] ;
+ echo "XMLRPC-C: USING VERSION $(version) FROM $(prefix)" ;
+
+ # now add stuff to the requirements
+ local xmlrpc-cxxflags = [ shell_or_die "$(config) c++2 abyss-server --cflags" ] ;
+ requirements += <define>HAVE_XMLRPC_C ;
+ requirements += <cxxflags>$(xmlrpc-cxxflags) ;
+
+ local libs = [ shell_or_die "$(config) c++2 abyss-server --libs" ] ;
+ for local i in [ SPLIT_BY_CHARACTERS $(libs) : " " ]
+ {
+ local libname = [ MATCH "-l(xmlrpc.*)" : $(i) ] ;
+ if $(libname)
+ {
+ external-lib $(libname) : $(prefix)/lib ;
+ # : : <runtime-link>static:<link>static <runtime-link>shared:<link>shared ;
+ requirements += <library>$(libname) ;
+ }
+ local pathname = [ MATCH "-L(.*)" : $(i) ] ;
+ if $(pathname)
+ {
+ requirements += <library-path>$(pathname) ;
+ }
+ }
+
+ rule xmlrpc { return yes ; }
+ }
+ else if [ option.get "with-xmlrpc-c" ]
+ {
+ echo "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!" ;
+ echo "XMLRPC-C: Could not find $(config). " ;
+ echo "Build aborted. " ;
+ echo "Use --no-xmlrpc-c to compile moses without server functionality. " ;
+ exit : 1 ;
+ }
+ else
+ {
+ rule xmlrpc ( what ? ) { } # never return anything
+ }
+
+}
+
+
+
diff --git a/lm/bhiksha.cc b/lm/bhiksha.cc
index c8a18dfda..4262b615e 100644
--- a/lm/bhiksha.cc
+++ b/lm/bhiksha.cc
@@ -11,12 +11,12 @@ namespace lm {
namespace ngram {
namespace trie {
-DontBhiksha::DontBhiksha(const void * /*base*/, uint64_t /*max_offset*/, uint64_t max_next, const Config &/*config*/) :
+DontBhiksha::DontBhiksha(const void * /*base*/, uint64_t /*max_offset*/, uint64_t max_next, const Config &/*config*/) :
next_(util::BitsMask::ByMax(max_next)) {}
const uint8_t kArrayBhikshaVersion = 0;
-// TODO: put this in binary file header instead when I change the binary file format again.
+// TODO: put this in binary file header instead when I change the binary file format again.
void ArrayBhiksha::UpdateConfigFromBinary(const BinaryFormat &file, uint64_t offset, Config &config) {
uint8_t buffer[2];
file.ReadForConfig(buffer, 2, offset);
@@ -33,7 +33,7 @@ uint8_t ChopBits(uint64_t max_offset, uint64_t max_next, const Config &config) {
uint8_t required = util::RequiredBits(max_next);
uint8_t best_chop = 0;
int64_t lowest_change = std::numeric_limits<int64_t>::max();
- // There are probably faster ways but I don't care because this is only done once per order at construction time.
+ // There are probably faster ways but I don't care because this is only done once per order at construction time.
for (uint8_t chop = 0; chop <= std::min(required, config.pointer_bhiksha_bits); ++chop) {
int64_t change = (max_next >> (required - chop)) * 64 /* table cost in bits */
- max_offset * static_cast<int64_t>(chop); /* savings in bits*/
diff --git a/lm/bhiksha.hh b/lm/bhiksha.hh
index 8ec8989c7..36438f1d2 100644
--- a/lm/bhiksha.hh
+++ b/lm/bhiksha.hh
@@ -7,7 +7,7 @@
* pages={388--391},
* }
*
- * Currently only used for next pointers.
+ * Currently only used for next pointers.
*/
#ifndef LM_BHIKSHA_H
@@ -86,9 +86,9 @@ class ArrayBhiksha {
// assert(end_it == std::upper_bound(offset_begin_, offset_end_, index + 1));
--end_it;
// assert(end_it >= begin_it);
- out.begin = ((begin_it - offset_begin_) << next_inline_.bits) |
+ out.begin = ((begin_it - offset_begin_) << next_inline_.bits) |
util::ReadInt57(base, bit_offset, next_inline_.bits, next_inline_.mask);
- out.end = ((end_it - offset_begin_) << next_inline_.bits) |
+ out.end = ((end_it - offset_begin_) << next_inline_.bits) |
util::ReadInt57(base, bit_offset + total_bits, next_inline_.bits, next_inline_.mask);
// If this fails, consider rebuilding your model using KenLM after 1e333d786b748555e8f368d2bbba29a016c98052
assert(out.end >= out.begin);
diff --git a/lm/binary_format.cc b/lm/binary_format.cc
index 481174047..4ad893d44 100644
--- a/lm/binary_format.cc
+++ b/lm/binary_format.cc
@@ -135,7 +135,7 @@ void MatchCheck(ModelType model_type, unsigned int search_version, const Paramet
const std::size_t kInvalidSize = static_cast<std::size_t>(-1);
-BinaryFormat::BinaryFormat(const Config &config)
+BinaryFormat::BinaryFormat(const Config &config)
: write_method_(config.write_method), write_mmap_(config.write_mmap), load_method_(config.load_method),
header_size_(kInvalidSize), vocab_size_(kInvalidSize), vocab_string_offset_(kInvalidOffset) {}
diff --git a/lm/binary_format.hh b/lm/binary_format.hh
index 136d6b1aa..ff99b9574 100644
--- a/lm/binary_format.hh
+++ b/lm/binary_format.hh
@@ -19,18 +19,18 @@ namespace ngram {
extern const char *kModelNames[6];
-/*Inspect a file to determine if it is a binary lm. If not, return false.
+/*Inspect a file to determine if it is a binary lm. If not, return false.
* If so, return true and set recognized to the type. This is the only API in
- * this header designed for use by decoder authors.
+ * this header designed for use by decoder authors.
*/
bool RecognizeBinary(const char *file, ModelType &recognized);
struct FixedWidthParameters {
unsigned char order;
float probing_multiplier;
- // What type of model is this?
+ // What type of model is this?
ModelType model_type;
- // Does the end of the file have the actual strings in the vocabulary?
+ // Does the end of the file have the actual strings in the vocabulary?
bool has_vocabulary;
unsigned int search_version;
};
@@ -38,7 +38,7 @@ struct FixedWidthParameters {
// This is a macro instead of an inline function so constants can be assigned using it.
#define ALIGN8(a) ((std::ptrdiff_t(((a)-1)/8)+1)*8)
-// Parameters stored in the header of a binary file.
+// Parameters stored in the header of a binary file.
struct Parameters {
FixedWidthParameters fixed;
std::vector<uint64_t> counts;
@@ -79,7 +79,7 @@ class BinaryFormat {
const char *write_mmap_;
util::LoadMethod load_method_;
- // File behind memory, if any.
+ // File behind memory, if any.
util::scoped_fd file_;
// If there is a file involved, a single mapping.
diff --git a/lm/blank.hh b/lm/blank.hh
index 2107e1cb6..e09054c9b 100644
--- a/lm/blank.hh
+++ b/lm/blank.hh
@@ -15,9 +15,9 @@ namespace ngram {
* kNoExtensionBackoff. If the n-gram might be extended, then out_state must
* contain the full n-gram, in which case kExtensionBackoff is set. In any
* case, if an n-gram has non-zero backoff, the full state is returned so
- * backoff can be properly charged.
+ * backoff can be properly charged.
* These differ only in sign bit because the backoff is in fact zero in either
- * case.
+ * case.
*/
const float kNoExtensionBackoff = -0.0;
const float kExtensionBackoff = 0.0;
@@ -28,7 +28,7 @@ inline void SetExtension(float &backoff) {
if (backoff == kNoExtensionBackoff) backoff = kExtensionBackoff;
}
-// This compiles down nicely.
+// This compiles down nicely.
inline bool HasExtension(const float &backoff) {
typedef union { float f; uint32_t i; } UnionValue;
UnionValue compare, interpret;
diff --git a/lm/build_binary_main.cc b/lm/build_binary_main.cc
index 6d88a398d..35206e60b 100644
--- a/lm/build_binary_main.cc
+++ b/lm/build_binary_main.cc
@@ -56,7 +56,7 @@ void Usage(const char *name, const char *default_mem) {
exit(1);
}
-// I could really use boost::lexical_cast right about now.
+// I could really use boost::lexical_cast right about now.
float ParseFloat(const char *from) {
char *end;
float ret = strtod(from, &end);
diff --git a/lm/builder/adjust_counts.cc b/lm/builder/adjust_counts.cc
index 2dd3cef1b..bcaa71998 100644
--- a/lm/builder/adjust_counts.cc
+++ b/lm/builder/adjust_counts.cc
@@ -114,7 +114,7 @@ class CollapseStream {
current_(NULL, NGram::OrderFromSize(position.GetChain().EntrySize())),
prune_threshold_(prune_threshold),
prune_words_(prune_words),
- block_(position) {
+ block_(position) {
StartBlock();
}
@@ -125,27 +125,27 @@ class CollapseStream {
CollapseStream &operator++() {
assert(block_);
-
+
if (current_.begin()[1] == kBOS && current_.Base() < copy_from_) {
memcpy(current_.Base(), copy_from_, current_.TotalSize());
UpdateCopyFrom();
-
+
// Mark highest order n-grams for later pruning
if(current_.Count() <= prune_threshold_) {
- current_.Mark();
+ current_.Mark();
}
-
+
if(!prune_words_.empty()) {
for(WordIndex* i = current_.begin(); i != current_.end(); i++) {
if(prune_words_[*i]) {
- current_.Mark();
+ current_.Mark();
break;
}
}
}
-
+
}
-
+
current_.NextInMemory();
uint8_t *block_base = static_cast<uint8_t*>(block_->Get());
if (current_.Base() == block_base + block_->ValidSize()) {
@@ -153,21 +153,21 @@ class CollapseStream {
++block_;
StartBlock();
}
-
+
// Mark highest order n-grams for later pruning
if(current_.Count() <= prune_threshold_) {
- current_.Mark();
+ current_.Mark();
}
if(!prune_words_.empty()) {
for(WordIndex* i = current_.begin(); i != current_.end(); i++) {
if(prune_words_[*i]) {
- current_.Mark();
+ current_.Mark();
break;
}
}
}
-
+
return *this;
}
@@ -180,21 +180,21 @@ class CollapseStream {
current_.ReBase(block_->Get());
copy_from_ = static_cast<uint8_t*>(block_->Get()) + block_->ValidSize();
UpdateCopyFrom();
-
+
// Mark highest order n-grams for later pruning
if(current_.Count() <= prune_threshold_) {
- current_.Mark();
+ current_.Mark();
}
if(!prune_words_.empty()) {
for(WordIndex* i = current_.begin(); i != current_.end(); i++) {
if(prune_words_[*i]) {
- current_.Mark();
+ current_.Mark();
break;
}
}
}
-
+
}
// Find last without bos.
@@ -222,18 +222,18 @@ void AdjustCounts::Run(const util::stream::ChainPositions &positions) {
StatCollector stats(order, counts_, counts_pruned_, discounts_);
if (order == 1) {
- // Only unigrams. Just collect stats.
+ // Only unigrams. Just collect stats.
for (NGramStream full(positions[0]); full; ++full) {
-
+
// Do not prune <s> </s> <unk>
if(*full->begin() > 2) {
if(full->Count() <= prune_thresholds_[0])
full->Mark();
-
+
if(!prune_words_.empty() && prune_words_[*full->begin()])
full->Mark();
}
-
+
stats.AddFull(full->UnmarkedCount(), full->IsMarked());
}
@@ -243,7 +243,7 @@ void AdjustCounts::Run(const util::stream::ChainPositions &positions) {
NGramStreams streams;
streams.Init(positions, positions.size() - 1);
-
+
CollapseStream full(positions[positions.size() - 1], prune_thresholds_.back(), prune_words_);
// Initialization: <unk> has count 0 and so does <s>.
@@ -261,7 +261,7 @@ void AdjustCounts::Run(const util::stream::ChainPositions &positions) {
std::vector<uint64_t> actual_counts(positions.size(), 0);
// Something of a hack: don't prune <s>.
actual_counts[0] = std::numeric_limits<uint64_t>::max();
-
+
// Iterate over full (the stream of the highest order ngrams)
for (; full; ++full) {
const WordIndex *different = FindDifference(*full, **lower_valid);
@@ -272,16 +272,16 @@ void AdjustCounts::Run(const util::stream::ChainPositions &positions) {
uint64_t order_minus_1 = lower_valid - streams_begin;
if(actual_counts[order_minus_1] <= prune_thresholds_[order_minus_1])
(*lower_valid)->Mark();
-
+
if(!prune_words_.empty()) {
for(WordIndex* i = (*lower_valid)->begin(); i != (*lower_valid)->end(); i++) {
if(prune_words_[*i]) {
- (*lower_valid)->Mark();
+ (*lower_valid)->Mark();
break;
}
}
}
-
+
stats.Add(order_minus_1, (*lower_valid)->UnmarkedCount(), (*lower_valid)->IsMarked());
++*lower_valid;
}
@@ -327,16 +327,16 @@ void AdjustCounts::Run(const util::stream::ChainPositions &positions) {
uint64_t lower_count = actual_counts[(*s)->Order() - 1];
if(lower_count <= prune_thresholds_[(*s)->Order() - 1])
(*s)->Mark();
-
+
if(!prune_words_.empty()) {
for(WordIndex* i = (*s)->begin(); i != (*s)->end(); i++) {
if(prune_words_[*i]) {
- (*s)->Mark();
+ (*s)->Mark();
break;
}
}
}
-
+
stats.Add(s - streams.begin(), lower_count, (*s)->IsMarked());
++*s;
}
diff --git a/lm/builder/adjust_counts.hh b/lm/builder/adjust_counts.hh
index b169950e9..29319ba06 100644
--- a/lm/builder/adjust_counts.hh
+++ b/lm/builder/adjust_counts.hh
@@ -30,9 +30,9 @@ struct DiscountConfig {
WarningAction bad_action;
};
-/* Compute adjusted counts.
+/* Compute adjusted counts.
* Input: unique suffix sorted N-grams (and just the N-grams) with raw counts.
- * Output: [1,N]-grams with adjusted counts.
+ * Output: [1,N]-grams with adjusted counts.
* [1,N)-grams are in suffix order
* N-grams are in undefined order (they're going to be sorted anyway).
*/
@@ -50,13 +50,13 @@ class AdjustCounts {
const DiscountConfig &discount_config,
std::vector<Discount> &discounts)
: prune_thresholds_(prune_thresholds), counts_(counts), counts_pruned_(counts_pruned),
- prune_words_(prune_words), discount_config_(discount_config), discounts_(discounts)
+ prune_words_(prune_words), discount_config_(discount_config), discounts_(discounts)
{}
void Run(const util::stream::ChainPositions &positions);
private:
- const std::vector<uint64_t> &prune_thresholds_;
+ const std::vector<uint64_t> &prune_thresholds_;
std::vector<uint64_t> &counts_;
std::vector<uint64_t> &counts_pruned_;
const std::vector<bool> &prune_words_;
diff --git a/lm/builder/adjust_counts_test.cc b/lm/builder/adjust_counts_test.cc
index 353e3dd35..2a9d78ae0 100644
--- a/lm/builder/adjust_counts_test.cc
+++ b/lm/builder/adjust_counts_test.cc
@@ -82,7 +82,7 @@ BOOST_AUTO_TEST_CASE(Simple) {
}
BOOST_REQUIRE_EQUAL(4UL, counts.size());
BOOST_CHECK_EQUAL(4UL, counts[0]);
- // These are no longer set because the discounts are bad.
+ // These are no longer set because the discounts are bad.
/* BOOST_CHECK_EQUAL(4UL, counts[1]);
BOOST_CHECK_EQUAL(3UL, counts[2]);
BOOST_CHECK_EQUAL(3UL, counts[3]);*/
diff --git a/lm/builder/corpus_count.cc b/lm/builder/corpus_count.cc
index 7f3dafa27..889eeb7a9 100644
--- a/lm/builder/corpus_count.cc
+++ b/lm/builder/corpus_count.cc
@@ -45,7 +45,7 @@ class DedupeHash : public std::unary_function<const WordIndex *, bool> {
std::size_t operator()(const WordIndex *start) const {
return util::MurmurHashNative(start, size_);
}
-
+
private:
const std::size_t size_;
};
@@ -53,11 +53,11 @@ class DedupeHash : public std::unary_function<const WordIndex *, bool> {
class DedupeEquals : public std::binary_function<const WordIndex *, const WordIndex *, bool> {
public:
explicit DedupeEquals(std::size_t order) : size_(order * sizeof(WordIndex)) {}
-
+
bool operator()(const WordIndex *first, const WordIndex *second) const {
return !memcmp(first, second, size_);
- }
-
+ }
+
private:
const std::size_t size_;
};
@@ -82,7 +82,7 @@ typedef util::ProbingHashTable<DedupeEntry, DedupeHash, DedupeEquals> Dedupe;
class Writer {
public:
- Writer(std::size_t order, const util::stream::ChainPosition &position, void *dedupe_mem, std::size_t dedupe_mem_size)
+ Writer(std::size_t order, const util::stream::ChainPosition &position, void *dedupe_mem, std::size_t dedupe_mem_size)
: block_(position), gram_(block_->Get(), order),
dedupe_invalid_(order, std::numeric_limits<WordIndex>::max()),
dedupe_(dedupe_mem, dedupe_mem_size, &dedupe_invalid_[0], DedupeHash(order), DedupeEquals(order)),
@@ -91,7 +91,7 @@ class Writer {
dedupe_.Clear();
assert(Dedupe::Size(position.GetChain().BlockSize() / position.GetChain().EntrySize(), kProbingMultiplier) == dedupe_mem_size);
if (order == 1) {
- // Add special words. AdjustCounts is responsible if order != 1.
+ // Add special words. AdjustCounts is responsible if order != 1.
AddUnigramWord(kUNK);
AddUnigramWord(kBOS);
}
@@ -121,16 +121,16 @@ class Writer {
memmove(gram_.begin(), gram_.begin() + 1, sizeof(WordIndex) * (gram_.Order() - 1));
return;
}
- // Complete the write.
+ // Complete the write.
gram_.Count() = 1;
- // Prepare the next n-gram.
+ // Prepare the next n-gram.
if (reinterpret_cast<uint8_t*>(gram_.begin()) + gram_.TotalSize() != static_cast<uint8_t*>(block_->Get()) + block_size_) {
NGram last(gram_);
gram_.NextInMemory();
std::copy(last.begin() + 1, last.end(), gram_.begin());
return;
}
- // Block end. Need to store the context in a temporary buffer.
+ // Block end. Need to store the context in a temporary buffer.
std::copy(gram_.begin() + 1, gram_.end(), buffer_.get());
dedupe_.Clear();
block_->SetValidSize(block_size_);
@@ -158,7 +158,7 @@ class Writer {
// Hash table combiner implementation.
Dedupe dedupe_;
- // Small buffer to hold existing ngrams when shifting across a block boundary.
+ // Small buffer to hold existing ngrams when shifting across a block boundary.
boost::scoped_array<WordIndex> buffer_;
const std::size_t block_size_;
@@ -224,12 +224,12 @@ void CorpusCount::Run(const util::stream::ChainPosition &position) {
} catch (const util::EndOfFileException &e) {}
token_count_ = count;
type_count_ = vocab.Size();
-
+
// Create list of unigrams that are supposed to be pruned
if (!prune_vocab_filename_.empty()) {
try {
util::FilePiece prune_vocab_file(prune_vocab_filename_.c_str());
-
+
prune_words_.resize(vocab.Size(), true);
try {
while (true) {
@@ -238,12 +238,12 @@ void CorpusCount::Run(const util::stream::ChainPosition &position) {
prune_words_[vocab.Index(*w)] = false;
}
} catch (const util::EndOfFileException &e) {}
-
+
// Never prune <unk>, <s>, </s>
prune_words_[kUNK] = false;
prune_words_[kBOS] = false;
prune_words_[kEOS] = false;
-
+
} catch (const util::Exception &e) {
std::cerr << e.what() << std::endl;
abort();
diff --git a/lm/builder/corpus_count.hh b/lm/builder/corpus_count.hh
index d3121ca45..165505c4a 100644
--- a/lm/builder/corpus_count.hh
+++ b/lm/builder/corpus_count.hh
@@ -40,7 +40,7 @@ class CorpusCount {
uint64_t &token_count_;
WordIndex &type_count_;
std::vector<bool>& prune_words_;
- const std::string& prune_vocab_filename_;
+ const std::string& prune_vocab_filename_;
std::size_t dedupe_mem_size_;
util::scoped_malloc dedupe_mem_;
diff --git a/lm/builder/initial_probabilities.cc b/lm/builder/initial_probabilities.cc
index b1dd96f31..80063eb2e 100644
--- a/lm/builder/initial_probabilities.cc
+++ b/lm/builder/initial_probabilities.cc
@@ -27,9 +27,9 @@ struct HashBufferEntry : public BufferEntry {
uint64_t hash_value;
};
-// Reads all entries in order like NGramStream does.
+// Reads all entries in order like NGramStream does.
// But deletes any entries that have CutoffCount below or equal to pruning
-// threshold.
+// threshold.
class PruneNGramStream {
public:
PruneNGramStream(const util::stream::ChainPosition &position) :
@@ -37,7 +37,7 @@ class PruneNGramStream {
dest_(NULL, NGram::OrderFromSize(position.GetChain().EntrySize())),
currentCount_(0),
block_(position)
- {
+ {
StartBlock();
}
@@ -50,7 +50,7 @@ class PruneNGramStream {
PruneNGramStream &operator++() {
assert(block_);
-
+
if(current_.Order() == 1 && *current_.begin() <= 2)
dest_.NextInMemory();
else if(currentCount_ > 0) {
@@ -59,9 +59,9 @@ class PruneNGramStream {
}
dest_.NextInMemory();
}
-
+
current_.NextInMemory();
-
+
uint8_t *block_base = static_cast<uint8_t*>(block_->Get());
if (current_.Base() == block_base + block_->ValidSize()) {
block_->SetValidSize(dest_.Base() - block_base);
@@ -70,13 +70,13 @@ class PruneNGramStream {
if (block_) {
currentCount_ = current_.CutoffCount();
}
- } else {
+ } else {
currentCount_ = current_.CutoffCount();
}
-
+
return *this;
}
-
+
private:
void StartBlock() {
for (; ; ++block_) {
@@ -85,13 +85,13 @@ class PruneNGramStream {
}
current_.ReBase(block_->Get());
currentCount_ = current_.CutoffCount();
-
+
dest_.ReBase(block_->Get());
}
NGram current_; // input iterator
NGram dest_; // output iterator
-
+
uint64_t currentCount_;
util::stream::Link block_;
@@ -155,24 +155,24 @@ class AddRight {
memcpy(previous_raw, in->begin(), size);
uint64_t denominator = 0;
uint64_t normalizer = 0;
-
+
uint64_t counts[4];
memset(counts, 0, sizeof(counts));
do {
denominator += in->UnmarkedCount();
-
+
// Collect unused probability mass from pruning.
// Becomes 0 for unpruned ngrams.
normalizer += in->UnmarkedCount() - in->CutoffCount();
-
+
// Chen&Goodman do not mention counting based on cutoffs, but
// backoff becomes larger than 1 otherwise, so probably needs
// to count cutoffs. Counts normally without pruning.
if(in->CutoffCount() > 0)
++counts[std::min(in->CutoffCount(), static_cast<uint64_t>(3))];
-
+
} while (++in && !memcmp(previous_raw, in->begin(), size));
-
+
BufferEntry &entry = *reinterpret_cast<BufferEntry*>(out.Get());
entry.denominator = static_cast<float>(denominator);
entry.gamma = 0.0;
@@ -182,9 +182,9 @@ class AddRight {
// Makes model sum to 1 with pruning (I hope).
entry.gamma += normalizer;
-
+
entry.gamma /= entry.denominator;
-
+
if(pruning_) {
// If pruning is enabled the stream actually contains HashBufferEntry, see InitialProbabilities(...),
// so add a hash value that identifies the current ngram.
@@ -244,13 +244,13 @@ class MergeRight {
++summed;
return;
}
-
+
std::vector<WordIndex> previous(grams->Order() - 1);
const std::size_t size = sizeof(WordIndex) * previous.size();
for (; grams; ++summed) {
memcpy(&previous[0], grams->begin(), size);
const BufferEntry &sums = *static_cast<const BufferEntry*>(summed.Get());
-
+
do {
Payload &pay = grams->Value();
pay.uninterp.prob = discount_.Apply(grams->UnmarkedCount()) / sums.denominator;
@@ -288,7 +288,7 @@ void InitialProbabilities(
gamma_out[i] >> AddRight(discounts[i], second, prune_vocab || prune_thresholds[i] > 0);
primary[i] >> MergeRight(config.interpolate_unigrams, gamma_out[i].Add(), discounts[i]);
-
+
// Don't bother with the OnlyGamma thread for something to discard.
if (i) gamma_out[i] >> OnlyGamma(prune_vocab || prune_thresholds[i] > 0);
}
diff --git a/lm/builder/initial_probabilities.hh b/lm/builder/initial_probabilities.hh
index 57e09cd51..a8ecf4dc2 100644
--- a/lm/builder/initial_probabilities.hh
+++ b/lm/builder/initial_probabilities.hh
@@ -15,17 +15,17 @@ struct InitialProbabilitiesConfig {
// These should be small buffers to keep the adder from getting too far ahead
util::stream::ChainConfig adder_in;
util::stream::ChainConfig adder_out;
- // SRILM doesn't normally interpolate unigrams.
+ // SRILM doesn't normally interpolate unigrams.
bool interpolate_unigrams;
};
/* Compute initial (uninterpolated) probabilities
* primary: the normal chain of n-grams. Incoming is context sorted adjusted
* counts. Outgoing has uninterpolated probabilities for use by Interpolate.
- * second_in: a second copy of the primary input. Discard the output.
+ * second_in: a second copy of the primary input. Discard the output.
* gamma_out: Computed gamma values are output on these chains in suffix order.
* The values are bare floats and should be buffered for interpolation to
- * use.
+ * use.
*/
void InitialProbabilities(
const InitialProbabilitiesConfig &config,
diff --git a/lm/builder/interpolate.cc b/lm/builder/interpolate.cc
index 0f9b98162..5b04cb3ff 100644
--- a/lm/builder/interpolate.cc
+++ b/lm/builder/interpolate.cc
@@ -47,7 +47,7 @@ class OutputQ {
private:
// Product of backoffs in the numerator divided by backoffs in the
- // denominator. Does not include
+ // denominator. Does not include
std::vector<float> q_delta_;
};
@@ -81,7 +81,7 @@ template <class Output> class Callback {
if(prune_vocab_ || prune_thresholds_[i + 1] > 0)
while(backoffs_[i])
++backoffs_[i];
-
+
if (backoffs_[i]) {
std::cerr << "Backoffs do not match for order " << (i + 1) << std::endl;
abort();
@@ -99,7 +99,7 @@ template <class Output> class Callback {
if(prune_vocab_ || prune_thresholds_[order_minus_1 + 1] > 0) {
//Compute hash value for current context
uint64_t current_hash = util::MurmurHashNative(gram.begin(), gram.Order() * sizeof(WordIndex));
-
+
const HashGamma *hashed_backoff = static_cast<const HashGamma*>(backoffs_[order_minus_1].Get());
while(current_hash != hashed_backoff->hash_value && ++backoffs_[order_minus_1])
hashed_backoff = static_cast<const HashGamma*>(backoffs_[order_minus_1].Get());
diff --git a/lm/builder/interpolate.hh b/lm/builder/interpolate.hh
index adfd9198f..207a16dfd 100644
--- a/lm/builder/interpolate.hh
+++ b/lm/builder/interpolate.hh
@@ -8,8 +8,8 @@
#include <stdint.h>
namespace lm { namespace builder {
-
-/* Interpolate step.
+
+/* Interpolate step.
* Input: suffix sorted n-grams with (p_uninterpolated, gamma) from
* InitialProbabilities.
* Output: suffix sorted n-grams with complete probability
diff --git a/lm/builder/joint_order.hh b/lm/builder/joint_order.hh
index 1728706dd..b05ef67fd 100644
--- a/lm/builder/joint_order.hh
+++ b/lm/builder/joint_order.hh
@@ -35,7 +35,7 @@ template <class Callback, class Compare> void JointOrder(const util::stream::Cha
// Does the context match the lower one?
if (!memcmp(streams[static_cast<int>(current) - 1]->begin(), streams[current]->begin() + Compare::kMatchOffset, sizeof(WordIndex) * current)) {
callback.Enter(current, *streams[current]);
- // Transition to looking for extensions.
+ // Transition to looking for extensions.
if (++current < order) continue;
}
#ifdef DEBUG
@@ -46,16 +46,16 @@ template <class Callback, class Compare> void JointOrder(const util::stream::Cha
abort();
}
#endif // DEBUG
- // No extension left.
+ // No extension left.
while(true) {
assert(current > 0);
--current;
callback.Exit(current, *streams[current]);
-
+
if (++streams[current]) break;
-
+
UTIL_THROW_IF(order != current + 1, FormatLoadException, "Detected n-gram without matching suffix");
-
+
order = current;
if (!order) return;
}
diff --git a/lm/builder/lmplz_main.cc b/lm/builder/lmplz_main.cc
index 65ec55729..5c9d86deb 100644
--- a/lm/builder/lmplz_main.cc
+++ b/lm/builder/lmplz_main.cc
@@ -53,7 +53,7 @@ std::vector<uint64_t> ParsePruning(const std::vector<std::string> &param, std::s
// throw if each n-gram order has not threshold specified
UTIL_THROW_IF(prune_thresholds.size() > order, util::Exception, "You specified pruning thresholds for orders 1 through " << prune_thresholds.size() << " but the model only has order " << order);
// threshold for unigram can only be 0 (no pruning)
-
+
// check if threshold are not in decreasing order
uint64_t lower_threshold = 0;
for (std::vector<uint64_t>::iterator it = prune_thresholds.begin(); it != prune_thresholds.end(); ++it) {
@@ -124,7 +124,7 @@ int main(int argc, char *argv[]) {
po::store(po::parse_command_line(argc, argv, options), vm);
if (argc == 1 || vm["help"].as<bool>()) {
- std::cerr <<
+ std::cerr <<
"Builds unpruned language models with modified Kneser-Ney smoothing.\n\n"
"Please cite:\n"
"@inproceedings{Heafield-estimate,\n"
@@ -147,7 +147,7 @@ int main(int argc, char *argv[]) {
std::cerr << "This machine has " << mem << " bytes of memory.\n\n";
} else {
std::cerr << "Unable to determine the amount of memory on this machine.\n\n";
- }
+ }
std::cerr << options << std::endl;
return 1;
}
@@ -191,11 +191,11 @@ int main(int argc, char *argv[]) {
else {
pipeline.prune_vocab = false;
}
-
+
util::NormalizeTempPrefix(pipeline.sort.temp_prefix);
lm::builder::InitialProbabilitiesConfig &initial = pipeline.initial_probs;
- // TODO: evaluate options for these.
+ // TODO: evaluate options for these.
initial.adder_in.total_memory = 32768;
initial.adder_in.block_count = 2;
initial.adder_out.total_memory = 32768;
diff --git a/lm/builder/ngram.hh b/lm/builder/ngram.hh
index 4525b3421..d0033206c 100644
--- a/lm/builder/ngram.hh
+++ b/lm/builder/ngram.hh
@@ -68,26 +68,26 @@ class NGram {
assert(size == TotalSize(ret));
return ret;
}
-
+
// manipulate msb to signal that ngram can be pruned
/*mjd**********************************************************************/
bool IsMarked() const {
return Value().count >> (sizeof(Value().count) * 8 - 1);
}
-
+
void Mark() {
Value().count |= (1ul << (sizeof(Value().count) * 8 - 1));
}
-
+
void Unmark() {
Value().count &= ~(1ul << (sizeof(Value().count) * 8 - 1));
}
-
+
uint64_t UnmarkedCount() const {
return Value().count & ~(1ul << (sizeof(Value().count) * 8 - 1));
}
-
+
uint64_t CutoffCount() const {
return IsMarked() ? 0 : UnmarkedCount();
}
diff --git a/lm/builder/pipeline.cc b/lm/builder/pipeline.cc
index fced0e3bd..1ca2e26f5 100644
--- a/lm/builder/pipeline.cc
+++ b/lm/builder/pipeline.cc
@@ -37,7 +37,7 @@ void PrintStatistics(const std::vector<uint64_t> &counts, const std::vector<uint
class Master {
public:
- explicit Master(PipelineConfig &config)
+ explicit Master(PipelineConfig &config)
: config_(config), chains_(config.order), files_(config.order) {
config_.minimum_block = std::max(NGram::TotalSize(config_.order), config_.minimum_block);
}
@@ -64,7 +64,7 @@ class Master {
CreateChains(config_.TotalMemory() - merge_using, count_bounds);
ngrams.Output(chains_.back(), merge_using);
- // Setup unigram file.
+ // Setup unigram file.
files_.push_back(util::MakeTemp(config_.TempPrefix()));
}
@@ -204,7 +204,7 @@ class Master {
PipelineConfig &config_;
util::stream::Chains chains_;
- // Often only unigrams, but sometimes all orders.
+ // Often only unigrams, but sometimes all orders.
util::FixedArray<util::stream::FileBuffer> files_;
};
@@ -214,7 +214,7 @@ void CountText(int text_file /* input */, int vocab_file /* output */, Master &m
const std::size_t vocab_usage = CorpusCount::VocabUsage(config.vocab_estimate);
UTIL_THROW_IF(config.TotalMemory() < vocab_usage, util::Exception, "Vocab hash size estimate " << vocab_usage << " exceeds total memory " << config.TotalMemory());
- std::size_t memory_for_chain =
+ std::size_t memory_for_chain =
// This much memory to work with after vocab hash table.
static_cast<float>(config.TotalMemory() - vocab_usage) /
// Solve for block size including the dedupe multiplier for one block.
@@ -252,7 +252,7 @@ void InitialProbabilities(const std::vector<uint64_t> &counts, const std::vector
util::stream::Chains gamma_chains(config.order);
InitialProbabilities(config.initial_probs, discounts, master.MutableChains(), second, gamma_chains, prune_thresholds, prune_vocab);
- // Don't care about gamma for 0.
+ // Don't care about gamma for 0.
gamma_chains[0] >> util::stream::kRecycle;
gammas.Init(config.order - 1);
for (std::size_t i = 1; i < config.order; ++i) {
@@ -307,16 +307,16 @@ void Pipeline(PipelineConfig &config, int text_file, Output &output) {
// master's destructor will wait for chains. But they might be deadlocked if
// this thread dies because e.g. it ran out of memory.
try {
- util::scoped_fd vocab_file(config.vocab_file.empty() ?
- util::MakeTemp(config.TempPrefix()) :
+ util::scoped_fd vocab_file(config.vocab_file.empty() ?
+ util::MakeTemp(config.TempPrefix()) :
util::CreateOrThrow(config.vocab_file.c_str()));
output.SetVocabFD(vocab_file.get());
uint64_t token_count;
std::string text_file_name;
-
+
std::vector<bool> prune_words;
CountText(text_file, vocab_file.get(), master, token_count, text_file_name, prune_words);
-
+
std::vector<uint64_t> counts;
std::vector<uint64_t> counts_pruned;
std::vector<Discount> discounts;
diff --git a/lm/builder/pipeline.hh b/lm/builder/pipeline.hh
index 8f4d82103..1987daff1 100644
--- a/lm/builder/pipeline.hh
+++ b/lm/builder/pipeline.hh
@@ -44,7 +44,7 @@ struct PipelineConfig {
// Compute collapsed q values instead of probability and backoff
bool output_q;
-
+
/* Computing the perplexity of LMs with different vocabularies is hard. For
* example, the lowest perplexity is attained by a unigram model that
* predicts p(<unk>) = 1 and has no other vocabulary. Also, linearly
diff --git a/lm/builder/print.cc b/lm/builder/print.cc
index 2c8c7276c..56a3134d8 100644
--- a/lm/builder/print.cc
+++ b/lm/builder/print.cc
@@ -55,7 +55,7 @@ void PrintARPA::Run(const util::stream::ChainPositions &positions) {
if (order != positions.size())
out << '\t' << stream->Value().complete.backoff;
out << '\n';
-
+
}
out << '\n';
}
diff --git a/lm/builder/print.hh b/lm/builder/print.hh
index ad282ea85..093a35697 100644
--- a/lm/builder/print.hh
+++ b/lm/builder/print.hh
@@ -14,7 +14,7 @@
// Warning: print routines read all unigrams before all bigrams before all
// trigrams etc. So if other parts of the chain move jointly, you'll have to
-// buffer.
+// buffer.
namespace lm { namespace builder {
@@ -42,7 +42,7 @@ class VocabReconstitute {
std::vector<const char*> map_;
};
-// Not defined, only specialized.
+// Not defined, only specialized.
template <class T> void PrintPayload(util::FakeOFStream &to, const Payload &payload);
template <> inline void PrintPayload<uint64_t>(util::FakeOFStream &to, const Payload &payload) {
// TODO slow
@@ -55,7 +55,7 @@ template <> inline void PrintPayload<ProbBackoff>(util::FakeOFStream &to, const
to << payload.complete.prob << ' ' << payload.complete.backoff;
}
-// template parameter is the type stored.
+// template parameter is the type stored.
template <class V> class Print {
public:
static void DumpSeparateFiles(const VocabReconstitute &vocab, const std::string &file_base, util::stream::Chains &chains) {
diff --git a/lm/builder/sort.hh b/lm/builder/sort.hh
index 712bb8e35..ed20b4b79 100644
--- a/lm/builder/sort.hh
+++ b/lm/builder/sort.hh
@@ -19,7 +19,7 @@ namespace builder {
*/
template <class Child> class Comparator : public std::binary_function<const void *, const void *, bool> {
public:
-
+
/**
* Constructs a comparator capable of comparing two n-grams.
*
@@ -51,8 +51,8 @@ template <class Child> class Comparator : public std::binary_function<const void
/**
* N-gram comparator that compares n-grams according to their reverse (suffix) order.
*
- * This comparator compares n-grams lexicographically, one word at a time,
- * beginning with the last word of each n-gram and ending with the first word of each n-gram.
+ * This comparator compares n-grams lexicographically, one word at a time,
+ * beginning with the last word of each n-gram and ending with the first word of each n-gram.
*
* Some examples of n-gram comparisons as defined by this comparator:
* - a b c == a b c
@@ -64,8 +64,8 @@ template <class Child> class Comparator : public std::binary_function<const void
*/
class SuffixOrder : public Comparator<SuffixOrder> {
public:
-
- /**
+
+ /**
* Constructs a comparator capable of comparing two n-grams.
*
* @param order Number of words in each n-gram
@@ -73,7 +73,7 @@ class SuffixOrder : public Comparator<SuffixOrder> {
explicit SuffixOrder(std::size_t order) : Comparator<SuffixOrder>(order) {}
/**
- * Compares two n-grams lexicographically, one word at a time,
+ * Compares two n-grams lexicographically, one word at a time,
* beginning with the last word of each n-gram and ending with the first word of each n-gram.
*
* @param lhs A pointer to the n-gram on the left-hand side of the comparison
@@ -90,11 +90,11 @@ class SuffixOrder : public Comparator<SuffixOrder> {
static const unsigned kMatchOffset = 1;
};
-
+
/**
* N-gram comparator that compares n-grams according to the reverse (suffix) order of the n-gram context.
*
- * This comparator compares n-grams lexicographically, one word at a time,
+ * This comparator compares n-grams lexicographically, one word at a time,
* beginning with the penultimate word of each n-gram and ending with the first word of each n-gram;
* finally, this comparator compares the last word of each n-gram.
*
@@ -108,8 +108,8 @@ class SuffixOrder : public Comparator<SuffixOrder> {
*/
class ContextOrder : public Comparator<ContextOrder> {
public:
-
- /**
+
+ /**
* Constructs a comparator capable of comparing two n-grams.
*
* @param order Number of words in each n-gram
@@ -117,7 +117,7 @@ class ContextOrder : public Comparator<ContextOrder> {
explicit ContextOrder(std::size_t order) : Comparator<ContextOrder>(order) {}
/**
- * Compares two n-grams lexicographically, one word at a time,
+ * Compares two n-grams lexicographically, one word at a time,
* beginning with the penultimate word of each n-gram and ending with the first word of each n-gram;
* finally, this comparator compares the last word of each n-gram.
*
@@ -136,7 +136,7 @@ class ContextOrder : public Comparator<ContextOrder> {
/**
* N-gram comparator that compares n-grams according to their natural (prefix) order.
*
- * This comparator compares n-grams lexicographically, one word at a time,
+ * This comparator compares n-grams lexicographically, one word at a time,
* beginning with the first word of each n-gram and ending with the last word of each n-gram.
*
* Some examples of n-gram comparisons as defined by this comparator:
@@ -149,8 +149,8 @@ class ContextOrder : public Comparator<ContextOrder> {
*/
class PrefixOrder : public Comparator<PrefixOrder> {
public:
-
- /**
+
+ /**
* Constructs a comparator capable of comparing two n-grams.
*
* @param order Number of words in each n-gram
@@ -158,7 +158,7 @@ class PrefixOrder : public Comparator<PrefixOrder> {
explicit PrefixOrder(std::size_t order) : Comparator<PrefixOrder>(order) {}
/**
- * Compares two n-grams lexicographically, one word at a time,
+ * Compares two n-grams lexicographically, one word at a time,
* beginning with the first word of each n-gram and ending with the last word of each n-gram.
*
* @param lhs A pointer to the n-gram on the left-hand side of the comparison
@@ -171,7 +171,7 @@ class PrefixOrder : public Comparator<PrefixOrder> {
}
return false;
}
-
+
static const unsigned kMatchOffset = 0;
};
@@ -179,7 +179,7 @@ class PrefixOrder : public Comparator<PrefixOrder> {
struct AddCombiner {
bool operator()(void *first_void, const void *second_void, const SuffixOrder &compare) const {
NGram first(first_void, compare.Order());
- // There isn't a const version of NGram.
+ // There isn't a const version of NGram.
NGram second(const_cast<void*>(second_void), compare.Order());
if (memcmp(first.begin(), second.begin(), sizeof(WordIndex) * compare.Order())) return false;
first.Count() += second.Count();
@@ -204,10 +204,10 @@ template <class Compare> class Sorts : public util::FixedArray<util::stream::Sor
typedef util::FixedArray<S> P;
public:
-
+
/**
* Constructs, but does not initialize.
- *
+ *
* @ref util::FixedArray::Init() "Init" must be called before use.
*
* @see util::FixedArray::Init()
@@ -222,7 +222,7 @@ template <class Compare> class Sorts : public util::FixedArray<util::stream::Sor
*/
explicit Sorts(std::size_t number) : util::FixedArray<util::stream::Sort<Compare> >(number) {}
- /**
+ /**
* Constructs a new @ref util::stream::Sort "Sort" object which is stored in this @ref util::FixedArray "array".
*
* The new @ref util::stream::Sort "Sort" object is constructed using the provided @ref util::stream::SortConfig "SortConfig" and @ref Comparator "ngram comparator";
diff --git a/lm/enumerate_vocab.hh b/lm/enumerate_vocab.hh
index f5ce78985..f4c94cd26 100644
--- a/lm/enumerate_vocab.hh
+++ b/lm/enumerate_vocab.hh
@@ -10,7 +10,7 @@ namespace lm {
* and implement Add. Then put a pointer in Config.enumerate_vocab; it does
* not take ownership. Add is called once per vocab word. index starts at 0
* and increases by 1 each time. This is only used by the Model constructor;
- * the pointer is not retained by the class.
+ * the pointer is not retained by the class.
*/
class EnumerateVocab {
public:
diff --git a/lm/facade.hh b/lm/facade.hh
index 8e12b62ee..325ef159a 100644
--- a/lm/facade.hh
+++ b/lm/facade.hh
@@ -9,8 +9,8 @@
namespace lm {
namespace base {
-// Common model interface that depends on knowing the specific classes.
-// Curiously recurring template pattern.
+// Common model interface that depends on knowing the specific classes.
+// Curiously recurring template pattern.
template <class Child, class StateT, class VocabularyT> class ModelFacade : public Model {
public:
typedef StateT State;
@@ -32,7 +32,7 @@ template <class Child, class StateT, class VocabularyT> class ModelFacade : publ
*reinterpret_cast<State*>(out_state));
}
- // Default Score function calls FullScore. Model can override this.
+ // Default Score function calls FullScore. Model can override this.
float Score(const State &in_state, const WordIndex new_word, State &out_state) const {
return static_cast<const Child*>(this)->FullScore(in_state, new_word, out_state).prob;
}
@@ -53,7 +53,7 @@ template <class Child, class StateT, class VocabularyT> class ModelFacade : publ
virtual ~ModelFacade() {}
- // begin_sentence and null_context can disappear after. vocab should stay.
+ // begin_sentence and null_context can disappear after. vocab should stay.
void Init(const State &begin_sentence, const State &null_context, const Vocabulary &vocab, unsigned char order) {
begin_sentence_ = begin_sentence;
null_context_ = null_context;
diff --git a/lm/filter/count_io.hh b/lm/filter/count_io.hh
index de894baf8..02eb78baa 100644
--- a/lm/filter/count_io.hh
+++ b/lm/filter/count_io.hh
@@ -33,7 +33,7 @@ class CountOutput : boost::noncopyable {
class CountBatch {
public:
- explicit CountBatch(std::streamsize initial_read)
+ explicit CountBatch(std::streamsize initial_read)
: initial_read_(initial_read) {
buffer_.reserve(initial_read);
}
@@ -66,7 +66,7 @@ class CountBatch {
private:
std::streamsize initial_read_;
- // This could have been a std::string but that's less happy with raw writes.
+ // This could have been a std::string but that's less happy with raw writes.
std::vector<char> buffer_;
};
diff --git a/lm/filter/filter_main.cc b/lm/filter/filter_main.cc
index 82fdc1ef7..6e89d1fa3 100644
--- a/lm/filter/filter_main.cc
+++ b/lm/filter/filter_main.cc
@@ -58,7 +58,7 @@ typedef enum {MODE_COPY, MODE_SINGLE, MODE_MULTIPLE, MODE_UNION, MODE_UNSET} Fil
typedef enum {FORMAT_ARPA, FORMAT_COUNT} Format;
struct Config {
- Config() :
+ Config() :
#ifndef NTHREAD
batch_size(25000),
threads(boost::thread::hardware_concurrency()),
diff --git a/lm/filter/format.hh b/lm/filter/format.hh
index 5a2e2db3c..d453f05b8 100644
--- a/lm/filter/format.hh
+++ b/lm/filter/format.hh
@@ -134,12 +134,12 @@ struct CountFormat {
/* For multithreading, the buffer classes hold batches of filter inputs and
* outputs in memory. The strings get reused a lot, so keep them around
- * instead of clearing each time.
+ * instead of clearing each time.
*/
class InputBuffer {
public:
InputBuffer() : actual_(0) {}
-
+
void Reserve(size_t size) { lines_.reserve(size); }
template <class Output> void AddNGram(const StringPiece &ngram, const StringPiece &line, Output &output) {
@@ -179,18 +179,18 @@ class BinaryOutputBuffer {
void Reserve(size_t size) {
lines_.reserve(size);
}
-
+
void AddNGram(const StringPiece &line) {
lines_.push_back(line);
}
-
+
template <class Output> void Flush(Output &output) {
for (std::vector<StringPiece>::const_iterator i = lines_.begin(); i != lines_.end(); ++i) {
output.AddNGram(*i);
}
lines_.clear();
}
-
+
private:
std::vector<StringPiece> lines_;
};
@@ -234,7 +234,7 @@ class MultipleOutputBuffer {
private:
struct Annotated {
- // If this is empty, send to all systems.
+ // If this is empty, send to all systems.
// A filter should never send to all systems and send to a single one.
std::vector<size_t> systems;
StringPiece line;
diff --git a/lm/filter/phrase.cc b/lm/filter/phrase.cc
index 345900ffa..d8260d54e 100644
--- a/lm/filter/phrase.cc
+++ b/lm/filter/phrase.cc
@@ -31,14 +31,14 @@ unsigned int ReadMultiple(std::istream &in, Substrings &out) {
word.clear();
}
if (c == ' ') continue;
- // It's more than just a space. Close out the phrase.
+ // It's more than just a space. Close out the phrase.
if (!phrase.empty()) {
sentence_content = true;
out.AddPhrase(sentence_id, phrase.begin(), phrase.end());
phrase.clear();
}
if (c == '\t' || c == '\v') continue;
- // It's more than a space or tab: a newline.
+ // It's more than a space or tab: a newline.
if (sentence_content) {
++sentence_id;
sentence_content = false;
@@ -53,7 +53,7 @@ typedef unsigned int Sentence;
typedef std::vector<Sentence> Sentences;
} // namespace
-namespace detail {
+namespace detail {
const StringPiece kEndSentence("</s>");
@@ -61,7 +61,7 @@ class Arc {
public:
Arc() {}
- // For arcs from one vertex to another.
+ // For arcs from one vertex to another.
void SetPhrase(detail::Vertex &from, detail::Vertex &to, const Sentences &intersect) {
Set(to, intersect);
from_ = &from;
@@ -69,7 +69,7 @@ class Arc {
/* For arcs from before the n-gram begins to somewhere in the n-gram (right
* aligned). These have no from_ vertex; it implictly matches every
- * sentence. This also handles when the n-gram is a substring of a phrase.
+ * sentence. This also handles when the n-gram is a substring of a phrase.
*/
void SetRight(detail::Vertex &to, const Sentences &complete) {
Set(to, complete);
@@ -87,12 +87,12 @@ class Arc {
/* When this function returns:
* If Empty() then there's nothing left from this intersection.
*
- * If Current() == to then to is part of the intersection.
+ * If Current() == to then to is part of the intersection.
*
* Otherwise, Current() > to. In this case, to is not part of the
* intersection and neither is anything < Current(). To determine if
* any value >= Current() is in the intersection, call LowerBound again
- * with the value.
+ * with the value.
*/
void LowerBound(const Sentence to);
@@ -160,15 +160,15 @@ void Arc::Set(Vertex &to, const Sentences &sentences) {
void Vertex::LowerBound(const Sentence to) {
if (Empty()) return;
- // Union lower bound.
+ // Union lower bound.
while (true) {
Arc *top = incoming_.top();
if (top->Current() > to) {
current_ = top->Current();
return;
}
- // If top->Current() == to, we still need to verify that's an actual
- // element and not just a bound.
+ // If top->Current() == to, we still need to verify that's an actual
+ // element and not just a bound.
incoming_.pop();
top->LowerBound(to);
if (!top->Empty()) {
@@ -213,13 +213,13 @@ void BuildGraph(const Substrings &phrase, const std::vector<Hash> &hashes, detai
}
}
- // Phrases starting at the second or later word in the n-gram.
+ // Phrases starting at the second or later word in the n-gram.
Vertex *vertex_from = vertices;
for (const Hash *word_from = first_word + 1; word_from != &*hashes.end(); ++word_from, ++vertex_from) {
hash = 0;
Vertex *vertex_to = vertex_from + 1;
for (const Hash *word_to = word_from; ; ++word_to, ++vertex_to) {
- // Notice that word_to and vertex_to have the same index.
+ // Notice that word_to and vertex_to have the same index.
hash = util::MurmurHashNative(&hash, sizeof(uint64_t), *word_to);
// Now hash covers [word_from, word_to].
if (word_to == last_word) {
@@ -250,7 +250,7 @@ detail::Vertex &ConditionCommon::MakeGraph() {
vertices_.clear();
vertices_.resize(hashes_.size());
arcs_.clear();
- // One for every substring.
+ // One for every substring.
arcs_.resize(((hashes_.size() + 1) * hashes_.size()) / 2);
BuildGraph(substrings_, hashes_, &*vertices_.begin(), &*arcs_.begin());
return vertices_[hashes_.size() - 1];
diff --git a/lm/filter/phrase.hh b/lm/filter/phrase.hh
index e5898c9ae..5227ab246 100644
--- a/lm/filter/phrase.hh
+++ b/lm/filter/phrase.hh
@@ -27,7 +27,7 @@ class Substrings {
private:
/* This is the value in a hash table where the key is a string. It indicates
* four sets of sentences:
- * substring is sentences with a phrase containing the key as a substring.
+ * substring is sentences with a phrase containing the key as a substring.
* left is sentencess with a phrase that begins with the key (left aligned).
* right is sentences with a phrase that ends with the key (right aligned).
* phrase is sentences where the key is a phrase.
@@ -39,8 +39,8 @@ class Substrings {
/* Most of the CPU is hash table lookups, so let's not complicate it with
* vector equality comparisons. If a collision happens, the SentenceRelation
* structure will contain the union of sentence ids over the colliding strings.
- * In that case, the filter will be slightly more permissive.
- * The key here is the same as boost's hash of std::vector<std::string>.
+ * In that case, the filter will be slightly more permissive.
+ * The key here is the same as boost's hash of std::vector<std::string>.
*/
typedef boost::unordered_map<Hash, SentenceRelation> Table;
@@ -58,9 +58,9 @@ class Substrings {
LM_FILTER_PHRASE_METHOD(Phrase, phrase)
#pragma GCC diagnostic ignored "-Wuninitialized" // end != finish so there's always an initialization
- // sentence_id must be non-decreasing. Iterators are over words in the phrase.
+ // sentence_id must be non-decreasing. Iterators are over words in the phrase.
template <class Iterator> void AddPhrase(unsigned int sentence_id, const Iterator &begin, const Iterator &end) {
- // Iterate over all substrings.
+ // Iterate over all substrings.
for (Iterator start = begin; start != end; ++start) {
Hash hash = 0;
SentenceRelation *relation;
@@ -85,7 +85,7 @@ class Substrings {
};
// Read a file with one sentence per line containing tab-delimited phrases of
-// space-separated words.
+// space-separated words.
unsigned int ReadMultiple(std::istream &in, Substrings &out);
namespace detail {
@@ -94,7 +94,7 @@ extern const StringPiece kEndSentence;
template <class Iterator> void MakeHashes(Iterator i, const Iterator &end, std::vector<Hash> &hashes) {
hashes.clear();
if (i == end) return;
- // TODO: check strict phrase boundaries after <s> and before </s>. For now, just skip tags.
+ // TODO: check strict phrase boundaries after <s> and before </s>. For now, just skip tags.
if ((i->data()[0] == '<') && (i->data()[i->size() - 1] == '>')) {
++i;
}
diff --git a/lm/filter/phrase_table_vocab_main.cc b/lm/filter/phrase_table_vocab_main.cc
index e0f47d894..e8a8d0265 100644
--- a/lm/filter/phrase_table_vocab_main.cc
+++ b/lm/filter/phrase_table_vocab_main.cc
@@ -88,7 +88,7 @@ class TargetWords {
class Input {
public:
- explicit Input(std::size_t max_length)
+ explicit Input(std::size_t max_length)
: max_length_(max_length), sentence_id_(0), empty_() {}
void AddSentence(StringPiece sentence, TargetWords &targets) {
@@ -125,7 +125,7 @@ class Input {
Map map_;
std::size_t sentence_id_;
-
+
// Temporaries in AddSentence.
std::string canonical_;
std::vector<std::size_t> starts_;
diff --git a/lm/filter/thread.hh b/lm/filter/thread.hh
index 6a6523f90..88e069cb1 100644
--- a/lm/filter/thread.hh
+++ b/lm/filter/thread.hh
@@ -13,29 +13,29 @@ namespace lm {
template <class OutputBuffer> class ThreadBatch {
public:
ThreadBatch() {}
-
+
void Reserve(size_t size) {
input_.Reserve(size);
output_.Reserve(size);
}
- // File reading thread.
+ // File reading thread.
InputBuffer &Fill(uint64_t sequence) {
sequence_ = sequence;
// Why wait until now to clear instead of after output? free in the same
- // thread as allocated.
+ // thread as allocated.
input_.Clear();
return input_;
}
- // Filter worker thread.
+ // Filter worker thread.
template <class Filter> void CallFilter(Filter &filter) {
input_.CallFilter(filter, output_);
}
uint64_t Sequence() const { return sequence_; }
- // File writing thread.
+ // File writing thread.
template <class RealOutput> void Flush(RealOutput &output) {
output_.Flush(output);
}
@@ -73,7 +73,7 @@ template <class Batch, class Output> class OutputWorker {
void operator()(Request request) {
assert(request->Sequence() >= base_sequence_);
- // Assemble the output in order.
+ // Assemble the output in order.
uint64_t pos = request->Sequence() - base_sequence_;
if (pos >= ordering_.size()) {
ordering_.resize(pos + 1, NULL);
@@ -102,7 +102,7 @@ template <class Filter, class OutputBuffer, class RealOutput> class Controller :
typedef ThreadBatch<OutputBuffer> Batch;
public:
- Controller(size_t batch_size, size_t queue, size_t workers, const Filter &filter, RealOutput &output)
+ Controller(size_t batch_size, size_t queue, size_t workers, const Filter &filter, RealOutput &output)
: batch_size_(batch_size), queue_size_(queue),
batches_(queue),
to_read_(queue),
diff --git a/lm/filter/vocab.cc b/lm/filter/vocab.cc
index 0a5585580..2aca4fc60 100644
--- a/lm/filter/vocab.cc
+++ b/lm/filter/vocab.cc
@@ -30,7 +30,7 @@ bool IsLineEnd(std::istream &in) {
}// namespace
// Read space separated words in enter separated lines. These lines can be
-// very long, so don't read an entire line at a time.
+// very long, so don't read an entire line at a time.
unsigned int ReadMultiple(std::istream &in, boost::unordered_map<std::string, std::vector<unsigned int> > &out) {
in.exceptions(std::istream::badbit);
unsigned int sentence = 0;
diff --git a/lm/filter/vocab.hh b/lm/filter/vocab.hh
index 2ee6e1f8a..397a93237 100644
--- a/lm/filter/vocab.hh
+++ b/lm/filter/vocab.hh
@@ -26,7 +26,7 @@ unsigned int ReadMultiple(std::istream &in, boost::unordered_map<std::string, st
/* Is this a special tag like <s> or <UNK>? This actually includes anything
* surrounded with < and >, which most tokenizers separate for real words, so
- * this should not catch real words as it looks at a single token.
+ * this should not catch real words as it looks at a single token.
*/
inline bool IsTag(const StringPiece &value) {
// The parser should never give an empty string.
diff --git a/lm/filter/wrapper.hh b/lm/filter/wrapper.hh
index 822c5c27d..227ec8e45 100644
--- a/lm/filter/wrapper.hh
+++ b/lm/filter/wrapper.hh
@@ -13,7 +13,7 @@ namespace lm {
// multiple-output filter so clients code against one interface.
template <class Binary> class BinaryFilter {
public:
- // Binary modes are just references (and a set) and it makes the API cleaner to copy them.
+ // Binary modes are just references (and a set) and it makes the API cleaner to copy them.
explicit BinaryFilter(Binary binary) : binary_(binary) {}
template <class Iterator, class Output> void AddNGram(const Iterator &begin, const Iterator &end, const StringPiece &line, Output &output) {
diff --git a/lm/left.hh b/lm/left.hh
index 36d613697..4d496863c 100644
--- a/lm/left.hh
+++ b/lm/left.hh
@@ -1,22 +1,22 @@
/* Efficient left and right language model state for sentence fragments.
* Intended usage:
- * Store ChartState with every chart entry.
+ * Store ChartState with every chart entry.
* To do a rule application:
- * 1. Make a ChartState object for your new entry.
- * 2. Construct RuleScore.
- * 3. Going from left to right, call Terminal or NonTerminal.
- * For terminals, just pass the vocab id.
+ * 1. Make a ChartState object for your new entry.
+ * 2. Construct RuleScore.
+ * 3. Going from left to right, call Terminal or NonTerminal.
+ * For terminals, just pass the vocab id.
* For non-terminals, pass that non-terminal's ChartState.
* If your decoder expects scores inclusive of subtree scores (i.e. you
* label entries with the highest-scoring path), pass the non-terminal's
- * score as prob.
+ * score as prob.
* If your decoder expects relative scores and will walk the chart later,
- * pass prob = 0.0.
+ * pass prob = 0.0.
* In other words, the only effect of prob is that it gets added to the
- * returned log probability.
- * 4. Call Finish. It returns the log probability.
+ * returned log probability.
+ * 4. Call Finish. It returns the log probability.
*
- * There's a couple more details:
+ * There's a couple more details:
* Do not pass <s> to Terminal as it is formally not a word in the sentence,
* only context. Instead, call BeginSentence. If called, it should be the
* first call after RuleScore is constructed (since <s> is always the
@@ -27,12 +27,12 @@
* Hashing and sorting comparison operators are provided. All state objects
* are POD. If you intend to use memcmp on raw state objects, you must call
* ZeroRemaining first, as the value of array entries beyond length is
- * otherwise undefined.
+ * otherwise undefined.
*
* Usage is of course not limited to chart decoding. Anything that generates
* sentence fragments missing left context could benefit. For example, a
* phrase-based decoder could pre-score phrases, storing ChartState with each
- * phrase, even if hypotheses are generated left-to-right.
+ * phrase, even if hypotheses are generated left-to-right.
*/
#ifndef LM_LEFT_H
@@ -77,7 +77,7 @@ template <class M> class RuleScore {
left_done_ = true;
}
- // Faster version of NonTerminal for the case where the rule begins with a non-terminal.
+ // Faster version of NonTerminal for the case where the rule begins with a non-terminal.
void BeginNonTerminal(const ChartState &in, float prob = 0.0) {
prob_ = prob;
*out_ = in;
@@ -86,7 +86,7 @@ template <class M> class RuleScore {
void NonTerminal(const ChartState &in, float prob = 0.0) {
prob_ += prob;
-
+
if (!in.left.length) {
if (in.left.full) {
for (const float *i = out_->right.backoff; i < out_->right.backoff + out_->right.length; ++i) prob_ += *i;
@@ -131,26 +131,26 @@ template <class M> class RuleScore {
return;
}
- // Right state was minimized, so it's already independent of the new words to the left.
+ // Right state was minimized, so it's already independent of the new words to the left.
if (in.right.length < in.left.length) {
out_->right = in.right;
return;
}
- // Shift exisiting words down.
+ // Shift exisiting words down.
for (WordIndex *i = out_->right.words + next_use - 1; i >= out_->right.words; --i) {
*(i + in.right.length) = *i;
}
- // Add words from in.right.
+ // Add words from in.right.
std::copy(in.right.words, in.right.words + in.right.length, out_->right.words);
- // Assemble backoff composed on the existing state's backoff followed by the new state's backoff.
+ // Assemble backoff composed on the existing state's backoff followed by the new state's backoff.
std::copy(in.right.backoff, in.right.backoff + in.right.length, out_->right.backoff);
std::copy(back, back + next_use, out_->right.backoff + in.right.length);
out_->right.length = in.right.length + next_use;
}
float Finish() {
- // A N-1-gram might extend left and right but we should still set full to true because it's an N-1-gram.
+ // A N-1-gram might extend left and right but we should still set full to true because it's an N-1-gram.
out_->left.full = left_done_ || (out_->left.length == model_.Order() - 1);
return prob_;
}
@@ -173,17 +173,17 @@ template <class M> class RuleScore {
back_in, // Backoffs to use
in.left.pointers[extend_length - 1], extend_length, // Words to be extended
back_out, // Backoffs for the next score
- next_use)); // Length of n-gram to use in next scoring.
+ next_use)); // Length of n-gram to use in next scoring.
if (next_use != out_->right.length) {
left_done_ = true;
if (!next_use) {
- // Early exit.
+ // Early exit.
out_->right = in.right;
prob_ += model_.UnRest(in.left.pointers + extend_length, in.left.pointers + in.left.length, extend_length + 1);
return true;
}
}
- // Continue scoring.
+ // Continue scoring.
return false;
}
diff --git a/lm/left_test.cc b/lm/left_test.cc
index b45614613..fdb641627 100644
--- a/lm/left_test.cc
+++ b/lm/left_test.cc
@@ -16,7 +16,7 @@ namespace {
#define Term(word) score.Terminal(m.GetVocabulary().Index(word));
#define VCheck(word, value) BOOST_CHECK_EQUAL(m.GetVocabulary().Index(word), value);
-// Apparently some Boost versions use templates and are pretty strict about types matching.
+// Apparently some Boost versions use templates and are pretty strict about types matching.
#define SLOPPY_CHECK_CLOSE(ref, value, tol) BOOST_CHECK_CLOSE(static_cast<double>(ref), static_cast<double>(value), static_cast<double>(tol));
template <class M> void Short(const M &m) {
@@ -175,7 +175,7 @@ template <class M> void LookupVocab(const M &m, const StringPiece &str, std::vec
SLOPPY_CHECK_CLOSE(expect, RightToLeft(m, words, rest), 0.001); \
SLOPPY_CHECK_CLOSE(expect, TreeMiddle(m, words, rest), 0.001); \
-// Build sentences, or parts thereof, from right to left.
+// Build sentences, or parts thereof, from right to left.
template <class M> void GrowBig(const M &m, bool rest = false) {
std::vector<WordIndex> words;
float expect;
diff --git a/lm/lm_exception.hh b/lm/lm_exception.hh
index 8bb610812..85a5738eb 100644
--- a/lm/lm_exception.hh
+++ b/lm/lm_exception.hh
@@ -1,7 +1,7 @@
#ifndef LM_LM_EXCEPTION_H
#define LM_LM_EXCEPTION_H
-// Named to avoid conflict with util/exception.hh.
+// Named to avoid conflict with util/exception.hh.
#include "util/exception.hh"
#include "util/string_piece.hh"
diff --git a/lm/max_order.hh b/lm/max_order.hh
index 5f181f3fc..0ad1379e0 100644
--- a/lm/max_order.hh
+++ b/lm/max_order.hh
@@ -1,7 +1,7 @@
#ifndef LM_MAX_ORDER_H
#define LM_MAX_ORDER_H
/* IF YOUR BUILD SYSTEM PASSES -DKENLM_MAX_ORDER, THEN CHANGE THE BUILD SYSTEM.
- * If not, this is the default maximum order.
+ * If not, this is the default maximum order.
* Having this limit means that State can be
* (kMaxOrder - 1) * sizeof(float) bytes instead of
* sizeof(float*) + (kMaxOrder - 1) * sizeof(float) + malloc overhead
diff --git a/lm/model.hh b/lm/model.hh
index c67ae2eed..b2bbe3999 100644
--- a/lm/model.hh
+++ b/lm/model.hh
@@ -25,7 +25,7 @@ namespace lm {
namespace ngram {
namespace detail {
-// Should return the same results as SRI.
+// Should return the same results as SRI.
// ModelFacade typedefs Vocabulary so we use VocabularyT to avoid naming conflicts.
template <class Search, class VocabularyT> class GenericModel : public base::ModelFacade<GenericModel<Search, VocabularyT>, State, VocabularyT> {
private:
@@ -38,7 +38,7 @@ template <class Search, class VocabularyT> class GenericModel : public base::Mod
/* Get the size of memory that will be mapped given ngram counts. This
* does not include small non-mapped control structures, such as this class
- * itself.
+ * itself.
*/
static uint64_t Size(const std::vector<uint64_t> &counts, const Config &config = Config());
@@ -46,47 +46,47 @@ template <class Search, class VocabularyT> class GenericModel : public base::Mod
* files must have the format expected by this class or you'll get an
* exception. So TrieModel can only load ARPA or binary created by
* TrieModel. To classify binary files, call RecognizeBinary in
- * lm/binary_format.hh.
+ * lm/binary_format.hh.
*/
explicit GenericModel(const char *file, const Config &config = Config());
/* Score p(new_word | in_state) and incorporate new_word into out_state.
* Note that in_state and out_state must be different references:
- * &in_state != &out_state.
+ * &in_state != &out_state.
*/
FullScoreReturn FullScore(const State &in_state, const WordIndex new_word, State &out_state) const;
/* Slower call without in_state. Try to remember state, but sometimes it
- * would cost too much memory or your decoder isn't setup properly.
+ * would cost too much memory or your decoder isn't setup properly.
* To use this function, make an array of WordIndex containing the context
* vocabulary ids in reverse order. Then, pass the bounds of the array:
* [context_rbegin, context_rend). The new_word is not part of the context
- * array unless you intend to repeat words.
+ * array unless you intend to repeat words.
*/
FullScoreReturn FullScoreForgotState(const WordIndex *context_rbegin, const WordIndex *context_rend, const WordIndex new_word, State &out_state) const;
/* Get the state for a context. Don't use this if you can avoid it. Use
* BeginSentenceState or NullContextState and extend from those. If
* you're only going to use this state to call FullScore once, use
- * FullScoreForgotState.
+ * FullScoreForgotState.
* To use this function, make an array of WordIndex containing the context
* vocabulary ids in reverse order. Then, pass the bounds of the array:
- * [context_rbegin, context_rend).
+ * [context_rbegin, context_rend).
*/
void GetState(const WordIndex *context_rbegin, const WordIndex *context_rend, State &out_state) const;
/* More efficient version of FullScore where a partial n-gram has already
- * been scored.
- * NOTE: THE RETURNED .rest AND .prob ARE RELATIVE TO THE .rest RETURNED BEFORE.
+ * been scored.
+ * NOTE: THE RETURNED .rest AND .prob ARE RELATIVE TO THE .rest RETURNED BEFORE.
*/
FullScoreReturn ExtendLeft(
- // Additional context in reverse order. This will update add_rend to
+ // Additional context in reverse order. This will update add_rend to
const WordIndex *add_rbegin, const WordIndex *add_rend,
- // Backoff weights to use.
+ // Backoff weights to use.
const float *backoff_in,
// extend_left returned by a previous query.
uint64_t extend_pointer,
- // Length of n-gram that the pointer corresponds to.
+ // Length of n-gram that the pointer corresponds to.
unsigned char extend_length,
// Where to write additional backoffs for [extend_length + 1, min(Order() - 1, return.ngram_length)]
float *backoff_out,
@@ -95,17 +95,17 @@ template <class Search, class VocabularyT> class GenericModel : public base::Mod
/* Return probabilities minus rest costs for an array of pointers. The
* first length should be the length of the n-gram to which pointers_begin
- * points.
+ * points.
*/
float UnRest(const uint64_t *pointers_begin, const uint64_t *pointers_end, unsigned char first_length) const {
- // Compiler should optimize this if away.
+ // Compiler should optimize this if away.
return Search::kDifferentRest ? InternalUnRest(pointers_begin, pointers_end, first_length) : 0.0;
}
private:
FullScoreReturn ScoreExceptBackoff(const WordIndex *const context_rbegin, const WordIndex *const context_rend, const WordIndex new_word, State &out_state) const;
- // Score bigrams and above. Do not include backoff.
+ // Score bigrams and above. Do not include backoff.
void ResumeScore(const WordIndex *context_rbegin, const WordIndex *const context_rend, unsigned char starting_order_minus_2, typename Search::Node &node, float *backoff_out, unsigned char &next_use, FullScoreReturn &ret) const;
// Appears after Size in the cc file.
@@ -116,7 +116,7 @@ template <class Search, class VocabularyT> class GenericModel : public base::Mod
float InternalUnRest(const uint64_t *pointers_begin, const uint64_t *pointers_end, unsigned char first_length) const;
BinaryFormat backing_;
-
+
VocabularyT vocab_;
Search search_;
@@ -124,8 +124,8 @@ template <class Search, class VocabularyT> class GenericModel : public base::Mod
} // namespace detail
-// Instead of typedef, inherit. This allows the Model etc to be forward declared.
-// Oh the joys of C and C++.
+// Instead of typedef, inherit. This allows the Model etc to be forward declared.
+// Oh the joys of C and C++.
#define LM_COMMA() ,
#define LM_NAME_MODEL(name, from)\
class name : public from {\
@@ -140,7 +140,7 @@ LM_NAME_MODEL(ArrayTrieModel, detail::GenericModel<trie::TrieSearch<DontQuantize
LM_NAME_MODEL(QuantTrieModel, detail::GenericModel<trie::TrieSearch<SeparatelyQuantize LM_COMMA() trie::DontBhiksha> LM_COMMA() SortedVocabulary>);
LM_NAME_MODEL(QuantArrayTrieModel, detail::GenericModel<trie::TrieSearch<SeparatelyQuantize LM_COMMA() trie::ArrayBhiksha> LM_COMMA() SortedVocabulary>);
-// Default implementation. No real reason for it to be the default.
+// Default implementation. No real reason for it to be the default.
typedef ::lm::ngram::ProbingVocabulary Vocabulary;
typedef ProbingModel Model;
diff --git a/lm/model_test.cc b/lm/model_test.cc
index 2e4b14fb4..d408d6fe4 100644
--- a/lm/model_test.cc
+++ b/lm/model_test.cc
@@ -7,7 +7,7 @@
#include <boost/test/unit_test.hpp>
#include <boost/test/floating_point_comparison.hpp>
-// Apparently some Boost versions use templates and are pretty strict about types matching.
+// Apparently some Boost versions use templates and are pretty strict about types matching.
#define SLOPPY_CHECK_CLOSE(ref, value, tol) BOOST_CHECK_CLOSE(static_cast<double>(ref), static_cast<double>(value), static_cast<double>(tol));
namespace lm {
@@ -118,7 +118,7 @@ template <class M> void Blanks(const M &model) {
AppendTest("not_found", 1, -1.995635 - 7.0 - 0.30103, true);
state = model.NullContextState();
- // higher looking is a blank.
+ // higher looking is a blank.
AppendTest("higher", 1, -1.509559, false);
AppendTest("looking", 2, -1.285941 - 0.30103, false);
@@ -150,7 +150,7 @@ template <class M> void Unknowns(const M &model) {
State preserve = state;
AppendTest("not_found2", 2, -15.0, true);
AppendTest("not_found3", 2, -15.0 - 2.0, true);
-
+
state = preserve;
AppendTest("however", 2, -4, true);
AppendTest("not_found3", 3, -6, true);
@@ -167,7 +167,7 @@ template <class M> void MinimalState(const M &model) {
AppendTest("foo", 1, -3.141592, true);
BOOST_CHECK_EQUAL(1, state.length);
AppendTest("bar", 2, -6.0, true);
- // Has to include the backoff weight.
+ // Has to include the backoff weight.
BOOST_CHECK_EQUAL(1, state.length);
AppendTest("bar", 1, -2.718281 + 3.0, true);
BOOST_CHECK_EQUAL(1, state.length);
@@ -263,7 +263,7 @@ template <class M> void Stateless(const M &model) {
// the
AppendTest("the", 1, -4.04005, true);
StatelessTest(5, 5, 1, -4.04005);
- // No context of the.
+ // No context of the.
StatelessTest(5, 0, 1, -1.687872);
// biarritz
StatelessTest(6, 1, 1, -1.9889);
diff --git a/lm/model_type.hh b/lm/model_type.hh
index fbe1117a5..dcdc6ac7c 100644
--- a/lm/model_type.hh
+++ b/lm/model_type.hh
@@ -8,7 +8,7 @@ namespace ngram {
* and I want to preserve existing binary files. */
typedef enum {PROBING=0, REST_PROBING=1, TRIE=2, QUANT_TRIE=3, ARRAY_TRIE=4, QUANT_ARRAY_TRIE=5} ModelType;
-// Historical names.
+// Historical names.
const ModelType HASH_PROBING = PROBING;
const ModelType TRIE_SORTED = TRIE;
const ModelType QUANT_TRIE_SORTED = QUANT_TRIE;
diff --git a/lm/ngram_query.hh b/lm/ngram_query.hh
index 560853749..937fe2421 100644
--- a/lm/ngram_query.hh
+++ b/lm/ngram_query.hh
@@ -22,7 +22,7 @@ struct BasicPrint {
std::cout << "Total: " << total << " OOV: " << oov << '\n';
}
void Summary(double, double, uint64_t, uint64_t) {}
-
+
};
struct FullPrint : public BasicPrint {
@@ -31,7 +31,7 @@ struct FullPrint : public BasicPrint {
}
void Summary(double ppl_including_oov, double ppl_excluding_oov, uint64_t corpus_oov, uint64_t corpus_tokens) {
- std::cout <<
+ std::cout <<
"Perplexity including OOVs:\t" << ppl_including_oov << "\n"
"Perplexity excluding OOVs:\t" << ppl_excluding_oov << "\n"
"OOVs:\t" << corpus_oov << "\n"
diff --git a/lm/partial.hh b/lm/partial.hh
index 3e67d91c5..9e4e3522e 100644
--- a/lm/partial.hh
+++ b/lm/partial.hh
@@ -35,9 +35,9 @@ template <class Model> ExtendReturn ExtendLoop(
unsigned char i = 0;
unsigned char length = pointers_end - pointers;
- // pointers_write is NULL means that the existing left state is full, so we should use completed probabilities.
+ // pointers_write is NULL means that the existing left state is full, so we should use completed probabilities.
if (pointers_write) {
- // Using full context, writing to new left state.
+ // Using full context, writing to new left state.
for (; i < length; ++i) {
FullScoreReturn ret(model.ExtendLeft(
add_rbegin, add_rbegin + value.next_use,
@@ -61,7 +61,7 @@ template <class Model> ExtendReturn ExtendLoop(
}
}
}
- // Using some of the new context.
+ // Using some of the new context.
for (; i < length && value.next_use; ++i) {
FullScoreReturn ret(model.ExtendLeft(
add_rbegin, add_rbegin + value.next_use,
@@ -73,7 +73,7 @@ template <class Model> ExtendReturn ExtendLoop(
value.adjust += ret.prob;
}
float unrest = model.UnRest(pointers + i, pointers_end, i + seen + 1);
- // Using none of the new context.
+ // Using none of the new context.
value.adjust += unrest;
std::copy(backoff_in, backoff_in + value.next_use, backoff_write);
@@ -100,7 +100,7 @@ template <class Model> float RevealBefore(const Model &model, const Right &revea
if (left.full) {
for (unsigned char i = 0; i < value.next_use; ++i) value.adjust += backoff_buffer[i];
} else {
- // If left wasn't full when it came in, put words into right state.
+ // If left wasn't full when it came in, put words into right state.
std::copy(reveal.words + seen, reveal.words + seen + value.next_use, right.words + right.length);
right.length += value.next_use;
left.full = value.make_full || (right.length == model.Order() - 1);
diff --git a/lm/partial_test.cc b/lm/partial_test.cc
index 8d309c85a..adb644fa6 100644
--- a/lm/partial_test.cc
+++ b/lm/partial_test.cc
@@ -123,7 +123,7 @@ BOOST_AUTO_TEST_CASE(EndSentence) {
before.words[1] = loin;
before.backoff[0] = -0.845098;
before.backoff[1] = 0.0;
-
+
before.length = 1;
BOOST_CHECK_CLOSE(-0.0410707, RevealBefore(m, before, 0, true, between.left, between.right), 0.001);
BOOST_CHECK_EQUAL(0, between.left.length);
@@ -159,7 +159,7 @@ void CheckAdjustment(const RestProbingModel &model, float expect, const Right &b
if (before_full) {
got += RevealBefore(model, before, before.length, true, between.left, between.right);
}
- // Sometimes they're zero and BOOST_CHECK_CLOSE fails for this.
+ // Sometimes they're zero and BOOST_CHECK_CLOSE fails for this.
BOOST_CHECK(fabs(expect - got) < 0.001);
}
diff --git a/lm/quantize.cc b/lm/quantize.cc
index 273ea3989..02b5dbc0e 100644
--- a/lm/quantize.cc
+++ b/lm/quantize.cc
@@ -50,12 +50,12 @@ void SeparatelyQuantize::UpdateConfigFromBinary(const BinaryFormat &file, uint64
void SeparatelyQuantize::SetupMemory(void *base, unsigned char order, const Config &config) {
prob_bits_ = config.prob_bits;
backoff_bits_ = config.backoff_bits;
- // We need the reserved values.
+ // We need the reserved values.
if (config.prob_bits == 0) UTIL_THROW(ConfigException, "You can't quantize probability to zero");
if (config.backoff_bits == 0) UTIL_THROW(ConfigException, "You can't quantize backoff to zero");
if (config.prob_bits > 25) UTIL_THROW(ConfigException, "For efficiency reasons, quantizing probability supports at most 25 bits. Currently you have requested " << static_cast<unsigned>(config.prob_bits) << " bits.");
if (config.backoff_bits > 25) UTIL_THROW(ConfigException, "For efficiency reasons, quantizing backoff supports at most 25 bits. Currently you have requested " << static_cast<unsigned>(config.backoff_bits) << " bits.");
- // Reserve 8 byte header for bit counts.
+ // Reserve 8 byte header for bit counts.
actual_base_ = static_cast<uint8_t*>(base);
float *start = reinterpret_cast<float*>(actual_base_ + 8);
for (unsigned char i = 0; i < order - 2; ++i) {
diff --git a/lm/quantize.hh b/lm/quantize.hh
index 84a30872e..8500aceec 100644
--- a/lm/quantize.hh
+++ b/lm/quantize.hh
@@ -85,7 +85,7 @@ class DontQuantize {
void SetupMemory(void * /*start*/, unsigned char /*order*/, const Config & /*config*/) {}
static const bool kTrain = false;
- // These should never be called because kTrain is false.
+ // These should never be called because kTrain is false.
void Train(uint8_t /*order*/, std::vector<float> &/*prob*/, std::vector<float> &/*backoff*/) {}
void TrainProb(uint8_t, std::vector<float> &/*prob*/) {}
@@ -142,7 +142,7 @@ class SeparatelyQuantize {
static uint64_t Size(uint8_t order, const Config &config) {
uint64_t longest_table = (static_cast<uint64_t>(1) << static_cast<uint64_t>(config.prob_bits)) * sizeof(float);
uint64_t middle_table = (static_cast<uint64_t>(1) << static_cast<uint64_t>(config.backoff_bits)) * sizeof(float) + longest_table;
- // unigrams are currently not quantized so no need for a table.
+ // unigrams are currently not quantized so no need for a table.
return (order - 2) * middle_table + longest_table + /* for the bit counts and alignment padding) */ 8;
}
@@ -168,7 +168,7 @@ class SeparatelyQuantize {
float Rest() const { return Prob(); }
void Write(float prob, float backoff) const {
- util::WriteInt57(address_.base, address_.offset, ProbBins().Bits() + BackoffBins().Bits(),
+ util::WriteInt57(address_.base, address_.offset, ProbBins().Bits() + BackoffBins().Bits(),
(ProbBins().EncodeProb(prob) << BackoffBins().Bits()) | BackoffBins().EncodeBackoff(backoff));
}
@@ -183,7 +183,7 @@ class SeparatelyQuantize {
class LongestPointer {
public:
LongestPointer(const SeparatelyQuantize &quant, const util::BitAddress &address) : table_(&quant.LongestTable()), address_(address) {}
-
+
LongestPointer() : address_(NULL, 0) {}
bool Found() const { return address_.base != NULL; }
@@ -206,7 +206,7 @@ class SeparatelyQuantize {
void SetupMemory(void *start, unsigned char order, const Config &config);
static const bool kTrain = true;
- // Assumes 0.0 is removed from backoff.
+ // Assumes 0.0 is removed from backoff.
void Train(uint8_t order, std::vector<float> &prob, std::vector<float> &backoff);
// Train just probabilities (for longest order).
void TrainProb(uint8_t order, std::vector<float> &prob);
diff --git a/lm/return.hh b/lm/return.hh
index 982ffd66a..ee1f25e94 100644
--- a/lm/return.hh
+++ b/lm/return.hh
@@ -9,7 +9,7 @@ struct FullScoreReturn {
// log10 probability
float prob;
- /* The length of n-gram matched. Do not use this for recombination.
+ /* The length of n-gram matched. Do not use this for recombination.
* Consider a model containing only the following n-grams:
* -1 foo
* -3.14 bar
@@ -18,9 +18,9 @@ struct FullScoreReturn {
*
* If you score ``bar'' then ngram_length is 1 and recombination state is the
* empty string because bar has zero backoff and does not extend to the
- * right.
- * If you score ``foo'' then ngram_length is 1 and recombination state is
- * ``foo''.
+ * right.
+ * If you score ``foo'' then ngram_length is 1 and recombination state is
+ * ``foo''.
*
* Ideally, keep output states around and compare them. Failing that,
* get out_state.ValidLength() and use that length for recombination.
@@ -29,7 +29,7 @@ struct FullScoreReturn {
/* Left extension information. If independent_left is set, then prob is
* independent of words to the left (up to additional backoff). Otherwise,
- * extend_left indicates how to efficiently extend further to the left.
+ * extend_left indicates how to efficiently extend further to the left.
*/
bool independent_left;
uint64_t extend_left; // Defined only if independent_left
diff --git a/lm/search_trie.cc b/lm/search_trie.cc
index 5b0f55fc8..a63985af6 100644
--- a/lm/search_trie.cc
+++ b/lm/search_trie.cc
@@ -517,7 +517,7 @@ template <class Quant, class Bhiksha> void BuildTrie(SortedFiles &files, std::ve
{
WriteEntries<Quant, Bhiksha> writer(contexts, quant, unigrams, out.middle_begin_, out.longest_, counts.size(), sri);
RecursiveInsert(counts.size(), counts[0], inputs, config.ProgressMessages(), "Writing trie", writer);
- // Write the last unigram entry, which is the end pointer for the bigrams.
+ // Write the last unigram entry, which is the end pointer for the bigrams.
writer.Unigram(counts[0]);
}
diff --git a/lm/sizes.cc b/lm/sizes.cc
index 55ad586c4..dd831c505 100644
--- a/lm/sizes.cc
+++ b/lm/sizes.cc
@@ -36,7 +36,7 @@ void ShowSizes(const std::vector<uint64_t> &counts, const lm::ngram::Config &con
long int length = std::max<long int>(2, static_cast<long int>(ceil(log10((double) max_length / divide))));
std::cerr << "Memory estimate for binary LM:\ntype ";
- // right align bytes.
+ // right align bytes.
for (long int i = 0; i < length - 2; ++i) std::cerr << ' ';
std::cerr << prefix << "B\n"
diff --git a/lm/state.hh b/lm/state.hh
index d9ba596ad..2195dee73 100644
--- a/lm/state.hh
+++ b/lm/state.hh
@@ -11,7 +11,7 @@ namespace lm {
namespace ngram {
// This is a POD but if you want memcmp to return the same as operator==, call
-// ZeroRemaining first.
+// ZeroRemaining first.
class State {
public:
bool operator==(const State &other) const {
@@ -19,7 +19,7 @@ class State {
return !memcmp(words, other.words, length * sizeof(WordIndex));
}
- // Three way comparison function.
+ // Three way comparison function.
int Compare(const State &other) const {
if (length != other.length) return length < other.length ? -1 : 1;
return memcmp(words, other.words, length * sizeof(WordIndex));
@@ -30,7 +30,7 @@ class State {
return memcmp(words, other.words, length * sizeof(WordIndex)) < 0;
}
- // Call this before using raw memcmp.
+ // Call this before using raw memcmp.
void ZeroRemaining() {
for (unsigned char i = length; i < KENLM_MAX_ORDER - 1; ++i) {
words[i] = 0;
@@ -40,8 +40,8 @@ class State {
unsigned char Length() const { return length; }
- // You shouldn't need to touch anything below this line, but the members are public so FullState will qualify as a POD.
- // This order minimizes total size of the struct if WordIndex is 64 bit, float is 32 bit, and alignment of 64 bit integers is 64 bit.
+ // You shouldn't need to touch anything below this line, but the members are public so FullState will qualify as a POD.
+ // This order minimizes total size of the struct if WordIndex is 64 bit, float is 32 bit, and alignment of 64 bit integers is 64 bit.
WordIndex words[KENLM_MAX_ORDER - 1];
float backoff[KENLM_MAX_ORDER - 1];
unsigned char length;
@@ -55,7 +55,7 @@ inline uint64_t hash_value(const State &state, uint64_t seed = 0) {
struct Left {
bool operator==(const Left &other) const {
- return
+ return
length == other.length &&
(!length || (pointers[length - 1] == other.pointers[length - 1] && full == other.full));
}
diff --git a/lm/trie.cc b/lm/trie.cc
index 93320a332..72ad54484 100644
--- a/lm/trie.cc
+++ b/lm/trie.cc
@@ -14,7 +14,7 @@ namespace {
class KeyAccessor {
public:
- KeyAccessor(const void *base, uint64_t key_mask, uint8_t key_bits, uint8_t total_bits)
+ KeyAccessor(const void *base, uint64_t key_mask, uint8_t key_bits, uint8_t total_bits)
: base_(reinterpret_cast<const uint8_t*>(base)), key_mask_(key_mask), key_bits_(key_bits), total_bits_(total_bits) {}
typedef uint64_t Key;
@@ -38,9 +38,9 @@ bool FindBitPacked(const void *base, uint64_t key_mask, uint8_t key_bits, uint8_
uint64_t BitPacked::BaseSize(uint64_t entries, uint64_t max_vocab, uint8_t remaining_bits) {
uint8_t total_bits = util::RequiredBits(max_vocab) + remaining_bits;
- // Extra entry for next pointer at the end.
+ // Extra entry for next pointer at the end.
// +7 then / 8 to round up bits and convert to bytes
- // +sizeof(uint64_t) so that ReadInt57 etc don't go segfault.
+ // +sizeof(uint64_t) so that ReadInt57 etc don't go segfault.
// Note that this waste is O(order), not O(number of ngrams).
return ((1 + entries) * total_bits + 7) / 8 + sizeof(uint64_t);
}
@@ -100,7 +100,7 @@ template <class Bhiksha> util::BitAddress BitPackedMiddle<Bhiksha>::Find(WordInd
template <class Bhiksha> void BitPackedMiddle<Bhiksha>::FinishedLoading(uint64_t next_end, const Config &config) {
// Write at insert_index. . .
- uint64_t last_next_write = insert_index_ * total_bits_ +
+ uint64_t last_next_write = insert_index_ * total_bits_ +
// at the offset where the next pointers are stored.
(total_bits_ - bhiksha_.InlineBits());
bhiksha_.WriteNext(base_, last_next_write, insert_index_, next_end);
diff --git a/lm/trie.hh b/lm/trie.hh
index cd39298b5..b7f0458bf 100644
--- a/lm/trie.hh
+++ b/lm/trie.hh
@@ -18,7 +18,7 @@ struct NodeRange {
uint64_t begin, end;
};
-// TODO: if the number of unigrams is a concern, also bit pack these records.
+// TODO: if the number of unigrams is a concern, also bit pack these records.
struct UnigramValue {
ProbBackoff weights;
uint64_t next;
@@ -44,24 +44,24 @@ class UnigramPointer {
class Unigram {
public:
Unigram() {}
-
+
void Init(void *start) {
unigram_ = static_cast<UnigramValue*>(start);
}
-
+
static uint64_t Size(uint64_t count) {
- // +1 in case unknown doesn't appear. +1 for the final next.
+ // +1 in case unknown doesn't appear. +1 for the final next.
return (count + 2) * sizeof(UnigramValue);
}
-
+
const ProbBackoff &Lookup(WordIndex index) const { return unigram_[index].weights; }
-
+
ProbBackoff &Unknown() { return unigram_[0].weights; }
UnigramValue *Raw() {
return unigram_;
}
-
+
UnigramPointer Find(WordIndex word, NodeRange &next) const {
UnigramValue *val = unigram_ + word;
next.begin = val->next;
@@ -71,7 +71,7 @@ class Unigram {
private:
UnigramValue *unigram_;
-};
+};
class BitPacked {
public:
@@ -99,7 +99,7 @@ template <class Bhiksha> class BitPackedMiddle : public BitPacked {
public:
static uint64_t Size(uint8_t quant_bits, uint64_t entries, uint64_t max_vocab, uint64_t max_next, const Config &config);
- // next_source need not be initialized.
+ // next_source need not be initialized.
BitPackedMiddle(void *base, uint8_t quant_bits, uint64_t entries, uint64_t max_vocab, uint64_t max_next, const BitPacked &next_source, const Config &config);
util::BitAddress Insert(WordIndex word);
diff --git a/lm/trie_sort.cc b/lm/trie_sort.cc
index c3f468746..33a2f96b5 100644
--- a/lm/trie_sort.cc
+++ b/lm/trie_sort.cc
@@ -27,7 +27,7 @@ namespace {
typedef util::SizedIterator NGramIter;
-// Proxy for an entry except there is some extra cruft between the entries. This is used to sort (n-1)-grams using the same memory as the sorted n-grams.
+// Proxy for an entry except there is some extra cruft between the entries. This is used to sort (n-1)-grams using the same memory as the sorted n-grams.
class PartialViewProxy {
public:
PartialViewProxy() : attention_size_(0), inner_() {}
@@ -64,7 +64,7 @@ class PartialViewProxy {
typedef util::SizedInnerIterator InnerIterator;
InnerIterator &Inner() { return inner_; }
- const InnerIterator &Inner() const { return inner_; }
+ const InnerIterator &Inner() const { return inner_; }
InnerIterator inner_;
};
@@ -78,7 +78,7 @@ FILE *DiskFlush(const void *mem_begin, const void *mem_end, const std::string &t
FILE *WriteContextFile(uint8_t *begin, uint8_t *end, const std::string &temp_prefix, std::size_t entry_size, unsigned char order) {
const size_t context_size = sizeof(WordIndex) * (order - 1);
- // Sort just the contexts using the same memory.
+ // Sort just the contexts using the same memory.
PartialIter context_begin(PartialViewProxy(begin + sizeof(WordIndex), entry_size, context_size));
PartialIter context_end(PartialViewProxy(end + sizeof(WordIndex), entry_size, context_size));
@@ -91,7 +91,7 @@ FILE *WriteContextFile(uint8_t *begin, uint8_t *end, const std::string &temp_pre
util::scoped_FILE out(util::FMakeTemp(temp_prefix));
- // Write out to file and uniqueify at the same time. Could have used unique_copy if there was an appropriate OutputIterator.
+ // Write out to file and uniqueify at the same time. Could have used unique_copy if there was an appropriate OutputIterator.
if (context_begin == context_end) return out.release();
PartialIter i(context_begin);
util::WriteOrThrow(out.get(), i->Data(), context_size);
@@ -118,7 +118,7 @@ struct ThrowCombine {
}
};
-// Useful for context files that just contain records with no value.
+// Useful for context files that just contain records with no value.
struct FirstCombine {
void operator()(std::size_t entry_size, unsigned char /*order*/, const void *first, const void * /*second*/, FILE *out) const {
util::WriteOrThrow(out, first, entry_size);
@@ -172,7 +172,7 @@ void RecordReader::Overwrite(const void *start, std::size_t amount) {
util::WriteOrThrow(file_, start, amount);
long forward = entry_size_ - internal - amount;
#if !defined(_WIN32) && !defined(_WIN64)
- if (forward)
+ if (forward)
#endif
UTIL_THROW_IF(fseek(file_, forward, SEEK_CUR), util::ErrnoException, "Couldn't seek forwards past revision");
}
@@ -191,7 +191,7 @@ SortedFiles::SortedFiles(const Config &config, util::FilePiece &f, std::vector<u
PositiveProbWarn warn(config.positive_log_probability);
unigram_.reset(util::MakeTemp(file_prefix));
{
- // In case <unk> appears.
+ // In case <unk> appears.
size_t size_out = (counts[0] + 1) * sizeof(ProbBackoff);
util::scoped_mmap unigram_mmap(util::MapZeroedWrite(unigram_.get(), size_out), size_out);
Read1Grams(f, counts[0], vocab, reinterpret_cast<ProbBackoff*>(unigram_mmap.get()), warn);
@@ -199,7 +199,7 @@ SortedFiles::SortedFiles(const Config &config, util::FilePiece &f, std::vector<u
if (!vocab.SawUnk()) ++counts[0];
}
- // Only use as much buffer as we need.
+ // Only use as much buffer as we need.
size_t buffer_use = 0;
for (unsigned int order = 2; order < counts.size(); ++order) {
buffer_use = std::max<size_t>(buffer_use, static_cast<size_t>((sizeof(WordIndex) * order + 2 * sizeof(float)) * counts[order - 1]));
@@ -240,7 +240,7 @@ class Closer {
void SortedFiles::ConvertToSorted(util::FilePiece &f, const SortedVocabulary &vocab, const std::vector<uint64_t> &counts, const std::string &file_prefix, unsigned char order, PositiveProbWarn &warn, void *mem, std::size_t mem_size) {
ReadNGramHeader(f, order);
const size_t count = counts[order - 1];
- // Size of weights. Does it include backoff?
+ // Size of weights. Does it include backoff?
const size_t words_size = sizeof(WordIndex) * order;
const size_t weights_size = sizeof(float) + ((order == counts.size()) ? 0 : sizeof(float));
const size_t entry_size = words_size + weights_size;
@@ -264,9 +264,9 @@ void SortedFiles::ConvertToSorted(util::FilePiece &f, const SortedVocabulary &vo
ReadNGram(f, order, vocab, it, *reinterpret_cast<ProbBackoff*>(out + words_size), warn);
}
}
- // Sort full records by full n-gram.
+ // Sort full records by full n-gram.
util::SizedProxy proxy_begin(begin, entry_size), proxy_end(out_end, entry_size);
- // parallel_sort uses too much RAM. TODO: figure out why windows sort doesn't like my proxies.
+ // parallel_sort uses too much RAM. TODO: figure out why windows sort doesn't like my proxies.
#if defined(_WIN32) || defined(_WIN64)
std::stable_sort
#else
@@ -279,7 +279,7 @@ void SortedFiles::ConvertToSorted(util::FilePiece &f, const SortedVocabulary &vo
done += (out_end - begin) / entry_size;
}
- // All individual files created. Merge them.
+ // All individual files created. Merge them.
while (files.size() > 1) {
files.push_back(MergeSortedFiles(files[0], files[1], file_prefix, weights_size, order, ThrowCombine()));
diff --git a/lm/trie_sort.hh b/lm/trie_sort.hh
index e5406d9b6..594efee51 100644
--- a/lm/trie_sort.hh
+++ b/lm/trie_sort.hh
@@ -1,4 +1,4 @@
-// Step of trie builder: create sorted files.
+// Step of trie builder: create sorted files.
#ifndef LM_TRIE_SORT_H
#define LM_TRIE_SORT_H
@@ -101,7 +101,7 @@ class SortedFiles {
private:
void ConvertToSorted(util::FilePiece &f, const SortedVocabulary &vocab, const std::vector<uint64_t> &counts, const std::string &prefix, unsigned char order, PositiveProbWarn &warn, void *mem, std::size_t mem_size);
-
+
util::scoped_fd unigram_;
util::scoped_FILE full_[KENLM_MAX_ORDER - 1], context_[KENLM_MAX_ORDER - 1];
diff --git a/lm/value.hh b/lm/value.hh
index 36e870848..d017d59fc 100644
--- a/lm/value.hh
+++ b/lm/value.hh
@@ -39,7 +39,7 @@ template <class Weights> class GenericProbingProxy {
const Weights *to_;
};
-// Basic proxy for trie unigrams.
+// Basic proxy for trie unigrams.
template <class Weights> class GenericTrieUnigramProxy {
public:
explicit GenericTrieUnigramProxy(const Weights &to) : to_(&to) {}
@@ -113,7 +113,7 @@ struct RestValue {
float Rest() const { return to_->rest; }
};
-// gcc 4.1 doesn't properly back dependent types :-(.
+// gcc 4.1 doesn't properly back dependent types :-(.
#pragma pack(push)
#pragma pack(4)
struct ProbingEntry {
diff --git a/lm/value_build.cc b/lm/value_build.cc
index 3ec3dce2a..ac623a6d9 100644
--- a/lm/value_build.cc
+++ b/lm/value_build.cc
@@ -3,7 +3,7 @@
#include "lm/model.hh"
#include "lm/read_arpa.hh"
-namespace lm {
+namespace lm {
namespace ngram {
template <class Model> LowerRestBuild<Model>::LowerRestBuild(const Config &config, unsigned int order, const typename Model::Vocabulary &vocab) {
@@ -12,8 +12,8 @@ template <class Model> LowerRestBuild<Model>::LowerRestBuild(const Config &confi
for_lower.write_mmap = NULL;
for_lower.rest_lower_files.clear();
- // Unigram models aren't supported, so this is a custom loader.
- // TODO: optimize the unigram loading?
+ // Unigram models aren't supported, so this is a custom loader.
+ // TODO: optimize the unigram loading?
{
util::FilePiece uni(config.rest_lower_files[0].c_str());
std::vector<uint64_t> number;
@@ -44,7 +44,7 @@ template <class Model> LowerRestBuild<Model>::LowerRestBuild(const Config &confi
throw;
}
- // TODO: force/check same vocab.
+ // TODO: force/check same vocab.
}
template <class Model> LowerRestBuild<Model>::~LowerRestBuild() {
diff --git a/lm/value_build.hh b/lm/value_build.hh
index 6fd26ef8f..49989ab42 100644
--- a/lm/value_build.hh
+++ b/lm/value_build.hh
@@ -57,7 +57,7 @@ class MaxRestBuild {
return true;
}
- // Probing does need to go back to unigram.
+ // Probing does need to go back to unigram.
const static bool kMarkEvenLower = true;
};
diff --git a/lm/virtual_interface.hh b/lm/virtual_interface.hh
index e138ac14e..ea491fbf7 100644
--- a/lm/virtual_interface.hh
+++ b/lm/virtual_interface.hh
@@ -15,16 +15,16 @@ template <class T, class U, class V> class ModelFacade;
/* Vocabulary interface. Call Index(string) and get a word index for use in
* calling Model. It provides faster convenience functions for <s>, </s>, and
- * <unk> although you can also find these using Index.
+ * <unk> although you can also find these using Index.
*
* Some models do not load the mapping from index to string. If you need this,
* check if the model Vocabulary class implements such a function and access it
- * directly.
+ * directly.
*
* The Vocabulary object is always owned by the Model and can be retrieved from
* the Model using BaseVocabulary() for this abstract interface or
* GetVocabulary() for the actual implementation (in which case you'll need the
- * actual implementation of the Model too).
+ * actual implementation of the Model too).
*/
class Vocabulary {
public:
@@ -36,7 +36,7 @@ class Vocabulary {
/* Most implementations allow StringPiece lookups and need only override
* Index(StringPiece). SRI requires null termination and overrides all
- * three methods.
+ * three methods.
*/
virtual WordIndex Index(const StringPiece &str) const = 0;
virtual WordIndex Index(const std::string &str) const {
@@ -47,7 +47,7 @@ class Vocabulary {
}
protected:
- // Call SetSpecial afterward.
+ // Call SetSpecial afterward.
Vocabulary() {}
Vocabulary(WordIndex begin_sentence, WordIndex end_sentence, WordIndex not_found) {
@@ -59,13 +59,13 @@ class Vocabulary {
WordIndex begin_sentence_, end_sentence_, not_found_;
private:
- // Disable copy constructors. They're private and undefined.
+ // Disable copy constructors. They're private and undefined.
// Ersatz boost::noncopyable.
Vocabulary(const Vocabulary &);
Vocabulary &operator=(const Vocabulary &);
};
-/* There are two ways to access a Model.
+/* There are two ways to access a Model.
*
*
* OPTION 1: Access the Model directly (e.g. lm::ngram::Model in model.hh).
@@ -90,29 +90,29 @@ class Vocabulary {
* unsigned int Order() const;
*
* NB: In case you're wondering why the model implementation looks like it's
- * missing these methods, see facade.hh.
+ * missing these methods, see facade.hh.
*
* This is the fastest way to use a model and presents a normal State class to
- * be included in a hypothesis state structure.
+ * be included in a hypothesis state structure.
*
*
- * OPTION 2: Use the virtual interface below.
+ * OPTION 2: Use the virtual interface below.
*
- * The virtual interface allow you to decide which Model to use at runtime
+ * The virtual interface allow you to decide which Model to use at runtime
* without templatizing everything on the Model type. However, each Model has
* its own State class, so a single State cannot be efficiently provided (it
* would require using the maximum memory of any Model's State or memory
* allocation with each lookup). This means you become responsible for
- * allocating memory with size StateSize() and passing it to the Score or
- * FullScore functions provided here.
+ * allocating memory with size StateSize() and passing it to the Score or
+ * FullScore functions provided here.
*
* For example, cdec has a std::string containing the entire state of a
* hypothesis. It can reserve StateSize bytes in this string for the model
- * state.
+ * state.
*
* All the State objects are POD, so it's ok to use raw memory for storing
* State.
- * in_state and out_state must not have the same address.
+ * in_state and out_state must not have the same address.
*/
class Model {
public:
@@ -148,7 +148,7 @@ class Model {
unsigned char order_;
- // Disable copy constructors. They're private and undefined.
+ // Disable copy constructors. They're private and undefined.
// Ersatz boost::noncopyable.
Model(const Model &);
Model &operator=(const Model &);
diff --git a/lm/vocab.cc b/lm/vocab.cc
index 4fad78964..f6d834323 100644
--- a/lm/vocab.cc
+++ b/lm/vocab.cc
@@ -20,15 +20,15 @@ namespace ngram {
namespace detail {
uint64_t HashForVocab(const char *str, std::size_t len) {
// This proved faster than Boost's hash in speed trials: total load time Murmur 67090000, Boost 72210000
- // Chose to use 64A instead of native so binary format will be portable across 64 and 32 bit.
+ // Chose to use 64A instead of native so binary format will be portable across 64 and 32 bit.
return util::MurmurHash64A(str, len, 0);
}
} // namespace detail
namespace {
-// Normally static initialization is a bad idea but MurmurHash is pure arithmetic, so this is ok.
+// Normally static initialization is a bad idea but MurmurHash is pure arithmetic, so this is ok.
const uint64_t kUnknownHash = detail::HashForVocab("<unk>", 5);
-// Sadly some LMs have <UNK>.
+// Sadly some LMs have <UNK>.
const uint64_t kUnknownCapHash = detail::HashForVocab("<UNK>", 5);
void ReadWords(int fd, EnumerateVocab *enumerate, WordIndex expected_count, uint64_t offset) {
@@ -38,7 +38,7 @@ void ReadWords(int fd, EnumerateVocab *enumerate, WordIndex expected_count, uint
util::ReadOrThrow(fd, check_unk, 6);
UTIL_THROW_IF(
memcmp(check_unk, "<unk>", 6),
- FormatLoadException,
+ FormatLoadException,
"Vocabulary words are in the wrong place. This could be because the binary file was built with stale gcc and old kenlm. Stale gcc, including the gcc distributed with RedHat and OS X, has a bug that ignores pragma pack for template-dependent types. New kenlm works around this, so you'll save memory but have to rebuild any binary files using the probing data structure.");
if (!enumerate) return;
enumerate->Add(0, "<unk>");
@@ -58,7 +58,7 @@ void ReadWords(int fd, EnumerateVocab *enumerate, WordIndex expected_count, uint
util::ReadOrThrow(fd, &next_char, 1);
buf.push_back(next_char);
}
- // Ok now we have null terminated strings.
+ // Ok now we have null terminated strings.
for (const char *i = buf.data(); i != buf.data() + buf.size();) {
std::size_t length = strlen(i);
enumerate->Add(index++, StringPiece(i, length));
@@ -83,13 +83,13 @@ void WriteWordsWrapper::Add(WordIndex index, const StringPiece &str) {
SortedVocabulary::SortedVocabulary() : begin_(NULL), end_(NULL), enumerate_(NULL) {}
uint64_t SortedVocabulary::Size(uint64_t entries, const Config &/*config*/) {
- // Lead with the number of entries.
+ // Lead with the number of entries.
return sizeof(uint64_t) + sizeof(uint64_t) * entries;
}
void SortedVocabulary::SetupMemory(void *start, std::size_t allocated, std::size_t entries, const Config &config) {
assert(allocated >= Size(entries, config));
- // Leave space for number of entries.
+ // Leave space for number of entries.
begin_ = reinterpret_cast<uint64_t*>(start) + 1;
end_ = begin_;
saw_unk_ = false;
@@ -122,7 +122,7 @@ WordIndex SortedVocabulary::Insert(const StringPiece &str) {
strings_to_enumerate_[end_ - begin_] = StringPiece(static_cast<const char*>(copied), str.size());
}
++end_;
- // This is 1 + the offset where it was inserted to make room for unk.
+ // This is 1 + the offset where it was inserted to make room for unk.
return end_ - begin_;
}
@@ -133,7 +133,7 @@ void SortedVocabulary::FinishedLoading(ProbBackoff *reorder_vocab) {
util::JointSort(begin_, end_, values);
}
for (WordIndex i = 0; i < static_cast<WordIndex>(end_ - begin_); ++i) {
- // <unk> strikes again: +1 here.
+ // <unk> strikes again: +1 here.
enumerate_->Add(i + 1, strings_to_enumerate_[i]);
}
strings_to_enumerate_.clear();
@@ -142,7 +142,7 @@ void SortedVocabulary::FinishedLoading(ProbBackoff *reorder_vocab) {
util::JointSort(begin_, end_, reorder_vocab + 1);
}
SetSpecial(Index("<s>"), Index("</s>"), 0);
- // Save size. Excludes UNK.
+ // Save size. Excludes UNK.
*(reinterpret_cast<uint64_t*>(begin_) - 1) = end_ - begin_;
// Includes UNK.
bound_ = end_ - begin_ + 1;
@@ -161,7 +161,7 @@ const unsigned int kProbingVocabularyVersion = 0;
namespace detail {
struct ProbingVocabularyHeader {
- // Lowest unused vocab id. This is also the number of words, including <unk>.
+ // Lowest unused vocab id. This is also the number of words, including <unk>.
unsigned int version;
WordIndex bound;
};
@@ -198,7 +198,7 @@ void ProbingVocabulary::ConfigureEnumerate(EnumerateVocab *to, std::size_t /*max
WordIndex ProbingVocabulary::Insert(const StringPiece &str) {
uint64_t hashed = detail::HashForVocab(str);
- // Prevent unknown from going into the table.
+ // Prevent unknown from going into the table.
if (hashed == kUnknownHash || hashed == kUnknownCapHash) {
saw_unk_ = true;
return 0;
diff --git a/lm/vocab.hh b/lm/vocab.hh
index d6ae07b83..2659b9ba8 100644
--- a/lm/vocab.hh
+++ b/lm/vocab.hh
@@ -35,7 +35,7 @@ class WriteWordsWrapper : public EnumerateVocab {
WriteWordsWrapper(EnumerateVocab *inner);
~WriteWordsWrapper();
-
+
void Add(WordIndex index, const StringPiece &str);
const std::string &Buffer() const { return buffer_; }
@@ -46,7 +46,7 @@ class WriteWordsWrapper : public EnumerateVocab {
std::string buffer_;
};
-// Vocabulary based on sorted uniform find storing only uint64_t values and using their offsets as indices.
+// Vocabulary based on sorted uniform find storing only uint64_t values and using their offsets as indices.
class SortedVocabulary : public base::Vocabulary {
public:
SortedVocabulary();
@@ -67,7 +67,7 @@ class SortedVocabulary : public base::Vocabulary {
// Size for purposes of file writing
static uint64_t Size(uint64_t entries, const Config &config);
- // Vocab words are [0, Bound()) Only valid after FinishedLoading/LoadedBinary.
+ // Vocab words are [0, Bound()) Only valid after FinishedLoading/LoadedBinary.
WordIndex Bound() const { return bound_; }
// Everything else is for populating. I'm too lazy to hide and friend these, but you'll only get a const reference anyway.
@@ -79,7 +79,7 @@ class SortedVocabulary : public base::Vocabulary {
WordIndex Insert(const StringPiece &str);
- // Reorders reorder_vocab so that the IDs are sorted.
+ // Reorders reorder_vocab so that the IDs are sorted.
void FinishedLoading(ProbBackoff *reorder_vocab);
// Trie stores the correct counts including <unk> in the header. If this was previously sized based on a count exluding <unk>, padding with 8 bytes will make it the correct size based on a count including <unk>.
@@ -98,7 +98,7 @@ class SortedVocabulary : public base::Vocabulary {
EnumerateVocab *enumerate_;
- // Actual strings. Used only when loading from ARPA and enumerate_ != NULL
+ // Actual strings. Used only when loading from ARPA and enumerate_ != NULL
util::Pool string_backing_;
std::vector<StringPiece> strings_to_enumerate_;
@@ -123,7 +123,7 @@ struct ProbingVocabularyEntry {
};
#pragma pack(pop)
-// Vocabulary storing a map from uint64_t to WordIndex.
+// Vocabulary storing a map from uint64_t to WordIndex.
class ProbingVocabulary : public base::Vocabulary {
public:
ProbingVocabulary();
@@ -137,7 +137,7 @@ class ProbingVocabulary : public base::Vocabulary {
// This just unwraps Config to get the probing_multiplier.
static uint64_t Size(uint64_t entries, const Config &config);
- // Vocab words are [0, Bound()).
+ // Vocab words are [0, Bound()).
WordIndex Bound() const { return bound_; }
// Everything else is for populating. I'm too lazy to hide and friend these, but you'll only get a const reference anyway.
diff --git a/lm/weights.hh b/lm/weights.hh
index da1963d83..f14312753 100644
--- a/lm/weights.hh
+++ b/lm/weights.hh
@@ -1,13 +1,13 @@
#ifndef LM_WEIGHTS_H
#define LM_WEIGHTS_H
-// Weights for n-grams. Probability and possibly a backoff.
+// Weights for n-grams. Probability and possibly a backoff.
namespace lm {
struct Prob {
float prob;
};
-// No inheritance so this will be a POD.
+// No inheritance so this will be a POD.
struct ProbBackoff {
float prob;
float backoff;
diff --git a/lm/wrappers/nplm.cc b/lm/wrappers/nplm.cc
index edc7b5b72..9bd7c1ed8 100644
--- a/lm/wrappers/nplm.cc
+++ b/lm/wrappers/nplm.cc
@@ -10,7 +10,7 @@
namespace lm {
namespace np {
-Vocabulary::Vocabulary(const nplm::vocabulary &vocab)
+Vocabulary::Vocabulary(const nplm::vocabulary &vocab)
: base::Vocabulary(vocab.lookup_word("<s>"), vocab.lookup_word("</s>"), vocab.lookup_word("<unk>")),
vocab_(vocab), null_word_(vocab.lookup_word("<null>")) {}
@@ -60,7 +60,7 @@ nplm::neuralLM *LoadNPLM(const std::string &file) {
}
} // namespace
-Model::Model(const std::string &file, std::size_t cache)
+Model::Model(const std::string &file, std::size_t cache)
: base_instance_(LoadNPLM(file)), vocab_(base_instance_->get_vocabulary()), cache_size_(cache) {
UTIL_THROW_IF(base_instance_->get_order() > NPLM_MAX_ORDER, util::Exception, "This NPLM has order " << (unsigned int)base_instance_->get_order() << " but the KenLM wrapper was compiled with " << NPLM_MAX_ORDER << ". Change the defintion of NPLM_MAX_ORDER and recompile.");
// log10 compatible with backoff models.
diff --git a/lm/wrappers/nplm.hh b/lm/wrappers/nplm.hh
index 416281de2..82b38fdd7 100644
--- a/lm/wrappers/nplm.hh
+++ b/lm/wrappers/nplm.hh
@@ -9,7 +9,7 @@
#include <boost/scoped_ptr.hpp>
/* Wrapper to NPLM "by Ashish Vaswani, with contributions from David Chiang
- * and Victoria Fossum."
+ * and Victoria Fossum."
* http://nlg.isi.edu/software/nplm/
*/
diff --git a/mert/Data.cpp b/mert/Data.cpp
index 49c1239e5..98f6c8399 100644
--- a/mert/Data.cpp
+++ b/mert/Data.cpp
@@ -17,6 +17,7 @@
#include "util/exception.hh"
#include "util/file_piece.hh"
+#include "util/random.hh"
#include "util/tokenize_piece.hh"
#include "util/string_piece.hh"
#include "FeatureDataIterator.h"
@@ -286,7 +287,7 @@ void Data::createShards(size_t shard_count, float shard_size, const string& scor
} else {
//create shards by randomly sampling
for (size_t i = 0; i < floor(shard_size+0.5); ++i) {
- shard_contents.push_back(rand() % data_size);
+ shard_contents.push_back(util::rand_excl(data_size));
}
}
diff --git a/mert/ForestRescoreTest.cpp b/mert/ForestRescoreTest.cpp
index 23668ab20..f1a1c8423 100644
--- a/mert/ForestRescoreTest.cpp
+++ b/mert/ForestRescoreTest.cpp
@@ -248,7 +248,7 @@ BOOST_AUTO_TEST_CASE(viterbi_full_hypergraph) {
Vocab vocab;
//References
ReferenceSet references;
- references.AddLine(0,"in addition to EU support for businesses , also the administration of national business support will be concentrated in four Centres for Economic Development , Transport and Environment ( ELY Centres ) , starting from mid @-@ September .",vocab);
+ references.AddLine(0,"in addition to EU support for businesses , also the administration of national business support will be concentrated in four Centres for Economic Development , Transport and Environment ( ELY Centres ) , starting from mid @-@ September .",vocab);
//Load the hypergraph
Graph graph(vocab);
util::scoped_fd fd(util::OpenReadOrThrow("mert/hgtest/0.gz"));
diff --git a/mert/Point.cpp b/mert/Point.cpp
index 55dc6a6b2..681d3ab3e 100644
--- a/mert/Point.cpp
+++ b/mert/Point.cpp
@@ -3,6 +3,7 @@
#include <cmath>
#include <cstdlib>
#include "util/exception.hh"
+#include "util/random.hh"
#include "FeatureStats.h"
#include "Optimizer.h"
@@ -57,10 +58,8 @@ void Point::Randomize()
UTIL_THROW_IF(m_min.size() != Point::m_dim, util::Exception, "Error");
UTIL_THROW_IF(m_max.size() != Point::m_dim, util::Exception, "Error");
- for (unsigned int i = 0; i < size(); i++) {
- operator[](i) = m_min[i] +
- static_cast<float>(random()) / static_cast<float>(RAND_MAX) * (m_max[i] - m_min[i]);
- }
+ for (unsigned int i = 0; i < size(); i++)
+ operator[](i) = util::rand_incl(m_min[i], m_max[i]);
}
double Point::operator*(const FeatureStats& F) const
diff --git a/mert/TER/bestShiftStruct.cpp b/mert/TER/bestShiftStruct.cpp
index 0ba3071a7..c8d44e3e3 100644
--- a/mert/TER/bestShiftStruct.cpp
+++ b/mert/TER/bestShiftStruct.cpp
@@ -53,6 +53,7 @@ string bestShiftStruct::toString()
s << m_best_shift->toString() << endl;
s << m_best_align->toString() << endl;
// s << (*m_empty) << endl;
+ return s.str();
}
bool bestShiftStruct::getEmpty()
{
diff --git a/mert/TODO b/mert/TODO
index 21b4ce04e..4ceb628d3 100644
--- a/mert/TODO
+++ b/mert/TODO
@@ -5,11 +5,8 @@
- check that --pairwise-ranked is compatible with all optimization metrics
-- Replace the standard rand() currently used in MERT and PRO with better
- random generators such as Boost's random generators (e.g., boost::mt19937).
- - create a Random class to hide the details, i.e., how to generate
- random numbers, which allows us to use custom random generators more
- easily.
+- Use better random generators in util/random.cc, e.g. boost::mt19937.
+ - Support plugging of custom random generators.
Pros:
- In MERT, you might want to use the random restarting technique to avoid
diff --git a/mert/evaluator.cpp b/mert/evaluator.cpp
index 026abf397..59ffaf3cd 100644
--- a/mert/evaluator.cpp
+++ b/mert/evaluator.cpp
@@ -16,6 +16,7 @@
#include "Timer.h"
#include "Util.h"
#include "Data.h"
+#include "util/random.hh"
using namespace std;
using namespace MosesTuning;
@@ -94,7 +95,7 @@ void EvaluatorUtil::evaluate(const string& candFile, int bootstrap, bool nbest_i
for (int i = 0; i < bootstrap; ++i) {
ScoreData scoredata(g_scorer);
for (int j = 0; j < n; ++j) {
- int randomIndex = random() % n;
+ const int randomIndex = util::rand_excl(n);
scoredata.add(entries[randomIndex], j);
}
g_scorer->setScoreData(&scoredata);
@@ -284,10 +285,10 @@ void InitSeed(const ProgramOption *opt)
{
if (opt->has_seed) {
cerr << "Seeding random numbers with " << opt->seed << endl;
- srandom(opt->seed);
+ util::rand_init(opt->seed);
} else {
cerr << "Seeding random numbers with system clock " << endl;
- srandom(time(NULL));
+ util::rand_init();
}
}
diff --git a/mert/kbmira.cpp b/mert/kbmira.cpp
index 5a119e875..092176984 100644
--- a/mert/kbmira.cpp
+++ b/mert/kbmira.cpp
@@ -40,6 +40,7 @@ de recherches du Canada
#include <boost/scoped_ptr.hpp>
#include "util/exception.hh"
+#include "util/random.hh"
#include "BleuScorer.h"
#include "HopeFearDecoder.h"
@@ -122,10 +123,10 @@ int main(int argc, char** argv)
if (vm.count("random-seed")) {
cerr << "Initialising random seed to " << seed << endl;
- srand(seed);
+ util::rand_init(seed);
} else {
cerr << "Initialising random seed from system clock" << endl;
- srand(time(NULL));
+ util::rand_init();
}
// Initialize weights
diff --git a/mert/mert.cpp b/mert/mert.cpp
index 275aa7b09..82b4cc34d 100644
--- a/mert/mert.cpp
+++ b/mert/mert.cpp
@@ -24,6 +24,7 @@
#include "Types.h"
#include "Timer.h"
#include "Util.h"
+#include "util/random.hh"
#include "moses/ThreadPool.h"
@@ -289,10 +290,10 @@ int main(int argc, char **argv)
if (option.has_seed) {
cerr << "Seeding random numbers with " << option.seed << endl;
- srandom(option.seed);
+ util::rand_init(option.seed);
} else {
cerr << "Seeding random numbers with system clock " << endl;
- srandom(time(NULL));
+ util::rand_init();
}
if (option.sparse_weights_file.size()) ++option.pdim;
diff --git a/mert/pro.cpp b/mert/pro.cpp
index 7660fe7d0..c0f9f7b57 100644
--- a/mert/pro.cpp
+++ b/mert/pro.cpp
@@ -43,6 +43,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
#include "ScoreDataIterator.h"
#include "BleuScorer.h"
#include "Util.h"
+#include "util/random.hh"
using namespace std;
using namespace MosesTuning;
@@ -141,10 +142,10 @@ int main(int argc, char** argv)
if (vm.count("random-seed")) {
cerr << "Initialising random seed to " << seed << endl;
- srand(seed);
+ util::rand_init(seed);
} else {
cerr << "Initialising random seed from system clock" << endl;
- srand(time(NULL));
+ util::rand_init();
}
if (scoreFiles.size() == 0 || featureFiles.size() == 0) {
@@ -211,11 +212,11 @@ int main(int argc, char** argv)
vector<float> scores;
size_t n_translations = hypotheses.size();
for(size_t i=0; i<n_candidates; i++) {
- size_t rand1 = rand() % n_translations;
+ size_t rand1 = util::rand_excl(n_translations);
pair<size_t,size_t> translation1 = hypotheses[rand1];
float bleu1 = smoothedSentenceBleu(scoreDataIters[translation1.first]->operator[](translation1.second), bleuSmoothing, smoothBP);
- size_t rand2 = rand() % n_translations;
+ size_t rand2 = util::rand_excl(n_translations);
pair<size_t,size_t> translation2 = hypotheses[rand2];
float bleu2 = smoothedSentenceBleu(scoreDataIters[translation2.first]->operator[](translation2.second), bleuSmoothing, smoothBP);
diff --git a/misc/merge-sorted.cc b/misc/merge-sorted.cc
index ae693215b..4550a491d 100644
--- a/misc/merge-sorted.cc
+++ b/misc/merge-sorted.cc
@@ -19,10 +19,10 @@ class Part
string my_lines[2];
size_t ctr;
public:
- string const& line() const
- {
+ string const& line() const
+ {
static string empty_line;
- return f ? my_lines[ctr%2] : empty_line;
+ return f ? my_lines[ctr%2] : empty_line;
}
Part(string _fname) : ctr(0)
@@ -32,7 +32,7 @@ public:
if (!getline(*f, my_lines[0])) f.reset();
}
- bool next()
+ bool next()
{
if (!f) return false;
if (!getline(*f, my_lines[++ctr%2]))
@@ -45,16 +45,16 @@ public:
return true;
}
- bool operator <(Part const& other) const
+ bool operator <(Part const& other) const
{ return line() < other.line(); }
- bool operator <=(Part const& other) const
+ bool operator <=(Part const& other) const
{ return line() <= other.line(); }
- bool operator >(Part const& other) const
+ bool operator >(Part const& other) const
{ return line() > other.line(); }
- bool operator >=(Part const& other) const
+ bool operator >=(Part const& other) const
{ return line() >= other.line(); }
bool go(ostream& out)
@@ -66,20 +66,20 @@ public:
out << fname << "-" << ctr - 1 << "-";
out << my_lines[(ctr - 1)%2] << endl;
}
- do
+ do
{
out << fname << " " << ctr << " ";
out << line() << "\n";
}
while (next() && my_lines[0] == my_lines[1]);
#else
- do { out << line() << "\n"; }
+ do { out << line() << "\n"; }
while (next() && my_lines[0] == my_lines[1]);
out.flush();
#endif
return f != NULL;
}
-
+
};
diff --git a/misc/pmoses/pmoses.cc b/misc/pmoses/pmoses.cc
index 8b8134adc..caf66cee5 100644
--- a/misc/pmoses/pmoses.cc
+++ b/misc/pmoses/pmoses.cc
@@ -33,25 +33,25 @@ using namespace Moses;
//Delete white spaces from the end and the begining of the string
string trim(string str) {
string::iterator it;
-
+
while ((str.length()>0)&&((*(it=str.begin()))==' ')) {
str.erase(it);
}
-
+
while ((str.length()>0)&&((*(it=(str.end()-1)))==' ')) {
str.erase(it);
}
-
+
for(unsigned i=0; i<str.length(); i++) {
if ((str[i]==' ') && ((i+1)<str.length()) && (str[i+1]==' ')) {
str=str.erase(i,1);
i--;
}
}
-
+
return str;
}
-
+
int main (int argc, char *argv[]) {
vector<FactorType> input, output;
@@ -64,12 +64,12 @@ int main (int argc, char *argv[]) {
input.push_back(0);
output.push_back(0);
-
+
+ weight.push_back(0);
weight.push_back(0);
weight.push_back(0);
weight.push_back(0);
weight.push_back(0);
- weight.push_back(0);
if (argc<3) {
cerr<<"Error: Wrong number of parameters."<<endl;
@@ -86,19 +86,19 @@ int main (int argc, char *argv[]) {
}
cerr<<"numScoreComponent: "<<numScoreComponent<<endl;
- cerr<<"numInputScores: "<<numInputScores<<endl;
+ cerr<<"numInputScores: "<<numInputScores<<endl;
PhraseDictionaryTreeAdaptor *pd=new PhraseDictionaryTreeAdaptor(numScoreComponent, numInputScores);
-
+
cerr<<"Table limit: "<<tableLimit<<endl;
cerr<<"WeightWordPenalty: "<<weightWP<<endl;
cerr<<"Source phrase: ___"<<source_str<<"___"<<endl;
-
+
if (!pd->Load(input, output, filePath, weight, tableLimit, lmList, weightWP)) {
delete pd;
return false;
}
-
+
cerr<<"-------------------------------------------------"<<endl;
FactorDirection direction;
Phrase phrase(direction);
@@ -106,15 +106,15 @@ int main (int argc, char *argv[]) {
phrase.CreateFromString(input, source_str, "|");
TargetPhraseCollection *tpc = (TargetPhraseCollection*) pd->GetTargetPhraseCollection(phrase);
- if (tpc == NULL)
+ if (tpc == NULL)
cerr<<"Not found."<<endl;
- else {
+ else {
TargetPhraseCollection::iterator iterTargetPhrase;
for (iterTargetPhrase = tpc->begin(); iterTargetPhrase != tpc->end(); ++iterTargetPhrase) {
//cerr<<(*(*iterTargetPhrase))<<endl;
-
+
stringstream strs;
- strs<<static_cast<const Phrase&>(*(*iterTargetPhrase));
+ strs<<static_cast<const Phrase&>(*(*iterTargetPhrase));
cerr<<source_str<<" => ___"<<trim(strs.str())<<"___ ";
ScoreComponentCollection scc = (*iterTargetPhrase)->GetScoreBreakdown();
cerr<<"Scores: ";
@@ -123,6 +123,6 @@ int main (int argc, char *argv[]) {
}
cerr<<endl;
}
- }
- cerr<<"-------------------------------------------------"<<endl;
+ }
+ cerr<<"-------------------------------------------------"<<endl;
}
diff --git a/moses-cmd/LatticeMBRGrid.cpp b/moses-cmd/LatticeMBRGrid.cpp
index edb4176c7..f842b1136 100644
--- a/moses-cmd/LatticeMBRGrid.cpp
+++ b/moses-cmd/LatticeMBRGrid.cpp
@@ -53,6 +53,7 @@ POSSIBILITY OF SUCH DAMAGE.
#include "util/exception.hh"
#include <boost/foreach.hpp>
+#include "moses/TranslationTask.h"
using namespace std;
using namespace Moses;
@@ -175,10 +176,13 @@ int main(int argc, char* argv[])
const vector<float>& prune_grid = grid.getGrid(lmbr_prune);
const vector<float>& scale_grid = grid.getGrid(lmbr_scale);
- for (boost::shared_ptr<InputType> source = ioWrapper->ReadInput();
- source != NULL; source = ioWrapper->ReadInput())
+ boost::shared_ptr<InputType> source;
+ while((source = ioWrapper->ReadInput()) != NULL)
{
- Manager manager(*source);
+ // set up task of translating one sentence
+ boost::shared_ptr<TranslationTask> ttask;
+ ttask = TranslationTask::create(source, ioWrapper);
+ Manager manager(ttask);
manager.Decode();
TrellisPathList nBestList;
manager.CalcNBest(nBestSize, nBestList,true);
diff --git a/moses-cmd/MainVW.cpp b/moses-cmd/MainVW.cpp
index 00df3df80..302866733 100644
--- a/moses-cmd/MainVW.cpp
+++ b/moses-cmd/MainVW.cpp
@@ -45,6 +45,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
#include "moses/FF/StatefulFeatureFunction.h"
#include "moses/FF/StatelessFeatureFunction.h"
#include "moses/TrainingTask.h"
+#include "util/random.hh"
#ifdef HAVE_PROTOBUF
#include "hypergraph.pb.h"
@@ -117,7 +118,7 @@ int main(int argc, char** argv)
//initialise random numbers
- srand(time(NULL));
+ util::rand_init();
// set up read/writing class
IFVERBOSE(1) {
@@ -153,8 +154,8 @@ int main(int argc, char** argv)
FeatureFunction::CallChangeSource(foo);
// set up task of training one sentence
- boost::shared_ptr<TrainingTask>
- task(new TrainingTask(source.get(), *ioWrapper));
+ boost::shared_ptr<TrainingTask> task;
+ task = TrainingTask::create(source, ioWrapper);
// execute task
#ifdef WITH_THREADS
diff --git a/moses/BaseManager.cpp b/moses/BaseManager.cpp
index 2c57e8336..a89bb848a 100644
--- a/moses/BaseManager.cpp
+++ b/moses/BaseManager.cpp
@@ -4,11 +4,25 @@
#include "BaseManager.h"
#include "moses/FF/StatelessFeatureFunction.h"
#include "moses/FF/StatefulFeatureFunction.h"
+#include "moses/TranslationTask.h"
using namespace std;
namespace Moses
{
+
+BaseManager::BaseManager(ttasksptr const& ttask)
+ : m_ttask(ttask), m_source(*(ttask->GetSource().get()))
+{ }
+
+const InputType&
+BaseManager::GetSource() const
+{ return m_source; }
+
+
+
+
+
/***
* print surface factor only for the given phrase
*/
diff --git a/moses/BaseManager.h b/moses/BaseManager.h
index c0b6d22c1..d7a25e7fd 100644
--- a/moses/BaseManager.h
+++ b/moses/BaseManager.h
@@ -1,3 +1,4 @@
+// -*- c++ -*-
#pragma once
#include <iostream>
@@ -14,11 +15,11 @@ class OutputCollector;
class BaseManager
{
protected:
- const InputType &m_source; /**< source sentence to be translated */
+ // const InputType &m_source; /**< source sentence to be translated */
+ ttaskwptr m_ttask;
+ InputType const& m_source;
- BaseManager(const InputType &source)
- :m_source(source) {
- }
+ BaseManager(ttasksptr const& ttask);
// output
typedef std::vector<std::pair<Moses::Word, Moses::WordsRange> > ApplicationContext;
@@ -45,13 +46,10 @@ protected:
}
public:
- virtual ~BaseManager() {
- }
+ virtual ~BaseManager() { }
//! the input sentence being decoded
- const InputType& GetSource() const {
- return m_source;
- }
+ const InputType& GetSource() const;
virtual void Decode() = 0;
// outputs
diff --git a/moses/ChartCellCollection.cpp b/moses/ChartCellCollection.cpp
index 46392261d..55c50a449 100644
--- a/moses/ChartCellCollection.cpp
+++ b/moses/ChartCellCollection.cpp
@@ -22,6 +22,7 @@
#include "ChartCellCollection.h"
#include "InputType.h"
#include "WordsRange.h"
+#include "ChartManager.h"
namespace Moses
{
@@ -51,7 +52,7 @@ private:
\param manager reference back to the manager
*/
ChartCellCollection::ChartCellCollection(const InputType &input, ChartManager &manager)
- :ChartCellCollectionBase(input, CubeCellFactory(manager)) {}
+ :ChartCellCollectionBase(input, CubeCellFactory(manager), manager.GetParser()) {}
} // namespace
diff --git a/moses/ChartCellCollection.h b/moses/ChartCellCollection.h
index 1edeb4450..5945ce12a 100644
--- a/moses/ChartCellCollection.h
+++ b/moses/ChartCellCollection.h
@@ -30,12 +30,15 @@ namespace Moses
{
class InputType;
class ChartManager;
+class ChartParser;
class ChartCellCollectionBase
{
public:
- template <class Factory> ChartCellCollectionBase(const InputType &input, const Factory &factory) :
- m_cells(input.GetSize()) {
+ template <class Factory> ChartCellCollectionBase(const InputType &input,
+ const Factory &factory,
+ const ChartParser &parser)
+ :m_cells(input.GetSize()) {
size_t size = input.GetSize();
for (size_t startPos = 0; startPos < size; ++startPos) {
diff --git a/moses/ChartManager.cpp b/moses/ChartManager.cpp
index d6846d6df..a0b39167a 100644
--- a/moses/ChartManager.cpp
+++ b/moses/ChartManager.cpp
@@ -39,21 +39,21 @@ using namespace std;
namespace Moses
{
+
extern bool g_mosesDebug;
/* constructor. Initialize everything prior to decoding a particular sentence.
* \param source the sentence to be decoded
* \param system which particular set of models to use.
*/
-ChartManager::ChartManager(InputType const& source)
- :BaseManager(source)
- ,m_hypoStackColl(source, *this)
- ,m_start(clock())
- ,m_hypothesisId(0)
- ,m_parser(source, m_hypoStackColl)
- ,m_translationOptionList(StaticData::Instance().GetRuleLimit(), source)
-{
-}
+ChartManager::ChartManager(ttasksptr const& ttask)
+ : BaseManager(ttask)
+ , m_hypoStackColl(m_source, *this)
+ , m_start(clock())
+ , m_hypothesisId(0)
+ , m_parser(ttask, m_hypoStackColl)
+ , m_translationOptionList(StaticData::Instance().GetRuleLimit(), m_source)
+{ }
ChartManager::~ChartManager()
{
@@ -67,6 +67,7 @@ ChartManager::~ChartManager()
//! decode the sentence. This contains the main laps. Basically, the CKY++ algorithm
void ChartManager::Decode()
{
+
VERBOSE(1,"Translating: " << m_source << endl);
ResetSentenceStats(m_source);
diff --git a/moses/ChartManager.h b/moses/ChartManager.h
index 745a792cb..bf5851806 100644
--- a/moses/ChartManager.h
+++ b/moses/ChartManager.h
@@ -33,8 +33,6 @@
#include "BaseManager.h"
#include "moses/Syntax/KBestExtractor.h"
-#include <boost/shared_ptr.hpp>
-
namespace Moses
{
@@ -103,7 +101,7 @@ private:
void Backtrack(const ChartHypothesis *hypo) const;
public:
- ChartManager(InputType const& source);
+ ChartManager(ttasksptr const& ttask);
~ChartManager();
void Decode();
void AddXmlChartOptions();
diff --git a/moses/ChartParser.cpp b/moses/ChartParser.cpp
index 5281bafdf..66e22a055 100644
--- a/moses/ChartParser.cpp
+++ b/moses/ChartParser.cpp
@@ -28,6 +28,7 @@
#include "DecodeGraph.h"
#include "moses/FF/UnknownWordPenaltyProducer.h"
#include "moses/TranslationModel/PhraseDictionary.h"
+#include "moses/TranslationTask.h"
using namespace std;
using namespace Moses;
@@ -35,7 +36,10 @@ using namespace Moses;
namespace Moses
{
-ChartParserUnknown::ChartParserUnknown() {}
+ChartParserUnknown
+::ChartParserUnknown(ttasksptr const& ttask)
+ : m_ttask(ttask)
+{ }
ChartParserUnknown::~ChartParserUnknown()
{
@@ -136,13 +140,16 @@ void ChartParserUnknown::Process(const Word &sourceWord, const WordsRange &range
}
}
-ChartParser::ChartParser(InputType const &source, ChartCellCollectionBase &cells) :
- m_decodeGraphList(StaticData::Instance().GetDecodeGraphs()),
- m_source(source)
+ChartParser
+::ChartParser(ttasksptr const& ttask, ChartCellCollectionBase &cells)
+ : m_ttask(ttask)
+ , m_unknown(ttask)
+ , m_decodeGraphList(StaticData::Instance().GetDecodeGraphs())
+ , m_source(*(ttask->GetSource().get()))
{
const StaticData &staticData = StaticData::Instance();
- staticData.InitializeForInput(source);
+ staticData.InitializeForInput(ttask);
CreateInputPaths(m_source);
const std::vector<PhraseDictionary*> &dictionaries = PhraseDictionary::GetColl();
@@ -161,7 +168,7 @@ ChartParser::ChartParser(InputType const &source, ChartCellCollectionBase &cells
ChartParser::~ChartParser()
{
RemoveAllInColl(m_ruleLookupManagers);
- StaticData::Instance().CleanUpAfterSentenceProcessing(m_source);
+ StaticData::Instance().CleanUpAfterSentenceProcessing(m_ttask.lock());
InputPathMatrix::const_iterator iterOuter;
for (iterOuter = m_inputPathMatrix.begin(); iterOuter != m_inputPathMatrix.end(); ++iterOuter) {
diff --git a/moses/ChartParser.h b/moses/ChartParser.h
index e438cf8ad..372a05f60 100644
--- a/moses/ChartParser.h
+++ b/moses/ChartParser.h
@@ -1,3 +1,4 @@
+// -*- c++ -*-
// $Id$
// vim:tabstop=2
/***********************************************************************
@@ -42,8 +43,9 @@ class DecodeGraph;
class ChartParserUnknown
{
+ ttaskwptr m_ttask;
public:
- ChartParserUnknown();
+ ChartParserUnknown(ttasksptr const& ttask);
~ChartParserUnknown();
void Process(const Word &sourceWord, const WordsRange &range, ChartParserCallback &to);
@@ -59,8 +61,9 @@ private:
class ChartParser
{
+ ttaskwptr m_ttask;
public:
- ChartParser(const InputType &source, ChartCellCollectionBase &cells);
+ ChartParser(ttasksptr const& ttask, ChartCellCollectionBase &cells);
~ChartParser();
void Create(const WordsRange &range, ChartParserCallback &to);
diff --git a/moses/ConfusionNet.cpp b/moses/ConfusionNet.cpp
index e8a54a061..e305a4147 100644
--- a/moses/ConfusionNet.cpp
+++ b/moses/ConfusionNet.cpp
@@ -291,7 +291,7 @@ std::ostream& operator<<(std::ostream& out,const ConfusionNet& cn)
TranslationOptionCollection*
ConfusionNet::
-CreateTranslationOptionCollection() const
+CreateTranslationOptionCollection(ttasksptr const& ttask) const
{
size_t maxNoTransOptPerCoverage
= StaticData::Instance().GetMaxNoTransOptPerCoverage();
@@ -299,7 +299,7 @@ CreateTranslationOptionCollection() const
= StaticData::Instance().GetTranslationOptionThreshold();
TranslationOptionCollection *rv
= new TranslationOptionCollectionConfusionNet
- (*this, maxNoTransOptPerCoverage, translationOptionThreshold);
+ (ttask, *this, maxNoTransOptPerCoverage, translationOptionThreshold);
assert(rv);
return rv;
}
diff --git a/moses/ConfusionNet.h b/moses/ConfusionNet.h
index 06a628c07..e274f2491 100644
--- a/moses/ConfusionNet.h
+++ b/moses/ConfusionNet.h
@@ -73,7 +73,8 @@ public:
std::string GetStringRep(const std::vector<FactorType> factorsToPrint) const; //TODO not defined
const Word& GetWord(size_t pos) const;
- TranslationOptionCollection* CreateTranslationOptionCollection() const;
+ TranslationOptionCollection*
+ CreateTranslationOptionCollection(ttasksptr const& ttask) const;
const NonTerminalSet &GetLabelSet(size_t /*startPos*/, size_t /*endPos*/) const {
return m_defaultLabelSet;
diff --git a/moses/ContextScope.h b/moses/ContextScope.h
new file mode 100644
index 000000000..ed9f854ff
--- /dev/null
+++ b/moses/ContextScope.h
@@ -0,0 +1,97 @@
+// -*- c++ -*-
+// A class to store "local" information (such as task-specific caches).
+// The idea is for each translation task to have a scope, which stores
+// shared pointers to task-specific objects such as caches and priors.
+// Since these objects are referenced via shared pointers, sopes can
+// share information.
+#pragma once
+
+#ifdef WITH_THREADS
+#include <boost/thread/shared_mutex.hpp>
+#include <boost/thread/locks.hpp>
+#include <boost/foreach.hpp>
+#endif
+
+#include <map>
+#include <boost/shared_ptr.hpp>
+// #include "thread_safe_container.h"
+
+namespace Moses
+{
+ class ContextScope
+ {
+ protected:
+ typedef std::map<void const*, boost::shared_ptr<void> > scratchpad_t;
+ typedef scratchpad_t::iterator iter_t;
+ typedef scratchpad_t::value_type entry_t;
+ typedef scratchpad_t::const_iterator const_iter_t;
+ scratchpad_t m_scratchpad;
+ mutable boost::shared_mutex m_lock;
+ public:
+ // class write_access
+ // {
+ // boost::unique_lock<boost::shared_mutex> m_lock;
+ // public:
+
+ // write_access(boost::shared_mutex& lock)
+ // : m_lock(lock)
+ // { }
+
+ // write_access(write_access& other)
+ // {
+ // swap(m_lock, other.m_lock);
+ // }
+ // };
+
+ // write_access lock() const
+ // {
+ // return write_access(m_lock);
+ // }
+
+ template<typename T>
+ boost::shared_ptr<void> const&
+ set(void const* const key, boost::shared_ptr<T> const& val)
+ {
+ boost::unique_lock<boost::shared_mutex> lock(m_lock);
+ return (m_scratchpad[key] = val);
+ }
+
+ template<typename T>
+ boost::shared_ptr<T> const
+ get(void const* key, bool CreateNewIfNecessary=false)
+ {
+ using boost::shared_mutex;
+ using boost::upgrade_lock;
+ // T const* key = reinterpret_cast<T const*>(xkey);
+ upgrade_lock<shared_mutex> lock(m_lock);
+ iter_t m = m_scratchpad.find(key);
+ boost::shared_ptr< T > ret;
+ if (m != m_scratchpad.end())
+ {
+ if (m->second == NULL && CreateNewIfNecessary)
+ {
+ boost::upgrade_to_unique_lock<shared_mutex> xlock(lock);
+ m->second.reset(new T);
+ }
+ ret = boost::static_pointer_cast< T >(m->second);
+ return ret;
+ }
+ if (!CreateNewIfNecessary) return ret;
+ boost::upgrade_to_unique_lock<shared_mutex> xlock(lock);
+ ret.reset(new T);
+ m_scratchpad[key] = ret;
+ return ret;
+ }
+
+ ContextScope() { }
+
+ ContextScope(ContextScope const& other)
+ {
+ boost::unique_lock<boost::shared_mutex> lock1(this->m_lock);
+ boost::unique_lock<boost::shared_mutex> lock2(other.m_lock);
+ m_scratchpad = other.m_scratchpad;
+ }
+
+ };
+
+};
diff --git a/moses/DecodeStepTranslation.cpp b/moses/DecodeStepTranslation.cpp
index 3692a68ea..7ea26f8a5 100644
--- a/moses/DecodeStepTranslation.cpp
+++ b/moses/DecodeStepTranslation.cpp
@@ -215,15 +215,15 @@ const InputPath &DecodeStepTranslation::GetInputPathLEGACY(
UTIL_THROW(util::Exception, "Input path not found");
}
-void
+void
DecodeStepTranslation::
ProcessLEGACY(TranslationOption const& in,
- DecodeStep const& decodeStep,
- PartialTranslOptColl &out,
- TranslationOptionCollection *toc,
+ DecodeStep const& decodeStep,
+ PartialTranslOptColl &out,
+ TranslationOptionCollection *toc,
bool adhereTableLimit) const
{
- if (in.GetTargetPhrase().GetSize() == 0)
+ if (in.GetTargetPhrase().GetSize() == 0)
{
// word deletion
out.Add(new TranslationOption(in));
@@ -240,35 +240,35 @@ ProcessLEGACY(TranslationOption const& in,
TargetPhraseCollectionWithSourcePhrase const* phraseColl;
phraseColl = pdict->GetTargetPhraseCollectionLEGACY(toc->GetSource(),srcRange);
-
- if (phraseColl != NULL)
+
+ if (phraseColl != NULL)
{
TargetPhraseCollection::const_iterator iterTargetPhrase, iterEnd;
iterEnd = ((adhereTableLimit && tableLimit && phraseColl->GetSize() >= tableLimit)
- ? phraseColl->begin() + tableLimit : phraseColl->end());
-
- for (iterTargetPhrase = phraseColl->begin();
- iterTargetPhrase != iterEnd;
- ++iterTargetPhrase)
+ ? phraseColl->begin() + tableLimit : phraseColl->end());
+
+ for (iterTargetPhrase = phraseColl->begin();
+ iterTargetPhrase != iterEnd;
+ ++iterTargetPhrase)
{
TargetPhrase const& targetPhrase = **iterTargetPhrase;
if (targetPhrase.GetSize() != currSize ||
(IsFilteringStep() && !in.IsCompatible(targetPhrase, m_conflictFactors)))
continue;
-
+
TargetPhrase outPhrase(inPhrase);
outPhrase.Merge(targetPhrase, m_newOutputFactors);
outPhrase.EvaluateInIsolation(inputPath.GetPhrase(), m_featuresToApply); // need to do this as all non-transcores would be screwed up
-
+
TranslationOption *newTransOpt = new TranslationOption(srcRange, outPhrase);
assert(newTransOpt != NULL);
newTransOpt->SetInputPath(inputPath);
-
+
out.Add(newTransOpt);
-
+
}
- }
+ }
}
}
diff --git a/moses/ExportInterface.cpp b/moses/ExportInterface.cpp
index 00850329e..27f757b5c 100644
--- a/moses/ExportInterface.cpp
+++ b/moses/ExportInterface.cpp
@@ -27,6 +27,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
#include <sstream>
#include <vector>
+#include "util/random.hh"
#include "util/usage.hh"
#ifdef WIN32
@@ -91,7 +92,7 @@ SimpleTranslationInterface::SimpleTranslationInterface(const string &mosesIni):
exit(1);
}
- srand(time(NULL));
+ util::rand_init();
}
@@ -118,10 +119,10 @@ string SimpleTranslationInterface::translate(const string &inputString)
FeatureFunction::CallChangeSource(&*source);
// set up task of translating one sentence
- boost::shared_ptr<TranslationTask> task
+ boost::shared_ptr<TranslationTask> task
= TranslationTask::create(source, ioWrapper);
task->Run();
-
+
string output = outputStream.str();
//now trim the end whitespace
const string whitespace = " \t\f\v\n\r";
@@ -151,28 +152,28 @@ run_as_server()
string logfile; params.SetParameter(logfile, "server-log", string(""));
size_t num_threads; params.SetParameter(num_threads, "threads", size_t(10));
if (isSerial) VERBOSE(1,"Running server in serial mode." << endl);
-
+
xmlrpc_c::registry myRegistry;
-
+
xmlrpc_c::methodPtr const translator(new MosesServer::Translator(num_threads));
xmlrpc_c::methodPtr const updater(new MosesServer::Updater);
xmlrpc_c::methodPtr const optimizer(new MosesServer::Optimizer);
-
+
myRegistry.addMethod("translate", translator);
myRegistry.addMethod("updater", updater);
myRegistry.addMethod("optimize", optimizer);
-
+
xmlrpc_c::serverAbyss myAbyssServer(myRegistry, port, logfile);
-
+
XVERBOSE(1,"Listening on port " << port << endl);
- if (isSerial) { while(1) myAbyssServer.runOnce(); }
+ if (isSerial) { while(1) myAbyssServer.runOnce(); }
else myAbyssServer.run();
std::cerr << "xmlrpc_c::serverAbyss.run() returned but should not." << std::endl;
- // #pragma message("BUILDING MOSES WITH SERVER SUPPORT")
+ // #pragma message("BUILDING MOSES WITH SERVER SUPPORT")
#else
- // #pragma message("BUILDING MOSES WITHOUT SERVER SUPPORT")
- std::cerr << "Moses was compiled without server support." << endl;
+ // #pragma message("BUILDING MOSES WITHOUT SERVER SUPPORT")
+ std::cerr << "Moses was compiled without server support." << endl;
#endif
return 1;
@@ -185,18 +186,18 @@ batch_run()
const StaticData& staticData = StaticData::Instance();
//initialise random numbers
- srand(time(NULL));
+ util::rand_init();
IFVERBOSE(1) PrintUserTime("Created input-output object");
-
+
// set up read/writing class:
- boost::shared_ptr<IOWrapper> ioWrapper(new IOWrapper);
+ boost::shared_ptr<IOWrapper> ioWrapper(new IOWrapper);
UTIL_THROW_IF2(ioWrapper == NULL, "Error; Failed to create IO object"
<< " [" << HERE << "]");
-
+
// check on weights
const ScoreComponentCollection& weights = staticData.GetAllWeights();
- IFVERBOSE(2)
+ IFVERBOSE(2)
{
TRACE_ERR("The global weight vector looks like this: ");
TRACE_ERR(weights);
@@ -207,20 +208,24 @@ batch_run()
ThreadPool pool(staticData.ThreadCount());
#endif
+ std::string context_string;
+ params.SetParameter(context_string,"context-string",string(""));
+
// main loop over set of input sentences
boost::shared_ptr<InputType> source;
while ((source = ioWrapper->ReadInput()) != NULL)
{
IFVERBOSE(1) ResetUserTime();
-
+
FeatureFunction::CallChangeSource(source.get());
-
+
// set up task of translating one sentence
boost::shared_ptr<TranslationTask>
task = TranslationTask::create(source, ioWrapper);
+ task->SetContextString(context_string);
- // Allow for (sentence-)context-specific processing prior to
+ // Allow for (sentence-)context-specific processing prior to
// decoding. This can be used, for example, for context-sensitive
// phrase lookup.
FeatureFunction::SetupAll(*task);
@@ -231,7 +236,7 @@ batch_run()
// simulated post-editing requires threads (within the dynamic phrase tables)
// but runs all sentences serially, to allow updating of the bitext.
bool spe = params.isParamSpecified("spe-src");
- if (spe)
+ if (spe)
{
// simulated post-editing: always run single-threaded!
task->Run();
@@ -242,7 +247,7 @@ batch_run()
<< "missing update data for simulated post-editing.");
UTIL_THROW_IF2(!getline(*ioWrapper->spe_aln,aln), "[" << HERE << "] "
<< "missing update data for simulated post-editing.");
- BOOST_FOREACH (PhraseDictionary* pd, PhraseDictionary::GetColl())
+ BOOST_FOREACH (PhraseDictionary* pd, PhraseDictionary::GetColl())
{
Mmsapt* sapt = dynamic_cast<Mmsapt*>(pd);
if (sapt) sapt->add(src,trg,aln);
@@ -250,7 +255,7 @@ batch_run()
VERBOSE(1,"[" << HERE << " added trg] " << trg << endl);
VERBOSE(1,"[" << HERE << " added aln] " << aln << endl);
}
- }
+ }
else pool.Submit(task);
#else
pool.Submit(task);
@@ -260,7 +265,7 @@ batch_run()
task->Run();
#endif
}
-
+
// we are done, finishing up
#ifdef WITH_THREADS
pool.Stop(true); //flush remaining jobs
@@ -269,7 +274,7 @@ batch_run()
FeatureFunction::Destroy();
IFVERBOSE(1) util::PrintUsage(std::cerr);
-
+
#ifndef EXIT_RETURN
//This avoids that destructors are called (it can take a long time)
exit(EXIT_SUCCESS);
@@ -281,15 +286,16 @@ batch_run()
/** Called by main function of the command line version of the decoder **/
int decoder_main(int argc, char** argv)
{
- try
+#ifdef NDEBUG
+ try
+#endif
{
-
#ifdef HAVE_PROTOBUF
GOOGLE_PROTOBUF_VERIFY_VERSION;
#endif
-
+
// echo command line, if verbose
- IFVERBOSE(1)
+ IFVERBOSE(1)
{
TRACE_ERR("command: ");
for(int i=0; i<argc; ++i) TRACE_ERR(argv[i]<<" ");
@@ -302,31 +308,33 @@ int decoder_main(int argc, char** argv)
// load all the settings into the Parameter class
// (stores them as strings, or array of strings)
- if (!params.LoadParam(argc,argv))
+ if (!params.LoadParam(argc,argv))
exit(1);
// initialize all "global" variables, which are stored in StaticData
// note: this also loads models such as the language model, etc.
- if (!StaticData::LoadDataStatic(&params, argv[0]))
+ if (!StaticData::LoadDataStatic(&params, argv[0]))
exit(1);
-
+
// setting "-show-weights" -> just dump out weights and exit
- if (params.isParamSpecified("show-weights"))
+ if (params.isParamSpecified("show-weights"))
{
ShowWeights();
exit(0);
}
-
+
if (params.GetParam("server"))
return run_as_server();
else
return batch_run();
-
- }
- catch (const std::exception &e)
+
+ }
+#ifdef NDEBUG
+ catch (const std::exception &e)
{
std::cerr << "Exception: " << e.what() << std::endl;
return EXIT_FAILURE;
}
+#endif
}
diff --git a/moses/FF/BleuScoreFeature.cpp b/moses/FF/BleuScoreFeature.cpp
index 24887c373..a98964386 100644
--- a/moses/FF/BleuScoreFeature.cpp
+++ b/moses/FF/BleuScoreFeature.cpp
@@ -880,6 +880,7 @@ const FFState* BleuScoreFeature::EmptyHypothesisState(const InputType& input) co
bool BleuScoreFeature::IsUseable(const FactorMask &mask) const
{
+ // TODO: Was this meant to return mask[0]!?
bool ret = mask[0];
return 0;
}
diff --git a/moses/FF/CountNonTerms.cpp b/moses/FF/CountNonTerms.cpp
index 17d1c9c20..07549942e 100644
--- a/moses/FF/CountNonTerms.cpp
+++ b/moses/FF/CountNonTerms.cpp
@@ -8,7 +8,7 @@ using namespace std;
namespace Moses
{
CountNonTerms::CountNonTerms(const std::string &line)
- :StatelessFeatureFunction(line)
+ :StatelessFeatureFunction(line,true)
,m_all(true)
,m_sourceSyntax(false)
,m_targetSyntax(false)
diff --git a/moses/FF/DecodeFeature.cpp b/moses/FF/DecodeFeature.cpp
index 11c8653ea..64b120519 100644
--- a/moses/FF/DecodeFeature.cpp
+++ b/moses/FF/DecodeFeature.cpp
@@ -30,8 +30,9 @@ using namespace std;
namespace Moses
{
-DecodeFeature::DecodeFeature(const std::string &line)
- : StatelessFeatureFunction(line)
+
+DecodeFeature::DecodeFeature(const std::string &line, bool registerNow)
+ : StatelessFeatureFunction(line, registerNow)
, m_container(NULL)
{
VERBOSE(2,"DecodeFeature:" << std::endl);
diff --git a/moses/FF/DecodeFeature.h b/moses/FF/DecodeFeature.h
index 19c9b3161..fdc1460bc 100644
--- a/moses/FF/DecodeFeature.h
+++ b/moses/FF/DecodeFeature.h
@@ -40,7 +40,7 @@ class DecodeFeature : public StatelessFeatureFunction
{
public:
- DecodeFeature(const std::string &line);
+ DecodeFeature(const std::string &line, bool registerNow);
DecodeFeature(size_t numScoreComponents
, const std::string &line);
diff --git a/moses/FF/Factory.cpp b/moses/FF/Factory.cpp
index 80e4a4243..81c6bdeb9 100644
--- a/moses/FF/Factory.cpp
+++ b/moses/FF/Factory.cpp
@@ -1,3 +1,4 @@
+#include "util/exception.hh"
#include "moses/FF/Factory.h"
#include "moses/StaticData.h"
@@ -146,26 +147,50 @@ protected:
FeatureFactory() {}
};
-template <class F> void FeatureFactory::DefaultSetup(F *feature)
+template <class F>
+void
+FeatureFactory
+::DefaultSetup(F *feature)
{
StaticData &static_data = StaticData::InstanceNonConst();
const string &featureName = feature->GetScoreProducerDescription();
std::vector<float> weights = static_data.GetParameter()->GetWeights(featureName);
- if (feature->IsTuneable() || weights.size()) {
- // if it's tuneable, ini file MUST have weights
- // even it it's not tuneable, people can still set the weights in the ini file
+
+ if (feature->GetNumScoreComponents())
+ {
+ if (weights.size() == 0)
+ {
+ weights = feature->DefaultWeights();
+ if (weights.size() == 0)
+ {
+ TRACE_ERR("WARNING: No weights specified in config file for FF "
+ << featureName << ". This FF does not supply default values.\n"
+ << "WARNING: Auto-initializing all weights for this FF to 1.0");
+ weights.assign(feature->GetNumScoreComponents(),1.0);
+ }
+ else
+ {
+ TRACE_ERR("WARNING: No weights specified in config file for FF "
+ << featureName << ". Using default values supplied by FF.");
+ }
+ }
+ UTIL_THROW_IF2(weights.size() != feature->GetNumScoreComponents(),
+ "FATAL ERROR: Mismatch in number of features and number "
+ << "of weights for Feature Function " << featureName
+ << " (features: " << feature->GetNumScoreComponents()
+ << " vs. weights: " << weights.size() << ")");
+ static_data.SetWeights(feature, weights);
+ }
+ else if (feature->IsTuneable())
static_data.SetWeights(feature, weights);
- } else if (feature->GetNumScoreComponents() > 0) {
- std::vector<float> defaultWeights = feature->DefaultWeights();
- static_data.SetWeights(feature, defaultWeights);
- }
}
namespace
{
-template <class F> class DefaultFeatureFactory : public FeatureFactory
+template <class F>
+class DefaultFeatureFactory : public FeatureFactory
{
public:
void Create(const std::string &line) {
diff --git a/moses/FF/FeatureFunction.cpp b/moses/FF/FeatureFunction.cpp
index 5c4b65729..298a9e65c 100644
--- a/moses/FF/FeatureFunction.cpp
+++ b/moses/FF/FeatureFunction.cpp
@@ -6,6 +6,7 @@
#include "moses/Hypothesis.h"
#include "moses/Manager.h"
#include "moses/TranslationOption.h"
+#include "moses/TranslationTask.h"
#include "moses/Util.h"
#include "moses/FF/DistortionScoreProducer.h"
@@ -37,10 +38,10 @@ void FeatureFunction::Destroy()
RemoveAllInColl(s_staticColl);
}
-// The original declaration as
+// The original declaration as
// void FeatureFunction::CallChangeSource(InputType *&input)
-// had me a bit perplexed. Would you really want to allow
-// any feature function to replace the InputType behind the
+// had me a bit perplexed. Would you really want to allow
+// any feature function to replace the InputType behind the
// back of the others? And change what the vector is pointing to?
void FeatureFunction::CallChangeSource(InputType * const&input)
@@ -58,14 +59,16 @@ void FeatureFunction::SetupAll(TranslationTask const& ttask)
}
FeatureFunction::
-FeatureFunction(const std::string& line)
+FeatureFunction(const std::string& line, bool registerNow)
: m_tuneable(true)
, m_requireSortingAfterSourceContext(false)
, m_verbosity(std::numeric_limits<std::size_t>::max())
, m_numScoreComponents(1)
+ , m_index(0)
{
m_numTuneableComponents = m_numScoreComponents;
- Initialize(line);
+ ParseLine(line);
+ if (registerNow) Register();
}
FeatureFunction::
@@ -75,17 +78,17 @@ FeatureFunction(size_t numScoreComponents,
, m_requireSortingAfterSourceContext(false)
, m_verbosity(std::numeric_limits<std::size_t>::max())
, m_numScoreComponents(numScoreComponents)
+ , m_index(0)
{
m_numTuneableComponents = m_numScoreComponents;
- Initialize(line);
+ ParseLine(line);
+ Register();
}
void
FeatureFunction::
-Initialize(const std::string &line)
+Register()
{
- ParseLine(line);
-
ScoreComponentCollection::RegisterScoreProducer(this);
s_staticColl.push_back(this);
}
@@ -163,7 +166,8 @@ void FeatureFunction::ReadParameters()
std::vector<float> FeatureFunction::DefaultWeights() const
{
- UTIL_THROW2(GetScoreProducerDescription() << ": No default weights");
+ return std::vector<float>(this->m_numScoreComponents,1.0);
+ // UTIL_THROW2(GetScoreProducerDescription() << ": No default weights");
}
void FeatureFunction::SetTuneableComponents(const std::string& value)
@@ -186,5 +190,31 @@ void FeatureFunction::SetTuneableComponents(const std::string& value)
}
}
+void
+FeatureFunction
+::InitializeForInput(ttasksptr const& ttask)
+{ InitializeForInput(*(ttask->GetSource().get())); }
+
+void
+FeatureFunction
+::CleanUpAfterSentenceProcessing(ttasksptr const& ttask)
+{ CleanUpAfterSentenceProcessing(*(ttask->GetSource().get())); }
+
+size_t
+FeatureFunction
+::GetIndex() const
+{ return m_index; }
+
+
+/// set index
+// @return index of the next FF
+size_t
+FeatureFunction
+::SetIndex(size_t const idx)
+{
+ m_index = idx;
+ return this->GetNumScoreComponents() + idx;
+}
+
}
diff --git a/moses/FF/FeatureFunction.h b/moses/FF/FeatureFunction.h
index c8c78fb0f..a8f189f0b 100644
--- a/moses/FF/FeatureFunction.h
+++ b/moses/FF/FeatureFunction.h
@@ -27,7 +27,7 @@ class FactorMask;
class InputPath;
class StackVec;
class DistortionScoreProducer;
-class TranslationTask;
+class TranslationTask;
/** base class for all feature functions.
*/
@@ -43,12 +43,15 @@ protected:
bool m_requireSortingAfterSourceContext;
size_t m_verbosity;
size_t m_numScoreComponents;
+ size_t m_index; // index into vector covering ALL feature function values
std::vector<bool> m_tuneableComponents;
size_t m_numTuneableComponents;
//In case there's multiple producers with the same description
static std::multiset<std::string> description_counts;
- void Initialize(const std::string &line);
+ void Register();
+private:
+ // void Initialize(const std::string &line);
void ParseLine(const std::string &line);
public:
@@ -62,7 +65,7 @@ public:
static void CallChangeSource(InputType * const&input);
// see my note in FeatureFunction.cpp --- UG
- FeatureFunction(const std::string &line);
+ FeatureFunction(const std::string &line, bool initializeNow);
FeatureFunction(size_t numScoreComponents, const std::string &line);
virtual bool IsStateless() const = 0;
virtual ~FeatureFunction();
@@ -114,30 +117,44 @@ public:
virtual std::vector<float> DefaultWeights() const;
+ size_t GetIndex() const;
+ size_t SetIndex(size_t const idx);
+
+protected:
+ virtual void
+ InitializeForInput(InputType const& source) { }
+ virtual void
+ CleanUpAfterSentenceProcessing(InputType const& source) { }
+
+public:
//! Called before search and collecting of translation options
- virtual void InitializeForInput(InputType const& source) {
- }
+ virtual void
+ InitializeForInput(ttasksptr const& ttask);
// clean up temporary memory, called after processing each sentence
- virtual void CleanUpAfterSentenceProcessing(const InputType& source) {
- }
+ virtual void
+ CleanUpAfterSentenceProcessing(ttasksptr const& ttask);
- const std::string &GetArgLine() const {
- return m_argLine;
- }
+ const std::string &
+ GetArgLine() const { return m_argLine; }
// given a target phrase containing only factors specified in mask
// return true if the feature function can be evaluated
virtual bool IsUseable(const FactorMask &mask) const = 0;
- // used by stateless ff and stateful ff. Calculate initial score estimate during loading of phrase table
- // source phrase is the substring that the phrase table uses to look up the target phrase,
+ // used by stateless ff and stateful ff. Calculate initial score
+ // estimate during loading of phrase table
+ //
+ // source phrase is the substring that the phrase table uses to look
+ // up the target phrase,
+ //
// may have more factors than actually need, but not guaranteed.
- // For SCFG decoding, the source contains non-terminals, NOT the raw source from the input sentence
- virtual void EvaluateInIsolation(const Phrase &source
- , const TargetPhrase &targetPhrase
- , ScoreComponentCollection &scoreBreakdown
- , ScoreComponentCollection &estimatedFutureScore) const = 0;
+ // For SCFG decoding, the source contains non-terminals, NOT the raw
+ // source from the input sentence
+ virtual void
+ EvaluateInIsolation(const Phrase &source, const TargetPhrase &targetPhrase,
+ ScoreComponentCollection& scoreBreakdown,
+ ScoreComponentCollection& estimatedFutureScore) const = 0;
// override this method if you want to change the input before decoding
virtual void ChangeSource(InputType * const&input) const { }
diff --git a/moses/FF/InputFeature.cpp b/moses/FF/InputFeature.cpp
index 7cbb428a6..6acb432ff 100644
--- a/moses/FF/InputFeature.cpp
+++ b/moses/FF/InputFeature.cpp
@@ -13,7 +13,7 @@ namespace Moses
InputFeature *InputFeature::s_instance = NULL;
InputFeature::InputFeature(const std::string &line)
- : StatelessFeatureFunction(line)
+ : StatelessFeatureFunction(line,true)
, m_numRealWordCount(0)
{
m_numInputScores = this->m_numScoreComponents;
diff --git a/moses/FF/LexicalReordering/LexicalReordering.cpp b/moses/FF/LexicalReordering/LexicalReordering.cpp
index 894080029..c67a16076 100644
--- a/moses/FF/LexicalReordering/LexicalReordering.cpp
+++ b/moses/FF/LexicalReordering/LexicalReordering.cpp
@@ -17,7 +17,7 @@ namespace Moses
{
LexicalReordering::
LexicalReordering(const std::string &line)
- : StatefulFeatureFunction(line)
+ : StatefulFeatureFunction(line,false)
{
VERBOSE(1, "Initializing Lexical Reordering Feature.." << std::endl);
@@ -65,13 +65,17 @@ LexicalReordering(const std::string &line)
}
// sanity check: number of default scores
- size_t numScores = m_configuration->GetNumScoreComponents();
+ size_t numScores
+ = m_numScoreComponents
+ = m_numTuneableComponents
+ = m_configuration->GetNumScoreComponents();
UTIL_THROW_IF2(m_haveDefaultScores && m_defaultScores.size() != numScores,
"wrong number of default scores (" << m_defaultScores.size()
<< ") for lexicalized reordering model (expected "
<< m_configuration->GetNumScoreComponents() << ")");
m_configuration->ConfigureSparse(sparseArgs, this);
+ this->Register();
}
LexicalReordering::
@@ -83,8 +87,9 @@ LexicalReordering::
Load()
{
typedef LexicalReorderingTable LRTable;
- m_table.reset(LRTable::LoadAvailable(m_filePath, m_factorsF,
- m_factorsE, std::vector<FactorType>()));
+ if (m_filePath.size())
+ m_table.reset(LRTable::LoadAvailable(m_filePath, m_factorsF,
+ m_factorsE, std::vector<FactorType>()));
}
Scores
@@ -132,16 +137,27 @@ void
LexicalReordering::
SetCache(TranslationOption& to) const
{
+ if (to.GetLexReorderingScores(this)) return;
+ // Scores were were set already (e.g., by sampling phrase table)
+
Phrase const& sphrase = to.GetInputPath().GetPhrase();
Phrase const& tphrase = to.GetTargetPhrase();
to.CacheLexReorderingScores(*this, this->GetProb(sphrase,tphrase));
}
+LRModel const&
+LexicalReordering
+::GetModel() const
+{
+ return *m_configuration;
+}
+
+
void
LexicalReordering::
SetCache(TranslationOptionList& tol) const
{
- BOOST_FOREACH(TranslationOption* to, tol)
+ BOOST_FOREACH(TranslationOption* to, tol)
this->SetCache(*to);
}
diff --git a/moses/FF/LexicalReordering/LexicalReordering.h b/moses/FF/LexicalReordering/LexicalReordering.h
index d1555a9c5..f39374622 100644
--- a/moses/FF/LexicalReordering/LexicalReordering.h
+++ b/moses/FF/LexicalReordering/LexicalReordering.h
@@ -45,7 +45,7 @@ public:
void
InitializeForInput(const InputType& i) {
- m_table->InitializeForInput(i);
+ if (m_table) m_table->InitializeForInput(i);
}
Scores
@@ -118,6 +118,8 @@ private:
std::string m_filePath;
bool m_haveDefaultScores;
Scores m_defaultScores;
+public:
+ LRModel const& GetModel() const;
};
}
diff --git a/moses/FF/LexicalReordering/LexicalReorderingState.cpp b/moses/FF/LexicalReordering/LexicalReorderingState.cpp
index 92b2f61ba..48fd577f1 100644
--- a/moses/FF/LexicalReordering/LexicalReorderingState.cpp
+++ b/moses/FF/LexicalReordering/LexicalReorderingState.cpp
@@ -101,7 +101,7 @@ GetOrientation(int const reoDistance) const
// this one is for HierarchicalReorderingBackwardState
return ((m_modelType == LeftRight)
? (reoDistance >= 1) ? R : L
- : (reoDistance == 1) ? M
+ : (reoDistance == 1) ? M
: (m_modelType == Monotonic) ? NM
: (reoDistance == -1) ? S
: (m_modelType == MSD) ? D
@@ -115,7 +115,7 @@ GetOrientation(WordsRange const& prev, WordsRange const& cur,
{
return ((m_modelType == LeftRight)
? cur.GetStartPos() > prev.GetEndPos() ? R : L
- : IsMonotonicStep(prev,cur,cov) ? M
+ : IsMonotonicStep(prev,cur,cov) ? M
: (m_modelType == Monotonic) ? NM
: IsSwap(prev,cur,cov) ? S
: (m_modelType == MSD) ? D
@@ -263,7 +263,7 @@ CopyScores(ScoreComponentCollection* accum,
const SparseReordering* sparse = m_configuration.GetSparseReordering();
if (sparse) sparse->CopyScores(*relevantOpt, m_prevOption, input, reoType,
- m_direction, accum);
+ m_direction, accum);
}
diff --git a/moses/FF/LexicalReordering/LexicalReorderingState.h b/moses/FF/LexicalReordering/LexicalReorderingState.h
index 48bf4698a..1e488fc41 100644
--- a/moses/FF/LexicalReordering/LexicalReorderingState.h
+++ b/moses/FF/LexicalReordering/LexicalReorderingState.h
@@ -44,7 +44,7 @@ public:
static const ReorderingType L = 1; // left
static const ReorderingType MAX = 3; // largest possible
#else
- enum ReorderingType
+ enum ReorderingType
{
M = 0, // monotonic
NM = 1, // non-monotonic
diff --git a/moses/FF/LexicalReordering/SparseReordering.cpp b/moses/FF/LexicalReordering/SparseReordering.cpp
index 040b94988..5397dcb10 100644
--- a/moses/FF/LexicalReordering/SparseReordering.cpp
+++ b/moses/FF/LexicalReordering/SparseReordering.cpp
@@ -13,8 +13,11 @@
#include "LexicalReordering.h"
#include "SparseReordering.h"
+#include <boost/algorithm/string/predicate.hpp>
+
using namespace std;
+using namespace boost::algorithm;
namespace Moses
{
@@ -57,6 +60,7 @@ const std::string& SparseReorderingFeatureKey::Name (const string& wordListId)
SparseReordering::SparseReordering(const map<string,string>& config, const LexicalReordering* producer)
: m_producer(producer)
+ , m_useWeightMap(false)
{
static const string kSource= "source";
static const string kTarget = "target";
@@ -80,6 +84,14 @@ SparseReordering::SparseReordering(const map<string,string>& config, const Lexic
} else {
UTIL_THROW(util::Exception, "Sparse reordering requires source or target, not " << fields[1]);
}
+ } else if (fields[0] == "weights") {
+ ReadWeightMap(i->second);
+ m_useWeightMap = true;
+ for (int reoType=0; reoType<=LRModel::MAX; ++reoType) {
+ ostringstream buf;
+ buf << reoType;
+ m_featureMap2.push_back(m_producer->GetFeatureName(buf.str()));
+ }
} else if (fields[0] == "phrase") {
m_usePhrase = true;
@@ -101,10 +113,10 @@ void SparseReordering::PreCalculateFeatureNames(size_t index, const string& id,
for (size_t position = SparseReorderingFeatureKey::First;
position <= SparseReorderingFeatureKey::Last; ++position) {
for (int reoType = 0; reoType <= LRModel::MAX; ++reoType) {
- SparseReorderingFeatureKey
- key(index, static_cast<SparseReorderingFeatureKey::Type>(type),
- factor, isCluster,
- static_cast<SparseReorderingFeatureKey::Position>(position),
+ SparseReorderingFeatureKey
+ key(index, static_cast<SparseReorderingFeatureKey::Type>(type),
+ factor, isCluster,
+ static_cast<SparseReorderingFeatureKey::Position>(position),
side, static_cast<LRModel::ReorderingType>(reoType));
m_featureMap.insert(pair<SparseReorderingFeatureKey, FName>(key,m_producer->GetFeatureName(key.Name(id))));
}
@@ -175,7 +187,16 @@ void SparseReordering::AddFeatures(
SparseReorderingFeatureKey key(id, type, wordFactor, false, position, side, reoType);
FeatureMap::const_iterator fmi = m_featureMap.find(key);
assert(fmi != m_featureMap.end());
- scores->SparsePlusEquals(fmi->second, 1.0);
+ if (m_useWeightMap) {
+ WeightMap::const_iterator wmi = m_weightMap.find(fmi->second.name());
+ if (wmi != m_weightMap.end()) {
+ if (wmi->second != 0) {
+ scores->SparsePlusEquals(m_featureMap2[reoType], wmi->second);
+ }
+ }
+ } else {
+ scores->SparsePlusEquals(fmi->second, 1.0);
+ }
}
for (size_t id = 0; id < clusterMaps->size(); ++id) {
@@ -186,7 +207,16 @@ void SparseReordering::AddFeatures(
SparseReorderingFeatureKey key(id, type, clusterIter->second, true, position, side, reoType);
FeatureMap::const_iterator fmi = m_featureMap.find(key);
assert(fmi != m_featureMap.end());
- scores->SparsePlusEquals(fmi->second, 1.0);
+ if (m_useWeightMap) {
+ WeightMap::const_iterator wmi = m_weightMap.find(fmi->second.name());
+ if (wmi != m_weightMap.end()) {
+ if (wmi->second != 0) {
+ scores->SparsePlusEquals(m_featureMap2[reoType], wmi->second);
+ }
+ }
+ } else {
+ scores->SparsePlusEquals(fmi->second, 1.0);
+ }
}
}
@@ -256,5 +286,29 @@ void SparseReordering::CopyScores(
}
+
+void SparseReordering::ReadWeightMap(const string& filename)
+{
+ util::FilePiece file(filename.c_str());
+ StringPiece line;
+ while (true) {
+ try {
+ line = file.ReadLine();
+ } catch (const util::EndOfFileException &e) {
+ break;
+ }
+ util::TokenIter<util::SingleCharacter, true> lineIter(line,util::SingleCharacter(' '));
+ UTIL_THROW_IF2(!lineIter, "Malformed weight line: '" << line << "'");
+ const std::string& name = lineIter->as_string();
+ ++lineIter;
+ UTIL_THROW_IF2(!lineIter, "Malformed weight line: '" << line << "'");
+ float weight = Moses::Scan<float>(lineIter->as_string());
+
+ std::pair< WeightMap::iterator, bool> inserted = m_weightMap.insert( std::make_pair(name, weight) );
+ UTIL_THROW_IF2(!inserted.second, "Duplicate weight: '" << name << "'");
+ }
+}
+
+
} //namespace
diff --git a/moses/FF/LexicalReordering/SparseReordering.h b/moses/FF/LexicalReordering/SparseReordering.h
index 8a2495ce8..ada17d1b2 100644
--- a/moses/FF/LexicalReordering/SparseReordering.h
+++ b/moses/FF/LexicalReordering/SparseReordering.h
@@ -112,10 +112,16 @@ private:
typedef boost::unordered_map<SparseReorderingFeatureKey, FName, HashSparseReorderingFeatureKey, EqualsSparseReorderingFeatureKey> FeatureMap;
FeatureMap m_featureMap;
+ typedef boost::unordered_map<std::string, float> WeightMap;
+ WeightMap m_weightMap;
+ bool m_useWeightMap;
+ std::vector<FName> m_featureMap2;
+
void ReadWordList(const std::string& filename, const std::string& id,
SparseReorderingFeatureKey::Side side, std::vector<WordList>* pWordLists);
void ReadClusterMap(const std::string& filename, const std::string& id, SparseReorderingFeatureKey::Side side, std::vector<ClusterMap>* pClusterMaps);
void PreCalculateFeatureNames(size_t index, const std::string& id, SparseReorderingFeatureKey::Side side, const Factor* factor, bool isCluster);
+ void ReadWeightMap(const std::string& filename);
void AddFeatures(
SparseReorderingFeatureKey::Type type, SparseReorderingFeatureKey::Side side,
diff --git a/moses/FF/Model1Feature.cpp b/moses/FF/Model1Feature.cpp
index 38883c12e..6f6552461 100644
--- a/moses/FF/Model1Feature.cpp
+++ b/moses/FF/Model1Feature.cpp
@@ -19,7 +19,7 @@ Model1Vocabulary::Model1Vocabulary()
Store(m_NULL,0);
}
-bool Model1Vocabulary::Store(const Factor* word, const unsigned id)
+bool Model1Vocabulary::Store(const Factor* word, const unsigned id)
{
boost::unordered_map<const Factor*, unsigned>::iterator iter = m_lookup.find( word );
if ( iter != m_lookup.end() ) {
@@ -33,7 +33,7 @@ bool Model1Vocabulary::Store(const Factor* word, const unsigned id)
return true;
}
-unsigned Model1Vocabulary::StoreIfNew(const Factor* word)
+unsigned Model1Vocabulary::StoreIfNew(const Factor* word)
{
boost::unordered_map<const Factor*, unsigned>::iterator iter = m_lookup.find( word );
@@ -47,7 +47,7 @@ unsigned Model1Vocabulary::StoreIfNew(const Factor* word)
return id;
}
-unsigned Model1Vocabulary::GetWordID(const Factor* word) const
+unsigned Model1Vocabulary::GetWordID(const Factor* word) const
{
boost::unordered_map<const Factor*, unsigned>::const_iterator iter = m_lookup.find( word );
if ( iter == m_lookup.end() ) {
@@ -56,7 +56,7 @@ unsigned Model1Vocabulary::GetWordID(const Factor* word) const
return iter->second;
}
-const Factor* Model1Vocabulary::GetWord(unsigned id) const
+const Factor* Model1Vocabulary::GetWord(unsigned id) const
{
if (id >= m_vocab.size()) {
return NULL;
@@ -64,7 +64,7 @@ const Factor* Model1Vocabulary::GetWord(unsigned id) const
return m_vocab[ id ];
}
-void Model1Vocabulary::Load(const std::string& fileName)
+void Model1Vocabulary::Load(const std::string& fileName)
{
InputFileStream inFile(fileName);
FactorCollection &factorCollection = FactorCollection::Instance();
@@ -84,7 +84,7 @@ void Model1Vocabulary::Load(const std::string& fileName)
UTIL_THROW_IF2(!stored, "Line " << i << " in " << fileName << " overwrites existing vocabulary entry.");
}
}
- while ( getline(inFile, line) )
+ while ( getline(inFile, line) )
{
++i;
std::vector<std::string> tokens = Tokenize(line);
@@ -104,7 +104,7 @@ void Model1LexicalTable::Load(const std::string &fileName, const Model1Vocabular
std::string line;
unsigned i = 0;
- while ( getline(inFile, line) )
+ while ( getline(inFile, line) )
{
++i;
std::vector<std::string> tokens = Tokenize(line);
@@ -126,8 +126,8 @@ void Model1LexicalTable::Load(const std::string &fileName, const Model1Vocabular
float Model1LexicalTable::GetProbability(const Factor* wordS, const Factor* wordT) const
{
float prob = m_floor;
-
- boost::unordered_map< const Factor*, boost::unordered_map< const Factor*, float > >::const_iterator iter1 = m_ltable.find( wordS );
+
+ boost::unordered_map< const Factor*, boost::unordered_map< const Factor*, float > >::const_iterator iter1 = m_ltable.find( wordS );
if ( iter1 != m_ltable.end() ) {
boost::unordered_map< const Factor*, float >::const_iterator iter2 = iter1->second.find( wordT );
@@ -193,10 +193,10 @@ void Model1Feature::EvaluateWithSourceContext(const InputType &input
float score = 0.0;
float norm = TransformScore(1+sentence.GetSize());
- for (size_t posT=0; posT<targetPhrase.GetSize(); ++posT)
+ for (size_t posT=0; posT<targetPhrase.GetSize(); ++posT)
{
const Word &wordT = targetPhrase.GetWord(posT);
- if ( !wordT.IsNonTerminal() )
+ if ( !wordT.IsNonTerminal() )
{
float thisWordProb = m_model1.GetProbability(m_emptyWord,wordT[0]); // probability conditioned on empty word
@@ -231,7 +231,7 @@ void Model1Feature::EvaluateWithSourceContext(const InputType &input
float thisWordScore = TransformScore(thisWordProb) - norm;
FEATUREVERBOSE(3, "score( " << wordT << " ) = " << thisWordScore << std::endl);
{
- #ifdef WITH_THREADS
+ #ifdef WITH_THREADS
// need to update cache; write lock
boost::unique_lock<boost::shared_mutex> lock(m_accessLock);
#endif
@@ -240,14 +240,14 @@ void Model1Feature::EvaluateWithSourceContext(const InputType &input
score += thisWordScore;
}
}
- }
+ }
scoreBreakdown.PlusEquals(this, score);
}
-
-void Model1Feature::CleanUpAfterSentenceProcessing(const InputType& source)
+
+void Model1Feature::CleanUpAfterSentenceProcessing(const InputType& source)
{
- #ifdef WITH_THREADS
+ #ifdef WITH_THREADS
// need to update cache; write lock
boost::unique_lock<boost::shared_mutex> lock(m_accessLock);
#endif
diff --git a/moses/FF/Model1Feature.h b/moses/FF/Model1Feature.h
index d526d165a..9c380e3ae 100644
--- a/moses/FF/Model1Feature.h
+++ b/moses/FF/Model1Feature.h
@@ -37,7 +37,7 @@ protected:
class Model1LexicalTable
{
public:
- Model1LexicalTable(float floor=1e-7) : m_floor(floor)
+ Model1LexicalTable(float floor=1e-7) : m_floor(floor)
{}
void Load(const std::string& fileName, const Model1Vocabulary& vcbS, const Model1Vocabulary& vcbT);
@@ -100,7 +100,7 @@ private:
const Factor* m_emptyWord;
void Load();
-
+
// cache
mutable boost::unordered_map<const InputType*, boost::unordered_map<const Factor*, float> > m_cache;
#ifdef WITH_THREADS
diff --git a/moses/FF/NieceTerminal.cpp b/moses/FF/NieceTerminal.cpp
index 3473790c1..6bd65f37c 100644
--- a/moses/FF/NieceTerminal.cpp
+++ b/moses/FF/NieceTerminal.cpp
@@ -11,7 +11,7 @@ using namespace std;
namespace Moses
{
NieceTerminal::NieceTerminal(const std::string &line)
- :StatelessFeatureFunction(line)
+ :StatelessFeatureFunction(line,true)
,m_hardConstraint(false)
{
ReadParameters();
diff --git a/moses/FF/OSM-Feature/osmHyp.cpp b/moses/FF/OSM-Feature/osmHyp.cpp
index 422b7c933..f971bbe8c 100644
--- a/moses/FF/OSM-Feature/osmHyp.cpp
+++ b/moses/FF/OSM-Feature/osmHyp.cpp
@@ -128,7 +128,7 @@ void osmHypothesis :: calculateOSMProb(OSMLM& ptrOp)
State currState = lmState;
State temp;
- for (int i = 0; i<operations.size(); i++) {
+ for (size_t i = 0; i<operations.size(); i++) {
temp = currState;
opProb += ptrOp.Score(temp,operations[i],currState);
}
@@ -368,7 +368,6 @@ void osmHypothesis :: computeOSMFeature(int startIndex , WordsBitmap & coverageV
string english;
string source;
int j1;
- int start = 0;
int targetIndex = 0;
doneTargetIndexes.clear();
@@ -391,7 +390,7 @@ void osmHypothesis :: computeOSMFeature(int startIndex , WordsBitmap & coverageV
}
- for (int i = 0; i < ceptsInPhrase.size(); i++) {
+ for (size_t i = 0; i < ceptsInPhrase.size(); i++) {
source = "";
english = "";
@@ -462,7 +461,7 @@ void osmHypothesis :: getMeCepts ( set <int> & eSide , set <int> & fSide , map <
for (iter = eSide.begin(); iter != eSide.end(); iter++) {
t = tS[*iter];
- for (int i = 0; i < t.size(); i++) {
+ for (size_t i = 0; i < t.size(); i++) {
fSide.insert(t[i]);
}
@@ -472,7 +471,7 @@ void osmHypothesis :: getMeCepts ( set <int> & eSide , set <int> & fSide , map <
t = sT[*iter];
- for (int i = 0 ; i<t.size(); i++) {
+ for (size_t i = 0 ; i<t.size(); i++) {
eSide.insert(t[i]);
}
@@ -498,7 +497,7 @@ void osmHypothesis :: constructCepts(vector <int> & align , int startIndex , int
int tgt;
- for (int i = 0; i < align.size(); i+=2) {
+ for (size_t i = 0; i < align.size(); i+=2) {
src = align[i];
tgt = align[i+1];
tS[tgt].push_back(src);
diff --git a/moses/FF/PhraseOrientationFeature.cpp b/moses/FF/PhraseOrientationFeature.cpp
index 5cb252270..2a59340ea 100644
--- a/moses/FF/PhraseOrientationFeature.cpp
+++ b/moses/FF/PhraseOrientationFeature.cpp
@@ -197,7 +197,7 @@ void PhraseOrientationFeature::EvaluateInIsolation(const Phrase &source,
FEATUREVERBOSE(4, "lastNonTerminalPreviousSourceSpanIsAligned== " << reoClassData->lastNonTerminalPreviousSourceSpanIsAligned << std::endl);
FEATUREVERBOSE(4, "lastNonTerminalFollowingSourceSpanIsAligned== " << reoClassData->lastNonTerminalFollowingSourceSpanIsAligned << std::endl;);
- if (reoClassData->lastNonTerminalPreviousSourceSpanIsAligned &&
+ if (reoClassData->lastNonTerminalPreviousSourceSpanIsAligned &&
reoClassData->lastNonTerminalFollowingSourceSpanIsAligned) {
// discontinuous
r2lOrientation = Moses::GHKM::PhraseOrientation::REO_CLASS_DLEFT;
@@ -221,7 +221,7 @@ void PhraseOrientationFeature::LookaheadScore(const OrientationPhraseProperty *o
ScoreComponentCollection &scoreBreakdown,
bool subtract) const
{
- size_t ffScoreIndex = scoreBreakdown.GetIndexes(this).first;
+ size_t ffScoreIndex = m_index;
std::vector<float> scoresL2R;
scoresL2R.push_back( TransformScore(orientationPhraseProperty->GetLeftToRightProbabilityMono()) );
@@ -467,7 +467,7 @@ FFState* PhraseOrientationFeature::EvaluateWhenApplied(
if ( (nNT == currTarPhr.GetAlignNonTerm().GetSize()-1) && reoClassData->lastNonTerminalIsBoundary ) {
// delay right-to-left scoring
-
+
FEATUREVERBOSE(3, "Delaying right-to-left scoring" << std::endl);
std::bitset<3> possibleFutureOrientationsR2L(0x7);
diff --git a/moses/FF/PhrasePairFeature.cpp b/moses/FF/PhrasePairFeature.cpp
index c58c79c36..bb806f8e7 100644
--- a/moses/FF/PhrasePairFeature.cpp
+++ b/moses/FF/PhrasePairFeature.cpp
@@ -134,9 +134,9 @@ void PhrasePairFeature::EvaluateWithSourceContext(const InputType &input
scoreBreakdown.SparsePlusEquals(namestr.str(),1);
}
if (m_domainTrigger) {
- const Sentence& input = static_cast<const Sentence&>(input);
- const bool use_topicid = input.GetUseTopicId();
- const bool use_topicid_prob = input.GetUseTopicIdAndProb();
+ const Sentence& isnt = static_cast<const Sentence&>(input);
+ const bool use_topicid = isnt.GetUseTopicId();
+ const bool use_topicid_prob = isnt.GetUseTopicIdAndProb();
// compute pair
ostringstream pair;
@@ -157,7 +157,7 @@ void PhrasePairFeature::EvaluateWithSourceContext(const InputType &input
if (use_topicid || use_topicid_prob) {
if(use_topicid) {
// use topicid as trigger
- const long topicid = input.GetTopicId();
+ const long topicid = isnt.GetTopicId();
stringstream feature;
feature << "pp_";
if (topicid == -1)
@@ -170,7 +170,7 @@ void PhrasePairFeature::EvaluateWithSourceContext(const InputType &input
scoreBreakdown.SparsePlusEquals(feature.str(), 1);
} else {
// use topic probabilities
- const vector<string> &topicid_prob = *(input.GetTopicIdAndProb());
+ const vector<string> &topicid_prob = *(isnt.GetTopicIdAndProb());
if (atol(topicid_prob[0].c_str()) == -1) {
stringstream feature;
feature << "pp_unk_";
@@ -189,7 +189,7 @@ void PhrasePairFeature::EvaluateWithSourceContext(const InputType &input
}
} else {
// range over domain trigger words
- const long docid = input.GetDocumentId();
+ const long docid = isnt.GetDocumentId();
for (set<string>::const_iterator p = m_vocabDomain[docid].begin(); p != m_vocabDomain[docid].end(); ++p) {
string sourceTrigger = *p;
ostringstream namestr;
@@ -202,11 +202,11 @@ void PhrasePairFeature::EvaluateWithSourceContext(const InputType &input
}
}
if (m_sourceContext) {
- const Sentence& input = static_cast<const Sentence&>(input);
+ const Sentence& isnt = static_cast<const Sentence&>(input);
// range over source words to get context
- for(size_t contextIndex = 0; contextIndex < input.GetSize(); contextIndex++ ) {
- StringPiece sourceTrigger = input.GetWord(contextIndex).GetFactor(m_sourceFactorId)->GetString();
+ for(size_t contextIndex = 0; contextIndex < isnt.GetSize(); contextIndex++ ) {
+ StringPiece sourceTrigger = isnt.GetWord(contextIndex).GetFactor(m_sourceFactorId)->GetString();
if (m_ignorePunctuation) {
// check if trigger is punctuation
char firstChar = sourceTrigger[0];
diff --git a/moses/FF/RulePairUnlexicalizedSource.cpp b/moses/FF/RulePairUnlexicalizedSource.cpp
index c31978423..148d54052 100644
--- a/moses/FF/RulePairUnlexicalizedSource.cpp
+++ b/moses/FF/RulePairUnlexicalizedSource.cpp
@@ -10,7 +10,7 @@ using namespace std;
namespace Moses
{
-
+
RulePairUnlexicalizedSource::RulePairUnlexicalizedSource(const std::string &line)
: StatelessFeatureFunction(0, line)
, m_glueRules(false)
@@ -51,7 +51,7 @@ void RulePairUnlexicalizedSource::EvaluateInIsolation(const Phrase &source
return;
}
- for (size_t posS=0; posS<source.GetSize(); ++posS)
+ for (size_t posS=0; posS<source.GetSize(); ++posS)
{
const Word &wordS = source.GetWord(posS);
if ( !wordS.IsNonTerminal() ) {
@@ -61,7 +61,7 @@ void RulePairUnlexicalizedSource::EvaluateInIsolation(const Phrase &source
ostringstream namestr;
- for (size_t posT=0; posT<targetPhrase.GetSize(); ++posT)
+ for (size_t posT=0; posT<targetPhrase.GetSize(); ++posT)
{
const Word &wordT = targetPhrase.GetWord(posT);
const Factor* factorT = wordT[0];
@@ -78,7 +78,7 @@ void RulePairUnlexicalizedSource::EvaluateInIsolation(const Phrase &source
namestr << targetPhraseLHS->GetString() << "|";
for (AlignmentInfo::const_iterator it=targetPhrase.GetAlignNonTerm().begin();
- it!=targetPhrase.GetAlignNonTerm().end(); ++it)
+ it!=targetPhrase.GetAlignNonTerm().end(); ++it)
{
namestr << "|" << it->first << "-" << it->second;
}
diff --git a/moses/FF/RuleScope.cpp b/moses/FF/RuleScope.cpp
index bc1cb3ebd..08987537d 100644
--- a/moses/FF/RuleScope.cpp
+++ b/moses/FF/RuleScope.cpp
@@ -70,11 +70,11 @@ void RuleScope::EvaluateInIsolation(const Phrase &source
estimatedFutureScore.PlusEquals(this, scores);
}
else {
- scoreBreakdown.PlusEquals(this, scores);
+ scoreBreakdown.PlusEquals(this, scores);
}
}
else if (m_futureCostOnly) {
- estimatedFutureScore.PlusEquals(this, score);
+ estimatedFutureScore.PlusEquals(this, score);
}
else {
scoreBreakdown.PlusEquals(this, score);
diff --git a/moses/FF/SoftSourceSyntacticConstraintsFeature.cpp b/moses/FF/SoftSourceSyntacticConstraintsFeature.cpp
index d57c42d99..f788f8e53 100644
--- a/moses/FF/SoftSourceSyntacticConstraintsFeature.cpp
+++ b/moses/FF/SoftSourceSyntacticConstraintsFeature.cpp
@@ -556,8 +556,8 @@ void SoftSourceSyntacticConstraintsFeature::EvaluateWithSourceContext(const Inpu
for (boost::unordered_set<size_t>::const_iterator treeInputLabelsLHSIt = treeInputLabelsLHS.begin();
treeInputLabelsLHSIt != treeInputLabelsLHS.end(); ++treeInputLabelsLHSIt) {
- scoreBreakdown.PlusEquals(this,
- "LHSPAIR_" + targetLHS->GetString().as_string() + "_" + m_sourceLabelsByIndex[*treeInputLabelsLHSIt],
+ scoreBreakdown.PlusEquals(this,
+ "LHSPAIR_" + targetLHS->GetString().as_string() + "_" + m_sourceLabelsByIndex[*treeInputLabelsLHSIt],
(float)1/treeInputLabelsLHS.size());
if (!m_targetSourceLHSJointCountFile.empty()) {
@@ -567,8 +567,8 @@ void SoftSourceSyntacticConstraintsFeature::EvaluateWithSourceContext(const Inpu
}
}
if ( treeInputLabelsLHS.size() == 0 ) {
- scoreBreakdown.PlusEquals(this,
- "LHSPAIR_" + targetLHS->GetString().as_string() + "_" + outputDefaultNonTerminal[0]->GetString().as_string(),
+ scoreBreakdown.PlusEquals(this,
+ "LHSPAIR_" + targetLHS->GetString().as_string() + "_" + outputDefaultNonTerminal[0]->GetString().as_string(),
1);
if (!m_targetSourceLHSJointCountFile.empty()) {
t2sLabelsScore = TransformScore(m_floor);
diff --git a/moses/FF/SpanLength.cpp b/moses/FF/SpanLength.cpp
index 0e14069ee..91ac3ff89 100644
--- a/moses/FF/SpanLength.cpp
+++ b/moses/FF/SpanLength.cpp
@@ -45,8 +45,7 @@ void SpanLength::EvaluateWithSourceContext(const InputType &input
const SpanLengthPhraseProperty *slProp = static_cast<const SpanLengthPhraseProperty*>(property);
- const Phrase *ruleSource = targetPhrase.GetRuleSource();
- assert(ruleSource);
+ assert(targetPhrase.GetRuleSource());
float score = 0;
for (size_t i = 0; i < stackVec->size(); ++i) {
diff --git a/moses/FF/StatefulFeatureFunction.cpp b/moses/FF/StatefulFeatureFunction.cpp
index 9e61ed05f..bfb56f88c 100644
--- a/moses/FF/StatefulFeatureFunction.cpp
+++ b/moses/FF/StatefulFeatureFunction.cpp
@@ -5,13 +5,15 @@ namespace Moses
std::vector<const StatefulFeatureFunction*> StatefulFeatureFunction::m_statefulFFs;
-StatefulFeatureFunction::StatefulFeatureFunction(const std::string &line)
- : FeatureFunction(line)
+StatefulFeatureFunction
+::StatefulFeatureFunction(const std::string &line, bool registerNow)
+ : FeatureFunction(line, registerNow)
{
m_statefulFFs.push_back(this);
}
-StatefulFeatureFunction::StatefulFeatureFunction(size_t numScoreComponents, const std::string &line)
+StatefulFeatureFunction
+::StatefulFeatureFunction(size_t numScoreComponents, const std::string &line)
: FeatureFunction(numScoreComponents, line)
{
m_statefulFFs.push_back(this);
diff --git a/moses/FF/StatefulFeatureFunction.h b/moses/FF/StatefulFeatureFunction.h
index c5364fd0d..c12f9516f 100644
--- a/moses/FF/StatefulFeatureFunction.h
+++ b/moses/FF/StatefulFeatureFunction.h
@@ -22,7 +22,7 @@ public:
return m_statefulFFs;
}
- StatefulFeatureFunction(const std::string &line);
+ StatefulFeatureFunction(const std::string &line, bool registerNow);
StatefulFeatureFunction(size_t numScoreComponents, const std::string &line);
/**
diff --git a/moses/FF/StatelessFeatureFunction.cpp b/moses/FF/StatelessFeatureFunction.cpp
index ecad23e6f..15d97e4bc 100644
--- a/moses/FF/StatelessFeatureFunction.cpp
+++ b/moses/FF/StatelessFeatureFunction.cpp
@@ -5,14 +5,16 @@ namespace Moses
std::vector<const StatelessFeatureFunction*> StatelessFeatureFunction::m_statelessFFs;
-StatelessFeatureFunction::StatelessFeatureFunction(const std::string &line)
- :FeatureFunction(line)
+StatelessFeatureFunction
+::StatelessFeatureFunction(const std::string &line, bool registerNow)
+ : FeatureFunction(line, registerNow)
{
m_statelessFFs.push_back(this);
}
-StatelessFeatureFunction::StatelessFeatureFunction(size_t numScoreComponents, const std::string &line)
- :FeatureFunction(numScoreComponents, line)
+StatelessFeatureFunction
+::StatelessFeatureFunction(size_t numScoreComponents, const std::string &line)
+ : FeatureFunction(numScoreComponents, line)
{
m_statelessFFs.push_back(this);
}
diff --git a/moses/FF/StatelessFeatureFunction.h b/moses/FF/StatelessFeatureFunction.h
index e5d3f3812..e1a22f5f4 100644
--- a/moses/FF/StatelessFeatureFunction.h
+++ b/moses/FF/StatelessFeatureFunction.h
@@ -20,7 +20,7 @@ public:
return m_statelessFFs;
}
- StatelessFeatureFunction(const std::string &line);
+ StatelessFeatureFunction(const std::string &line, bool registerNow);
StatelessFeatureFunction(size_t numScoreComponents, const std::string &line);
/**
diff --git a/moses/FF/SyntaxRHS.cpp b/moses/FF/SyntaxRHS.cpp
index 80f9b21bc..a06477855 100644
--- a/moses/FF/SyntaxRHS.cpp
+++ b/moses/FF/SyntaxRHS.cpp
@@ -29,10 +29,6 @@ void SyntaxRHS::EvaluateWithSourceContext(const InputType &input
, ScoreComponentCollection *estimatedFutureScore) const
{
assert(stackVec);
- for (size_t i = 0; i < stackVec->size(); ++i) {
- const ChartCellLabel &cell = *stackVec->at(i);
-
- }
if (targetPhrase.GetNumNonTerminals()) {
vector<float> newScores(m_numScoreComponents);
diff --git a/moses/FF/UnalignedWordCountFeature.cpp b/moses/FF/UnalignedWordCountFeature.cpp
index 9f0fe10db..64ea36a72 100644
--- a/moses/FF/UnalignedWordCountFeature.cpp
+++ b/moses/FF/UnalignedWordCountFeature.cpp
@@ -23,8 +23,6 @@ void UnalignedWordCountFeature::EvaluateInIsolation(const Phrase &source
, ScoreComponentCollection &scoreBreakdown
, ScoreComponentCollection &estimatedFutureScore) const
{
- const size_t ffScoreIndex(scoreBreakdown.GetIndexes(this).first);
-
const AlignmentInfo &alignmentInfo = targetPhrase.GetAlignTerm();
const size_t sourceLength = source.GetSize();
const size_t targetLength = targetPhrase.GetSize();
@@ -57,8 +55,8 @@ void UnalignedWordCountFeature::EvaluateInIsolation(const Phrase &source
}
}
- scoreBreakdown.PlusEquals(ffScoreIndex, sourceUnalignedCount);
- scoreBreakdown.PlusEquals(ffScoreIndex+1, targetUnalignedCount);
+ scoreBreakdown.PlusEquals(m_index, sourceUnalignedCount);
+ scoreBreakdown.PlusEquals(m_index+1, targetUnalignedCount);
IFFEATUREVERBOSE(2) {
FEATUREVERBOSE(2, source << std::endl);
diff --git a/moses/FF/VW/VW.h b/moses/FF/VW/VW.h
index dd9d0b858..c94791c32 100644
--- a/moses/FF/VW/VW.h
+++ b/moses/FF/VW/VW.h
@@ -165,7 +165,7 @@ public:
const std::vector<VWFeatureBase*>& sourceFeatures =
VWFeatureBase::GetSourceFeatures(GetScoreProducerDescription());
- const std::vector<VWFeatureBase*>& targetFeatures =
+ const std::vector<VWFeatureBase*>& targetFeatures =
VWFeatureBase::GetTargetFeatures(GetScoreProducerDescription());
const WordsRange &sourceRange = translationOptionList.Get(0)->GetSourceWordsRange();
@@ -229,7 +229,7 @@ public:
//
// predict using a trained classifier, use this in decoding (=at test time)
//
-
+
std::vector<float> losses(translationOptionList.size());
// extract source side features
@@ -296,7 +296,7 @@ public:
// classifier (squared/logistic/hinge/...), hence the name "loss"
if (value == "logistic") {
m_normalizer = (Discriminative::Normalizer *) new Discriminative::LogisticLossNormalizer();
- } else if (value == "squared") {
+ } else if (value == "squared") {
m_normalizer = (Discriminative::Normalizer *) new Discriminative::SquaredLossNormalizer();
} else {
UTIL_THROW2("Unknown loss type:" << value);
@@ -317,7 +317,7 @@ public:
const TabbedSentence& tabbedSentence = static_cast<const TabbedSentence&>(source);
UTIL_THROW_IF2(tabbedSentence.GetColumns().size() < 2,
"TabbedSentence must contain target<tab>alignment");
-
+
// target sentence represented as a phrase
Phrase *target = new Phrase();
target->CreateFromString(
@@ -431,7 +431,7 @@ private:
const TargetPhrase &targetPhrase = topt->GetTargetPhrase();
// extract raw counts from phrase-table property
- const CountsPhraseProperty *property =
+ const CountsPhraseProperty *property =
static_cast<const CountsPhraseProperty *>(targetPhrase.GetProperty("Counts"));
if (! property) {
diff --git a/moses/GenerationDictionary.cpp b/moses/GenerationDictionary.cpp
index 67e3b9108..ddb1428d9 100644
--- a/moses/GenerationDictionary.cpp
+++ b/moses/GenerationDictionary.cpp
@@ -36,7 +36,7 @@ namespace Moses
std::vector<GenerationDictionary*> GenerationDictionary::s_staticColl;
GenerationDictionary::GenerationDictionary(const std::string &line)
- : DecodeFeature(line)
+ : DecodeFeature(line, true)
{
s_staticColl.push_back(this);
diff --git a/moses/Hypothesis.cpp b/moses/Hypothesis.cpp
index 79a469523..b792d11f8 100644
--- a/moses/Hypothesis.cpp
+++ b/moses/Hypothesis.cpp
@@ -86,7 +86,7 @@ namespace Moses
, m_sourceInput(prevHypo.m_sourceInput)
, m_currSourceWordsRange(transOpt.GetSourceWordsRange())
, m_currTargetWordsRange(prevHypo.m_currTargetWordsRange.GetEndPos() + 1,
- prevHypo.m_currTargetWordsRange.GetEndPos()
+ prevHypo.m_currTargetWordsRange.GetEndPos()
+ transOpt.GetTargetPhrase().GetSize())
, m_wordDeleted(false)
, m_totalScore(0.0f)
@@ -127,7 +127,7 @@ namespace Moses
}
}
- void
+ void
Hypothesis::
AddArc(Hypothesis *loserHypo)
{
@@ -156,7 +156,7 @@ namespace Moses
/***
* return the subclass of Hypothesis most appropriate to the given translation option
*/
- Hypothesis*
+ Hypothesis*
Hypothesis::
CreateNext(const TranslationOption &transOpt) const
{
@@ -166,7 +166,7 @@ namespace Moses
/***
* return the subclass of Hypothesis most appropriate to the given translation option
*/
- Hypothesis*
+ Hypothesis*
Hypothesis::
Create(const Hypothesis &prevHypo, const TranslationOption &transOpt)
{
@@ -182,9 +182,9 @@ namespace Moses
* return the subclass of Hypothesis most appropriate to the given target phrase
*/
- Hypothesis*
+ Hypothesis*
Hypothesis::
- Create(Manager& manager, InputType const& m_source,
+ Create(Manager& manager, InputType const& m_source,
const TranslationOption &initialTransOpt)
{
#ifdef USE_HYPO_POOL
@@ -200,7 +200,7 @@ namespace Moses
keep an ordered list of hypotheses. This makes recombination
much quicker.
*/
- int
+ int
Hypothesis::
RecombineCompare(const Hypothesis &compare) const
{
@@ -223,22 +223,22 @@ namespace Moses
return 0;
}
- void
+ void
Hypothesis::
EvaluateWhenApplied(StatefulFeatureFunction const& sfff,
int state_idx)
{
const StaticData &staticData = StaticData::Instance();
- if (! staticData.IsFeatureFunctionIgnored( sfff ))
+ if (! staticData.IsFeatureFunctionIgnored( sfff ))
{
- m_ffStates[state_idx]
+ m_ffStates[state_idx]
= sfff.EvaluateWhenApplied
(*this, m_prevHypo ? m_prevHypo->m_ffStates[state_idx] : NULL,
&m_currScoreBreakdown);
}
}
- void
+ void
Hypothesis::
EvaluateWhenApplied(const StatelessFeatureFunction& slff)
{
@@ -251,7 +251,7 @@ namespace Moses
/***
* calculate the logarithm of our total translation score (sum up components)
*/
- void
+ void
Hypothesis::
EvaluateWhenApplied(const SquareMatrix &futureScore)
{
@@ -309,7 +309,7 @@ namespace Moses
/**
* print hypothesis information for pharaoh-style logging
*/
- void
+ void
Hypothesis::
PrintHypothesis() const
{
@@ -346,7 +346,7 @@ namespace Moses
//PrintLMScores();
}
- void
+ void
Hypothesis::
CleanupArcList()
{
@@ -361,27 +361,27 @@ namespace Moses
*/
const StaticData &staticData = StaticData::Instance();
size_t nBestSize = staticData.GetNBestSize();
- bool distinctNBest = (staticData.GetDistinctNBest() ||
- staticData.GetLatticeSamplesSize() ||
- staticData.UseMBR() ||
- staticData.GetOutputSearchGraph() ||
- staticData.GetOutputSearchGraphSLF() ||
- staticData.GetOutputSearchGraphHypergraph() ||
+ bool distinctNBest = (staticData.GetDistinctNBest() ||
+ staticData.GetLatticeSamplesSize() ||
+ staticData.UseMBR() ||
+ staticData.GetOutputSearchGraph() ||
+ staticData.GetOutputSearchGraphSLF() ||
+ staticData.GetOutputSearchGraphHypergraph() ||
staticData.UseLatticeMBR());
- if (!distinctNBest && m_arcList->size() > nBestSize * 5)
+ if (!distinctNBest && m_arcList->size() > nBestSize * 5)
{
// prune arc list only if there too many arcs
NTH_ELEMENT4(m_arcList->begin(), m_arcList->begin() + nBestSize - 1,
m_arcList->end(), CompareHypothesisTotalScore());
-
+
// delete bad ones
ArcList::iterator iter;
- for (iter = m_arcList->begin() + nBestSize; iter != m_arcList->end() ; ++iter)
+ for (iter = m_arcList->begin() + nBestSize; iter != m_arcList->end() ; ++iter)
FREEHYPO(*iter);
m_arcList->erase(m_arcList->begin() + nBestSize, m_arcList->end());
}
-
+
// set all arc's main hypo variable to this hypo
ArcList::iterator iter = m_arcList->begin();
for (; iter != m_arcList->end() ; ++iter) {
@@ -395,15 +395,15 @@ namespace Moses
GetCurrTargetPhrase() const
{ return m_transOpt.GetTargetPhrase(); }
- void
+ void
Hypothesis::
GetOutputPhrase(Phrase &out) const
{
- if (m_prevHypo != NULL)
+ if (m_prevHypo != NULL)
m_prevHypo->GetOutputPhrase(out);
out.Append(GetCurrTargetPhrase());
}
-
+
TO_STRING_BODY(Hypothesis)
// friend
@@ -424,37 +424,37 @@ namespace Moses
}
- std::string
+ std::string
Hypothesis::
GetSourcePhraseStringRep(const vector<FactorType> factorsToPrint) const
{ return m_transOpt.GetInputPath().GetPhrase().GetStringRep(factorsToPrint); }
- std::string
+ std::string
Hypothesis::
GetTargetPhraseStringRep(const vector<FactorType> factorsToPrint) const
{ return (m_prevHypo ? GetCurrTargetPhrase().GetStringRep(factorsToPrint) : ""); }
- std::string
+ std::string
Hypothesis::
GetSourcePhraseStringRep() const
{
vector<FactorType> allFactors(MAX_NUM_FACTORS);
- for(size_t i=0; i < MAX_NUM_FACTORS; i++)
+ for(size_t i=0; i < MAX_NUM_FACTORS; i++)
allFactors[i] = i;
return GetSourcePhraseStringRep(allFactors);
}
- std::string
+ std::string
Hypothesis::
GetTargetPhraseStringRep() const
{
vector<FactorType> allFactors(MAX_NUM_FACTORS);
- for(size_t i=0; i < MAX_NUM_FACTORS; i++)
+ for(size_t i=0; i < MAX_NUM_FACTORS; i++)
allFactors[i] = i;
return GetTargetPhraseStringRep(allFactors);
}
- void
+ void
Hypothesis::
OutputAlignment(std::ostream &out) const
{
@@ -464,32 +464,32 @@ namespace Moses
edges.push_back(currentHypo);
currentHypo = currentHypo->GetPrevHypo();
}
-
+
OutputAlignment(out, edges);
-
+
}
-
- void
+
+ void
Hypothesis::
OutputAlignment(ostream &out, const vector<const Hypothesis *> &edges)
{
size_t targetOffset = 0;
-
+
for (int currEdge = (int)edges.size() - 1 ; currEdge >= 0 ; currEdge--) {
const Hypothesis &edge = *edges[currEdge];
const TargetPhrase &tp = edge.GetCurrTargetPhrase();
size_t sourceOffset = edge.GetCurrSourceWordsRange().GetStartPos();
-
+
OutputAlignment(out, tp.GetAlignTerm(), sourceOffset, targetOffset);
-
+
targetOffset += tp.GetSize();
}
// Used by --print-alignment-info, so no endl
}
- void
+ void
Hypothesis::
- OutputAlignment(ostream &out, const AlignmentInfo &ai,
+ OutputAlignment(ostream &out, const AlignmentInfo &ai,
size_t sourceOffset, size_t targetOffset)
{
typedef std::vector< const std::pair<size_t,size_t>* > AlignVec;
@@ -500,20 +500,20 @@ namespace Moses
const std::pair<size_t,size_t> &alignment = **it;
out << alignment.first + sourceOffset << "-" << alignment.second + targetOffset << " ";
}
-
+
}
- void
+ void
Hypothesis::
OutputInput(std::vector<const Phrase*>& map, const Hypothesis* hypo)
{
if (!hypo->GetPrevHypo()) return;
OutputInput(map, hypo->GetPrevHypo());
- map[hypo->GetCurrSourceWordsRange().GetStartPos()]
+ map[hypo->GetCurrSourceWordsRange().GetStartPos()]
= &hypo->GetTranslationOption().GetInputPath().GetPhrase();
}
- void
+ void
Hypothesis::
OutputInput(std::ostream& os) const
{
@@ -523,13 +523,13 @@ namespace Moses
for (size_t i=0; i<len; ++i)
if (inp_phrases[i]) os << *inp_phrases[i];
}
-
- void
+
+ void
Hypothesis::
OutputBestSurface(std::ostream &out, const std::vector<FactorType> &outputFactorOrder,
char reportSegmentation, bool reportAllFactors) const
{
- if (m_prevHypo)
+ if (m_prevHypo)
{ // recursively retrace this best path through the lattice, starting from the end of the hypothesis sentence
m_prevHypo->OutputBestSurface(out, outputFactorOrder, reportSegmentation, reportAllFactors);
}
@@ -540,9 +540,9 @@ namespace Moses
/***
* print surface factor only for the given phrase
*/
- void
+ void
Hypothesis::
- OutputSurface(std::ostream &out, const Hypothesis &edge,
+ OutputSurface(std::ostream &out, const Hypothesis &edge,
const std::vector<FactorType> &outputFactorOrder,
char reportSegmentation, bool reportAllFactors) const
{
@@ -616,15 +616,15 @@ namespace Moses
}
}
- std::map<size_t, const Factor*>
+ std::map<size_t, const Factor*>
Hypothesis::
GetPlaceholders(const Hypothesis &hypo, FactorType placeholderFactor) const
{
const InputPath &inputPath = hypo.GetTranslationOption().GetInputPath();
const Phrase &inputPhrase = inputPath.GetPhrase();
-
+
std::map<size_t, const Factor*> ret;
-
+
for (size_t sourcePos = 0; sourcePos < inputPhrase.GetSize(); ++sourcePos) {
const Factor *factor = inputPhrase.GetFactor(sourcePos, placeholderFactor);
if (factor) {
@@ -634,7 +634,7 @@ namespace Moses
ret[*targetPos.begin()] = factor;
}
}
-
+
return ret;
}
@@ -646,8 +646,8 @@ namespace Moses
using namespace std;
WordsRange const& src = this->GetCurrSourceWordsRange();
WordsRange const& trg = this->GetCurrTargetWordsRange();
-
- vector<pair<size_t,size_t> const* > a
+
+ vector<pair<size_t,size_t> const* > a
= this->GetCurrTargetPhrase().GetAlignTerm().GetSortedAlignments();
typedef pair<size_t,size_t> item;
map<string, xmlrpc_c::value> M;
@@ -659,7 +659,7 @@ namespace Moses
}
}
- void
+ void
Hypothesis::
OutputWordAlignment(vector<xmlrpc_c::value>& out) const
{
@@ -671,7 +671,7 @@ namespace Moses
}
#endif
-
-
+
+
}
diff --git a/moses/Hypothesis.h b/moses/Hypothesis.h
index ddd0d9af3..0ce75b83c 100644
--- a/moses/Hypothesis.h
+++ b/moses/Hypothesis.h
@@ -41,7 +41,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
#ifdef HAVE_XMLRPC_C
#include <xmlrpc-c/base.hpp>
-#endif
+#endif
namespace Moses
{
@@ -291,10 +291,10 @@ public:
#ifdef HAVE_XMLRPC_C
void OutputWordAlignment(std::vector<xmlrpc_c::value>& out) const;
void OutputLocalWordAlignment(std::vector<xmlrpc_c::value>& dest) const;
-#endif
+#endif
+
-
};
std::ostream& operator<<(std::ostream& out, const Hypothesis& hypothesis);
@@ -313,7 +313,7 @@ struct CompareHypothesisTotalScore {
ObjectPool<Hypothesis> &pool = Hypothesis::GetObjectPool(); \
pool.freeObject(hypo); \
} \
-
+
#else
#define FREEHYPO(hypo) delete hypo
#endif
diff --git a/moses/IOWrapper.cpp b/moses/IOWrapper.cpp
index 92994e234..57717e880 100644
--- a/moses/IOWrapper.cpp
+++ b/moses/IOWrapper.cpp
@@ -87,7 +87,7 @@ IOWrapper::IOWrapper()
{
const StaticData &staticData = StaticData::Instance();
- m_inputType = staticData.GetInputType();
+ m_inputType = staticData.GetInputType();
m_currentLine = staticData.GetStartTranslationId();
m_inputFactorOrder = &staticData.GetInputFactorOrder();
@@ -269,7 +269,7 @@ IOWrapper::ReadInput()
#endif
if (source->Read(*m_inputStream, *m_inputFactorOrder))
source->SetTranslationId(m_currentLine++);
- else
+ else
source.reset();
return source;
}
diff --git a/moses/Incremental.cpp b/moses/Incremental.cpp
index 49573664e..5e6139d3b 100644
--- a/moses/Incremental.cpp
+++ b/moses/Incremental.cpp
@@ -203,15 +203,15 @@ struct ChartCellBaseFactory {
} // namespace
-Manager::Manager(const InputType &source) :
- BaseManager(source),
- cells_(source, ChartCellBaseFactory()),
- parser_(source, cells_),
- n_best_(search::NBestConfig(StaticData::Instance().GetNBestSize())) {}
+Manager::Manager(ttasksptr const& ttask)
+ : BaseManager(ttask)
+ , cells_(m_source, ChartCellBaseFactory(), parser_)
+ , parser_(ttask, cells_)
+ , n_best_(search::NBestConfig(StaticData::Instance().GetNBestSize()))
+{ }
Manager::~Manager()
-{
-}
+{ }
template <class Model, class Best> search::History Manager::PopulateBest(const Model &model, const std::vector<lm::WordIndex> &words, Best &out)
{
diff --git a/moses/Incremental.h b/moses/Incremental.h
index c1f5e40b3..91b5dc5a0 100644
--- a/moses/Incremental.h
+++ b/moses/Incremental.h
@@ -1,3 +1,4 @@
+// -*- c++ -*-
#pragma once
#include "lm/word_index.hh"
@@ -24,7 +25,7 @@ namespace Incremental
class Manager : public BaseManager
{
public:
- Manager(const InputType &source);
+ Manager(ttasksptr const& ttask);
~Manager();
diff --git a/moses/InputType.cpp b/moses/InputType.cpp
index a06c106bd..d01fdd46e 100644
--- a/moses/InputType.cpp
+++ b/moses/InputType.cpp
@@ -29,7 +29,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
namespace Moses
{
-InputType::InputType(long translationId)
+InputType::InputType(long translationId)
: m_translationId(translationId)
{
m_frontSpanCoveredLength = 0;
diff --git a/moses/InputType.h b/moses/InputType.h
index ea03ec23e..24c7ef4fb 100644
--- a/moses/InputType.h
+++ b/moses/InputType.h
@@ -180,8 +180,8 @@ public:
virtual void Print(std::ostream&) const =0;
//! create trans options specific to this InputType
- virtual TranslationOptionCollection*
- CreateTranslationOptionCollection() const=0;
+ virtual TranslationOptionCollection*
+ CreateTranslationOptionCollection(ttasksptr const& ttask) const=0;
//! return substring. Only valid for Sentence class. TODO - get rid of this fn
virtual Phrase GetSubString(const WordsRange&) const =0;
diff --git a/moses/Jamfile b/moses/Jamfile
index 822645505..c94750ec6 100644
--- a/moses/Jamfile
+++ b/moses/Jamfile
@@ -1,3 +1,4 @@
+# -*- jam -*-
max-factors = [ option.get "max-factors" : 4 : 4 ] ;
path-constant FACTOR-LOG : bin/factor.log ;
update-if-changed $(FACTOR-LOG) $(max-factors) ;
@@ -58,23 +59,16 @@ obj FF_Factory.o : FF/Factory.cpp LM//macros headers ../lm//kenlm mmlib : <depen
# if yes, include server capabilities in the moses executable
# include $(TOP)/jam-files/server.jam ;
-xmlrpc-c-config = [ build_server ] ;
-if $(xmlrpc-c-config) = no
+if [ xmlrpc ]
{
- echo "NOT BUILDING MOSES SERVER!" ;
- alias mserver ;
- alias xmlrpc-linkflags ;
+ echo "BUILDING MOSES SERVER!" ;
+ alias mserver : [ glob server/*.cpp ] ;
}
else
{
- xmlprc-version = [ shell_or_die "$(xmlrpc-c-config) --version" ] ;
- # xmlprc-linkflags = [ shell_or_die "$(xmlrpc-c-config) --version" ] ;
- echo "building moses server with xmlrpc-c version $(xmlrpc-c-version)" ;
- alias mserver : [ glob server/*.cpp ] ;
-}
-
-
-
+ echo "NOT BUILDING MOSES SERVER!" ;
+ alias mserver ;
+}
if [ option.get "with-mm" : no : yes ] = yes
{
diff --git a/moses/LM/BilingualLM.cpp b/moses/LM/BilingualLM.cpp
index fb59696d4..d881c4616 100644
--- a/moses/LM/BilingualLM.cpp
+++ b/moses/LM/BilingualLM.cpp
@@ -106,7 +106,7 @@ size_t BilingualLM::selectMiddleAlignment(
{
set<size_t>::iterator it = alignment_links.begin();
- for (int i = 0; i < (alignment_links.size() - 1) / 2; ++i) {
+ for (size_t i = 0; i < (alignment_links.size() - 1) / 2; ++i) {
++it;
}
diff --git a/moses/LM/RDLM.cpp b/moses/LM/RDLM.cpp
index f531ade28..179b67095 100644
--- a/moses/LM/RDLM.cpp
+++ b/moses/LM/RDLM.cpp
@@ -50,7 +50,7 @@ void RDLM::Load() {
UTIL_THROW_IF2(size_head != lm_head_base_instance_->get_order(),
"Error: order of head LM (" << lm_head_base_instance_->get_order() << ") does not match context size specified (left_context=" << m_context_left << " , right_context=" << m_context_right << " , up_context=" << m_context_up << " for a total order of " << size_head);
- UTIL_THROW_IF2(size_label != lm_label_base_instance_->get_order(),
+ UTIL_THROW_IF2(size_label != lm_label_base_instance_->get_order(),
"Error: order of label LM (" << lm_label_base_instance_->get_order() << ") does not match context size specified (left_context=" << m_context_left << " , right_context=" << m_context_right << " , up_context=" << m_context_up << " for a total order of " << size_label);
//get int value of commonly used tokens
@@ -96,10 +96,10 @@ void RDLM::Load() {
// TreePointer mytree3 (new InternalTree("[ADJA europäische]"));
// TreePointer mytree4 (new InternalTree("[pred [det [ART die]] [attr [adv [adv [PTKNEG nicht]] [ADV fast]] [ADJA neue]] [attr [ADJA]] [NN Zeit]]]"));
// TreePointer mytree2 (new InternalTree("[vroot [subj [PPER ich]] [VAFIN bin] [pred]]"));
-//
+//
// std::vector<int> ancestor_heads;
// std::vector<int> ancestor_labels;
-//
+//
// size_t boundary_hash(0);
// boost::array<float, 4> score;
// score.fill(0);
@@ -108,48 +108,48 @@ void RDLM::Load() {
// TreePointerMap back_pointers = AssociateLeafNTs(mytree3.get(), previous_trees);
// Score(mytree3.get(), back_pointers, score, ancestor_heads, ancestor_labels, boundary_hash);
// std::cerr << "head LM: " << score[0] << "label LM: " << score[2] << " approx: " << score[1] << " - " << score[3] << std::endl;
-//
+//
// previous_trees.push_back(mytree3);
// back_pointers = AssociateLeafNTs(mytree4.get(), previous_trees);
// std::cerr << "scoring: " << mytree4->GetString() << std::endl;
// Score(mytree4.get(), back_pointers, score, ancestor_heads, ancestor_labels, boundary_hash);
// std::cerr << "head LM: " << score[0] << "label LM: " << score[2] << " approx: " << score[1] << " - " << score[3] << std::endl;
-//
+//
// mytree4->Combine(previous_trees);
// previous_trees.clear();
// previous_trees.push_back(mytree4);
// back_pointers = AssociateLeafNTs(mytree2.get(), previous_trees);
// std::cerr << "scoring: " << mytree2->GetString() << std::endl;
-//
+//
// score[1] = 0;
// score[3] = 0;
// Score(mytree2.get(), back_pointers, score, ancestor_heads, ancestor_labels, boundary_hash);
// std::cerr << "head LM: " << score[0] << "label LM: " << score[2] << " approx: " << score[1] << " - " << score[3] << std::endl;
-//
+//
// score[0] = 0;
// score[1] = 0;
// score[2] = 0;
// score[3] = 0;
// std::cerr << "scoring: " << mytree->GetString() << std::endl;
-//
+//
// Score(mytree.get(), back_pointers, score, ancestor_heads, ancestor_labels, boundary_hash);
// std::cerr << "head LM: " << score[0] << "label LM: " << score[2] << " approx: " << score[1] << " - " << score[3] << std::endl;
-//
+//
// }
// UTIL_THROW2("Finished");
-//
+//
// }
-//
+//
// {
// std::cerr << "BINARIZED\n\n";
// TreePointer mytree (new InternalTree("[vroot [subj [PPER ich]] [^vroot [VAFIN bin] [pred [det [ART die]] [^pred [attr [adv [adv [PTKNEG nicht]] [ADV fast]] [ADJA neue]] [^pred [attr [ADJA europäische]] [NN Zeit]]]]]]"));
// TreePointer mytree3 (new InternalTree("[ADJA europäische]"));
// TreePointer mytree4 (new InternalTree("[^pred [attr [adv [adv [PTKNEG nicht]] [ADV fast]] [ADJA neue]] [^pred [attr [ADJA]] [NN Zeit]]]"));
// TreePointer mytree2 (new InternalTree("[vroot [subj [PPER ich]] [^vroot [VAFIN bin] [pred [det [ART die]] [^pred]]]]"));
-//
+//
// std::vector<int> ancestor_heads;
// std::vector<int> ancestor_labels;
-//
+//
// size_t boundary_hash(0);
// boost::array<float, 4> score;
// score.fill(0);
@@ -158,33 +158,33 @@ void RDLM::Load() {
// TreePointerMap back_pointers = AssociateLeafNTs(mytree3.get(), previous_trees);
// Score(mytree3.get(), back_pointers, score, ancestor_heads, ancestor_labels, boundary_hash);
// std::cerr << "head LM: " << score[0] << " label LM: " << score[2] << " approx: " << score[1] << " - " << score[3] << std::endl;
-//
+//
// previous_trees.push_back(mytree3);
// back_pointers = AssociateLeafNTs(mytree4.get(), previous_trees);
// std::cerr << "scoring: " << mytree4->GetString() << std::endl;
// Score(mytree4.get(), back_pointers, score, ancestor_heads, ancestor_labels, boundary_hash);
// std::cerr << "head LM: " << score[0] << " label LM: " << score[2] << " approx: " << score[1] << " - " << score[3] << std::endl;
-//
+//
// mytree4->Combine(previous_trees);
// previous_trees.clear();
// previous_trees.push_back(mytree4);
// back_pointers = AssociateLeafNTs(mytree2.get(), previous_trees);
// std::cerr << "scoring: " << mytree2->GetString() << std::endl;
-//
+//
// score[1] = 0;
// score[3] = 0;
// Score(mytree2.get(), back_pointers, score, ancestor_heads, ancestor_labels, boundary_hash);
// std::cerr << "head LM: " << score[0] << " label LM: " << score[2] << " approx: " << score[1] << " - " << score[3] << std::endl;
-//
+//
// score[0] = 0;
// score[1] = 0;
// score[2] = 0;
// score[3] = 0;
// std::cerr << "scoring: " << mytree->GetString() << std::endl;
-//
+//
// Score(mytree.get(), back_pointers, score, ancestor_heads, ancestor_labels, boundary_hash);
// std::cerr << "head LM: " << score[0] << " label LM: " << score[2] << " approx: " << score[1] << " - " << score[3] << std::endl;
-//
+//
// }
// UTIL_THROW2("Finished");
@@ -790,7 +790,7 @@ FFState* RDLM::EvaluateWhenApplied(const ChartHypothesis& cur_hypo
prev_approx_label -= prev->GetApproximateScoreLabel();
}
}
- size_t ff_idx = accumulator->GetIndexes(this).first;
+ size_t ff_idx = m_index; // accumulator->GetIndexes(this).first;
accumulator->PlusEquals(ff_idx, prev_approx_head);
accumulator->PlusEquals(ff_idx+1, prev_approx_label);
diff --git a/moses/Manager.cpp b/moses/Manager.cpp
index da0661aee..8daaa6c8e 100644
--- a/moses/Manager.cpp
+++ b/moses/Manager.cpp
@@ -44,6 +44,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
#include "moses/LM/Base.h"
#include "moses/TranslationModel/PhraseDictionary.h"
#include "moses/TranslationAnalysis.h"
+#include "moses/TranslationTask.h"
#include "moses/HypergraphOutput.h"
#include "moses/mbr.h"
#include "moses/LatticeMBR.h"
@@ -54,33 +55,40 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
#endif
#include "util/exception.hh"
+#include "util/random.hh"
using namespace std;
namespace Moses
{
-Manager::Manager(InputType const& source)
- :BaseManager(source)
- ,m_transOptColl(source.CreateTranslationOptionCollection())
- ,interrupted_flag(0)
- ,m_hypoId(0)
+
+Manager::Manager(ttasksptr const& ttask)
+ : BaseManager(ttask)
+ , interrupted_flag(0)
+ , m_hypoId(0)
{
+ boost::shared_ptr<InputType> source = ttask->GetSource();
+ m_transOptColl = source->CreateTranslationOptionCollection(ttask);
+
const StaticData &staticData = StaticData::Instance();
SearchAlgorithm searchAlgorithm = staticData.GetSearchAlgorithm();
- m_search = Search::CreateSearch(*this, source, searchAlgorithm, *m_transOptColl);
+ m_search = Search::CreateSearch(*this, *source, searchAlgorithm,
+ *m_transOptColl);
- StaticData::Instance().InitializeForInput(m_source);
+ StaticData::Instance().InitializeForInput(ttask);
}
Manager::~Manager()
{
delete m_transOptColl;
delete m_search;
- // this is a comment ...
-
- StaticData::Instance().CleanUpAfterSentenceProcessing(m_source);
+ StaticData::Instance().CleanUpAfterSentenceProcessing(m_ttask.lock());
}
+const InputType&
+Manager::GetSource() const
+{ return m_source ; }
+
/**
* Main decoder loop that translates a sentence by expanding
* hypotheses stack by stack, until the end of the sentence.
@@ -121,7 +129,8 @@ void Manager::Decode()
Timer searchTime;
searchTime.start();
m_search->Decode();
- VERBOSE(1, "Line " << m_source.GetTranslationId() << ": Search took " << searchTime << " seconds" << endl);
+ VERBOSE(1, "Line " << m_source.GetTranslationId()
+ << ": Search took " << searchTime << " seconds" << endl);
IFVERBOSE(2) {
GetSentenceStats().StopTimeTotal();
TRACE_ERR(GetSentenceStats());
@@ -418,7 +427,7 @@ void Manager::CalcLatticeSamples(size_t count, TrellisPathList &ret) const
//cerr << endl;
//draw the sample
- float frandom = log((float)rand()/RAND_MAX);
+ const float frandom = log(util::rand_incl(0.0f, 1.0f));
size_t position = 1;
float sum = candidateScores[0];
for (; position < candidateScores.size() && sum < frandom; ++position) {
@@ -1637,7 +1646,7 @@ void Manager::OutputNBest(std::ostream& out
out << " |||";
// print scores with feature names
- path.GetScoreBreakdown().OutputAllFeatureScores(out );
+ path.GetScoreBreakdown()->OutputAllFeatureScores(out);
// total
out << " ||| " << path.GetTotalScore();
diff --git a/moses/Manager.h b/moses/Manager.h
index 4de0f5f95..398d456c6 100644
--- a/moses/Manager.h
+++ b/moses/Manager.h
@@ -151,7 +151,8 @@ protected:
void OutputAlignment(std::ostringstream &out, const TrellisPath &path) const;
public:
- Manager(InputType const& source);
+ // Manager(InputType const& source);
+ Manager(ttasksptr const& ttask);
~Manager();
const TranslationOptionCollection* getSntTranslationOptions();
@@ -180,9 +181,8 @@ public:
void OutputSearchGraphAsSLF(long translationId, std::ostream &outputSearchGraphStream) const;
void OutputSearchGraphAsHypergraph(std::ostream &outputSearchGraphStream) const;
void GetSearchGraph(std::vector<SearchGraphNode>& searchGraph) const;
- const InputType& GetSource() const {
- return m_source;
- }
+
+ const InputType& GetSource() const;
/***
* to be called after processing a sentence (which may consist of more than just calling ProcessSentence() )
diff --git a/moses/MockHypothesis.cpp b/moses/MockHypothesis.cpp
index 999124a90..a087400be 100644
--- a/moses/MockHypothesis.cpp
+++ b/moses/MockHypothesis.cpp
@@ -17,13 +17,12 @@ License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
***********************************************************************/
-
-
#include "MockHypothesis.h"
+#include "TranslationOption.h"
+#include "TranslationTask.h"
#include <boost/test/unit_test.hpp>
-#include "TranslationOption.h"
using namespace Moses;
using namespace std;
@@ -31,29 +30,23 @@ using namespace std;
namespace MosesTest
{
-
-MockHypothesisGuard::MockHypothesisGuard(
- const string& sourceSentence,
+MockHypothesisGuard
+::MockHypothesisGuard
+( const string& sourceSentence,
const vector<Alignment>& alignments,
const vector<string>& targetSegments)
- : m_initialTransOpt(),
- m_wp("WordPenalty"),
- m_uwp("UnknownWordPenalty"),
- m_dist("Distortion"),
- m_manager(m_sentence)
+ : m_initialTransOpt(), m_wp("WordPenalty"),
+ m_uwp("UnknownWordPenalty"), m_dist("Distortion")
{
BOOST_CHECK_EQUAL(alignments.size(), targetSegments.size());
-
- std::vector<Moses::FactorType> factors;
- factors.push_back(0);
-
- stringstream in(sourceSentence + "\n");
- m_sentence.Read(in,factors);
-
+ std::vector<Moses::FactorType> factors(1,0);
+ m_sentence.reset(new Sentence(0, sourceSentence, &factors));
+ m_ttask = TranslationTask::create(m_sentence);
+ m_manager.reset(new Manager(m_ttask));
//Initial empty hypothesis
- m_manager.ResetSentenceStats(m_sentence);
- m_hypothesis = Hypothesis::Create(m_manager, m_sentence, m_initialTransOpt);
+ m_manager->ResetSentenceStats(*m_sentence);
+ m_hypothesis = Hypothesis::Create(*m_manager, *m_sentence, m_initialTransOpt);
//create the chain
vector<Alignment>::const_iterator ai = alignments.begin();
diff --git a/moses/MockHypothesis.h b/moses/MockHypothesis.h
index 78cfc104e..34a313553 100644
--- a/moses/MockHypothesis.h
+++ b/moses/MockHypothesis.h
@@ -1,3 +1,4 @@
+// -*- c++ -*-
/***********************************************************************
Moses - factored phrase-based language decoder
Copyright (C) 2010 University of Edinburgh
@@ -45,10 +46,11 @@ class MockHypothesisGuard
public:
/** Creates a phrase-based hypothesis.
*/
- MockHypothesisGuard(
- const std::string& sourceSentence,
+ MockHypothesisGuard
+ ( const std::string& sourceSentence,
const std::vector<Alignment>& alignments,
const std::vector<std::string>& targetSegments);
+
Moses::Hypothesis* operator*() const {
return m_hypothesis;
}
@@ -58,11 +60,12 @@ public:
private:
Moses::TranslationOption m_initialTransOpt;
- Moses::Sentence m_sentence;
+ boost::shared_ptr<Moses::Sentence> m_sentence;
Moses::WordPenaltyProducer m_wp;
Moses::UnknownWordPenaltyProducer m_uwp;
Moses::DistortionScoreProducer m_dist;
- Moses::Manager m_manager;
+ boost::shared_ptr<Moses::Manager> m_manager;
+ boost::shared_ptr<Moses::TranslationTask> m_ttask;
Moses::Hypothesis* m_hypothesis;
std::vector<Moses::TargetPhrase> m_targetPhrases;
std::vector<Moses::TranslationOption*> m_toptions;
diff --git a/moses/Parameter.cpp b/moses/Parameter.cpp
index e43c69d22..5b5d76828 100644
--- a/moses/Parameter.cpp
+++ b/moses/Parameter.cpp
@@ -31,6 +31,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
#include "InputFileStream.h"
#include "StaticData.h"
#include "util/exception.hh"
+#include "util/random.hh"
#include <boost/program_options.hpp>
@@ -46,7 +47,7 @@ Parameter::Parameter()
{
///////////////////////////////////////////////////////////////////////////////////////
// general options
- po::options_description main_opts("Main Options");
+ po::options_description main_opts("Main Options");
AddParam(main_opts,"config", "f", "location of the configuration file");
AddParam(main_opts,"input-file", "i", "location of the input file to be translated");
@@ -56,7 +57,7 @@ Parameter::Parameter()
///////////////////////////////////////////////////////////////////////////////////////
// factorization options
- po::options_description factor_opts("General Factorization Options");
+ po::options_description factor_opts("General Factorization Options");
AddParam(factor_opts,"factor-delimiter", "fd", "specify a different factor delimiter than the default");
// one should be able to specify different factor delimiters for intput and output
AddParam(factor_opts,"mapping", "description of decoding steps"); // whatever that means ...
@@ -64,7 +65,7 @@ Parameter::Parameter()
///////////////////////////////////////////////////////////////////////////////////////
// general search options
- po::options_description search_opts("Search Options");
+ po::options_description search_opts("Search Options");
string desc = "Which search algorithm to use.\n";
desc += "0=normal stack (default)\n";
desc += "1=cube pruning\n";
@@ -119,7 +120,7 @@ Parameter::Parameter()
///////////////////////////////////////////////////////////////////////////////////////
// minimum bayes risk decoding
po::options_description mbr_opts("Minimum Bayes Risk (MBR), Lattice MBR, and Consensus decoding");
-
+
AddParam(mbr_opts,"minimum-bayes-risk", "mbr", "use miminum Bayes risk to determine best translation");
AddParam(mbr_opts,"mbr-size", "number of translation candidates considered in MBR decoding (default 200)");
AddParam(mbr_opts,"mbr-scale", "scaling factor to convert log linear score probability in MBR decoding (default 1.0)");
@@ -137,7 +138,7 @@ Parameter::Parameter()
///////////////////////////////////////////////////////////////////////////////////////
// OOV handling options
- po::options_description oov_opts("OOV Handling Options");
+ po::options_description oov_opts("OOV Handling Options");
AddParam(oov_opts,"drop-unknown", "du", "drop unknown words instead of copying them");
AddParam(oov_opts,"mark-unknown", "mu", "mark unknown words in output");
AddParam(oov_opts,"lmodel-oov-feature", "add language model oov feature, one per model");
@@ -145,7 +146,7 @@ Parameter::Parameter()
///////////////////////////////////////////////////////////////////////////////////////
// input options
- po::options_description input_opts("Input Format Options");
+ po::options_description input_opts("Input Format Options");
AddParam(input_opts,"input-factors", "list of factors in the input");
AddParam(input_opts,"inputtype", "text (0), confusion network (1), word lattice (2), tree (3) (default = 0)");
AddParam(input_opts,"xml-input", "xi", "allows markup of input with desired translations and probabilities. values can be 'pass-through' (default), 'inclusive', 'exclusive', 'constraint', 'ignore'");
@@ -155,7 +156,7 @@ Parameter::Parameter()
///////////////////////////////////////////////////////////////////////////////////////
// output options
- po::options_description output_opts("Output Options");
+ po::options_description output_opts("Output Options");
AddParam(output_opts,"report-all-factors", "report all factors in output, not just first");
AddParam(output_opts,"output-factors", "list if factors in the output");
AddParam(output_opts,"print-id", "prefix translations with id. Default if false");
@@ -191,7 +192,7 @@ Parameter::Parameter()
///////////////////////////////////////////////////////////////////////////////////////
// nbest-options
- po::options_description nbest_opts("N-best Options");
+ po::options_description nbest_opts("N-best Options");
AddParam(nbest_opts,"n-best-list", "file and size of n-best-list to be generated; specify - as the file in order to write to STDOUT");
// AddParam(nbest_opts,"n-best-list-file", "file of n-best-list to be generated; specify - as the file in order to write to STDOUT");
// AddParam(nbest_opts,"n-best-list-size", "size of n-best-list to be generated; specify - as the file in order to write to STDOUT");
@@ -201,22 +202,22 @@ Parameter::Parameter()
AddParam(nbest_opts,"report-all-factors-in-n-best", "Report all factors in n-best-lists. Default is false");
AddParam(nbest_opts,"lattice-samples", "generate samples from lattice, in same format as nbest list. Uses the file and size arguments, as in n-best-list");
AddParam(nbest_opts,"include-segmentation-in-n-best", "include phrasal segmentation in the n-best list. default is false");
- AddParam(nbest_opts,"print-alignment-info-in-n-best",
+ AddParam(nbest_opts,"print-alignment-info-in-n-best",
"Include word-to-word alignment in the n-best list. Word-to-word alignments are taken from the phrase table if any. Default is false");
///////////////////////////////////////////////////////////////////////////////////////
// server options
- po::options_description server_opts("Moses Server Options");
+ po::options_description server_opts("Moses Server Options");
AddParam(server_opts,"server", "Run moses as a translation server.");
AddParam(server_opts,"server-port", "Port for moses server");
AddParam(server_opts,"server-log", "Log destination for moses server");
AddParam(server_opts,"serial", "Run server in serial mode, processing only one request at a time.");
- po::options_description irstlm_opts("IRSTLM Options");
- AddParam(irstlm_opts,"clean-lm-cache",
+ po::options_description irstlm_opts("IRSTLM Options");
+ AddParam(irstlm_opts,"clean-lm-cache",
"clean language model caches after N translations (default N=1)");
- po::options_description chart_opts("Chart Decoding Options");
+ po::options_description chart_opts("Chart Decoding Options");
AddParam(chart_opts,"max-chart-span", "maximum num. of source word chart rules can consume (default 10)");
AddParam(chart_opts,"non-terminals", "list of non-term symbols, space separated");
AddParam(chart_opts,"rule-limit", "a little like table limit. But for chart decoding rules. Default is DEFAULT_MAX_TRANS_OPT_SIZE");
@@ -239,12 +240,13 @@ Parameter::Parameter()
AddParam(misc_opts,"feature-name-overwrite", "Override feature name (NOT arguments). Eg. SRILM-->KENLM, PhraseDictionaryMemory-->PhraseDictionaryScope3");
AddParam(misc_opts,"feature", "All the feature functions should be here");
+ AddParam(misc_opts,"context-string", "A (tokenized) string containing context words for context-sensitive translation.");
// Compact phrase table and reordering table.
po::options_description cpt_opts("Options when using compact phrase and reordering tables.");
AddParam(cpt_opts,"minphr-memory", "Load phrase table in minphr format into memory");
AddParam(cpt_opts,"minlexr-memory", "Load lexical reordering table in minlexr format into memory");
-
+
po::options_description spe_opts("Simulated Post-editing Options");
AddParam(spe_opts,"spe-src", "Simulated post-editing. Source filename");
AddParam(spe_opts,"spe-trg", "Simulated post-editing. Target filename");
@@ -252,7 +254,7 @@ Parameter::Parameter()
///////////////////////////////////////////////////////////////////////////////////////
// DEPRECATED options
- po::options_description deprec_opts("Deprecated Options");
+ po::options_description deprec_opts("Deprecated Options");
AddParam(deprec_opts,"link-param-count", "DEPRECATED. DO NOT USE. Number of parameters on word links when using confusion networks or lattices (default = 1)");
AddParam(deprec_opts,"weight-slm", "slm", "DEPRECATED. DO NOT USE. weight(s) for syntactic language model");
AddParam(deprec_opts,"weight-bl", "bl", "DEPRECATED. DO NOT USE. weight for bleu score feature");
@@ -294,7 +296,7 @@ Parameter::Parameter()
AddParam(deprec_opts,"source-word-deletion-feature", "DEPRECATED. DO NOT USE. Count feature for each unaligned source word");
AddParam(deprec_opts,"word-translation-feature", "DEPRECATED. DO NOT USE. Count feature for word translation according to word alignment");
- po::options_description zombie_opts("Zombie Options");
+ po::options_description zombie_opts("Zombie Options");
AddParam(zombie_opts,"distortion-file", "source factors (0 if table independent of source), target factors, location of the factorized/lexicalized reordering tables");
@@ -341,10 +343,10 @@ const PARAM_VEC *Parameter::GetParam(const std::string &paramName) const
}
/** initialize a parameter, sub of constructor */
-void
+void
Parameter::
AddParam(po::options_description& optgroup,
- string const& paramName,
+ string const& paramName,
string const& description)
{
m_valid[paramName] = true;
@@ -353,11 +355,11 @@ AddParam(po::options_description& optgroup,
}
/** initialize a parameter (including abbreviation), sub of constructor */
-void
+void
Parameter::
AddParam(po::options_description& optgroup,
- string const& paramName,
- string const& abbrevName,
+ string const& paramName,
+ string const& abbrevName,
string const& description)
{
m_valid[paramName] = true;
@@ -375,14 +377,14 @@ AddParam(po::options_description& optgroup,
}
/** print descriptions of all parameters */
-void
+void
Parameter::
Explain()
{
cerr << "Usage:" << endl;
cerr << m_options << endl;
- // for(PARAM_STRING::const_iterator iterParam = m_description.begin();
- // iterParam != m_description.end(); iterParam++)
+ // for(PARAM_STRING::const_iterator iterParam = m_description.begin();
+ // iterParam != m_description.end(); iterParam++)
// {
// const string paramName = iterParam->first;
// const string paramDescription = iterParam->second;
@@ -397,7 +399,7 @@ Explain()
/** check whether an item on the command line is a switch or a value
* \param token token on the command line to checked **/
-bool
+bool
Parameter::
isOption(const char* token)
{
@@ -411,7 +413,7 @@ isOption(const char* token)
}
/** load all parameters from the configuration file and the command line switches */
-bool
+bool
Parameter::
LoadParam(const string &filePath)
{
@@ -420,11 +422,11 @@ LoadParam(const string &filePath)
}
/** load all parameters from the configuration file and the command line switches */
-bool
+bool
Parameter::
LoadParam(int argc, char* xargv[])
{
- // legacy parameter handling: all parameters are expected
+ // legacy parameter handling: all parameters are expected
// to start with a single dash
char* argv[argc+1];
for (int i = 0; i < argc; ++i)
@@ -433,7 +435,7 @@ LoadParam(int argc, char* xargv[])
if (strlen(argv[i]) > 2 && argv[i][0] == '-' && argv[i][1] == '-')
++argv[i];
}
-
+
// config file (-f) arg mandatory
string configPath;
if ( (configPath = FindParam("-f", argc, argv)) == ""
@@ -532,7 +534,7 @@ LoadParam(int argc, char* xargv[])
return Validate() && noErrorFlag;
}
-void
+void
Parameter::
AddFeaturesCmd()
{
@@ -548,7 +550,7 @@ AddFeaturesCmd()
}
}
-std::vector<float>
+std::vector<float>
Parameter::
GetWeights(const std::string &name)
{
@@ -562,7 +564,7 @@ GetWeights(const std::string &name)
return ret;
}
-void
+void
Parameter::
SetWeight(const std::string &name, size_t ind, float weight)
{
@@ -605,7 +607,7 @@ AddWeight(const std::string &name, size_t ind,
SetWeight(name, ind, weights);
}
-void
+void
Parameter::
ConvertWeightArgsSingleWeight(const string &oldWeightName, const string &newWeightName)
{
@@ -623,7 +625,7 @@ ConvertWeightArgsSingleWeight(const string &oldWeightName, const string &newWeig
}
}
-void
+void
Parameter::
ConvertWeightArgsPhraseModel(const string &oldWeightName)
{
@@ -796,7 +798,7 @@ ConvertWeightArgsPhraseModel(const string &oldWeightName)
}
-void
+void
Parameter::
AddFeature(const std::string &line)
{
@@ -804,7 +806,7 @@ AddFeature(const std::string &line)
features.push_back(line);
}
-void
+void
Parameter::
ConvertWeightArgsDistortion()
{
@@ -871,7 +873,7 @@ ConvertWeightArgsDistortion()
}
-void
+void
Parameter::
ConvertWeightArgsLM()
{
@@ -965,7 +967,7 @@ ConvertWeightArgsLM()
m_setting.erase(oldFeatureName);
}
-void
+void
Parameter::
ConvertWeightArgsGeneration(const std::string &oldWeightName, const std::string &newWeightName)
{
@@ -1009,7 +1011,7 @@ ConvertWeightArgsGeneration(const std::string &oldWeightName, const std::string
m_setting.erase(oldFeatureName);
}
-void
+void
Parameter::
ConvertWeightArgsWordPenalty()
{
@@ -1044,7 +1046,7 @@ ConvertWeightArgsWordPenalty()
}
-void
+void
Parameter::
ConvertPhrasePenalty()
{
@@ -1061,7 +1063,7 @@ ConvertPhrasePenalty()
}
}
-void
+void
Parameter::
ConvertWeightArgs()
{
@@ -1105,7 +1107,7 @@ ConvertWeightArgs()
}
-void
+void
Parameter::
CreateWeightsMap()
{
@@ -1113,7 +1115,7 @@ CreateWeightsMap()
CreateWeightsMap(m_setting["weight"]);
}
-void
+void
Parameter::
CreateWeightsMap(const PARAM_VEC &vec)
{
@@ -1135,7 +1137,7 @@ CreateWeightsMap(const PARAM_VEC &vec)
}
}
-void
+void
Parameter::
WeightOverwrite()
{
@@ -1194,7 +1196,7 @@ WeightOverwrite()
}
/** check that parameter settings make sense */
-bool
+bool
Parameter::
Validate()
{
@@ -1255,9 +1257,9 @@ Validate()
}
/** check whether a file exists */
-bool
+bool
Parameter::
-FilesExist(const string &paramName, int fieldNo,
+FilesExist(const string &paramName, int fieldNo,
std::vector<std::string> const& extensions)
{
typedef std::vector<std::string> StringVec;
@@ -1301,7 +1303,7 @@ FilesExist(const string &paramName, int fieldNo,
/** look for a switch in arg, update parameter */
// TODO arg parsing like this does not belong in the library, it belongs
// in moses-cmd
-string
+string
Parameter::
FindParam(const string &paramSwitch, int argc, char* argv[])
{
@@ -1323,7 +1325,7 @@ FindParam(const string &paramSwitch, int argc, char* argv[])
* \param paramName full name of parameter
* \param argc number of arguments on command line
* \param argv values of paramters on command line */
-void
+void
Parameter::
OverwriteParam(const string &paramSwitch, const string &paramName, int argc, char* argv[])
{
@@ -1351,7 +1353,7 @@ OverwriteParam(const string &paramSwitch, const string &paramName, int argc, cha
/** read parameters from a configuration file */
-bool
+bool
Parameter::
ReadConfigFile(const string &filePath )
{
@@ -1392,7 +1394,7 @@ struct Credit {
this->contact = contact ;
this->currentPursuits = currentPursuits ;
this->areaResponsibility = areaResponsibility;
- this->sortId = rand() % 1000;
+ this->sortId = util::rand_excl(1000);
}
bool operator<(const Credit &other) const {
@@ -1421,7 +1423,7 @@ std::ostream& operator<<(std::ostream &os, const Credit &credit)
return os;
}
-void
+void
Parameter::
PrintCredit()
{
@@ -1511,7 +1513,7 @@ PrintCredit()
/** update parameter settings with command line switches
* \param paramName full name of parameter
* \param values inew values for paramName */
-void
+void
Parameter::
OverwriteParam(const string &paramName, PARAM_VEC values)
{
@@ -1536,14 +1538,14 @@ OverwriteParam(const string &paramName, PARAM_VEC values)
VERBOSE(2, std::endl);
}
-void
+void
Parameter::
PrintFF() const
{
StaticData::Instance().GetFeatureRegistry().PrintFF();
}
-std::set<std::string>
+std::set<std::string>
Parameter::
GetWeightNames() const
{
@@ -1556,7 +1558,7 @@ GetWeightNames() const
return ret;
}
-void
+void
Parameter::
Save(const std::string path)
{
@@ -1584,9 +1586,9 @@ Save(const std::string path)
}
template<>
-void
+void
Parameter::
-SetParameter<bool>(bool &parameter, std::string const& parameterName,
+SetParameter<bool>(bool &parameter, std::string const& parameterName,
bool const& defaultValue) const
{
const PARAM_VEC *params = GetParam(parameterName);
diff --git a/moses/Parameter.h b/moses/Parameter.h
index dd967e925..90b18c427 100644
--- a/moses/Parameter.h
+++ b/moses/Parameter.h
@@ -51,8 +51,8 @@ protected:
PARAM_STRING m_abbreviation;
PARAM_STRING m_description;
PARAM_STRING m_fullname;
- // std::map<char,std::set<std::string> > m_confusable;
- // stores long parameter names that start with a letter that is also a short option.
+ // std::map<char,std::set<std::string> > m_confusable;
+ // stores long parameter names that start with a letter that is also a short option.
options_description m_options;
std::map<std::string, std::vector<float> > m_weights;
@@ -64,30 +64,30 @@ protected:
bool isOption(const char* token);
bool Validate();
- void
+ void
AddParam(options_description& optgroup,
value_semantic const* optvalue,
- std::string const& paramName,
+ std::string const& paramName,
std::string const& description);
- void
+ void
AddParam(options_description& optgroup,
- std::string const &paramName,
+ std::string const &paramName,
std::string const &description);
- void
+ void
AddParam(options_description& optgroup,
value_semantic const* optvalue,
std::string const& paramName,
- std::string const& abbrevName,
+ std::string const& abbrevName,
std::string const& description);
- void
+ void
AddParam(options_description& optgroup,
std::string const& paramName,
- std::string const& abbrevName,
+ std::string const& abbrevName,
std::string const& description);
-
+
void PrintCredit();
void PrintFF() const;
diff --git a/moses/ScoreComponentCollection.cpp b/moses/ScoreComponentCollection.cpp
index e656743ec..d07fb5f00 100644
--- a/moses/ScoreComponentCollection.cpp
+++ b/moses/ScoreComponentCollection.cpp
@@ -1,6 +1,8 @@
// $Id$
#include <vector>
#include <boost/algorithm/string/predicate.hpp>
+#include <boost/foreach.hpp>
+#include <boost/format.hpp>
#include "util/exception.hh"
#include "ScoreComponentCollection.h"
#include "StaticData.h"
@@ -47,7 +49,7 @@ std::ostream& operator<<(std::ostream& os, const ScorePair& rhs)
return os;
}
-ScoreComponentCollection::ScoreIndexMap ScoreComponentCollection::s_scoreIndexes;
+//ScoreComponentCollection::ScoreIndexMap ScoreComponentCollection::s_scoreIndexes;
size_t ScoreComponentCollection::s_denseVectorSize = 0;
ScoreComponentCollection::
@@ -58,15 +60,14 @@ ScoreComponentCollection()
void
ScoreComponentCollection::
-RegisterScoreProducer(const FeatureFunction* scoreProducer)
+RegisterScoreProducer(FeatureFunction* scoreProducer)
{
size_t start = s_denseVectorSize;
- size_t end = start + scoreProducer->GetNumScoreComponents();
+ s_denseVectorSize = scoreProducer->SetIndex(s_denseVectorSize);
VERBOSE(1, "FeatureFunction: "
<< scoreProducer->GetScoreProducerDescription()
- << " start: " << start << " end: " << (end-1) << endl);
- s_scoreIndexes[scoreProducer] = pair<size_t,size_t>(start,end);
- s_denseVectorSize = end;
+ << " start: " << start
+ << " end: " << (s_denseVectorSize-1) << endl);
}
@@ -191,21 +192,23 @@ void ScoreComponentCollection::Save(ostream& out, bool multiline) const
sep = "=";
linesep = " ";
}
- ScoreIndexMap::const_iterator iter = s_scoreIndexes.begin();
- for (; iter != s_scoreIndexes.end(); ++iter ) {
- string name = iter->first->GetScoreProducerDescription();
- IndexPair ip = iter->second; // feature indices
- if (ip.second-ip.first == 1) {
- out << name << sep << m_scores[ip.first] << linesep;
- } else {
- for (size_t i=ip.first; i < ip.second; ++i) {
- ostringstream fullname;
- fullname << name << "_" << (i + 1 - ip.first);
- out << fullname.str() << sep << m_scores[i] << linesep;
- }
- }
- }
+ std::vector<FeatureFunction*> const& all_ff
+ = FeatureFunction::GetFeatureFunctions();
+ BOOST_FOREACH(FeatureFunction const* ff, all_ff)
+ {
+ string name = ff->GetScoreProducerDescription();
+ size_t i = ff->GetIndex();
+ if (ff->GetNumScoreComponents() == 1)
+ out << name << sep << m_scores[i] << linesep;
+ else
+ {
+ size_t stop = i + ff->GetNumScoreComponents();
+ boost::format fmt("%s_%d");
+ for (size_t k = 1; i < stop; ++i, ++k)
+ out << fmt % name % k << sep << m_scores[i] << linesep;
+ }
+ }
// write sparse features
m_scores.write(out,sep,linesep);
}
@@ -242,8 +245,8 @@ void
ScoreComponentCollection::
Assign(const FeatureFunction* sp, const std::vector<float>& scores)
{
- IndexPair indexes = GetIndexes(sp);
- size_t numScores = indexes.second - indexes.first;
+ size_t numScores = sp->GetNumScoreComponents();
+ size_t offset = sp->GetIndex();
if (scores.size() != numScores) {
UTIL_THROW(util::Exception, "Feature function "
@@ -253,7 +256,7 @@ Assign(const FeatureFunction* sp, const std::vector<float>& scores)
}
for (size_t i = 0; i < scores.size(); ++i) {
- m_scores[i + indexes.first] = scores[i];
+ m_scores[i + offset] = scores[i];
}
}
diff --git a/moses/ScoreComponentCollection.h b/moses/ScoreComponentCollection.h
index ce285b59e..0dbdb366c 100644
--- a/moses/ScoreComponentCollection.h
+++ b/moses/ScoreComponentCollection.h
@@ -95,24 +95,24 @@ private:
FVector m_scores;
public:
- typedef std::pair<size_t,size_t> IndexPair;
+ // typedef std::pair<size_t,size_t> IndexPair;
private:
- typedef std::map<const FeatureFunction*,IndexPair> ScoreIndexMap;
- static ScoreIndexMap s_scoreIndexes;
+ // typedef std::map<const FeatureFunction*,IndexPair> ScoreIndexMap;
+ // static ScoreIndexMap s_scoreIndexes;
static size_t s_denseVectorSize;
public:
- static IndexPair GetIndexes(const FeatureFunction* sp) {
- ScoreIndexMap::const_iterator indexIter = s_scoreIndexes.find(sp);
- if (indexIter == s_scoreIndexes.end()) {
- std::stringstream strme;
- strme << "ERROR: FeatureFunction: " << sp->GetScoreProducerDescription() <<
- " not registered with ScoreIndexMap" << std::endl;
- strme << "You must call ScoreComponentCollection.RegisterScoreProducer() " <<
- " for every FeatureFunction" << std::endl;
- UTIL_THROW2(strme.str());
- }
- return indexIter->second;
- }
+ // static IndexPair GetIndexes(const FeatureFunction* sp) {
+ // ScoreIndexMap::const_iterator indexIter = s_scoreIndexes.find(sp);
+ // if (indexIter == s_scoreIndexes.end()) {
+ // std::stringstream strme;
+ // strme << "ERROR: FeatureFunction: " << sp->GetScoreProducerDescription() <<
+ // " not registered with ScoreIndexMap" << std::endl;
+ // strme << "You must call ScoreComponentCollection.RegisterScoreProducer() " <<
+ // " for every FeatureFunction" << std::endl;
+ // UTIL_THROW2(strme.str());
+ // }
+ // return indexIter->second;
+ // }
public:
static void ResetCounter() {
@@ -136,7 +136,7 @@ public:
* Register a ScoreProducer with a fixed number of scores, so that it can
* be allocated space in the dense part of the feature vector.
**/
- static void RegisterScoreProducer(const FeatureFunction* scoreProducer);
+ static void RegisterScoreProducer(FeatureFunction* scoreProducer);
/** Load from file */
bool Load(const std::string& filename) {
@@ -229,22 +229,23 @@ public:
//! Add scores from a single ScoreProducer only
//! The length of scores must be equal to the number of score components
//! produced by sp
- void PlusEquals(const FeatureFunction* sp, const ScoreComponentCollection& scores) {
- IndexPair indexes = GetIndexes(sp);
- for (size_t i = indexes.first; i < indexes.second; ++i) {
- m_scores[i] += scores.m_scores[i];
- }
+ void
+ PlusEquals(const FeatureFunction* sp,
+ const ScoreComponentCollection& scores) {
+ size_t i = sp->GetIndex();
+ size_t stop = i + sp->GetNumScoreComponents();
+ for (;i < stop; ++i) m_scores[i] += scores.m_scores[i];
}
//! Add scores from a single FeatureFunction only
//! The length of scores must be equal to the number of score components
//! produced by sp
void PlusEquals(const FeatureFunction* sp, const std::vector<float>& scores) {
- IndexPair indexes = GetIndexes(sp);
- UTIL_THROW_IF2(scores.size() != indexes.second - indexes.first,
+ UTIL_THROW_IF2(scores.size() != sp->GetNumScoreComponents(),
"Number of scores is incorrect");
+ size_t offset = sp->GetIndex();
for (size_t i = 0; i < scores.size(); ++i) {
- m_scores[i + indexes.first] += scores[i];
+ m_scores[i + offset] += scores[i];
}
}
@@ -252,10 +253,9 @@ public:
//! to add the score from a single ScoreProducer that produces
//! a single value
void PlusEquals(const FeatureFunction* sp, float score) {
- IndexPair indexes = GetIndexes(sp);
- UTIL_THROW_IF2(1 != indexes.second - indexes.first,
+ UTIL_THROW_IF2(sp->GetNumScoreComponents() != 1,
"Number of scores is incorrect");
- m_scores[indexes.first] += score;
+ m_scores[sp->GetIndex()] += score;
}
//For features which have an unbounded number of components
@@ -287,10 +287,10 @@ public:
//! to add the score from a single ScoreProducer that produces
//! a single value
void Assign(const FeatureFunction* sp, float score) {
- IndexPair indexes = GetIndexes(sp);
- UTIL_THROW_IF2(1 != indexes.second - indexes.first,
+
+ UTIL_THROW_IF2(sp->GetNumScoreComponents() != 1,
"Feature function must must only contain 1 score");
- m_scores[indexes.first] = score;
+ m_scores[sp->GetIndex()] = score;
}
// Assign score by index
@@ -329,9 +329,9 @@ public:
size_t components = sp->GetNumScoreComponents();
std::vector<float> res(components);
- IndexPair indexes = GetIndexes(sp);
+ size_t offset = sp->GetIndex();
for (size_t i = 0; i < res.size(); ++i) {
- res[i] = m_scores[i + indexes.first];
+ res[i] = m_scores[i + offset];
}
return res;
}
@@ -364,18 +364,17 @@ public:
m_scores.capMin(minValue);
}
- std::pair<size_t,size_t> GetIndexesForProducer(const FeatureFunction* sp) const {
- IndexPair indexPair = GetIndexes(sp);
- return indexPair;
- }
+ // std::pair<size_t,size_t> GetIndexesForProducer(const FeatureFunction* sp) const {
+ // IndexPair indexPair = GetIndexes(sp);
+ // return indexPair;
+ // }
//! if a FeatureFunction produces a single score (for example, a language model score)
//! this will return it. If not, this method will throw
float GetScoreForProducer(const FeatureFunction* sp) const {
- IndexPair indexes = GetIndexes(sp);
- UTIL_THROW_IF2(indexes.second - indexes.first != 1,
+ UTIL_THROW_IF2(sp->GetNumScoreComponents() != 1,
"Feature function must must only contain 1 score");
- return m_scores[indexes.first];
+ return m_scores[sp->GetIndex()];
}
//For features which have an unbounded number of components
diff --git a/moses/Sentence.cpp b/moses/Sentence.cpp
index a16eae3b2..cf866f933 100644
--- a/moses/Sentence.cpp
+++ b/moses/Sentence.cpp
@@ -43,7 +43,7 @@ Sentence::
Sentence() : Phrase(0) , InputType()
{
const StaticData& SD = StaticData::Instance();
- if (SD.IsSyntax())
+ if (SD.IsSyntax())
m_defaultLabelSet.insert(SD.GetInputDefaultNonTerminal());
}
@@ -60,7 +60,7 @@ aux_init_partial_translation(string& line)
string sourceCompletedStr;
int loc1 = line.find( "|||", 0 );
int loc2 = line.find( "|||", loc1 + 3 );
- if (loc1 > -1 && loc2 > -1)
+ if (loc1 > -1 && loc2 > -1)
{
m_initialTargetPhrase = Trim(line.substr(0, loc1));
string scov = Trim(line.substr(loc1 + 3, loc2 - loc1 - 3));
@@ -68,14 +68,14 @@ aux_init_partial_translation(string& line)
m_sourceCompleted.resize(scov.size());
int contiguous = 1;
- for (size_t i = 0; i < scov.size(); ++i)
+ for (size_t i = 0; i < scov.size(); ++i)
{
- if (sourceCompletedStr.at(i) == '1')
+ if (sourceCompletedStr.at(i) == '1')
{
m_sourceCompleted[i] = true;
if (contiguous) m_frontSpanCoveredLength++;
- }
- else
+ }
+ else
{
m_sourceCompleted[i] = false;
contiguous = 0;
@@ -92,48 +92,48 @@ aux_interpret_sgml_markup(string& line)
typedef std::map<std::string, std::string> metamap;
metamap meta = ProcessAndStripSGML(line);
metamap::const_iterator i;
- if ((i = meta.find("id")) != meta.end())
+ if ((i = meta.find("id")) != meta.end())
this->SetTranslationId(atol(i->second.c_str()));
- if ((i = meta.find("docid")) != meta.end())
+ if ((i = meta.find("docid")) != meta.end())
{
this->SetDocumentId(atol(i->second.c_str()));
this->SetUseTopicId(false);
this->SetUseTopicIdAndProb(false);
}
- if ((i = meta.find("topic")) != meta.end())
+ if ((i = meta.find("topic")) != meta.end())
{
vector<string> topic_params;
boost::split(topic_params, i->second, boost::is_any_of("\t "));
- if (topic_params.size() == 1)
+ if (topic_params.size() == 1)
{
this->SetTopicId(atol(topic_params[0].c_str()));
this->SetUseTopicId(true);
this->SetUseTopicIdAndProb(false);
- }
- else
+ }
+ else
{
this->SetTopicIdAndProb(topic_params);
this->SetUseTopicId(false);
this->SetUseTopicIdAndProb(true);
}
}
- if ((i = meta.find("weight-setting")) != meta.end())
+ if ((i = meta.find("weight-setting")) != meta.end())
{
this->SetWeightSetting(i->second);
this->SetSpecifiesWeightSetting(true);
- StaticData::Instance().SetWeightSetting(i->second);
+ StaticData::Instance().SetWeightSetting(i->second);
// oh this is so horrible! Why does this have to be propagated globally?
// --- UG
- }
+ }
else this->SetSpecifiesWeightSetting(false);
}
-void
+void
Sentence::
aux_interpret_dlt(string& line) // whatever DLT means ... --- UG
{
using namespace std;
- typedef map<string, string> str2str_map;
+ typedef map<string, string> str2str_map;
vector<str2str_map> meta = ProcessAndStripDLT(line);
BOOST_FOREACH(str2str_map const& M, meta)
{
@@ -148,7 +148,7 @@ aux_interpret_dlt(string& line) // whatever DLT means ... --- UG
cbtm = PhraseDictionaryDynamicCacheBased::InstanceNonConst(id);
if (cbtm) cbtm->ExecuteDlt(M);
}
- if (i->second == "cblm")
+ if (i->second == "cblm")
{
DynamicCacheBasedLanguageModel* cblm;
cblm = DynamicCacheBasedLanguageModel::InstanceNonConst(id);
@@ -167,11 +167,11 @@ aux_interpret_xml(std::string& line, std::vector<size_t> & xmlWalls,
const StaticData &SD = StaticData::Instance();
using namespace std;
- if (SD.GetXmlInputType() != XmlPassThrough)
+ if (SD.GetXmlInputType() != XmlPassThrough)
{
int offset = SD.IsSyntax() ? 1 : 0;
- bool OK = ProcessAndStripXMLTags(line, m_xmlOptions,
- m_reorderingConstraint,
+ bool OK = ProcessAndStripXMLTags(line, m_xmlOptions,
+ m_reorderingConstraint,
xmlWalls, placeholders, offset,
SD.GetXmlBrackets().first,
SD.GetXmlBrackets().second);
@@ -179,7 +179,7 @@ aux_interpret_xml(std::string& line, std::vector<size_t> & xmlWalls,
}
}
-void
+void
Sentence::
init(string line, std::vector<FactorType> const& factorOrder)
{
@@ -189,15 +189,15 @@ init(string line, std::vector<FactorType> const& factorOrder)
m_frontSpanCoveredLength = 0;
m_sourceCompleted.resize(0);
- if (SD.ContinuePartialTranslation())
+ if (SD.ContinuePartialTranslation())
aux_init_partial_translation(line);
line = Trim(line);
aux_interpret_sgml_markup(line); // for "<seg id=..." markup
aux_interpret_dlt(line); // some poorly documented cache-based stuff
-
+
// if sentences is specified as "<passthrough tag1=""/>"
- if (SD.IsPassthroughEnabled() || SD.IsPassthroughInNBestEnabled())
+ if (SD.IsPassthroughEnabled() || SD.IsPassthroughInNBestEnabled())
{
string pthru = PassthroughSGML(line,"passthrough");
this->SetPassthroughInformation(pthru);
@@ -212,19 +212,19 @@ init(string line, std::vector<FactorType> const& factorOrder)
ProcessPlaceholders(placeholders);
if (SD.IsSyntax()) InitStartEndWord();
-
+
// now that we have final word positions in phrase (from
// CreateFromString), we can make input phrase objects to go with
// our XmlOptions and create TranslationOptions
// only fill the vector if we are parsing XML
- if (SD.GetXmlInputType() != XmlPassThrough)
+ if (SD.GetXmlInputType() != XmlPassThrough)
{
m_xmlCoverageMap.assign(GetSize(), false);
BOOST_FOREACH(XmlOption* o, m_xmlOptions)
{
WordsRange const& r = o->range;
- for(size_t j = r.GetStartPos(); j <= r.GetEndPos(); ++j)
+ for(size_t j = r.GetStartPos(); j <= r.GetEndPos(); ++j)
m_xmlCoverageMap[j]=true;
}
}
@@ -233,7 +233,7 @@ init(string line, std::vector<FactorType> const& factorOrder)
m_reorderingConstraint.InitializeWalls(GetSize());
// set reordering walls, if "-monotone-at-punction" is set
- if (SD.UseReorderingConstraint() && GetSize())
+ if (SD.UseReorderingConstraint() && GetSize())
{
WordsRange r(0, GetSize()-1);
m_reorderingConstraint.SetMonotoneAtPunctuation(GetSubString(r));
@@ -244,10 +244,10 @@ init(string line, std::vector<FactorType> const& factorOrder)
if(xmlWalls[i] < GetSize()) // no buggy walls, please
m_reorderingConstraint.SetWall(xmlWalls[i], true);
m_reorderingConstraint.FinalizeWalls();
-
+
}
-int
+int
Sentence::
Read(std::istream& in,const std::vector<FactorType>& factorOrder)
{
@@ -258,7 +258,7 @@ Read(std::istream& in,const std::vector<FactorType>& factorOrder)
return 1;
}
-void
+void
Sentence::
ProcessPlaceholders(const std::vector< std::pair<size_t, std::string> > &placeholders)
{
@@ -278,11 +278,13 @@ ProcessPlaceholders(const std::vector< std::pair<size_t, std::string> > &placeho
TranslationOptionCollection*
Sentence::
-CreateTranslationOptionCollection() const
+CreateTranslationOptionCollection(ttasksptr const& ttask) const
{
size_t maxNoTransOptPerCoverage = StaticData::Instance().GetMaxNoTransOptPerCoverage();
float transOptThreshold = StaticData::Instance().GetTranslationOptionThreshold();
- TranslationOptionCollection *rv= new TranslationOptionCollectionText(*this, maxNoTransOptPerCoverage, transOptThreshold);
+ TranslationOptionCollection *rv
+ = new TranslationOptionCollectionText(ttask, *this, maxNoTransOptPerCoverage,
+ transOptThreshold);
assert(rv);
return rv;
}
@@ -383,10 +385,12 @@ CreateFromString(vector<FactorType> const& FOrder, string const& phraseString)
}
Sentence::
-Sentence(size_t const transId, string const& stext) : InputType(transId)
+Sentence(size_t const transId, string const& stext,
+ vector<FactorType> const* IFO)
+ : InputType(transId)
{
- vector<FactorType> const& IFO = StaticData::Instance().GetInputFactorOrder();
- init(stext, IFO);
+ if (IFO) init(stext, *IFO);
+ else init(stext, StaticData::Instance().GetInputFactorOrder());
}
}
diff --git a/moses/Sentence.h b/moses/Sentence.h
index 8e7e934a6..8a870f76b 100644
--- a/moses/Sentence.h
+++ b/moses/Sentence.h
@@ -63,7 +63,9 @@ namespace Moses
public:
Sentence();
- Sentence(size_t const transId, std::string const& stext);
+ Sentence(size_t const transId, std::string const& stext,
+ std::vector<FactorType> const* IFO = NULL);
+ // Sentence(size_t const transId, std::string const& stext);
~Sentence();
InputTypeEnum GetType() const {
@@ -96,18 +98,19 @@ namespace Moses
virtual int Read(std::istream& in,const std::vector<FactorType>& factorOrder);
void Print(std::ostream& out) const;
- TranslationOptionCollection* CreateTranslationOptionCollection() const;
+ TranslationOptionCollection*
+ CreateTranslationOptionCollection(ttasksptr const& ttask) const;
- virtual void
- CreateFromString(std::vector<FactorType> const &factorOrder,
- std::string const& phraseString);
+ virtual void
+ CreateFromString(std::vector<FactorType> const &factorOrder,
+ std::string const& phraseString);
const NonTerminalSet&
- GetLabelSet(size_t /*startPos*/, size_t /*endPos*/) const
+ GetLabelSet(size_t /*startPos*/, size_t /*endPos*/) const
{ return m_defaultLabelSet; }
- void
+ void
init(std::string line, std::vector<FactorType> const& factorOrder);
private:
@@ -117,13 +120,13 @@ namespace Moses
// void aux_interpret_xml (std::string& line, std::vector<size_t> & xmlWalls,
// std::vector<std::pair<size_t, std::string> >& placeholders);
- void
+ void
aux_interpret_sgml_markup(std::string& line);
- void
+ void
aux_interpret_dlt(std::string& line);
- void
+ void
aux_interpret_xml
(std::string& line, std::vector<size_t> & xmlWalls,
std::vector<std::pair<size_t, std::string> >& placeholders);
@@ -132,7 +135,7 @@ namespace Moses
aux_init_partial_translation(std::string& line);
};
-
+
}
diff --git a/moses/StaticData.cpp b/moses/StaticData.cpp
index 8709d758f..420ad7a20 100644
--- a/moses/StaticData.cpp
+++ b/moses/StaticData.cpp
@@ -99,25 +99,55 @@ bool StaticData::LoadDataStatic(Parameter *parameter, const std::string &execPat
return s_instance.LoadData(parameter);
}
-bool StaticData::LoadData(Parameter *parameter)
+void
+StaticData
+::initialize_features()
{
- ResetUserTime();
- m_parameter = parameter;
+ std::map<std::string, std::string> featureNameOverride = OverrideFeatureNames();
+ // all features
+ map<string, int> featureIndexMap;
- const PARAM_VEC *params;
+ const PARAM_VEC* params = m_parameter->GetParam("feature");
+ for (size_t i = 0; params && i < params->size(); ++i) {
+ const string &line = Trim(params->at(i));
+ VERBOSE(1,"line=" << line << endl);
+ if (line.empty())
+ continue;
- // verbose level
- m_parameter->SetParameter(m_verboseLevel, "verbose", (size_t) 1);
+ vector<string> toks = Tokenize(line);
- // to cube or not to cube
- m_parameter->SetParameter(m_searchAlgorithm, "search-algorithm", Normal);
+ string &feature = toks[0];
+ std::map<std::string, std::string>::const_iterator iter
+ = featureNameOverride.find(feature);
+ if (iter == featureNameOverride.end()) {
+ // feature name not override
+ m_registry.Construct(feature, line);
+ } else {
+ // replace feature name with new name
+ string newName = iter->second;
+ feature = newName;
+ string newLine = Join(" ", toks);
+ m_registry.Construct(newName, newLine);
+ }
+ }
- if (IsSyntax())
- LoadChartDecodingParameters();
+ NoCache();
+ OverrideFeatures();
+
+}
+
+void
+StaticData
+::ini_input_options()
+{
+ const PARAM_VEC *params;
// input type has to be specified BEFORE loading the phrase tables!
m_parameter->SetParameter(m_inputType, "inputtype", SentenceInput);
+ m_parameter->SetParameter(m_continuePartialTranslation,
+ "continue-partial-translation", false );
+
std::string s_it = "text input";
if (m_inputType == 1) {
s_it = "confusion net";
@@ -130,31 +160,55 @@ bool StaticData::LoadData(Parameter *parameter)
}
VERBOSE(2,"input type is: "<<s_it<<"\n");
+ // use of xml in input
+ m_parameter->SetParameter<XmlInputType>(m_xmlInputType, "xml-input", XmlPassThrough);
+
+ // specify XML tags opening and closing brackets for XML option
+ params = m_parameter->GetParam("xml-brackets");
+ if (params && params->size()) {
+ std::vector<std::string> brackets = Tokenize(params->at(0));
+ if(brackets.size()!=2) {
+ cerr << "invalid xml-brackets value, must specify exactly 2 blank-delimited strings for XML tags opening and closing brackets" << endl;
+ exit(1);
+ }
+ m_xmlBrackets.first= brackets[0];
+ m_xmlBrackets.second=brackets[1];
+ VERBOSE(1,"XML tags opening and closing brackets for XML input are: "
+ << m_xmlBrackets.first << " and " << m_xmlBrackets.second << endl);
+ }
+
+ m_parameter->SetParameter(m_defaultNonTermOnlyForEmptyRange,
+ "default-non-term-for-empty-range-only", false );
+
+}
+
+bool
+StaticData
+::ini_output_options()
+{
+ const PARAM_VEC *params;
+
+ // verbose level
+ m_parameter->SetParameter(m_verboseLevel, "verbose", (size_t) 1);
+
+
m_parameter->SetParameter(m_recoverPath, "recover-input-path", false);
if (m_recoverPath && m_inputType == SentenceInput) {
TRACE_ERR("--recover-input-path should only be used with confusion net or word lattice input!\n");
m_recoverPath = false;
}
- // factor delimiter
- m_parameter->SetParameter<string>(m_factorDelimiter, "factor-delimiter", "|");
- if (m_factorDelimiter == "none") {
- m_factorDelimiter = "";
- }
-
- m_parameter->SetParameter( m_continuePartialTranslation, "continue-partial-translation", false );
- m_parameter->SetParameter( m_outputHypoScore, "output-hypo-score", false );
+ m_parameter->SetParameter(m_outputHypoScore, "output-hypo-score", false );
//word-to-word alignment
// alignments
- m_parameter->SetParameter( m_PrintAlignmentInfo, "print-alignment-info", false );
+ m_parameter->SetParameter(m_PrintAlignmentInfo, "print-alignment-info", false );
if (m_PrintAlignmentInfo) {
m_needAlignmentInfo = true;
}
m_parameter->SetParameter(m_wordAlignmentSort, "sort-word-alignment", NoSort);
- m_parameter->SetParameter( m_PrintAlignmentInfoNbest, "print-alignment-info-in-n-best", false );
if (m_PrintAlignmentInfoNbest) {
m_needAlignmentInfo = true;
}
@@ -169,37 +223,6 @@ bool StaticData::LoadData(Parameter *parameter)
m_parameter->SetParameter( m_PrintPassthroughInformation, "print-passthrough", false );
m_parameter->SetParameter( m_PrintPassthroughInformationInNBest, "print-passthrough-in-n-best", false );
- // n-best
- params = m_parameter->GetParam("n-best-list");
- if (params) {
- if (params->size() >= 2) {
- m_nBestFilePath = params->at(0);
- m_nBestSize = Scan<size_t>( params->at(1) );
- m_onlyDistinctNBest=(params->size()>2 && params->at(2)=="distinct");
- } else {
- std::cerr << "wrong format for switch -n-best-list file size [disinct]";
- return false;
- }
- } else {
- m_nBestSize = 0;
- }
-
- m_parameter->SetParameter<size_t>(m_nBestFactor, "n-best-factor", 20);
-
- //lattice samples
- params = m_parameter->GetParam("lattice-samples");
- if (params) {
- if (params->size() ==2 ) {
- m_latticeSamplesFilePath = params->at(0);
- m_latticeSamplesSize = Scan<size_t>(params->at(1));
- } else {
- std::cerr <<"wrong format for switch -lattice-samples file size";
- return false;
- }
- } else {
- m_latticeSamplesSize = 0;
- }
-
// word graph
params = m_parameter->GetParam("output-word-graph");
if (params && params->size() == 2)
@@ -260,12 +283,6 @@ bool StaticData::LoadData(Parameter *parameter)
m_parameter->SetParameter<string>(m_outputUnknownsFile, "output-unknowns", "");
- // include feature names in the n-best list
- m_parameter->SetParameter( m_labeledNBestList, "labeled-n-best-list", true );
-
- // include word alignment in the n-best list
- m_parameter->SetParameter( m_nBestIncludesSegmentation, "include-segmentation-in-n-best", false );
-
// printing source phrase spans
m_parameter->SetParameter( m_reportSegmentation, "report-segmentation", false );
m_parameter->SetParameter( m_reportSegmentationEnriched, "report-segmentation-enriched", false );
@@ -273,8 +290,159 @@ bool StaticData::LoadData(Parameter *parameter)
// print all factors of output translations
m_parameter->SetParameter( m_reportAllFactors, "report-all-factors", false );
+ //Print Translation Options
+ m_parameter->SetParameter(m_printTranslationOptions, "print-translation-option", false );
+
+ //Print All Derivations
+ m_parameter->SetParameter(m_printAllDerivations , "print-all-derivations", false );
+
+ // additional output
+ m_parameter->SetParameter<string>(m_detailedTranslationReportingFilePath, "translation-details", "");
+ m_parameter->SetParameter<string>(m_detailedTreeFragmentsTranslationReportingFilePath, "tree-translation-details", "");
+
+ //DIMw
+ m_parameter->SetParameter<string>(m_detailedAllTranslationReportingFilePath, "translation-all-details", "");
+
+ m_parameter->SetParameter<long>(m_startTranslationId, "start-translation-id", 0);
+
+
+ //lattice samples
+ params = m_parameter->GetParam("lattice-samples");
+ if (params) {
+ if (params->size() ==2 ) {
+ m_latticeSamplesFilePath = params->at(0);
+ m_latticeSamplesSize = Scan<size_t>(params->at(1));
+ } else {
+ std::cerr <<"wrong format for switch -lattice-samples file size";
+ return false;
+ }
+ } else {
+ m_latticeSamplesSize = 0;
+ }
+ return true;
+}
+
+
+bool
+StaticData
+::ini_nbest_options()
+{
+ const PARAM_VEC *params;
+ // n-best
+ params = m_parameter->GetParam("n-best-list");
+ if (params) {
+ if (params->size() >= 2) {
+ m_nBestFilePath = params->at(0);
+ m_nBestSize = Scan<size_t>( params->at(1) );
+ m_onlyDistinctNBest=(params->size()>2 && params->at(2)=="distinct");
+ } else {
+ std::cerr << "wrong format for switch -n-best-list file size [disinct]";
+ return false;
+ }
+ } else {
+ m_nBestSize = 0;
+ }
+
+ m_parameter->SetParameter<size_t>(m_nBestFactor, "n-best-factor", 20);
+
+
+ m_parameter->SetParameter(m_PrintAlignmentInfoNbest,
+ "print-alignment-info-in-n-best", false );
+
+ // include feature names in the n-best list
+ m_parameter->SetParameter(m_labeledNBestList, "labeled-n-best-list", true );
+
+ // include word alignment in the n-best list
+ m_parameter->SetParameter(m_nBestIncludesSegmentation,
+ "include-segmentation-in-n-best", false );
+
// print all factors of output translations
- m_parameter->SetParameter( m_reportAllFactorsNBest, "report-all-factors-in-n-best", false );
+ m_parameter->SetParameter(m_reportAllFactorsNBest,
+ "report-all-factors-in-n-best", false );
+
+ m_parameter->SetParameter(m_printNBestTrees, "n-best-trees", false );
+ return true;
+}
+
+void
+StaticData
+::ini_compact_table_options()
+{
+ // Compact phrase table and reordering model
+ m_parameter->SetParameter(m_minphrMemory, "minphr-memory", false );
+ m_parameter->SetParameter(m_minlexrMemory, "minlexr-memory", false );
+}
+
+void
+StaticData
+::ini_lm_options()
+{
+ m_parameter->SetParameter<size_t>(m_lmcache_cleanup_threshold, "clean-lm-cache", 1);
+}
+
+// threads, timeouts, etc.
+bool
+StaticData
+::ini_performance_options()
+{
+ const PARAM_VEC *params;
+ m_parameter->SetParameter<size_t>(m_timeout_threshold, "time-out", -1);
+ m_timeout = (GetTimeoutThreshold() == (size_t)-1) ? false : true;
+
+ m_threadCount = 1;
+ params = m_parameter->GetParam("threads");
+ if (params && params->size()) {
+ if (params->at(0) == "all") {
+#ifdef WITH_THREADS
+ m_threadCount = boost::thread::hardware_concurrency();
+ if (!m_threadCount) {
+ std::cerr << "-threads all specified but Boost doesn't know how many cores there are";
+ return false;
+ }
+#else
+ std::cerr << "-threads all specified but moses not built with thread support";
+ return false;
+#endif
+ } else {
+ m_threadCount = Scan<int>(params->at(0));
+ if (m_threadCount < 1) {
+ std::cerr << "Specify at least one thread.";
+ return false;
+ }
+#ifndef WITH_THREADS
+ if (m_threadCount > 1) {
+ std::cerr << "Error: Thread count of " << params->at(0)
+ << " but moses not built with thread support";
+ return false;
+ }
+#endif
+ }
+ }
+ return true;
+}
+
+void
+StaticData
+::ini_cube_pruning_options()
+{
+ m_parameter->SetParameter(m_cubePruningPopLimit, "cube-pruning-pop-limit",
+ DEFAULT_CUBE_PRUNING_POP_LIMIT);
+ m_parameter->SetParameter(m_cubePruningDiversity, "cube-pruning-diversity",
+ DEFAULT_CUBE_PRUNING_DIVERSITY);
+ m_parameter->SetParameter(m_cubePruningLazyScoring, "cube-pruning-lazy-scoring",
+ false);
+}
+
+void
+StaticData
+::ini_factor_maps()
+{
+ const PARAM_VEC *params;
+ // factor delimiter
+ m_parameter->SetParameter<string>(m_factorDelimiter, "factor-delimiter", "|");
+ if (m_factorDelimiter == "none") {
+ m_factorDelimiter = "";
+ }
//input factors
params = m_parameter->GetParam("input-factors");
@@ -294,31 +462,44 @@ bool StaticData::LoadData(Parameter *parameter)
// default. output factor 0
m_outputFactorOrder.push_back(0);
}
+}
+
+void
+StaticData
+::ini_oov_options()
+{
+ // unknown word processing
+ m_parameter->SetParameter(m_dropUnknown, "drop-unknown", false );
+ m_parameter->SetParameter(m_markUnknown, "mark-unknown", false );
+
+ m_parameter->SetParameter(m_lmEnableOOVFeature, "lmodel-oov-feature", false);
//source word deletion
m_parameter->SetParameter(m_wordDeletionEnabled, "phrase-drop-allowed", false );
- //Disable discarding
- m_parameter->SetParameter(m_disableDiscarding, "disable-discarding", false);
+}
- //Print Translation Options
- m_parameter->SetParameter(m_printTranslationOptions, "print-translation-option", false );
+void
+StaticData
+::ini_distortion_options()
+{
+ // reordering constraints
+ m_parameter->SetParameter(m_maxDistortion, "distortion-limit", -1);
- //Print All Derivations
- m_parameter->SetParameter(m_printAllDerivations , "print-all-derivations", false );
+ m_parameter->SetParameter(m_reorderingConstraint, "monotone-at-punctuation", false );
- // additional output
- m_parameter->SetParameter<string>(m_detailedTranslationReportingFilePath, "translation-details", "");
- m_parameter->SetParameter<string>(m_detailedTreeFragmentsTranslationReportingFilePath, "tree-translation-details", "");
+ // early distortion cost
+ m_parameter->SetParameter(m_useEarlyDistortionCost, "early-distortion-cost", false );
- //DIMw
- m_parameter->SetParameter<string>(m_detailedAllTranslationReportingFilePath, "translation-all-details", "");
- // reordering constraints
- m_parameter->SetParameter(m_maxDistortion, "distortion-limit", -1);
- m_parameter->SetParameter(m_reorderingConstraint, "monotone-at-punctuation", false );
+}
+bool
+StaticData
+::ini_stack_decoding_options()
+{
+ const PARAM_VEC *params;
// settings for pruning
m_parameter->SetParameter(m_maxHypoStackSize, "stack", DEFAULT_MAX_HYPOSTACK_SIZE);
@@ -341,42 +522,53 @@ bool StaticData::LoadData(Parameter *parameter)
m_parameter->SetParameter(m_earlyDiscardingThreshold, "early-discarding-threshold", DEFAULT_EARLY_DISCARDING_THRESHOLD);
m_earlyDiscardingThreshold = TransformScore(m_earlyDiscardingThreshold);
+ return true;
+}
+void
+StaticData
+::ini_phrase_lookup_options()
+{
m_parameter->SetParameter(m_translationOptionThreshold, "translation-option-threshold", DEFAULT_TRANSLATION_OPTION_THRESHOLD);
m_translationOptionThreshold = TransformScore(m_translationOptionThreshold);
m_parameter->SetParameter(m_maxNoTransOptPerCoverage, "max-trans-opt-per-coverage", DEFAULT_MAX_TRANS_OPT_SIZE);
m_parameter->SetParameter(m_maxNoPartTransOpt, "max-partial-trans-opt", DEFAULT_MAX_PART_TRANS_OPT_SIZE);
m_parameter->SetParameter(m_maxPhraseLength, "max-phrase-length", DEFAULT_MAX_PHRASE_LENGTH);
- m_parameter->SetParameter(m_cubePruningPopLimit, "cube-pruning-pop-limit", DEFAULT_CUBE_PRUNING_POP_LIMIT);
- m_parameter->SetParameter(m_cubePruningDiversity, "cube-pruning-diversity", DEFAULT_CUBE_PRUNING_DIVERSITY);
- m_parameter->SetParameter(m_cubePruningLazyScoring, "cube-pruning-lazy-scoring", false);
-
- // early distortion cost
- m_parameter->SetParameter(m_useEarlyDistortionCost, "early-distortion-cost", false );
+}
- // unknown word processing
- m_parameter->SetParameter(m_dropUnknown, "drop-unknown", false );
- m_parameter->SetParameter(m_markUnknown, "mark-unknown", false );
+void
+StaticData
+::ini_zombie_options()
+{
+ //Disable discarding
+ m_parameter->SetParameter(m_disableDiscarding, "disable-discarding", false);
- m_parameter->SetParameter(m_lmEnableOOVFeature, "lmodel-oov-feature", false);
+}
+void
+StaticData
+::ini_mbr_options()
+{
// minimum Bayes risk decoding
m_parameter->SetParameter(m_mbr, "minimum-bayes-risk", false );
m_parameter->SetParameter<size_t>(m_mbrSize, "mbr-size", 200);
m_parameter->SetParameter(m_mbrScale, "mbr-scale", 1.0f);
+}
+
+void
+StaticData
+::ini_lmbr_options()
+{
+ const PARAM_VEC *params;
//lattice mbr
m_parameter->SetParameter(m_useLatticeMBR, "lminimum-bayes-risk", false );
if (m_useLatticeMBR && m_mbr) {
cerr << "Error: Cannot use both n-best mbr and lattice mbr together" << endl;
exit(1);
}
-
- //mira training
- m_parameter->SetParameter(m_mira, "mira", false );
-
// lattice MBR
if (m_useLatticeMBR) m_mbr = true;
@@ -391,6 +583,12 @@ bool StaticData::LoadData(Parameter *parameter)
m_lmbrThetas = Scan<float>(*params);
}
+}
+
+void
+StaticData
+::ini_consensus_decoding_options()
+{
//consensus decoding
m_parameter->SetParameter(m_useConsensusDecoding, "consensus-decoding", false );
if (m_useConsensusDecoding && m_mbr) {
@@ -398,115 +596,74 @@ bool StaticData::LoadData(Parameter *parameter)
exit(1);
}
if (m_useConsensusDecoding) m_mbr=true;
+}
- m_parameter->SetParameter(m_defaultNonTermOnlyForEmptyRange, "default-non-term-for-empty-range-only", false );
- m_parameter->SetParameter(m_printNBestTrees, "n-best-trees", false );
+void
+StaticData
+::ini_mira_options()
+{
+ //mira training
+ m_parameter->SetParameter(m_mira, "mira", false );
+}
- // S2T decoder
- m_parameter->SetParameter(m_s2tParsingAlgorithm, "s2t-parsing-algorithm", RecursiveCYKPlus);
+bool StaticData::LoadData(Parameter *parameter)
+{
+ ResetUserTime();
+ m_parameter = parameter;
- // Compact phrase table and reordering model
- m_parameter->SetParameter(m_minphrMemory, "minphr-memory", false );
- m_parameter->SetParameter(m_minlexrMemory, "minlexr-memory", false );
+ const PARAM_VEC *params;
- m_parameter->SetParameter<size_t>(m_timeout_threshold, "time-out", -1);
- m_timeout = (GetTimeoutThreshold() == (size_t)-1) ? false : true;
+ // to cube or not to cube
+ m_parameter->SetParameter(m_searchAlgorithm, "search-algorithm", Normal);
- m_parameter->SetParameter<size_t>(m_lmcache_cleanup_threshold, "clean-lm-cache", 1);
+ if (IsSyntax())
+ LoadChartDecodingParameters();
- m_threadCount = 1;
- params = m_parameter->GetParam("threads");
- if (params && params->size()) {
- if (params->at(0) == "all") {
-#ifdef WITH_THREADS
- m_threadCount = boost::thread::hardware_concurrency();
- if (!m_threadCount) {
- std::cerr << "-threads all specified but Boost doesn't know how many cores there are";
- return false;
- }
-#else
- std::cerr << "-threads all specified but moses not built with thread support";
- return false;
-#endif
- } else {
- m_threadCount = Scan<int>(params->at(0));
- if (m_threadCount < 1) {
- std::cerr << "Specify at least one thread.";
- return false;
- }
-#ifndef WITH_THREADS
- if (m_threadCount > 1) {
- std::cerr << "Error: Thread count of " << params->at(0) << " but moses not built with thread support";
- return false;
- }
-#endif
- }
- }
+ // ORDER HERE MATTERS, SO DON'T CHANGE IT UNLESS YOU KNOW WHAT YOU ARE DOING!
+ // input, output
+ ini_factor_maps();
+ ini_input_options();
+ if (!ini_output_options()) return false;
+ if (!ini_nbest_options()) return false;
- m_parameter->SetParameter<long>(m_startTranslationId, "start-translation-id", 0);
+ // threading etc.
+ if (!ini_performance_options()) return false;
- // use of xml in input
- m_parameter->SetParameter<XmlInputType>(m_xmlInputType, "xml-input", XmlPassThrough);
+ // model loading
+ ini_compact_table_options();
- // specify XML tags opening and closing brackets for XML option
- params = m_parameter->GetParam("xml-brackets");
- if (params && params->size()) {
- std::vector<std::string> brackets = Tokenize(params->at(0));
- if(brackets.size()!=2) {
- cerr << "invalid xml-brackets value, must specify exactly 2 blank-delimited strings for XML tags opening and closing brackets" << endl;
- exit(1);
- }
- m_xmlBrackets.first= brackets[0];
- m_xmlBrackets.second=brackets[1];
- VERBOSE(1,"XML tags opening and closing brackets for XML input are: "
- << m_xmlBrackets.first << " and " << m_xmlBrackets.second << endl);
- }
+ // search
+ ini_distortion_options();
+ if (!ini_stack_decoding_options()) return false;
+ ini_phrase_lookup_options();
+ ini_cube_pruning_options();
- m_parameter->SetParameter(m_placeHolderFactor, "placeholder-factor", NOT_FOUND);
+ ini_oov_options();
+ ini_mbr_options();
+ ini_lmbr_options();
+ ini_consensus_decoding_options();
- std::map<std::string, std::string> featureNameOverride = OverrideFeatureNames();
+ ini_mira_options();
- // all features
- map<string, int> featureIndexMap;
+ // S2T decoder
+ m_parameter->SetParameter(m_s2tParsingAlgorithm, "s2t-parsing-algorithm",
+ RecursiveCYKPlus);
- params = m_parameter->GetParam("feature");
- for (size_t i = 0; params && i < params->size(); ++i) {
- const string &line = Trim(params->at(i));
- VERBOSE(1,"line=" << line << endl);
- if (line.empty())
- continue;
- vector<string> toks = Tokenize(line);
+ ini_zombie_options(); // probably dead, or maybe not
- string &feature = toks[0];
- std::map<std::string, std::string>::const_iterator iter = featureNameOverride.find(feature);
- if (iter == featureNameOverride.end()) {
- // feature name not override
- m_registry.Construct(feature, line);
- } else {
- // replace feature name with new name
- string newName = iter->second;
- feature = newName;
- string newLine = Join(" ", toks);
- m_registry.Construct(newName, newLine);
- }
- }
+ m_parameter->SetParameter(m_placeHolderFactor, "placeholder-factor", NOT_FOUND);
- NoCache();
- OverrideFeatures();
+ // FEATURE FUNCTION INITIALIZATION HAPPENS HERE ===============================
+ initialize_features();
- if (m_parameter->GetParam("show-weights") == NULL) {
+ if (m_parameter->GetParam("show-weights") == NULL)
LoadFeatureFunctions();
- }
LoadDecodeGraphs();
-
- if (!CheckWeights()) {
- return false;
- }
-
- //Add any other features here.
+ // sanity check that there are no weights without an associated FF
+ if (!CheckWeights()) return false;
//Load extra feature weights
string weightFile;
@@ -523,13 +680,21 @@ bool StaticData::LoadData(Parameter *parameter)
//Load sparse features from config (overrules weight file)
LoadSparseWeightsFromConfig();
- // alternate weight settings
+ // load alternate weight settings
+ //
+ // When and where are these used??? [UG]
+ //
+ // Update: Just checked the manual. The config file is NOT the right
+ // place to do this. [UG]
+ //
+ // <TODO>
+ // * Eliminate alternate-weight-setting. Alternate weight settings should
+ // be provided with the input, not in the config file.
+ // </TODO>
params = m_parameter->GetParam("alternate-weight-setting");
- if (params && params->size()) {
- if (!LoadAlternateWeightSettings()) {
- return false;
- }
- }
+ if (params && params->size() && !LoadAlternateWeightSettings())
+ return false;
+
return true;
}
@@ -846,27 +1011,34 @@ float StaticData::GetWeightWordPenalty() const
return weightWP;
}
-void StaticData::InitializeForInput(const InputType& source) const
+void
+StaticData
+::InitializeForInput(ttasksptr const& ttask) const
{
- const std::vector<FeatureFunction*> &producers = FeatureFunction::GetFeatureFunctions();
+ const std::vector<FeatureFunction*> &producers
+ = FeatureFunction::GetFeatureFunctions();
for(size_t i=0; i<producers.size(); ++i) {
FeatureFunction &ff = *producers[i];
if (! IsFeatureFunctionIgnored(ff)) {
Timer iTime;
iTime.start();
- ff.InitializeForInput(source);
- VERBOSE(3,"InitializeForInput( " << ff.GetScoreProducerDescription() << " ) = " << iTime << endl);
+ ff.InitializeForInput(ttask);
+ VERBOSE(3,"InitializeForInput( " << ff.GetScoreProducerDescription() << " )"
+ << "= " << iTime << endl);
}
}
}
-void StaticData::CleanUpAfterSentenceProcessing(const InputType& source) const
+void
+StaticData
+::CleanUpAfterSentenceProcessing(ttasksptr const& ttask) const
{
- const std::vector<FeatureFunction*> &producers = FeatureFunction::GetFeatureFunctions();
+ const std::vector<FeatureFunction*> &producers
+ = FeatureFunction::GetFeatureFunctions();
for(size_t i=0; i<producers.size(); ++i) {
FeatureFunction &ff = *producers[i];
if (! IsFeatureFunctionIgnored(ff)) {
- ff.CleanUpAfterSentenceProcessing(source);
+ ff.CleanUpAfterSentenceProcessing(ttask);
}
}
}
@@ -938,7 +1110,8 @@ bool StaticData::CheckWeights() const
}
if (!weightNames.empty()) {
- cerr << "The following weights have no feature function. Maybe incorrectly spelt weights: ";
+ cerr << "The following weights have no feature function. "
+ << "Maybe incorrectly spelt weights: ";
set<string>::iterator iter;
for (iter = weightNames.begin(); iter != weightNames.end(); ++iter) {
cerr << *iter << ",";
@@ -1095,7 +1268,9 @@ void StaticData::NoCache()
}
}
-std::map<std::string, std::string> StaticData::OverrideFeatureNames()
+std::map<std::string, std::string>
+StaticData
+::OverrideFeatureNames()
{
std::map<std::string, std::string> ret;
diff --git a/moses/StaticData.h b/moses/StaticData.h
index e06a67b9d..438ac0633 100644
--- a/moses/StaticData.h
+++ b/moses/StaticData.h
@@ -236,6 +236,26 @@ protected:
// number of nonterminal labels
// size_t m_nonTerminalSize;
+
+ void ini_compact_table_options();
+ void ini_consensus_decoding_options();
+ void ini_cube_pruning_options();
+ void ini_distortion_options();
+ void ini_factor_maps();
+ void ini_input_options();
+ void ini_lm_options();
+ void ini_lmbr_options();
+ void ini_mbr_options();
+ void ini_mira_options();
+ bool ini_nbest_options();
+ void ini_oov_options();
+ bool ini_output_options();
+ bool ini_performance_options();
+ void ini_phrase_lookup_options();
+ bool ini_stack_decoding_options();
+ void ini_zombie_options();
+
+ void initialize_features();
public:
bool IsAlwaysCreateDirectTranslationOption() const {
@@ -446,19 +466,29 @@ public:
SearchAlgorithm GetSearchAlgorithm() const {
return m_searchAlgorithm;
}
- bool IsSyntax() const {
- return m_searchAlgorithm == CYKPlus ||
- m_searchAlgorithm == ChartIncremental ||
- m_searchAlgorithm == SyntaxS2T ||
- m_searchAlgorithm == SyntaxT2S ||
- m_searchAlgorithm == SyntaxT2S_SCFG ||
- m_searchAlgorithm == SyntaxF2S;
- }
- const ScoreComponentCollection& GetAllWeights() const {
- return m_allWeights;
+ // bool IsSyntax() const {
+ // return m_searchAlgorithm == CYKPlus ||
+ // m_searchAlgorithm == ChartIncremental ||
+ // m_searchAlgorithm == SyntaxS2T ||
+ // m_searchAlgorithm == SyntaxT2S ||
+ // m_searchAlgorithm == SyntaxT2S_SCFG ||
+ // m_searchAlgorithm == SyntaxF2S;
+ // }
+
+ bool IsSyntax(SearchAlgorithm algo = DefaultSearchAlgorithm) const
+ {
+ if (algo == DefaultSearchAlgorithm)
+ algo = m_searchAlgorithm;
+ return (algo == CYKPlus || algo == ChartIncremental ||
+ algo == SyntaxS2T || algo == SyntaxT2S ||
+ algo == SyntaxF2S || algo == SyntaxT2S_SCFG);
}
+ const ScoreComponentCollection&
+ GetAllWeights() const
+ { return m_allWeights; }
+
void SetAllWeights(const ScoreComponentCollection& weights) {
m_allWeights = weights;
}
@@ -742,8 +772,9 @@ public:
}
//sentence (and thread) specific initialisationn and cleanup
- void InitializeForInput(const InputType& source) const;
- void CleanUpAfterSentenceProcessing(const InputType& source) const;
+ // void InitializeForInput(const InputType& source, ttaskptr const& ttask) const;
+ void InitializeForInput(ttasksptr const& ttask) const;
+ void CleanUpAfterSentenceProcessing(ttasksptr const& ttask) const;
void LoadFeatureFunctions();
bool CheckWeights() const;
diff --git a/moses/Syntax/F2S/Manager-inl.h b/moses/Syntax/F2S/Manager-inl.h
index f7f8f0ae9..3aedc640e 100644
--- a/moses/Syntax/F2S/Manager-inl.h
+++ b/moses/Syntax/F2S/Manager-inl.h
@@ -1,3 +1,4 @@
+// -*- c++ -*-
#pragma once
#include "moses/DecodeGraph.h"
@@ -32,14 +33,14 @@ namespace F2S
{
template<typename RuleMatcher>
-Manager<RuleMatcher>::Manager(const InputType &source)
- : Syntax::Manager(source)
+Manager<RuleMatcher>::Manager(ttasksptr const& ttask)
+ : Syntax::Manager(ttask)
{
- if (const ForestInput *p = dynamic_cast<const ForestInput*>(&source)) {
+ if (const ForestInput *p = dynamic_cast<const ForestInput*>(&m_source)) {
m_forest = p->GetForest();
m_rootVertex = p->GetRootVertex();
- m_sentenceLength = p->GetSize();
- } else if (const TreeInput *p = dynamic_cast<const TreeInput*>(&source)) {
+ m_sentenceLength = p->GetSize();
+ } else if (const TreeInput *p = dynamic_cast<const TreeInput*>(&m_source)) {
T2S::InputTreeBuilder builder;
T2S::InputTree tmpTree;
builder.Build(*p, "Q", tmpTree);
diff --git a/moses/Syntax/F2S/Manager.h b/moses/Syntax/F2S/Manager.h
index 90f34c04b..1dcab4f5e 100644
--- a/moses/Syntax/F2S/Manager.h
+++ b/moses/Syntax/F2S/Manager.h
@@ -30,17 +30,16 @@ template<typename RuleMatcher>
class Manager : public Syntax::Manager
{
public:
- Manager(const InputType &);
+ Manager(ttasksptr const& ttask);
void Decode();
// Get the SHyperedge for the 1-best derivation.
const SHyperedge *GetBestSHyperedge() const;
- void ExtractKBest(
- std::size_t k,
- std::vector<boost::shared_ptr<KBestExtractor::Derivation> > &kBestList,
- bool onlyDistinct=false) const;
+ typedef std::vector<boost::shared_ptr<KBestExtractor::Derivation> > kBestList_t;
+ void ExtractKBest(std::size_t k, kBestList_t& kBestList,
+ bool onlyDistinct=false) const;
void OutputDetailedTranslationReport(OutputCollector *collector) const;
diff --git a/moses/Syntax/Manager.cpp b/moses/Syntax/Manager.cpp
index a11a0042f..f84890927 100644
--- a/moses/Syntax/Manager.cpp
+++ b/moses/Syntax/Manager.cpp
@@ -12,10 +12,9 @@ namespace Moses
namespace Syntax
{
-Manager::Manager(const InputType &source)
- : Moses::BaseManager(source)
-{
-}
+Manager::Manager(ttasksptr const& ttask)
+ : Moses::BaseManager(ttask)
+{ }
void Manager::OutputBest(OutputCollector *collector) const
{
diff --git a/moses/Syntax/Manager.h b/moses/Syntax/Manager.h
index 8d814f604..ed36c7c1d 100644
--- a/moses/Syntax/Manager.h
+++ b/moses/Syntax/Manager.h
@@ -14,7 +14,7 @@ namespace Syntax
class Manager : public BaseManager
{
public:
- Manager(const InputType &);
+ Manager(ttasksptr const& ttask);
// Virtual functions from Moses::BaseManager that are implemented the same
// way for all Syntax managers.
diff --git a/moses/Syntax/RuleTableFF.cpp b/moses/Syntax/RuleTableFF.cpp
index 37063e048..dd24493f0 100644
--- a/moses/Syntax/RuleTableFF.cpp
+++ b/moses/Syntax/RuleTableFF.cpp
@@ -17,7 +17,7 @@ namespace Syntax
std::vector<RuleTableFF*> RuleTableFF::s_instances;
RuleTableFF::RuleTableFF(const std::string &line)
- : PhraseDictionary(line)
+ : PhraseDictionary(line, true)
{
ReadParameters();
// caching for memory pt is pointless
diff --git a/moses/Syntax/S2T/Manager-inl.h b/moses/Syntax/S2T/Manager-inl.h
index 15594d589..ef08752b6 100644
--- a/moses/Syntax/S2T/Manager-inl.h
+++ b/moses/Syntax/S2T/Manager-inl.h
@@ -1,3 +1,4 @@
+// -*- c++ -*-
#pragma once
#include <iostream>
@@ -30,12 +31,11 @@ namespace S2T
{
template<typename Parser>
-Manager<Parser>::Manager(const InputType &source)
- : Syntax::Manager(source)
- , m_pchart(source.GetSize(), Parser::RequiresCompressedChart())
- , m_schart(source.GetSize())
-{
-}
+Manager<Parser>::Manager(ttasksptr const& ttask)
+ : Syntax::Manager(ttask)
+ , m_pchart(m_source.GetSize(), Parser::RequiresCompressedChart())
+ , m_schart(m_source.GetSize())
+{ }
template<typename Parser>
void Manager<Parser>::InitializeCharts()
diff --git a/moses/Syntax/S2T/Manager.h b/moses/Syntax/S2T/Manager.h
index 0961c8e77..711d6f9d8 100644
--- a/moses/Syntax/S2T/Manager.h
+++ b/moses/Syntax/S2T/Manager.h
@@ -30,7 +30,7 @@ template<typename Parser>
class Manager : public Syntax::Manager
{
public:
- Manager(const InputType &);
+ Manager(ttasksptr const& ttask);
void Decode();
diff --git a/moses/Syntax/T2S/Manager-inl.h b/moses/Syntax/T2S/Manager-inl.h
index c0df884e1..90ecb35bf 100644
--- a/moses/Syntax/T2S/Manager-inl.h
+++ b/moses/Syntax/T2S/Manager-inl.h
@@ -27,10 +27,10 @@ namespace T2S
{
template<typename RuleMatcher>
-Manager<RuleMatcher>::Manager(const InputType &source)
- : Syntax::Manager(source)
+Manager<RuleMatcher>::Manager(ttasksptr const& ttask)
+ : Syntax::Manager(ttask)
{
- if (const TreeInput *p = dynamic_cast<const TreeInput*>(&source)) {
+ if (const TreeInput *p = dynamic_cast<const TreeInput*>(&m_source)) {
// Construct the InputTree.
InputTreeBuilder builder;
builder.Build(*p, "Q", m_inputTree);
diff --git a/moses/Syntax/T2S/Manager.h b/moses/Syntax/T2S/Manager.h
index c8421477c..b2036aba0 100644
--- a/moses/Syntax/T2S/Manager.h
+++ b/moses/Syntax/T2S/Manager.h
@@ -30,7 +30,7 @@ template<typename RuleMatcher>
class Manager : public Syntax::Manager
{
public:
- Manager(const InputType &);
+ Manager(ttasksptr const& ttask);
void Decode();
diff --git a/moses/TargetPhrase.cpp b/moses/TargetPhrase.cpp
index d6abd16ff..5a26e44cc 100644
--- a/moses/TargetPhrase.cpp
+++ b/moses/TargetPhrase.cpp
@@ -223,8 +223,8 @@ void TargetPhrase::SetSparseScore(const FeatureFunction* translationScoreProduce
m_scoreBreakdown.Assign(translationScoreProducer, sparseString.as_string());
}
-boost::shared_ptr<Scores>
-mergescores(boost::shared_ptr<Scores> const& a,
+boost::shared_ptr<Scores>
+mergescores(boost::shared_ptr<Scores> const& a,
boost::shared_ptr<Scores> const& b)
{
boost::shared_ptr<Scores> ret;
@@ -243,7 +243,7 @@ mergescores(boost::shared_ptr<Scores> const& a,
return ret;
}
-void
+void
TargetPhrase::
Merge(const TargetPhrase &copy, const std::vector<FactorType>& factorVec)
{
@@ -256,7 +256,7 @@ Merge(const TargetPhrase &copy, const std::vector<FactorType>& factorVec)
BOOST_FOREACH(item const& s, copy.m_cached_scores)
{
pair<iter,bool> foo = m_cached_scores.insert(s);
- if (foo.second == false)
+ if (foo.second == false)
foo.first->second = mergescores(foo.first->second, s.second);
}
}
@@ -278,8 +278,8 @@ GetExtraScores(FeatureFunction const* ff) const
void
TargetPhrase::
-SetExtraScores(FeatureFunction const* ff,
- boost::shared_ptr<Scores> const& s)
+SetExtraScores(FeatureFunction const* ff,
+ boost::shared_ptr<Scores> const& s)
{ m_cached_scores[ff] = s; }
diff --git a/moses/TargetPhrase.h b/moses/TargetPhrase.h
index 8230373db..1f5960121 100644
--- a/moses/TargetPhrase.h
+++ b/moses/TargetPhrase.h
@@ -52,16 +52,16 @@ class PhraseDictionary;
class TargetPhrase: public Phrase
{
public:
- typedef std::map<FeatureFunction const*, boost::shared_ptr<Scores> >
+ typedef std::map<FeatureFunction const*, boost::shared_ptr<Scores> >
ScoreCache_t;
ScoreCache_t const& GetExtraScores() const;
Scores const* GetExtraScores(FeatureFunction const* ff) const;
- void SetExtraScores(FeatureFunction const* ff,
+ void SetExtraScores(FeatureFunction const* ff,
boost::shared_ptr<Scores> const& scores);
-
+
private:
- ScoreCache_t m_cached_scores;
-
+ ScoreCache_t m_cached_scores;
+
private:
friend std::ostream& operator<<(std::ostream&, const TargetPhrase&);
friend void swap(TargetPhrase &first, TargetPhrase &second);
@@ -198,7 +198,7 @@ public:
return found->second;
}
-
+
// To be set by the FF that needs it, by default the rule source = NULL
// make a copy of the source side of the rule
diff --git a/moses/ThreadPool.h b/moses/ThreadPool.h
index 024d1c54d..b7d459bb2 100644
--- a/moses/ThreadPool.h
+++ b/moses/ThreadPool.h
@@ -27,7 +27,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
#include <vector>
#include <boost/shared_ptr.hpp>
-
+
#ifdef WITH_THREADS
#include <boost/bind.hpp>
#include <boost/thread.hpp>
diff --git a/moses/TrainingTask.h b/moses/TrainingTask.h
index 885e8fd16..6166b4d42 100644
--- a/moses/TrainingTask.h
+++ b/moses/TrainingTask.h
@@ -1,9 +1,11 @@
+//-*- c++ -*-
#pragma once
#include <boost/smart_ptr/shared_ptr.hpp>
#include "moses/ThreadPool.h"
#include "moses/TranslationOptionCollection.h"
#include "moses/IOWrapper.h"
+#include "moses/TranslationTask.h"
namespace Moses
{
@@ -11,35 +13,57 @@ class InputType;
class OutputCollector;
-class TrainingTask : public Moses::Task
+class TrainingTask : public Moses::TranslationTask
{
+protected:
+ TrainingTask(boost::shared_ptr<Moses::InputType> const source,
+ boost::shared_ptr<Moses::IOWrapper> const ioWrapper)
+ : TranslationTask(source, ioWrapper)
+ { }
+
public:
- TrainingTask(Moses::InputType* source, Moses::IOWrapper &ioWrapper)
- : m_source(source)
- , m_ioWrapper(ioWrapper) {
+ // factory function
+ static boost::shared_ptr<TrainingTask>
+ create(boost::shared_ptr<InputType> const& source)
+ {
+ boost::shared_ptr<IOWrapper> nix;
+ boost::shared_ptr<TrainingTask> ret(new TrainingTask(source, nix));
+ ret->m_self = ret;
+ return ret;
}
- ~TrainingTask() {
+ // factory function
+ static boost::shared_ptr<TrainingTask>
+ create(boost::shared_ptr<InputType> const& source,
+ boost::shared_ptr<IOWrapper> const& ioWrapper)
+ {
+ boost::shared_ptr<TrainingTask> ret(new TrainingTask(source, ioWrapper));
+ ret->m_self = ret;
+ return ret;
}
+ ~TrainingTask()
+ { }
+
void Run() {
- StaticData::Instance().InitializeForInput(*m_source);
+ StaticData::Instance().InitializeForInput(this->self());
std::cerr << *m_source << std::endl;
- TranslationOptionCollection *transOptColl = m_source->CreateTranslationOptionCollection();
+ TranslationOptionCollection *transOptColl
+ = m_source->CreateTranslationOptionCollection(this->self());
transOptColl->CreateTranslationOptions();
delete transOptColl;
- StaticData::Instance().CleanUpAfterSentenceProcessing(*m_source);
+ StaticData::Instance().CleanUpAfterSentenceProcessing(this->self());
}
private:
- Moses::InputType* m_source;
- Moses::IOWrapper &m_ioWrapper;
+ // Moses::InputType* m_source;
+ // Moses::IOWrapper &m_ioWrapper;
};
diff --git a/moses/TranslationModel/CompactPT/BlockHashIndex.h b/moses/TranslationModel/CompactPT/BlockHashIndex.h
index b3f5e6f4b..130dd89fc 100644
--- a/moses/TranslationModel/CompactPT/BlockHashIndex.h
+++ b/moses/TranslationModel/CompactPT/BlockHashIndex.h
@@ -161,8 +161,8 @@ public:
}
#ifdef WITH_THREADS
-
- boost::shared_ptr<HashTask<Keys> >
+
+ boost::shared_ptr<HashTask<Keys> >
ht(new HashTask<Keys>(current, *this, keys));
m_threadPool.Submit(ht);
#else
diff --git a/moses/TranslationModel/CompactPT/MurmurHash3.cpp b/moses/TranslationModel/CompactPT/MurmurHash3.cpp
index fb6946bbc..dfde88708 100644
--- a/moses/TranslationModel/CompactPT/MurmurHash3.cpp
+++ b/moses/TranslationModel/CompactPT/MurmurHash3.cpp
@@ -1,425 +1,425 @@
-//-----------------------------------------------------------------------------
-// MurmurHash3 was written by Austin Appleby, and is placed in the public
-// domain. The author hereby disclaims copyright to this source code.
-
-// Note - The x86 and x64 versions do _not_ produce the same results, as the
-// algorithms are optimized for their respective platforms. You can still
-// compile and run any of them on any platform, but your performance with the
-// non-native version will be less than optimal.
-
-#include "MurmurHash3.h"
-
-//-----------------------------------------------------------------------------
-// Platform-specific functions and macros
-
-// Microsoft Visual Studio
-
-#if defined(_MSC_VER)
-
-#define FORCE_INLINE __forceinline
-
-#include <cstdlib>
-
-#define ROTL32(x,y) _rotl(x,y)
-#define ROTL64(x,y) _rotl64(x,y)
-
-#define BIG_CONSTANT(x) (x)
-
-// Other compilers
-
-#else // defined(_MSC_VER)
-
-#define FORCE_INLINE inline __attribute__((always_inline))
-
-inline uint32_t rotl32 ( uint32_t x, int8_t r )
-{
- return (x << r) | (x >> (32 - r));
-}
-
-inline uint64_t rotl64 ( uint64_t x, int8_t r )
-{
- return (x << r) | (x >> (64 - r));
-}
-
-#define ROTL32(x,y) rotl32(x,y)
-#define ROTL64(x,y) rotl64(x,y)
-
-#define BIG_CONSTANT(x) (x##LLU)
-
-#endif // !defined(_MSC_VER)
-
-//-----------------------------------------------------------------------------
-// Block read - if your platform needs to do endian-swapping or can only
-// handle aligned reads, do the conversion here
-
-FORCE_INLINE uint32_t getblock ( const uint32_t * p, int i )
-{
- return p[i];
-}
-
-FORCE_INLINE uint64_t getblock ( const uint64_t * p, int i )
-{
- return p[i];
-}
-
-//-----------------------------------------------------------------------------
-// Finalization mix - force all bits of a hash block to avalanche
-
-FORCE_INLINE uint32_t fmix ( uint32_t h )
-{
- h ^= h >> 16;
- h *= 0x85ebca6b;
- h ^= h >> 13;
- h *= 0xc2b2ae35;
- h ^= h >> 16;
-
- return h;
-}
-
-//----------
-
-FORCE_INLINE uint64_t fmix ( uint64_t k )
-{
- k ^= k >> 33;
- k *= BIG_CONSTANT(0xff51afd7ed558ccd);
- k ^= k >> 33;
- k *= BIG_CONSTANT(0xc4ceb9fe1a85ec53);
- k ^= k >> 33;
-
- return k;
-}
-
-//-----------------------------------------------------------------------------
-
-void MurmurHash3_x86_32 ( const void * key, int len,
- uint32_t seed, void * out )
-{
- const uint8_t * data = (const uint8_t*)key;
- const int nblocks = len / 4;
-
- uint32_t h1 = seed;
-
- uint32_t c1 = 0xcc9e2d51;
- uint32_t c2 = 0x1b873593;
-
- //----------
- // body
-
- const uint32_t * blocks = (const uint32_t *)(data + nblocks*4);
-
- for(int i = -nblocks; i; i++) {
- uint32_t k1 = getblock(blocks,i);
-
- k1 *= c1;
- k1 = ROTL32(k1,15);
- k1 *= c2;
-
- h1 ^= k1;
- h1 = ROTL32(h1,13);
- h1 = h1*5+0xe6546b64;
- }
-
- //----------
- // tail
-
- const uint8_t * tail = (const uint8_t*)(data + nblocks*4);
-
- uint32_t k1 = 0;
-
- switch(len & 3) {
- case 3:
- k1 ^= tail[2] << 16;
- case 2:
- k1 ^= tail[1] << 8;
- case 1:
- k1 ^= tail[0];
- k1 *= c1;
- k1 = ROTL32(k1,15);
- k1 *= c2;
- h1 ^= k1;
- };
-
- //----------
- // finalization
-
- h1 ^= len;
-
- h1 = fmix(h1);
-
- *(uint32_t*)out = h1;
-}
-
-//-----------------------------------------------------------------------------
-
-void MurmurHash3_x86_128 ( const void * key, const int len,
- uint32_t seed, void * out )
-{
- const uint8_t * data = (const uint8_t*)key;
- const int nblocks = len / 16;
-
- uint32_t h1 = seed;
- uint32_t h2 = seed;
- uint32_t h3 = seed;
- uint32_t h4 = seed;
-
- uint32_t c1 = 0x239b961b;
- uint32_t c2 = 0xab0e9789;
- uint32_t c3 = 0x38b34ae5;
- uint32_t c4 = 0xa1e38b93;
-
- //----------
- // body
-
- const uint32_t * blocks = (const uint32_t *)(data + nblocks*16);
-
- for(int i = -nblocks; i; i++) {
- uint32_t k1 = getblock(blocks,i*4+0);
- uint32_t k2 = getblock(blocks,i*4+1);
- uint32_t k3 = getblock(blocks,i*4+2);
- uint32_t k4 = getblock(blocks,i*4+3);
-
- k1 *= c1;
- k1 = ROTL32(k1,15);
- k1 *= c2;
- h1 ^= k1;
-
- h1 = ROTL32(h1,19);
- h1 += h2;
- h1 = h1*5+0x561ccd1b;
-
- k2 *= c2;
- k2 = ROTL32(k2,16);
- k2 *= c3;
- h2 ^= k2;
-
- h2 = ROTL32(h2,17);
- h2 += h3;
- h2 = h2*5+0x0bcaa747;
-
- k3 *= c3;
- k3 = ROTL32(k3,17);
- k3 *= c4;
- h3 ^= k3;
-
- h3 = ROTL32(h3,15);
- h3 += h4;
- h3 = h3*5+0x96cd1c35;
-
- k4 *= c4;
- k4 = ROTL32(k4,18);
- k4 *= c1;
- h4 ^= k4;
-
- h4 = ROTL32(h4,13);
- h4 += h1;
- h4 = h4*5+0x32ac3b17;
- }
-
- //----------
- // tail
-
- const uint8_t * tail = (const uint8_t*)(data + nblocks*16);
-
- uint32_t k1 = 0;
- uint32_t k2 = 0;
- uint32_t k3 = 0;
- uint32_t k4 = 0;
-
- switch(len & 15) {
- case 15:
- k4 ^= tail[14] << 16;
- case 14:
- k4 ^= tail[13] << 8;
- case 13:
- k4 ^= tail[12] << 0;
- k4 *= c4;
- k4 = ROTL32(k4,18);
- k4 *= c1;
- h4 ^= k4;
-
- case 12:
- k3 ^= tail[11] << 24;
- case 11:
- k3 ^= tail[10] << 16;
- case 10:
- k3 ^= tail[ 9] << 8;
- case 9:
- k3 ^= tail[ 8] << 0;
- k3 *= c3;
- k3 = ROTL32(k3,17);
- k3 *= c4;
- h3 ^= k3;
-
- case 8:
- k2 ^= tail[ 7] << 24;
- case 7:
- k2 ^= tail[ 6] << 16;
- case 6:
- k2 ^= tail[ 5] << 8;
- case 5:
- k2 ^= tail[ 4] << 0;
- k2 *= c2;
- k2 = ROTL32(k2,16);
- k2 *= c3;
- h2 ^= k2;
-
- case 4:
- k1 ^= tail[ 3] << 24;
- case 3:
- k1 ^= tail[ 2] << 16;
- case 2:
- k1 ^= tail[ 1] << 8;
- case 1:
- k1 ^= tail[ 0] << 0;
- k1 *= c1;
- k1 = ROTL32(k1,15);
- k1 *= c2;
- h1 ^= k1;
- };
-
- //----------
- // finalization
-
- h1 ^= len;
- h2 ^= len;
- h3 ^= len;
- h4 ^= len;
-
- h1 += h2;
- h1 += h3;
- h1 += h4;
- h2 += h1;
- h3 += h1;
- h4 += h1;
-
- h1 = fmix(h1);
- h2 = fmix(h2);
- h3 = fmix(h3);
- h4 = fmix(h4);
-
- h1 += h2;
- h1 += h3;
- h1 += h4;
- h2 += h1;
- h3 += h1;
- h4 += h1;
-
- ((uint32_t*)out)[0] = h1;
- ((uint32_t*)out)[1] = h2;
- ((uint32_t*)out)[2] = h3;
- ((uint32_t*)out)[3] = h4;
-}
-
-//-----------------------------------------------------------------------------
-
-void MurmurHash3_x64_128 ( const void * key, const int len,
- const uint32_t seed, void * out )
-{
- const uint8_t * data = (const uint8_t*)key;
- const int nblocks = len / 16;
-
- uint64_t h1 = seed;
- uint64_t h2 = seed;
-
- uint64_t c1 = BIG_CONSTANT(0x87c37b91114253d5);
- uint64_t c2 = BIG_CONSTANT(0x4cf5ad432745937f);
-
- //----------
- // body
-
- const uint64_t * blocks = (const uint64_t *)(data);
-
- for(int i = 0; i < nblocks; i++) {
- uint64_t k1 = getblock(blocks,i*2+0);
- uint64_t k2 = getblock(blocks,i*2+1);
-
- k1 *= c1;
- k1 = ROTL64(k1,31);
- k1 *= c2;
- h1 ^= k1;
-
- h1 = ROTL64(h1,27);
- h1 += h2;
- h1 = h1*5+0x52dce729;
-
- k2 *= c2;
- k2 = ROTL64(k2,33);
- k2 *= c1;
- h2 ^= k2;
-
- h2 = ROTL64(h2,31);
- h2 += h1;
- h2 = h2*5+0x38495ab5;
- }
-
- //----------
- // tail
-
- const uint8_t * tail = (const uint8_t*)(data + nblocks*16);
-
- uint64_t k1 = 0;
- uint64_t k2 = 0;
-
- switch(len & 15) {
- case 15:
- k2 ^= uint64_t(tail[14]) << 48;
- case 14:
- k2 ^= uint64_t(tail[13]) << 40;
- case 13:
- k2 ^= uint64_t(tail[12]) << 32;
- case 12:
- k2 ^= uint64_t(tail[11]) << 24;
- case 11:
- k2 ^= uint64_t(tail[10]) << 16;
- case 10:
- k2 ^= uint64_t(tail[ 9]) << 8;
- case 9:
- k2 ^= uint64_t(tail[ 8]) << 0;
- k2 *= c2;
- k2 = ROTL64(k2,33);
- k2 *= c1;
- h2 ^= k2;
-
- case 8:
- k1 ^= uint64_t(tail[ 7]) << 56;
- case 7:
- k1 ^= uint64_t(tail[ 6]) << 48;
- case 6:
- k1 ^= uint64_t(tail[ 5]) << 40;
- case 5:
- k1 ^= uint64_t(tail[ 4]) << 32;
- case 4:
- k1 ^= uint64_t(tail[ 3]) << 24;
- case 3:
- k1 ^= uint64_t(tail[ 2]) << 16;
- case 2:
- k1 ^= uint64_t(tail[ 1]) << 8;
- case 1:
- k1 ^= uint64_t(tail[ 0]) << 0;
- k1 *= c1;
- k1 = ROTL64(k1,31);
- k1 *= c2;
- h1 ^= k1;
- };
-
- //----------
- // finalization
-
- h1 ^= len;
- h2 ^= len;
-
- h1 += h2;
- h2 += h1;
-
- h1 = fmix(h1);
- h2 = fmix(h2);
-
- h1 += h2;
- h2 += h1;
-
- ((uint64_t*)out)[0] = h1;
- ((uint64_t*)out)[1] = h2;
-}
-
-//-----------------------------------------------------------------------------
-
+//-----------------------------------------------------------------------------
+// MurmurHash3 was written by Austin Appleby, and is placed in the public
+// domain. The author hereby disclaims copyright to this source code.
+
+// Note - The x86 and x64 versions do _not_ produce the same results, as the
+// algorithms are optimized for their respective platforms. You can still
+// compile and run any of them on any platform, but your performance with the
+// non-native version will be less than optimal.
+
+#include "MurmurHash3.h"
+
+//-----------------------------------------------------------------------------
+// Platform-specific functions and macros
+
+// Microsoft Visual Studio
+
+#if defined(_MSC_VER)
+
+#define FORCE_INLINE __forceinline
+
+#include <cstdlib>
+
+#define ROTL32(x,y) _rotl(x,y)
+#define ROTL64(x,y) _rotl64(x,y)
+
+#define BIG_CONSTANT(x) (x)
+
+// Other compilers
+
+#else // defined(_MSC_VER)
+
+#define FORCE_INLINE inline __attribute__((always_inline))
+
+inline uint32_t rotl32 ( uint32_t x, int8_t r )
+{
+ return (x << r) | (x >> (32 - r));
+}
+
+inline uint64_t rotl64 ( uint64_t x, int8_t r )
+{
+ return (x << r) | (x >> (64 - r));
+}
+
+#define ROTL32(x,y) rotl32(x,y)
+#define ROTL64(x,y) rotl64(x,y)
+
+#define BIG_CONSTANT(x) (x##LLU)
+
+#endif // !defined(_MSC_VER)
+
+//-----------------------------------------------------------------------------
+// Block read - if your platform needs to do endian-swapping or can only
+// handle aligned reads, do the conversion here
+
+FORCE_INLINE uint32_t getblock ( const uint32_t * p, int i )
+{
+ return p[i];
+}
+
+FORCE_INLINE uint64_t getblock ( const uint64_t * p, int i )
+{
+ return p[i];
+}
+
+//-----------------------------------------------------------------------------
+// Finalization mix - force all bits of a hash block to avalanche
+
+FORCE_INLINE uint32_t fmix ( uint32_t h )
+{
+ h ^= h >> 16;
+ h *= 0x85ebca6b;
+ h ^= h >> 13;
+ h *= 0xc2b2ae35;
+ h ^= h >> 16;
+
+ return h;
+}
+
+//----------
+
+FORCE_INLINE uint64_t fmix ( uint64_t k )
+{
+ k ^= k >> 33;
+ k *= BIG_CONSTANT(0xff51afd7ed558ccd);
+ k ^= k >> 33;
+ k *= BIG_CONSTANT(0xc4ceb9fe1a85ec53);
+ k ^= k >> 33;
+
+ return k;
+}
+
+//-----------------------------------------------------------------------------
+
+void MurmurHash3_x86_32 ( const void * key, int len,
+ uint32_t seed, void * out )
+{
+ const uint8_t * data = (const uint8_t*)key;
+ const int nblocks = len / 4;
+
+ uint32_t h1 = seed;
+
+ uint32_t c1 = 0xcc9e2d51;
+ uint32_t c2 = 0x1b873593;
+
+ //----------
+ // body
+
+ const uint32_t * blocks = (const uint32_t *)(data + nblocks*4);
+
+ for(int i = -nblocks; i; i++) {
+ uint32_t k1 = getblock(blocks,i);
+
+ k1 *= c1;
+ k1 = ROTL32(k1,15);
+ k1 *= c2;
+
+ h1 ^= k1;
+ h1 = ROTL32(h1,13);
+ h1 = h1*5+0xe6546b64;
+ }
+
+ //----------
+ // tail
+
+ const uint8_t * tail = (const uint8_t*)(data + nblocks*4);
+
+ uint32_t k1 = 0;
+
+ switch(len & 3) {
+ case 3:
+ k1 ^= tail[2] << 16;
+ case 2:
+ k1 ^= tail[1] << 8;
+ case 1:
+ k1 ^= tail[0];
+ k1 *= c1;
+ k1 = ROTL32(k1,15);
+ k1 *= c2;
+ h1 ^= k1;
+ };
+
+ //----------
+ // finalization
+
+ h1 ^= len;
+
+ h1 = fmix(h1);
+
+ *(uint32_t*)out = h1;
+}
+
+//-----------------------------------------------------------------------------
+
+void MurmurHash3_x86_128 ( const void * key, const int len,
+ uint32_t seed, void * out )
+{
+ const uint8_t * data = (const uint8_t*)key;
+ const int nblocks = len / 16;
+
+ uint32_t h1 = seed;
+ uint32_t h2 = seed;
+ uint32_t h3 = seed;
+ uint32_t h4 = seed;
+
+ uint32_t c1 = 0x239b961b;
+ uint32_t c2 = 0xab0e9789;
+ uint32_t c3 = 0x38b34ae5;
+ uint32_t c4 = 0xa1e38b93;
+
+ //----------
+ // body
+
+ const uint32_t * blocks = (const uint32_t *)(data + nblocks*16);
+
+ for(int i = -nblocks; i; i++) {
+ uint32_t k1 = getblock(blocks,i*4+0);
+ uint32_t k2 = getblock(blocks,i*4+1);
+ uint32_t k3 = getblock(blocks,i*4+2);
+ uint32_t k4 = getblock(blocks,i*4+3);
+
+ k1 *= c1;
+ k1 = ROTL32(k1,15);
+ k1 *= c2;
+ h1 ^= k1;
+
+ h1 = ROTL32(h1,19);
+ h1 += h2;
+ h1 = h1*5+0x561ccd1b;
+
+ k2 *= c2;
+ k2 = ROTL32(k2,16);
+ k2 *= c3;
+ h2 ^= k2;
+
+ h2 = ROTL32(h2,17);
+ h2 += h3;
+ h2 = h2*5+0x0bcaa747;
+
+ k3 *= c3;
+ k3 = ROTL32(k3,17);
+ k3 *= c4;
+ h3 ^= k3;
+
+ h3 = ROTL32(h3,15);
+ h3 += h4;
+ h3 = h3*5+0x96cd1c35;
+
+ k4 *= c4;
+ k4 = ROTL32(k4,18);
+ k4 *= c1;
+ h4 ^= k4;
+
+ h4 = ROTL32(h4,13);
+ h4 += h1;
+ h4 = h4*5+0x32ac3b17;
+ }
+
+ //----------
+ // tail
+
+ const uint8_t * tail = (const uint8_t*)(data + nblocks*16);
+
+ uint32_t k1 = 0;
+ uint32_t k2 = 0;
+ uint32_t k3 = 0;
+ uint32_t k4 = 0;
+
+ switch(len & 15) {
+ case 15:
+ k4 ^= tail[14] << 16;
+ case 14:
+ k4 ^= tail[13] << 8;
+ case 13:
+ k4 ^= tail[12] << 0;
+ k4 *= c4;
+ k4 = ROTL32(k4,18);
+ k4 *= c1;
+ h4 ^= k4;
+
+ case 12:
+ k3 ^= tail[11] << 24;
+ case 11:
+ k3 ^= tail[10] << 16;
+ case 10:
+ k3 ^= tail[ 9] << 8;
+ case 9:
+ k3 ^= tail[ 8] << 0;
+ k3 *= c3;
+ k3 = ROTL32(k3,17);
+ k3 *= c4;
+ h3 ^= k3;
+
+ case 8:
+ k2 ^= tail[ 7] << 24;
+ case 7:
+ k2 ^= tail[ 6] << 16;
+ case 6:
+ k2 ^= tail[ 5] << 8;
+ case 5:
+ k2 ^= tail[ 4] << 0;
+ k2 *= c2;
+ k2 = ROTL32(k2,16);
+ k2 *= c3;
+ h2 ^= k2;
+
+ case 4:
+ k1 ^= tail[ 3] << 24;
+ case 3:
+ k1 ^= tail[ 2] << 16;
+ case 2:
+ k1 ^= tail[ 1] << 8;
+ case 1:
+ k1 ^= tail[ 0] << 0;
+ k1 *= c1;
+ k1 = ROTL32(k1,15);
+ k1 *= c2;
+ h1 ^= k1;
+ };
+
+ //----------
+ // finalization
+
+ h1 ^= len;
+ h2 ^= len;
+ h3 ^= len;
+ h4 ^= len;
+
+ h1 += h2;
+ h1 += h3;
+ h1 += h4;
+ h2 += h1;
+ h3 += h1;
+ h4 += h1;
+
+ h1 = fmix(h1);
+ h2 = fmix(h2);
+ h3 = fmix(h3);
+ h4 = fmix(h4);
+
+ h1 += h2;
+ h1 += h3;
+ h1 += h4;
+ h2 += h1;
+ h3 += h1;
+ h4 += h1;
+
+ ((uint32_t*)out)[0] = h1;
+ ((uint32_t*)out)[1] = h2;
+ ((uint32_t*)out)[2] = h3;
+ ((uint32_t*)out)[3] = h4;
+}
+
+//-----------------------------------------------------------------------------
+
+void MurmurHash3_x64_128 ( const void * key, const int len,
+ const uint32_t seed, void * out )
+{
+ const uint8_t * data = (const uint8_t*)key;
+ const int nblocks = len / 16;
+
+ uint64_t h1 = seed;
+ uint64_t h2 = seed;
+
+ uint64_t c1 = BIG_CONSTANT(0x87c37b91114253d5);
+ uint64_t c2 = BIG_CONSTANT(0x4cf5ad432745937f);
+
+ //----------
+ // body
+
+ const uint64_t * blocks = (const uint64_t *)(data);
+
+ for(int i = 0; i < nblocks; i++) {
+ uint64_t k1 = getblock(blocks,i*2+0);
+ uint64_t k2 = getblock(blocks,i*2+1);
+
+ k1 *= c1;
+ k1 = ROTL64(k1,31);
+ k1 *= c2;
+ h1 ^= k1;
+
+ h1 = ROTL64(h1,27);
+ h1 += h2;
+ h1 = h1*5+0x52dce729;
+
+ k2 *= c2;
+ k2 = ROTL64(k2,33);
+ k2 *= c1;
+ h2 ^= k2;
+
+ h2 = ROTL64(h2,31);
+ h2 += h1;
+ h2 = h2*5+0x38495ab5;
+ }
+
+ //----------
+ // tail
+
+ const uint8_t * tail = (const uint8_t*)(data + nblocks*16);
+
+ uint64_t k1 = 0;
+ uint64_t k2 = 0;
+
+ switch(len & 15) {
+ case 15:
+ k2 ^= uint64_t(tail[14]) << 48;
+ case 14:
+ k2 ^= uint64_t(tail[13]) << 40;
+ case 13:
+ k2 ^= uint64_t(tail[12]) << 32;
+ case 12:
+ k2 ^= uint64_t(tail[11]) << 24;
+ case 11:
+ k2 ^= uint64_t(tail[10]) << 16;
+ case 10:
+ k2 ^= uint64_t(tail[ 9]) << 8;
+ case 9:
+ k2 ^= uint64_t(tail[ 8]) << 0;
+ k2 *= c2;
+ k2 = ROTL64(k2,33);
+ k2 *= c1;
+ h2 ^= k2;
+
+ case 8:
+ k1 ^= uint64_t(tail[ 7]) << 56;
+ case 7:
+ k1 ^= uint64_t(tail[ 6]) << 48;
+ case 6:
+ k1 ^= uint64_t(tail[ 5]) << 40;
+ case 5:
+ k1 ^= uint64_t(tail[ 4]) << 32;
+ case 4:
+ k1 ^= uint64_t(tail[ 3]) << 24;
+ case 3:
+ k1 ^= uint64_t(tail[ 2]) << 16;
+ case 2:
+ k1 ^= uint64_t(tail[ 1]) << 8;
+ case 1:
+ k1 ^= uint64_t(tail[ 0]) << 0;
+ k1 *= c1;
+ k1 = ROTL64(k1,31);
+ k1 *= c2;
+ h1 ^= k1;
+ };
+
+ //----------
+ // finalization
+
+ h1 ^= len;
+ h2 ^= len;
+
+ h1 += h2;
+ h2 += h1;
+
+ h1 = fmix(h1);
+ h2 = fmix(h2);
+
+ h1 += h2;
+ h2 += h1;
+
+ ((uint64_t*)out)[0] = h1;
+ ((uint64_t*)out)[1] = h2;
+}
+
+//-----------------------------------------------------------------------------
+
diff --git a/moses/TranslationModel/CompactPT/MurmurHash3.h b/moses/TranslationModel/CompactPT/MurmurHash3.h
index 58e98204d..54e9d3f9e 100644
--- a/moses/TranslationModel/CompactPT/MurmurHash3.h
+++ b/moses/TranslationModel/CompactPT/MurmurHash3.h
@@ -1,37 +1,37 @@
-//-----------------------------------------------------------------------------
-// MurmurHash3 was written by Austin Appleby, and is placed in the public
-// domain. The author hereby disclaims copyright to this source code.
-
-#ifndef _MURMURHASH3_H_
-#define _MURMURHASH3_H_
-
-//-----------------------------------------------------------------------------
-// Platform-specific functions and macros
-
-// Microsoft Visual Studio
-
-#if defined(_MSC_VER)
-
-typedef unsigned char uint8_t;
-typedef unsigned long uint32_t;
-typedef unsigned __int64 uint64_t;
-
-// Other compilers
-
-#else // defined(_MSC_VER)
-
-#include <stdint.h>
-
-#endif // !defined(_MSC_VER)
-
-//-----------------------------------------------------------------------------
-
-void MurmurHash3_x86_32 ( const void * key, int len, uint32_t seed, void * out );
-
-void MurmurHash3_x86_128 ( const void * key, int len, uint32_t seed, void * out );
-
-void MurmurHash3_x64_128 ( const void * key, int len, uint32_t seed, void * out );
-
-//-----------------------------------------------------------------------------
-
-#endif // _MURMURHASH3_H_
+//-----------------------------------------------------------------------------
+// MurmurHash3 was written by Austin Appleby, and is placed in the public
+// domain. The author hereby disclaims copyright to this source code.
+
+#ifndef _MURMURHASH3_H_
+#define _MURMURHASH3_H_
+
+//-----------------------------------------------------------------------------
+// Platform-specific functions and macros
+
+// Microsoft Visual Studio
+
+#if defined(_MSC_VER)
+
+typedef unsigned char uint8_t;
+typedef unsigned long uint32_t;
+typedef unsigned __int64 uint64_t;
+
+// Other compilers
+
+#else // defined(_MSC_VER)
+
+#include <stdint.h>
+
+#endif // !defined(_MSC_VER)
+
+//-----------------------------------------------------------------------------
+
+void MurmurHash3_x86_32 ( const void * key, int len, uint32_t seed, void * out );
+
+void MurmurHash3_x86_128 ( const void * key, int len, uint32_t seed, void * out );
+
+void MurmurHash3_x64_128 ( const void * key, int len, uint32_t seed, void * out );
+
+//-----------------------------------------------------------------------------
+
+#endif // _MURMURHASH3_H_
diff --git a/moses/TranslationModel/CompactPT/PhraseDictionaryCompact.cpp b/moses/TranslationModel/CompactPT/PhraseDictionaryCompact.cpp
index 9c3f6b513..afed99057 100644
--- a/moses/TranslationModel/CompactPT/PhraseDictionaryCompact.cpp
+++ b/moses/TranslationModel/CompactPT/PhraseDictionaryCompact.cpp
@@ -44,7 +44,7 @@ namespace Moses
{
PhraseDictionaryCompact::PhraseDictionaryCompact(const std::string &line)
- :PhraseDictionary(line)
+ :PhraseDictionary(line, true)
,m_inMemory(true)
,m_useAlignmentInfo(true)
,m_hash(10, 16)
diff --git a/moses/TranslationModel/DynSAInclude/hash.h b/moses/TranslationModel/DynSAInclude/hash.h
index 8536c46f5..4cf69bf2f 100644
--- a/moses/TranslationModel/DynSAInclude/hash.h
+++ b/moses/TranslationModel/DynSAInclude/hash.h
@@ -6,6 +6,7 @@
#include "utils.h"
#include "FileHandler.h"
#include "util/exception.hh"
+#include "util/random.hh"
using namespace Moses;
typedef uint64_t P; // largest input range is 2^64
@@ -162,7 +163,7 @@ void Hash_shiftAddXOR<T>::initSeeds()
{
v_ = new T[this->H_];
for(count_t i=0; i < this->H_; i++)
- v_[i] = Utils::rand<T>() + 1;
+ v_[i] = util::wide_rand<T>() + 1;
}
template <typename T>
T Hash_shiftAddXOR<T>::hash(const char* s, count_t h)
@@ -187,9 +188,8 @@ void UnivHash_tableXOR<T>::initSeeds()
// fill with random values
for(count_t j=0; j < this->H_; j++) {
table_[j] = new T[tblLen_];
- for(count_t i=0; i < tblLen_; i++) {
- table_[j][i] = Utils::rand<T>(this->m_-1);
- }
+ for(count_t i=0; i < tblLen_; i++)
+ table_[j][i] = util::wide_rand_excl(this->m_-1);
}
}
template <typename T>
@@ -218,7 +218,7 @@ void UnivHash_noPrimes<T>::initSeeds()
{
a_ = new P[this->H_];
for(T i=0; i < this->H_; i++) {
- a_[i] = Utils::rand<P>();
+ a_[i] = util::wide_rand<P>();
if(a_[i] % 2 == 0) a_[i]++; // a must be odd
}
}
@@ -284,8 +284,8 @@ void UnivHash_linear<T>::initSeeds()
a_[i] = new T[MAX_NGRAM_ORDER];
b_[i] = new T[MAX_NGRAM_ORDER];
for(count_t j=0; j < MAX_NGRAM_ORDER; j++) {
- a_[i][j] = 1 + Utils::rand<T>();
- b_[i][j] = Utils::rand<T>();
+ a_[i][j] = 1 + util::wide_rand<T>();
+ b_[i][j] = util::wide_rand<T>();
}
}
}
diff --git a/moses/TranslationModel/DynSAInclude/utils.h b/moses/TranslationModel/DynSAInclude/utils.h
index e2f24fd4f..485e4a065 100644
--- a/moses/TranslationModel/DynSAInclude/utils.h
+++ b/moses/TranslationModel/DynSAInclude/utils.h
@@ -62,22 +62,6 @@ public:
str[i] = tolower(str[i]);
}
}
- // TODO: interface with decent PRG
- template<typename T>
- static T rand(T mod_bnd = 0) {
- T random = 0;
- if(sizeof(T) <= 4) {
- random = static_cast<T>(std::rand());
- } else if(sizeof(T) == 8) {
- random = static_cast<T>(std::rand());
- random <<= 31;
- random <<= 1;
- random |= static_cast<T>(std::rand());
- }
- if(mod_bnd != 0)
- return random % mod_bnd;
- else return random;
- }
};
#endif
diff --git a/moses/TranslationModel/DynSuffixArray.cpp b/moses/TranslationModel/DynSuffixArray.cpp
index 3e8c79c0e..c1dc62f12 100644
--- a/moses/TranslationModel/DynSuffixArray.cpp
+++ b/moses/TranslationModel/DynSuffixArray.cpp
@@ -1,4 +1,6 @@
#include "DynSuffixArray.h"
+#include "util/random.hh"
+
#include <iostream>
#include <boost/foreach.hpp>
@@ -315,33 +317,31 @@ int DynSuffixArray::Compare(int pos1, int pos2, int max)
return 0;
}
+namespace
+{
+/// Helper: swap two entries in an int array.
+inline void swap_ints(int array[], int one, int other)
+{
+ const int tmp = array[one];
+ array[one] = array[other];
+ array[other] = tmp;
+}
+}
+
void DynSuffixArray::Qsort(int* array, int begin, int end)
{
if(end > begin) {
- int index;
+ int index = util::rand_incl(begin, end);
{
- index = begin + (rand() % (end - begin + 1));
- int pivot = array[index];
- {
- int tmp = array[index];
- array[index] = array[end];
- array[end] = tmp;
- }
+ const int pivot = array[index];
+ swap_ints(array, index, end);
for(int i=index=begin; i < end; ++i) {
if (Compare(array[i], pivot, 20) <= 0) {
- {
- int tmp = array[index];
- array[index] = array[i];
- array[i] = tmp;
- index++;
- }
+ swap_ints(array, index, i);
+ index++;
}
}
- {
- int tmp = array[index];
- array[index] = array[end];
- array[end] = tmp;
- }
+ swap_ints(array, index, end);
}
Qsort(array, begin, index - 1);
Qsort(array, index + 1, end);
diff --git a/moses/TranslationModel/PhraseDictionary.cpp b/moses/TranslationModel/PhraseDictionary.cpp
index 7fdd61f97..643633be4 100644
--- a/moses/TranslationModel/PhraseDictionary.cpp
+++ b/moses/TranslationModel/PhraseDictionary.cpp
@@ -44,10 +44,10 @@ CacheColl::~CacheColl()
}
}
-PhraseDictionary::PhraseDictionary(const std::string &line)
- :DecodeFeature(line)
- ,m_tableLimit(20) // default
- ,m_maxCacheSize(DEFAULT_MAX_TRANS_OPT_CACHE_SIZE)
+PhraseDictionary::PhraseDictionary(const std::string &line, bool registerNow)
+ : DecodeFeature(line, registerNow)
+ , m_tableLimit(20) // default
+ , m_maxCacheSize(DEFAULT_MAX_TRANS_OPT_CACHE_SIZE)
{
m_id = s_staticColl.size();
s_staticColl.push_back(this);
@@ -151,7 +151,7 @@ Release(TargetPhraseCollection const* tpc) const
bool
PhraseDictionary::
-PrefixExists(Phrase const& phrase) const
+PrefixExists(ttasksptr const& ttask, Phrase const& phrase) const
{
return true;
}
diff --git a/moses/TranslationModel/PhraseDictionary.h b/moses/TranslationModel/PhraseDictionary.h
index 4736f37d8..2c1f1f39e 100644
--- a/moses/TranslationModel/PhraseDictionary.h
+++ b/moses/TranslationModel/PhraseDictionary.h
@@ -70,6 +70,9 @@ public:
**/
class PhraseDictionary : public DecodeFeature
{
+ friend class PhraseDictionaryMultiModelCounts;
+ // why is this necessary? that's a derived class, so it should have
+ // access to the
public:
virtual bool ProvidesPrefixCheck() const;
@@ -77,7 +80,7 @@ public:
return s_staticColl;
}
- PhraseDictionary(const std::string &line);
+ PhraseDictionary(const std::string &line, bool registerNow);
virtual ~PhraseDictionary() {
}
@@ -100,21 +103,37 @@ public:
// exist in the table.
virtual
bool
- PrefixExists(Phrase const& phrase) const;
+ PrefixExists(ttasksptr const& ttask, Phrase const& phrase) const;
// LEGACY!
// The preferred method is to override GetTargetPhraseCollectionBatch().
// See class PhraseDictionaryMemory or PhraseDictionaryOnDisk for details
//! find list of translations that can translates src. Only for phrase input
+public:
virtual
TargetPhraseCollection const *
GetTargetPhraseCollectionLEGACY(const Phrase& src) const;
virtual
+ TargetPhraseCollection const *
+ GetTargetPhraseCollectionLEGACY(ttasksptr const& ttask, const Phrase& src)
+ {
+ return GetTargetPhraseCollectionLEGACY(src);
+ }
+
+ virtual
void
GetTargetPhraseCollectionBatch(const InputPathList &inputPathQueue) const;
+ virtual
+ void
+ GetTargetPhraseCollectionBatch(ttasksptr const& ttask,
+ const InputPathList &inputPathQueue) const
+ {
+ GetTargetPhraseCollectionBatch(inputPathQueue);
+ }
+
//! Create entry for translation of source to targetPhrase
virtual void InitializeForInput(InputType const& source) {
}
diff --git a/moses/TranslationModel/PhraseDictionaryDynSuffixArray.cpp b/moses/TranslationModel/PhraseDictionaryDynSuffixArray.cpp
index 83b78fe5b..0ae4d4fc8 100644
--- a/moses/TranslationModel/PhraseDictionaryDynSuffixArray.cpp
+++ b/moses/TranslationModel/PhraseDictionaryDynSuffixArray.cpp
@@ -10,7 +10,7 @@ namespace Moses
{
PhraseDictionaryDynSuffixArray::
PhraseDictionaryDynSuffixArray(const std::string &line)
- : PhraseDictionary(line)
+ : PhraseDictionary(line, true)
,m_biSA(new BilingualDynSuffixArray())
{
ReadParameters();
diff --git a/moses/TranslationModel/PhraseDictionaryDynamicCacheBased.cpp b/moses/TranslationModel/PhraseDictionaryDynamicCacheBased.cpp
index a97e24486..04f3630ca 100644
--- a/moses/TranslationModel/PhraseDictionaryDynamicCacheBased.cpp
+++ b/moses/TranslationModel/PhraseDictionaryDynamicCacheBased.cpp
@@ -36,7 +36,7 @@ PhraseDictionaryDynamicCacheBased *PhraseDictionaryDynamicCacheBased::s_instance
//! contructor
PhraseDictionaryDynamicCacheBased::PhraseDictionaryDynamicCacheBased(const std::string &line)
- : PhraseDictionary(line)
+ : PhraseDictionary(line, true)
{
std::cerr << "Initializing PhraseDictionaryDynamicCacheBased feature..." << std::endl;
diff --git a/moses/TranslationModel/PhraseDictionaryMultiModel.cpp b/moses/TranslationModel/PhraseDictionaryMultiModel.cpp
index f0b555e22..4b4df0a02 100644
--- a/moses/TranslationModel/PhraseDictionaryMultiModel.cpp
+++ b/moses/TranslationModel/PhraseDictionaryMultiModel.cpp
@@ -26,7 +26,7 @@ namespace Moses
{
PhraseDictionaryMultiModel::PhraseDictionaryMultiModel(const std::string &line)
- :PhraseDictionary(line)
+ :PhraseDictionary(line, true)
{
ReadParameters();
@@ -54,7 +54,7 @@ PhraseDictionaryMultiModel::PhraseDictionaryMultiModel(const std::string &line)
}
PhraseDictionaryMultiModel::PhraseDictionaryMultiModel(int type, const std::string &line)
- :PhraseDictionary(line)
+ :PhraseDictionary(line, true)
{
if (type == 1) {
// PhraseDictionaryMultiModelCounts
diff --git a/moses/TranslationModel/PhraseDictionaryTransliteration.cpp b/moses/TranslationModel/PhraseDictionaryTransliteration.cpp
index 1d654f4b0..03b69d0ad 100644
--- a/moses/TranslationModel/PhraseDictionaryTransliteration.cpp
+++ b/moses/TranslationModel/PhraseDictionaryTransliteration.cpp
@@ -12,7 +12,7 @@ using namespace std;
namespace Moses
{
PhraseDictionaryTransliteration::PhraseDictionaryTransliteration(const std::string &line)
- : PhraseDictionary(line)
+ : PhraseDictionary(line, true)
{
ReadParameters();
UTIL_THROW_IF2(m_mosesDir.empty() ||
diff --git a/moses/TranslationModel/PhraseDictionaryTreeAdaptor.cpp b/moses/TranslationModel/PhraseDictionaryTreeAdaptor.cpp
index a1105d3a7..9879bc3f8 100644
--- a/moses/TranslationModel/PhraseDictionaryTreeAdaptor.cpp
+++ b/moses/TranslationModel/PhraseDictionaryTreeAdaptor.cpp
@@ -26,7 +26,7 @@ namespace Moses
PhraseDictionaryTreeAdaptor::
PhraseDictionaryTreeAdaptor(const std::string &line)
- : PhraseDictionary(line)
+ : PhraseDictionary(line, true)
{
ReadParameters();
}
diff --git a/moses/TranslationModel/ProbingPT/hash.hh b/moses/TranslationModel/ProbingPT/hash.hh
index a4fcd6330..607238ae1 100644
--- a/moses/TranslationModel/ProbingPT/hash.hh
+++ b/moses/TranslationModel/ProbingPT/hash.hh
@@ -7,7 +7,7 @@
#include <vector>
//Gets the MurmurmurHash for give string
-uint64_t getHash(StringPiece text);
+uint64_t getHash(StringPiece text);
std::vector<uint64_t> getVocabIDs(StringPiece textin);
diff --git a/moses/TranslationModel/ProbingPT/storing.hh b/moses/TranslationModel/ProbingPT/storing.hh
index eb3b1ea53..e1be3bc87 100644
--- a/moses/TranslationModel/ProbingPT/storing.hh
+++ b/moses/TranslationModel/ProbingPT/storing.hh
@@ -2,7 +2,7 @@
#include <cstdio>
#include <fstream>
-#include <iostream>
+#include <iostream>
#include "hash.hh" //Includes line_splitter
#include "probing_hash_utils.hh"
diff --git a/moses/TranslationModel/RuleTable/PhraseDictionaryFuzzyMatch.cpp b/moses/TranslationModel/RuleTable/PhraseDictionaryFuzzyMatch.cpp
index dc25b805b..95bd70e27 100644
--- a/moses/TranslationModel/RuleTable/PhraseDictionaryFuzzyMatch.cpp
+++ b/moses/TranslationModel/RuleTable/PhraseDictionaryFuzzyMatch.cpp
@@ -45,6 +45,7 @@
#include "moses/TranslationModel/fuzzy-match/SentenceAlignment.h"
#include "util/file.hh"
#include "util/exception.hh"
+#include "util/random.hh"
using namespace std;
@@ -62,8 +63,8 @@ char *mkdtemp(char *tempbuf)
return NULL;
}
- srand((unsigned)time(0));
- rand_value = (int)((rand() / ((double)RAND_MAX+1.0)) * 1e6);
+ util::rand_init();
+ rand_value = util::rand_excl(1e6);
tempbase = strrchr(tempbuf, '/');
tempbase = tempbase ? tempbase+1 : tempbuf;
strcpy(tempbasebuf, tempbase);
@@ -79,7 +80,7 @@ namespace Moses
{
PhraseDictionaryFuzzyMatch::PhraseDictionaryFuzzyMatch(const std::string &line)
- :PhraseDictionary(line)
+ :PhraseDictionary(line, true)
,m_config(3)
,m_FuzzyMatchWrapper(NULL)
{
diff --git a/moses/TranslationModel/RuleTable/PhraseDictionaryOnDisk.cpp b/moses/TranslationModel/RuleTable/PhraseDictionaryOnDisk.cpp
index c4433bc5a..795660c8d 100644
--- a/moses/TranslationModel/RuleTable/PhraseDictionaryOnDisk.cpp
+++ b/moses/TranslationModel/RuleTable/PhraseDictionaryOnDisk.cpp
@@ -35,7 +35,7 @@ using namespace std;
namespace Moses
{
PhraseDictionaryOnDisk::PhraseDictionaryOnDisk(const std::string &line)
- : MyBase(line)
+ : MyBase(line, true)
, m_maxSpanDefault(NOT_FOUND)
, m_maxSpanLabelled(NOT_FOUND)
{
diff --git a/moses/TranslationModel/RuleTable/Trie.h b/moses/TranslationModel/RuleTable/Trie.h
index c7cb7e991..51cc92e4a 100644
--- a/moses/TranslationModel/RuleTable/Trie.h
+++ b/moses/TranslationModel/RuleTable/Trie.h
@@ -41,7 +41,7 @@ class RuleTableTrie : public PhraseDictionary
{
public:
RuleTableTrie(const std::string &line)
- : PhraseDictionary(line) {
+ : PhraseDictionary(line, true) {
}
virtual ~RuleTableTrie();
diff --git a/moses/TranslationModel/SkeletonPT.cpp b/moses/TranslationModel/SkeletonPT.cpp
index c1df952c1..22d05f1a0 100644
--- a/moses/TranslationModel/SkeletonPT.cpp
+++ b/moses/TranslationModel/SkeletonPT.cpp
@@ -7,7 +7,7 @@ using namespace std;
namespace Moses
{
SkeletonPT::SkeletonPT(const std::string &line)
- : PhraseDictionary(line)
+ : PhraseDictionary(line, true)
{
ReadParameters();
}
diff --git a/moses/TranslationModel/UG/Jamfile b/moses/TranslationModel/UG/Jamfile
index 3ac7910b2..954640b7a 100644
--- a/moses/TranslationModel/UG/Jamfile
+++ b/moses/TranslationModel/UG/Jamfile
@@ -95,6 +95,19 @@ $(TOP)/moses/TranslationModel/UG//mmsapt
$(TOP)/util//kenutil
;
+exe bitext-find :
+bitext-find.cc
+$(TOP)/moses//moses
+$(TOP)/moses/TranslationModel/UG/generic//generic
+$(TOP)//boost_iostreams
+$(TOP)//boost_filesystem
+$(TOP)//boost_program_options
+$(TOP)/moses/TranslationModel/UG/mm//mm
+# $(TOP)/moses/TranslationModel/UG//mmsapt
+$(TOP)/util//kenutil
+;
+
+
exe spe-check-coverage3 :
spe-check-coverage3.cc
$(TOP)/moses//moses
@@ -108,4 +121,4 @@ $(TOP)/util//kenutil
;
install $(PREFIX)/bin : try-align try-align2 ;
-fakelib mmsapt : [ glob *.cpp mmsapt*.cc sapt*.cc ] ;
+fakelib mmsapt : [ glob *.cpp TargetPhrase*.cc mmsapt*.cc sapt*.cc ] ;
diff --git a/moses/TranslationModel/UG/TargetPhraseCollectionCache.cc b/moses/TranslationModel/UG/TargetPhraseCollectionCache.cc
new file mode 100644
index 000000000..1217b9711
--- /dev/null
+++ b/moses/TranslationModel/UG/TargetPhraseCollectionCache.cc
@@ -0,0 +1,179 @@
+#include "TargetPhraseCollectionCache.h"
+
+namespace Moses
+{
+ using std::vector;
+
+#if defined(timespec)
+ bool operator<(timespec const& a, timespec const& b)
+ {
+ if (a.tv_sec != b.tv_sec) return a.tv_sec < b.tv_sec;
+ return (a.tv_nsec < b.tv_nsec);
+ }
+
+ bool operator>=(timespec const& a, timespec const& b)
+ {
+ if (a.tv_sec != b.tv_sec) return a.tv_sec > b.tv_sec;
+ return (a.tv_nsec >= b.tv_nsec);
+ }
+#endif
+
+ bool operator<(timeval const& a, timeval const& b)
+ {
+ if (a.tv_sec != b.tv_sec) return a.tv_sec < b.tv_sec;
+ return (a.tv_usec < b.tv_usec);
+ }
+
+ bool operator>=(timeval const& a, timeval const& b)
+ {
+ if (a.tv_sec != b.tv_sec) return a.tv_sec > b.tv_sec;
+ return (a.tv_usec >= b.tv_usec);
+ }
+
+ void
+ bubble_up(std::vector<TPCollWrapper*>& v, size_t k)
+ {
+ if (k >= v.size()) return;
+ for (;k && (v[k]->tstamp < v[k/2]->tstamp); k /=2)
+ {
+ std::swap(v[k],v[k/2]);
+ std::swap(v[k]->idx,v[k/2]->idx);
+ }
+ }
+
+ void
+ bubble_down(std::vector<TPCollWrapper*>& v, size_t k)
+ {
+ for (size_t j = 2*(k+1); j <= v.size(); j = 2*((k=j)+1))
+ {
+ if (j == v.size() || (v[j-1]->tstamp < v[j]->tstamp)) --j;
+ if (v[j]->tstamp >= v[k]->tstamp) break;
+ std::swap(v[k],v[j]);
+ v[k]->idx = k;
+ v[j]->idx = j;
+ }
+ }
+
+ TPCollCache
+ ::TPCollCache(size_t capacity)
+ {
+ m_history.reserve(capacity);
+ }
+
+ TPCollWrapper*
+ TPCollCache
+ ::encache(TPCollWrapper* const& ptr)
+ {
+ using namespace boost;
+ // update time stamp:
+#if defined(timespec)
+ clock_gettime(CLOCK_MONOTONIC, &ptr->tstamp);
+#else
+ gettimeofday(&ptr->tstamp, NULL);
+#endif
+ unique_lock<shared_mutex> lock(m_history_lock);
+ if (m_history.capacity() > 1)
+ {
+ vector<TPCollWrapper*>& v = m_history;
+ if (ptr->idx >= 0) // ptr is already in history
+ {
+ assert(ptr == v[ptr->idx]);
+ size_t k = 2 * (ptr->idx + 1);
+ if (k < v.size()) bubble_up(v,k--);
+ if (k < v.size()) bubble_up(v,k);
+ }
+ else if (v.size() < v.capacity())
+ {
+ size_t k = ptr->idx = v.size();
+ v.push_back(ptr);
+ bubble_up(v,k);
+ }
+ else // someone else needs to go
+ {
+ v[0]->idx = -1;
+ release(v[0]);
+ v[0] = ptr;
+ bubble_down(v,0);
+ }
+ }
+ return ptr;
+ } // TPCollCache::encache(...)
+
+ TPCollWrapper*
+ TPCollCache
+ ::get(uint64_t key, size_t revision)
+ {
+ using namespace boost;
+ cache_t::iterator m;
+ {
+ shared_lock<shared_mutex> lock(m_cache_lock);
+ m = m_cache.find(key);
+ if (m == m_cache.end() || m->second->revision != revision)
+ return NULL;
+ ++m->second->refCount;
+ }
+
+ encache(m->second);
+ return NULL;
+ } // TPCollCache::get(...)
+
+ void
+ TPCollCache
+ ::add(uint64_t key, TPCollWrapper* ptr)
+ {
+ {
+ boost::unique_lock<boost::shared_mutex> lock(m_cache_lock);
+ m_cache[key] = ptr;
+ ++ptr->refCount;
+ // ++m_tpc_ctr;
+ }
+ encache(ptr);
+ } // TPCollCache::add(...)
+
+ void
+ TPCollCache
+ ::release(TPCollWrapper*& ptr)
+ {
+ if (!ptr) return;
+
+ if (--ptr->refCount || ptr->idx >= 0) // tpc is still in use
+ {
+ ptr = NULL;
+ return;
+ }
+
+#if 0
+ timespec t; clock_gettime(CLOCK_MONOTONIC,&t);
+ timespec r; clock_getres(CLOCK_MONOTONIC,&r);
+ float delta = t.tv_sec - ptr->tstamp.tv_sec;
+ cerr << "deleting old cache entry after " << delta << " seconds."
+ << " clock resolution is " << r.tv_sec << ":" << r.tv_nsec
+ << " at " << __FILE__ << ":" << __LINE__ << endl;
+#endif
+
+ boost::upgrade_lock<boost::shared_mutex> lock(m_cache_lock);
+ cache_t::iterator m = m_cache.find(ptr->key);
+ if (m != m_cache.end() && m->second == ptr)
+ { // the cache could have been updated with a new pointer
+ // for the same phrase already, so we need to check
+ // if the pointer we cound is the one we want to get rid of,
+ // hence the second check
+ boost::upgrade_to_unique_lock<boost::shared_mutex> xlock(lock);
+ m_cache.erase(m);
+ }
+ delete ptr;
+ ptr = NULL;
+ } // TPCollCache::release(...)
+
+ TPCollWrapper::
+ TPCollWrapper(size_t r, uint64_t k)
+ : revision(r), key(k), refCount(0), idx(-1)
+ { }
+
+ TPCollWrapper::
+ ~TPCollWrapper()
+ {
+ assert(this->refCount == 0);
+ }
+
+} // namespace
diff --git a/moses/TranslationModel/UG/TargetPhraseCollectionCache.h b/moses/TranslationModel/UG/TargetPhraseCollectionCache.h
new file mode 100644
index 000000000..269200647
--- /dev/null
+++ b/moses/TranslationModel/UG/TargetPhraseCollectionCache.h
@@ -0,0 +1,62 @@
+// -*- c++ -*-
+#pragma once
+#include <time.h>
+#include "moses/TargetPhraseCollection.h"
+
+namespace Moses
+{
+ class TPCollWrapper
+ // wrapper around TargetPhraseCollection that includes reference counts
+ // and a time stamp for least-recently-used caching of TargetPhraseCollection-s
+ : public TargetPhraseCollection
+ {
+ public:
+ size_t const revision;
+ // revison; gets changed when the underlying corpus in Mmsapt is updated
+
+ uint64_t const key; // phrase key
+ uint32_t refCount; // reference count
+#if defined(timespec) // timespec is better, but not available everywhere
+ timespec tstamp; // last use
+#else
+ timeval tstamp; // last use
+#endif
+ int idx; // position in the history heap
+ TPCollWrapper(size_t r, uint64_t const k);
+ ~TPCollWrapper();
+ };
+
+ class TPCollCache
+ {
+ typedef boost::unordered_map<uint64_t, TPCollWrapper*> cache_t;
+ typedef std::vector<TPCollWrapper*> history_t;
+ cache_t m_cache; // maps from phrase ids to target phrase collections
+ mutable history_t m_history; // heap of live items, least recently used one on top
+
+ mutable boost::shared_mutex m_cache_lock; // locks m_cache
+ mutable boost::shared_mutex m_history_lock; // locks m_history
+
+#if 0
+ // mutable size_t m_tpc_ctr;
+ // counter of all live item, for debugging. probably obsolete; was used
+ // to track memory leaks
+#endif
+
+ TPCollWrapper* encache(TPCollWrapper* const& ptr);
+ // updates time stamp and position in least-recently-used heap m_history
+
+ public:
+ TPCollCache(size_t capacity=1000);
+
+ TPCollWrapper*
+ get(uint64_t key, size_t revision);
+
+ void
+ add(uint64_t key, TPCollWrapper* ptr);
+
+ void
+ release(TPCollWrapper*& tpc);
+ };
+
+
+}
diff --git a/moses/TranslationModel/UG/bitext-find.cc b/moses/TranslationModel/UG/bitext-find.cc
new file mode 100644
index 000000000..18cc6e0fa
--- /dev/null
+++ b/moses/TranslationModel/UG/bitext-find.cc
@@ -0,0 +1,149 @@
+#include <boost/program_options.hpp>
+#include "mm/ug_bitext.h"
+#include <string>
+
+using namespace std;
+using namespace Moses;
+using namespace Moses::bitext;
+namespace po=boost::program_options;
+typedef L2R_Token<SimpleWordId> Token;
+typedef mmBitext<Token> mmbitext;
+typedef Bitext<Token>::tsa tsa;
+
+string bname, L1, L2, Q1, Q2;
+size_t maxhits;
+void interpret_args(int ac, char* av[]);
+
+
+void
+write_sentence
+(Ttrack<Token> const& T, uint32_t const sid, TokenIndex const& V, ostream& out)
+{
+ Token const* t = T.sntStart(sid);
+ Token const* e = T.sntEnd(sid);
+ // size_t i = 0;
+ while (t < e)
+ {
+ // out << i++ << ":";
+ out << V[t->id()];
+ if (++t < e) out << " ";
+ }
+}
+
+bool
+fill(string const& query, TSA<Token> const& tsa,
+ TokenIndex const& V, bitvector& v)
+{
+ v.resize(tsa.getCorpus()->size());
+ Bitext<Token>::iter m(&tsa);
+ istringstream buf(query); string w;
+ while (buf >> w)
+ if (!m.extend(V[w]))
+ return false;
+ m.markSentences(v);
+ return true;
+}
+
+
+
+
+int main(int argc, char* argv[])
+{
+ interpret_args(argc, argv);
+ if (Q1.empty() && Q2.empty()) exit(0);
+
+ mmbitext B; string w;
+ B.open(bname, L1, L2);
+
+ Bitext<Token>::iter m1(B.I1.get(), *B.V1, Q1);
+ if (Q1.size() && m1.size() == 0) exit(0);
+
+ Bitext<Token>::iter m2(B.I2.get(), *B.V2, Q2);
+ if (Q2.size() && m2.size() == 0) exit(0);
+
+ bitvector check(B.T1->size());
+ if (Q1.size() == 0 || Q2.size() == 0) check.set();
+ else (m2.markSentences(check));
+
+ Bitext<Token>::iter& m = m1.size() ? m1 : m2;
+ char const* x = m.lower_bound(-1);
+ char const* stop = m.upper_bound(-1);
+ uint64_t sid;
+ ushort off;
+ boost::taus88 rnd;
+ size_t N = m.approxOccurrenceCount();
+ maxhits = min(N, maxhits);
+ size_t k = 0; // selected
+ for (size_t i = 0; x < stop; ++i)
+ {
+ x = m.root->readSid(x,stop,sid);
+ x = m.root->readOffset(x,stop,off);
+
+ if (!check[sid]) continue;
+ size_t r = (N - i) * rnd()/(rnd.max()+1.) + k;
+ if (maxhits != N && r >= maxhits) continue;
+ ++k;
+
+ size_t s1,s2,e1,e2; int po_fwd=-1,po_bwd=-1;
+ vector<uchar> caln;
+ // cout << sid << " " << B.docname(sid) << endl;
+ if (!B.find_trg_phr_bounds(sid, off, off+m.size(),
+ s1,s2,e1,e2,po_fwd,po_bwd,
+ &caln, NULL, &m == &m2))
+ {
+ // cout << "alignment failure" << endl;
+ }
+
+ cout << sid << " " << B.docname(sid)
+ << " dfwd=" << po_fwd << " dbwd=" << po_bwd
+ << "\n";
+ write_sentence(*B.T1, sid, *B.V1, cout); cout << "\n";
+ write_sentence(*B.T2, sid, *B.V2, cout); cout << "\n";
+ B.write_yawat_alignment(sid,
+ m1.size() ? &m1 : NULL,
+ m2.size() ? &m2 : NULL, cout);
+ cout << endl;
+
+ }
+}
+
+void
+interpret_args(int ac, char* av[])
+{
+ po::variables_map vm;
+ po::options_description o("Options");
+ o.add_options()
+
+ ("help,h", "print this message")
+ ("maxhits,n", po::value<size_t>(&maxhits)->default_value(25),
+ "max. number of hits")
+ ("q1", po::value<string>(&Q1), "query in L1")
+ ("q2", po::value<string>(&Q2), "query in L2")
+ ;
+
+ po::options_description h("Hidden Options");
+ h.add_options()
+ ("bname", po::value<string>(&bname), "base name of corpus")
+ ("L1", po::value<string>(&L1), "L1 tag")
+ ("L2", po::value<string>(&L2), "L2 tag")
+ ;
+
+ h.add(o);
+ po::positional_options_description a;
+ a.add("bname",1);
+ a.add("L1",1);
+ a.add("L2",1);
+
+ po::store(po::command_line_parser(ac,av)
+ .options(h)
+ .positional(a)
+ .run(),vm);
+ po::notify(vm);
+ if (vm.count("help"))
+ {
+ cout << "\nusage:\n\t" << av[0]
+ << " [options] [--q1=<L1string>] [--q2=<L2string>]" << endl;
+ cout << o << endl;
+ exit(0);
+ }
+}
diff --git a/moses/TranslationModel/UG/count-ptable-features.cc b/moses/TranslationModel/UG/count-ptable-features.cc
index b4d2cb4dd..4c9022075 100644
--- a/moses/TranslationModel/UG/count-ptable-features.cc
+++ b/moses/TranslationModel/UG/count-ptable-features.cc
@@ -21,6 +21,6 @@ int main()
cout << PT.GetFeatureNames().size() << endl;
exit(0);
}
-
-
+
+
diff --git a/moses/TranslationModel/UG/generic/file_io/ug_stream.cpp b/moses/TranslationModel/UG/generic/file_io/ug_stream.cpp
index 073b64dfc..b87aa1d0c 100644
--- a/moses/TranslationModel/UG/generic/file_io/ug_stream.cpp
+++ b/moses/TranslationModel/UG/generic/file_io/ug_stream.cpp
@@ -11,7 +11,7 @@ namespace ugdiss
using namespace boost::algorithm;
using namespace boost::iostreams;
- filtering_istream*
+ filtering_istream*
open_input_stream(string fname)
{
filtering_istream* ret = new filtering_istream();
@@ -19,7 +19,7 @@ namespace ugdiss
return ret;
}
- filtering_ostream*
+ filtering_ostream*
open_output_stream(string fname)
{
filtering_ostream* ret = new filtering_ostream();
@@ -27,7 +27,7 @@ namespace ugdiss
return ret;
}
- void
+ void
open_input_stream(string fname, filtering_istream& in)
{
if (ends_with(fname, ".gz"))
@@ -41,7 +41,7 @@ namespace ugdiss
in.push(file_source(fname.c_str()));
}
- void
+ void
open_output_stream(string fname, filtering_ostream& out)
{
if (ends_with(fname, ".gz") || ends_with(fname, ".gz_"))
diff --git a/moses/TranslationModel/UG/generic/file_io/ug_stream.h b/moses/TranslationModel/UG/generic/file_io/ug_stream.h
index e2c9e4764..5555e36f8 100644
--- a/moses/TranslationModel/UG/generic/file_io/ug_stream.h
+++ b/moses/TranslationModel/UG/generic/file_io/ug_stream.h
@@ -23,7 +23,7 @@ using namespace boost::iostreams;
/** open input file that is possibly compressed
* decompression filters are automatically added based on the file name
- * gzip for .gz; bzip2 for bz2.
+ * gzip for .gz; bzip2 for bz2.
*/
filtering_istream* open_input_stream(string fname);
void open_input_stream(string fname, filtering_istream& in);
diff --git a/moses/TranslationModel/UG/generic/program_options/ug_get_options.cpp b/moses/TranslationModel/UG/generic/program_options/ug_get_options.cpp
index 31927ac84..6c1644837 100644
--- a/moses/TranslationModel/UG/generic/program_options/ug_get_options.cpp
+++ b/moses/TranslationModel/UG/generic/program_options/ug_get_options.cpp
@@ -11,7 +11,7 @@ namespace ugdiss
{
using namespace std;
- void
+ void
get_options(int ac, char* av[], progopts& o, posopts& a, optsmap& vm,
char const* cfgFileParam)
{
@@ -30,17 +30,17 @@ namespace ugdiss
}
else
{
- cerr << "Error: cannot find config file '"
+ cerr << "Error: cannot find config file '"
<< cfgFile << "'!" << endl;
exit(1);
}
}
}
-
+
// process positional args, ignoring those set in the config file
if (a.max_total_count())
po::store(po::command_line_parser(ac,av)
- .options(o).positional(a).run(),vm);
+ .options(o).positional(a).run(),vm);
po::notify(vm); // IMPORTANT
}
}
diff --git a/moses/TranslationModel/UG/generic/program_options/ug_get_options.h b/moses/TranslationModel/UG/generic/program_options/ug_get_options.h
index 79b626ef5..636b11302 100644
--- a/moses/TranslationModel/UG/generic/program_options/ug_get_options.h
+++ b/moses/TranslationModel/UG/generic/program_options/ug_get_options.h
@@ -6,18 +6,18 @@
#include <boost/program_options.hpp>
-namespace ugdiss
+namespace ugdiss
{
namespace po=boost::program_options;
typedef po::options_description progopts;
typedef po::positional_options_description posopts;
typedef po::variables_map optsmap;
- void
- get_options(int ac, char* av[],
- progopts & o,
- posopts & a,
- optsmap & vm,
+ void
+ get_options(int ac, char* av[],
+ progopts & o,
+ posopts & a,
+ optsmap & vm,
char const* cfgFileParam=NULL);
}
diff --git a/moses/TranslationModel/UG/generic/program_options/ug_splice_arglist.cc b/moses/TranslationModel/UG/generic/program_options/ug_splice_arglist.cc
index 7dc2cd18f..f30d91acc 100644
--- a/moses/TranslationModel/UG/generic/program_options/ug_splice_arglist.cc
+++ b/moses/TranslationModel/UG/generic/program_options/ug_splice_arglist.cc
@@ -5,15 +5,15 @@
#include <boost/foreach.hpp>
namespace Moses {
-
- void
+
+ void
filter_arguments(int const argc_in, char const* const* const argv_in,
- int & argc_moses, char*** argv_moses,
+ int & argc_moses, char*** argv_moses,
int & argc_other, char*** argv_other,
vector<pair<string,int> > const& filter)
{
*argv_moses = new char*[argc_in];
- *argv_other = new char*[argc_in];
+ *argv_other = new char*[argc_in];
(*argv_moses)[0] = new char[strlen(argv_in[0])+1];
strcpy((*argv_moses)[0], argv_in[0]);
argc_moses = 1;
@@ -30,7 +30,7 @@ namespace Moses {
strcpy((*argv_other)[argc_other++],argv_in[i]);
for (int k = 0; k < o.second; ++k)
{
- UTIL_THROW_IF2(++i >= argc_in || argv_in[i][0] == '-',
+ UTIL_THROW_IF2(++i >= argc_in || argv_in[i][0] == '-',
"[" << HERE << "] Missing argument for "
<< "parameter " << o.first << "!");
(*argv_other)[argc_other] = new char[strlen(argv_in[i])+1];
@@ -44,7 +44,7 @@ namespace Moses {
strcpy((*argv_moses)[argc_moses++], argv_in[i++]);
}
}
-
+
} // namespace Moses
diff --git a/moses/TranslationModel/UG/generic/program_options/ug_splice_arglist.h b/moses/TranslationModel/UG/generic/program_options/ug_splice_arglist.h
index e56585e8a..605acee6c 100644
--- a/moses/TranslationModel/UG/generic/program_options/ug_splice_arglist.h
+++ b/moses/TranslationModel/UG/generic/program_options/ug_splice_arglist.h
@@ -5,12 +5,12 @@
namespace Moses {
using namespace std;
- // Function to splice the argument list (e.g. before handing it over to
+ // Function to splice the argument list (e.g. before handing it over to
// Moses LoadParam() function. /filter/ is a vector of argument names
- // and the number of arguments after each of them
- void
+ // and the number of arguments after each of them
+ void
filter_arguments(int const argc_in, char const* const* const argv_in,
- int & argc_moses, char*** argv_moses,
+ int & argc_moses, char*** argv_moses,
int & argc_other, char*** argv_other,
vector<pair<string,int> > const& filter);
diff --git a/moses/TranslationModel/UG/generic/sampling/Sampling.h b/moses/TranslationModel/UG/generic/sampling/Sampling.h
index c60953d5d..652e532bc 100644
--- a/moses/TranslationModel/UG/generic/sampling/Sampling.h
+++ b/moses/TranslationModel/UG/generic/sampling/Sampling.h
@@ -2,19 +2,16 @@
#define __sampling_h
#include <boost/dynamic_bitset.hpp>
#include <vector>
+
+#include "util/random.hh"
+
// Utility functions for proper sub-sampling.
// (c) 2007-2012 Ulrich Germann
namespace Moses
{
- using namespace std;
-inline
-size_t
-randInt(size_t N)
-{
- return N*(rand()/(RAND_MAX+1.));
-}
+using namespace std;
// select a random sample of size /s/ without restitution from the range of
// integers [0,N);
@@ -35,15 +32,15 @@ randomSample(vector<idx_t>& v, size_t s, size_t N)
if (s*10<N) {
boost::dynamic_bitset<uint64_t> check(N,0);
for (size_t i = 0; i < v.size(); i++) {
- size_t x = randInt(N);
- while (check[x]) x = randInt(N);
+ size_t x = util::rand_excl(N);
+ while (check[x]) x = util::rand_excl(N);
check[x]=true;
v[i] = x;
}
} else {
size_t m=0;
for (size_t t = 0; m <= s && t < N; t++)
- if (s==N || randInt(N-t) < s-m) v[m++] = t;
+ if (s==N || util::rand_excl(N-t) < s-m) v[m++] = t;
}
}
diff --git a/moses/TranslationModel/UG/generic/sorting/VectorIndexSorter.h b/moses/TranslationModel/UG/generic/sorting/VectorIndexSorter.h
index f26e28c52..31132c63c 100644
--- a/moses/TranslationModel/UG/generic/sorting/VectorIndexSorter.h
+++ b/moses/TranslationModel/UG/generic/sorting/VectorIndexSorter.h
@@ -17,40 +17,40 @@
namespace Moses
{
using namespace std;
- template<typename VAL,
+ template<typename VAL,
typename COMP = greater<VAL>,
typename IDX_T=size_t>
class
- VectorIndexSorter
+ VectorIndexSorter
: public binary_function<IDX_T const&, IDX_T const&, bool>
{
vector<VAL> const& m_vecref;
boost::shared_ptr<COMP> m_comp;
public:
-
+
COMP const& Compare;
VectorIndexSorter(vector<VAL> const& v, COMP const& comp)
: m_vecref(v), Compare(comp) {
}
-
+
VectorIndexSorter(vector<VAL> const& v)
: m_vecref(v), m_comp(new COMP()), Compare(*m_comp) {
}
-
+
bool operator()(IDX_T const & a, IDX_T const & b) const {
bool fwd = Compare(m_vecref.at(a) ,m_vecref.at(b));
bool bwd = Compare(m_vecref[b], m_vecref[a]);
return (fwd == bwd ? a < b : fwd);
}
-
+
boost::shared_ptr<vector<IDX_T> >
GetOrder() const;
-
+
void
GetOrder(vector<IDX_T> & order) const;
-
+
};
-
+
template<typename VAL, typename COMP, typename IDX_T>
boost::shared_ptr<vector<IDX_T> >
VectorIndexSorter<VAL,COMP,IDX_T>::
@@ -60,7 +60,7 @@ namespace Moses
get_order(*ret);
return ret;
}
-
+
template<typename VAL, typename COMP, typename IDX_T>
void
VectorIndexSorter<VAL,COMP,IDX_T>::
@@ -70,6 +70,6 @@ namespace Moses
for (IDX_T i = 0; i < IDX_T(m_vecref.size()); ++i) order[i] = i;
sort(order.begin(), order.end(), *this);
}
-
+
}
#endif
diff --git a/moses/TranslationModel/UG/generic/stringdist/ug_stringdist.cc b/moses/TranslationModel/UG/generic/stringdist/ug_stringdist.cc
index 4b61ecd60..877b7a816 100644
--- a/moses/TranslationModel/UG/generic/stringdist/ug_stringdist.cc
+++ b/moses/TranslationModel/UG/generic/stringdist/ug_stringdist.cc
@@ -6,14 +6,14 @@
// string distance measures
// Code by Ulrich Germann
-namespace stringdist
+namespace stringdist
{
- UErrorCode strip_accents(UnicodeString & trg)
+ UErrorCode strip_accents(UnicodeString & trg)
{
UErrorCode status = U_ZERO_ERROR;
- static Transliterator *stripper
- = Transliterator::createInstance("NFD; [:M:] Remove; NFC",
+ static Transliterator *stripper
+ = Transliterator::createInstance("NFD; [:M:] Remove; NFC",
UTRANS_FORWARD, status);
stripper->transliterate(trg);
return status;
@@ -22,9 +22,9 @@ namespace stringdist
char const*
StringDiff::
Segment::
- elabel[] = { "same", "cap", "flip", "permutation",
- "accent", "duplication",
- "insertion", "deletion",
+ elabel[] = { "same", "cap", "flip", "permutation",
+ "accent", "duplication",
+ "insertion", "deletion",
"mismatch", "noinit" };
StringDiff::
@@ -44,7 +44,7 @@ namespace stringdist
Segment()
: start_a(-1), end_a(-1), start_b(-1), end_b(-1), match(noinit), dist(0)
{}
-
+
UnicodeString const&
StringDiff::
set_a(string const& a)
@@ -74,8 +74,8 @@ namespace stringdist
{
return this->b;
}
-
- size_t
+
+ size_t
StringDiff::
size()
{
@@ -94,7 +94,7 @@ namespace stringdist
// if (s.match == same) continue;
// else if (s.match == insertion) ret += s.end_b - s.start_b;
// else if (s.match == deletion) ret += s.end_a - s.start_a;
-
+
// }
// }
@@ -138,7 +138,7 @@ namespace stringdist
#endif
}
- float
+ float
fillAlignmentMatrix(UChar const* a, size_t const lenA,
UChar const* b, size_t const lenB,
vector<vector<float> > & M)
@@ -164,7 +164,7 @@ namespace stringdist
return M.back().back();
}
- float
+ float
levenshtein(UChar const* a, size_t const lenA,
UChar const* b, size_t const lenB)
{
@@ -180,7 +180,7 @@ namespace stringdist
cout << endl;
}
cout << string(25,'-') << endl;
-#endif
+#endif
int i = M.size() -1;
int j = M.back().size() -1;
@@ -207,29 +207,29 @@ namespace stringdist
return ret;
}
-
+
StringDiff::
Segment::
- Segment(size_t const as, size_t const ae,
+ Segment(size_t const as, size_t const ae,
size_t const bs, size_t const be,
- UnicodeString const& a,
- UnicodeString const& b)
+ UnicodeString const& a,
+ UnicodeString const& b)
{
dist = 0;
- start_a = as; end_a = ae;
+ start_a = as; end_a = ae;
start_b = bs; end_b = be;
if (as == ae)
match = bs == be ? same : insertion;
- else if (bs == be)
+ else if (bs == be)
match = deletion;
- else if (be-bs != ae-as)
+ else if (be-bs != ae-as)
{
match = mismatch;
dist = stringdist::levenshtein(a.getBuffer() + as, ae - as,
b.getBuffer() + bs, be - bs);
}
- else
+ else
{
match = same;
size_t stop = ae-as;
@@ -251,11 +251,11 @@ namespace stringdist
}
}
}
- if (match == insertion)
+ if (match == insertion)
{
dist = be-bs;
}
- else if (match == deletion)
+ else if (match == deletion)
{
dist = ae-as;
}
@@ -309,18 +309,18 @@ namespace stringdist
if (i) --i;
if (j) --j;
}
- for (size_t k = 0; k < A.size(); ++k)
+ for (size_t k = 0; k < A.size(); ++k)
A[k] = min(A[k],A2[k]);
- for (size_t k = 0; k < B.size(); ++k)
+ for (size_t k = 0; k < B.size(); ++k)
B[k] = min(B[k],B2[k]);
-
+
if (a[i] == b[j]) { A[i] = j; B[j] = i; }
i = 0;
j = 0;
size_t I, J;
while (i < a.length() and j < b.length())
{
- if (A[i] < 0)
+ if (A[i] < 0)
{
I = i + 1;
while (I < A.size() and A[I] < 0) ++I;
@@ -338,24 +338,24 @@ namespace stringdist
difflist.push_back(Segment(i,i,j,J,a,b));
j = J;
}
- else
+ else
{
- I = i;
+ I = i;
J = j;
- while(I < A.size() && A[I] >= 0 && J < B.size() && B[J] >= 0)
+ while(I < A.size() && A[I] >= 0 && J < B.size() && B[J] >= 0)
{ ++I; ++J; }
difflist.push_back(Segment(i,I,j,J,a,b));
i = I; j = J;
}
}
- if (i < a.length() || j < b.length())
+ if (i < a.length() || j < b.length())
difflist.push_back(Segment(i,a.length(),j,b.length(),a,b));
diffcnt.assign(noinit,0);
for (size_t i = 0; i < difflist.size(); ++i)
{
Segment & s = difflist[i];
- if (s.match == insertion and
+ if (s.match == insertion and
((s.start_a and a[s.start_a - 1] == b[s.start_b]) or
(s.end_a < a.length() and a[s.end_a] == b[s.start_b])))
{
@@ -364,7 +364,7 @@ namespace stringdist
sameletter = b[i] == b[i-1];
if (sameletter) s.match = duplication;
}
- else if (s.match == deletion and
+ else if (s.match == deletion and
((s.start_b and b[s.start_b - 1] == a[s.start_a]) or
(s.end_b < b.length() and b[s.end_b] == a[s.start_a])))
{
@@ -380,15 +380,15 @@ namespace stringdist
void
StringDiff::
- showDiff(std::ostream& out)
+ showDiff(std::ostream& out)
{
if (difflist.size() == 0) align();
vector<size_t> fromEnd(difflist.size(),0);
for (int d = difflist.size()-1; d-- > 0;)
{
fromEnd[d] = a.length() - difflist[d].end_a;
- // cout << d << " " << fromEnd[d] << " "
- // << difflist[d].start_a << "-"
+ // cout << d << " " << fromEnd[d] << " "
+ // << difflist[d].start_a << "-"
// << difflist[d].end_a << endl;
}
for (size_t d = 0; d < difflist.size(); ++d)
@@ -402,7 +402,7 @@ namespace stringdist
bseg.toUTF8String(bbuf);
out << abuf << " ";
out << bbuf << " ";
- out << s.label() << " "
+ out << s.label() << " "
<< s.dist << " "
<< fromEnd[d]
<< endl;
@@ -423,7 +423,7 @@ namespace stringdist
{
return difflist.at(i);
}
-
+
vector<int> const&
StringDiff::
getFeatures() const
diff --git a/moses/TranslationModel/UG/generic/stringdist/ug_stringdist.h b/moses/TranslationModel/UG/generic/stringdist/ug_stringdist.h
index 43fb089f1..8dfcfb58a 100644
--- a/moses/TranslationModel/UG/generic/stringdist/ug_stringdist.h
+++ b/moses/TranslationModel/UG/generic/stringdist/ug_stringdist.h
@@ -21,15 +21,15 @@ using namespace std;
//using namespace boost;
using namespace ugdiss;
-namespace stringdist
+namespace stringdist
{
- float
+ float
levenshtein(UChar const* a, size_t const lenA,
UChar const* b, size_t const lenB);
UErrorCode strip_accents(UnicodeString & trg);
- float
+ float
fillAlignmentMatrix(UChar const* a, size_t const lenA,
UChar const* b, size_t const lenB,
vector<vector<float> > & M);
@@ -37,9 +37,9 @@ namespace stringdist
class StringDiff
{
public:
- enum MATCHTYPE
+ enum MATCHTYPE
{
- same, // a and b are identical
+ same, // a and b are identical
cap, // a and b differ only in capitalization
flip, // two-letter flip
permutation, // a and b have same letters but in different order
@@ -48,7 +48,7 @@ namespace stringdist
insertion, // a is empty
deletion, // b is empty
mismatch, // none of the above
- noinit // not initialized
+ noinit // not initialized
};
struct Segment
@@ -59,9 +59,9 @@ namespace stringdist
MATCHTYPE match;
float dist;
Segment();
- Segment(size_t const as, size_t const ae,
+ Segment(size_t const as, size_t const ae,
size_t const bs, size_t const be,
- UnicodeString const& a,
+ UnicodeString const& a,
UnicodeString const& b);
char const* label() const;
};
diff --git a/moses/TranslationModel/UG/generic/threading/ug_thread_safe_counter.cc b/moses/TranslationModel/UG/generic/threading/ug_thread_safe_counter.cc
index 662493e18..b4565f99d 100644
--- a/moses/TranslationModel/UG/generic/threading/ug_thread_safe_counter.cc
+++ b/moses/TranslationModel/UG/generic/threading/ug_thread_safe_counter.cc
@@ -3,10 +3,10 @@ namespace Moses
{
ThreadSafeCounter::
ThreadSafeCounter()
- : ctr(0)
+ : ctr(0)
{ }
- size_t
+ size_t
ThreadSafeCounter::
operator++()
{
@@ -14,21 +14,21 @@ namespace Moses
return ++ctr;
}
- size_t
+ size_t
ThreadSafeCounter::
operator++(int foo)
{
boost::lock_guard<boost::mutex> guard(this->lock);
return ctr++;
}
-
+
ThreadSafeCounter::
operator size_t() const
{
return ctr;
}
- size_t
+ size_t
ThreadSafeCounter::
operator--()
{
@@ -36,13 +36,13 @@ namespace Moses
return --ctr;
}
- size_t
+ size_t
ThreadSafeCounter::
operator--(int foo)
{
boost::lock_guard<boost::mutex> guard(this->lock);
return ctr--;
}
-
-
+
+
}
diff --git a/moses/TranslationModel/UG/mm/calc-coverage.cc b/moses/TranslationModel/UG/mm/calc-coverage.cc
index ef17656d9..83f67220d 100644
--- a/moses/TranslationModel/UG/mm/calc-coverage.cc
+++ b/moses/TranslationModel/UG/mm/calc-coverage.cc
@@ -16,7 +16,7 @@ using namespace ugdiss;
typedef L2R_Token<SimpleWordId> Token;
TokenIndex V;
sptr<vector<vector<Token> > > C(new vector<vector<Token> >());
-void
+void
add_file(string fname)
{
filtering_istream in;
diff --git a/moses/TranslationModel/UG/mm/custom-pt.cc b/moses/TranslationModel/UG/mm/custom-pt.cc
index 93c8c0eb0..1a51aa8a4 100644
--- a/moses/TranslationModel/UG/mm/custom-pt.cc
+++ b/moses/TranslationModel/UG/mm/custom-pt.cc
@@ -31,7 +31,7 @@ using namespace Moses;
using namespace Moses::bitext;
#define CACHING_THRESHOLD 1000
-#define lbop boost::math::binomial_distribution<>::find_lower_bound_on_p
+#define lbop boost::math::binomial_distribution<>::find_lower_bound_on_p
size_t mctr=0,xctr=0;
typedef L2R_Token<SimpleWordId> Token;
@@ -49,15 +49,15 @@ PScoreWC<Token> apply_wp;
vector<float> fweights;
void
-nbest_phrasepairs(uint64_t const pid1,
- pstats const& ps,
+nbest_phrasepairs(uint64_t const pid1,
+ pstats const& ps,
vector<PhrasePair> & nbest)
{
pstats::trg_map_t::const_iterator m;
vector<size_t> idx(nbest.size());
size_t i=0;
- for (m = ps.trg.begin();
- m != ps.trg.end() && i < nbest.size();
+ for (m = ps.trg.begin();
+ m != ps.trg.end() && i < nbest.size();
++m)
{
// cout << m->second.rcnt() << " " << ps.good << endl;
@@ -74,17 +74,17 @@ nbest_phrasepairs(uint64_t const pid1,
++i;
}
// cout << i << " " << nbest.size() << endl;
- if (i < nbest.size())
+ if (i < nbest.size())
{
// cout << "Resizing from " << nbest.size() << " to " << i << endl;
nbest.resize(i);
idx.resize(i);
}
VectorIndexSorter<PhrasePair> sorter(nbest,greater<PhrasePair>());
- if (m != ps.trg.end())
+ if (m != ps.trg.end())
{
make_heap(idx.begin(),idx.end(),sorter);
- PhrasePair cand;
+ PhrasePair cand;
cand.init(pid1,ps,5);
for (; m != ps.trg.end(); ++m)
{
@@ -104,7 +104,7 @@ nbest_phrasepairs(uint64_t const pid1,
}
sort(nbest.begin(),nbest.end(),greater<PhrasePair>());
}
-
+
int main(int argc, char* argv[])
{
// assert(argc == 4);
@@ -120,8 +120,8 @@ int main(int argc, char* argv[])
string L2 = "en";
size_t max_samples = argc > 1 ? atoi(argv[1]) : 1000;
#endif
- char c = *base.rbegin();
- if (c != '/' && c != '.')
+ char c = *base.rbegin();
+ if (c != '/' && c != '.')
base += ".";
fweights.resize(5,.25);
@@ -138,7 +138,7 @@ int main(int argc, char* argv[])
string line;
while (getline(cin,line))
{
- vector<id_type> snt;
+ vector<id_type> snt;
bt.V1->fillIdSeq(line,snt);
for (size_t i = 0; i < snt.size(); ++i)
{
@@ -156,8 +156,8 @@ int main(int argc, char* argv[])
sptr<pstats> s = bt.lookup(m);
for (size_t j = i; j <= k; ++j)
cout << (*bt.V1)[snt[j]] << " ";
- cout << s->good << "/"
- << s->sample_cnt << "/"
+ cout << s->good << "/"
+ << s->sample_cnt << "/"
<< s->raw_cnt << endl;
// vector<PhrasePair> nbest(min(s->trg.size(),size_t(20)));
vector<PhrasePair> nbest(s->trg.size());
@@ -172,17 +172,17 @@ int main(int argc, char* argv[])
cout << " " << setw(6) << pp.score << " ";
for (uint32_t i = off; i < stop; ++i)
cout << (*bt.V2)[o[i].id()] << " ";
- cout << pp.joint << "/"
+ cout << pp.joint << "/"
<< pp.raw1 << "/"
<< pp.raw2 << " |";
- BOOST_FOREACH(float f, pp.fvals)
+ BOOST_FOREACH(float f, pp.fvals)
cout << " " << f;
cout << endl;
}
}
}
}
-#endif
+#endif
exit(0);
}
#endif
diff --git a/moses/TranslationModel/UG/mm/mam2symal.cc b/moses/TranslationModel/UG/mm/mam2symal.cc
index 9610e6f56..eb5034aab 100644
--- a/moses/TranslationModel/UG/mm/mam2symal.cc
+++ b/moses/TranslationModel/UG/mm/mam2symal.cc
@@ -22,7 +22,7 @@ typedef L2R_Token<Conll_Sform> Token;
mmTtrack<char> MAM;
bool with_sids;
-void
+void
interpret_args(int ac, char* av[])
{
po::variables_map vm;
@@ -31,7 +31,7 @@ interpret_args(int ac, char* av[])
("help,h", "print this message")
("numbers,n", po::bool_switch(&with_sids), "print sentence ids as first token")
;
-
+
po::options_description h("Hidden Options");
h.add_options()
("mamfile", po::value<string>(&mamfile), "mamfile")
@@ -40,7 +40,7 @@ interpret_args(int ac, char* av[])
po::positional_options_description a;
a.add("mamfile",1);
a.add("range",-1);
-
+
po::store(po::command_line_parser(ac,av)
.options(h.add(o))
.positional(a)
@@ -56,11 +56,11 @@ interpret_args(int ac, char* av[])
}
}
-void
+void
printRangeMAM(size_t start, size_t stop)
{
for (;start < stop; start++)
- {
+ {
// size_t i = 0;
char const* p = MAM.sntStart(start);
char const* q = MAM.sntEnd(start);
@@ -76,7 +76,7 @@ printRangeMAM(size_t start, size_t stop)
}
}
-int
+int
main(int argc, char*argv[])
{
interpret_args(argc,argv);
@@ -91,7 +91,7 @@ main(int argc, char*argv[])
buf>>first;
if (buf.peek() == '-') buf>>c>>last;
else last = first;
- if (last < MAM.size())
+ if (last < MAM.size())
printRangeMAM(first,last+1);
}
}
diff --git a/moses/TranslationModel/UG/mm/mam_verify.cc b/moses/TranslationModel/UG/mm/mam_verify.cc
index d43539742..798baa947 100644
--- a/moses/TranslationModel/UG/mm/mam_verify.cc
+++ b/moses/TranslationModel/UG/mm/mam_verify.cc
@@ -21,7 +21,7 @@ mmTtrack<char> MAM;
mmTtrack<Token> T1,T2;
bool inv;
vector<string> range;
-void
+void
interpret_args(int ac, char* av[])
{
po::variables_map vm;
@@ -30,7 +30,7 @@ interpret_args(int ac, char* av[])
("help,h", "print this message")
("inv,i", po::bool_switch(&inv), "inverse")
;
-
+
po::options_description h("Hidden Options");
h.add_options()
("bname", po::value<string>(&bname), "base name")
@@ -43,7 +43,7 @@ interpret_args(int ac, char* av[])
a.add("L1",1);
a.add("L2",1);
a.add("range",-1);
-
+
po::store(po::command_line_parser(ac,av)
.options(h.add(o))
.positional(a)
@@ -87,7 +87,7 @@ check_range(size_t start, size_t stop)
return noAln;
}
-int
+int
main(int argc, char*argv[])
{
interpret_args(argc,argv);
@@ -100,7 +100,7 @@ main(int argc, char*argv[])
exit(1);
}
size_t noAln;
- if (!range.size())
+ if (!range.size())
noAln = check_range(0, MAM.size());
else
{
@@ -112,7 +112,7 @@ main(int argc, char*argv[])
buf>>first;
if (buf.peek() == '-') buf>>c>>last;
else last = first;
- if (last < MAM.size())
+ if (last < MAM.size())
noAln += check_range(first,last+1);
}
}
diff --git a/moses/TranslationModel/UG/mm/mmlex-build.cc b/moses/TranslationModel/UG/mm/mmlex-build.cc
index 4ef0842e4..1e7bee5cb 100644
--- a/moses/TranslationModel/UG/mm/mmlex-build.cc
+++ b/moses/TranslationModel/UG/mm/mmlex-build.cc
@@ -1,8 +1,8 @@
// -*- c++ -*-
// Program to extract word cooccurrence counts from a memory-mapped
// word-aligned bitext stores the counts lexicon in the format for
-// mm2dTable<uint32_t> (ug_mm_2d_table.h)
-//
+// mm2dTable<uint32_t> (ug_mm_2d_table.h)
+//
// (c) 2010-2012 Ulrich Germann
// to do: multi-threading
@@ -20,10 +20,11 @@
#include <boost/foreach.hpp>
#include <boost/thread.hpp>
#include <boost/math/distributions/binomial.hpp>
-#include <boost/unordered_map.hpp>
-#include <boost/unordered_set.hpp>
+#include <boost/unordered_map.hpp>
+#include <boost/unordered_set.hpp>
#include "moses/TranslationModel/UG/generic/program_options/ug_get_options.h"
+#include "moses/Util.h"
#include "ug_mm_2d_table.h"
#include "ug_mm_ttrack.h"
#include "ug_corpus_token.h"
@@ -35,7 +36,7 @@ using namespace boost::math;
typedef mm2dTable<id_type,id_type,uint32_t,uint32_t> LEX_t;
typedef SimpleWordId Token;
-// DECLARATIONS
+// DECLARATIONS
void interpret_args(int ac, char* av[]);
mmTtrack<Token> T1,T2;
@@ -51,7 +52,7 @@ struct Count
Count(uint32_t ax, uint32_t cx) : a(ax), c(cx) {}
};
-bool
+bool
operator<(pair<id_type,Count> const& a,
pair<id_type,Count> const& b)
{
@@ -71,7 +72,7 @@ public:
countlist_t & LEX;
size_t offset;
size_t skip;
- Counter(countlist_t& lex, size_t o, size_t s)
+ Counter(countlist_t& lex, size_t o, size_t s)
: LEX(lex), offset(o), skip(s) {}
void processSentence(id_type sid);
void operator()();
@@ -82,7 +83,7 @@ int verbose;
size_t truncat;
size_t num_threads;
-void
+void
Counter::
operator()()
{
@@ -104,17 +105,17 @@ struct lexsorter
{
vector<countlist_t> const& v;
id_type wid;
- lexsorter(vector<countlist_t> const& vx, id_type widx)
+ lexsorter(vector<countlist_t> const& vx, id_type widx)
: v(vx),wid(widx) {}
bool operator()(pair<uint32_t,uint32_t> const& a,
pair<uint32_t,uint32_t> const& b) const
{
- return (v.at(a.first).at(wid).at(a.second).first >
+ return (v.at(a.first).at(wid).at(a.second).first >
v.at(b.first).at(wid).at(b.second).first);
}
};
-void
+void
writeTableHeader(ostream& out)
{
filepos_type idxOffset=0;
@@ -158,7 +159,7 @@ void writeTable(ostream* aln_out, ostream* coc_out)
H.pop_back();
else
push_heap(H.begin(),H.end(),sorter);
- while (H.size() &&
+ while (H.size() &&
XLEX[H[0].first][id1].at(H[0].second).first == id2)
{
aln += XLEX[H[0].first][id1][H[0].second].second.a;
@@ -177,7 +178,7 @@ void writeTable(ostream* aln_out, ostream* coc_out)
numwrite(*aln_out,aln);
m1a[id1] += aln;
m2a[id2] += aln;
- }
+ }
if (coc_out && coc)
{
++CellCountC;
@@ -190,7 +191,7 @@ void writeTable(ostream* aln_out, ostream* coc_out)
}
idxa.back() = CellCountA;
idxc.back() = CellCountC;
- if (aln_out)
+ if (aln_out)
{
filepos_type idxOffsetA = aln_out->tellp();
BOOST_FOREACH(id_type foo, idxa)
@@ -200,7 +201,7 @@ void writeTable(ostream* aln_out, ostream* coc_out)
aln_out->seekp(0);
numwrite(*aln_out,idxOffsetA);
}
- if (coc_out)
+ if (coc_out)
{
filepos_type idxOffsetC = coc_out->tellp();
BOOST_FOREACH(id_type foo, idxc)
@@ -222,9 +223,9 @@ processSentence(id_type sid)
Token const* e2 = T2.sntEnd(sid);
vector<ushort> cnt1(V1.ksize(),0);
vector<ushort> cnt2(V2.ksize(),0);
- for (Token const* x = s1; x < e1; ++x)
+ for (Token const* x = s1; x < e1; ++x)
++cnt1.at(x->id());
- for (Token const* x = s2; x < e2; ++x)
+ for (Token const* x = s2; x < e2; ++x)
++cnt2.at(x->id());
boost::unordered_set<wpair> seen;
@@ -241,10 +242,14 @@ processSentence(id_type sid)
p = binread(p,r);
p = binread(p,c);
// cout << sid << " " << r << "-" << c << endl;
- assert(r < check1.size());
- assert(c < check2.size());
- assert(s1+r < e1);
- assert(s2+c < e2);
+ UTIL_THROW_IF2(r >= check1.size(), "out of bounds at line " << sid);
+ UTIL_THROW_IF2(c >= check2.size(), "out of bounds at line " << sid);
+ // assert(r < check1.size());
+ // assert(c < check2.size());
+ UTIL_THROW_IF2(s1+r >= e1, "out of bounds at line " << sid);
+ UTIL_THROW_IF2(s2+c >= e2, "out of bounds at line " << sid);
+ // assert(s1+r < e1);
+ // assert(s2+c < e2);
check1.reset(r);
check2.reset(c);
id_type id1 = (s1+r)->id();
@@ -252,81 +257,21 @@ processSentence(id_type sid)
wpair k(id1,id2);
Count& cnt = CNT[k];
cnt.a++;
- if (seen.insert(k).second)
+ if (seen.insert(k).second)
cnt.c += cnt1[id1] * cnt2[id2];
}
// count unaliged words
- for (size_t i = check1.find_first();
- i < check1.size();
+ for (size_t i = check1.find_first();
+ i < check1.size();
i = check1.find_next(i))
CNT[wpair((s1+i)->id(),0)].a++;
- for (size_t i = check2.find_first();
- i < check2.size();
+ for (size_t i = check2.find_first();
+ i < check2.size();
i = check2.find_next(i))
CNT[wpair(0,(s2+i)->id())].a++;
}
-// void
-// writeTable(string ofname,
-// vector<vector<uint32_t> >& FREQ,
-// vector<map<id_type,uint32_t> >& RARE)
-// {
-// ofstream out(ofname.c_str());
-// filepos_type idxOffset=0;
-
-// vector<uint32_t> m1; // marginals L1
-// vector<uint32_t> m2; // marginals L2
-// m1.resize(max(first_rare_id,V1.getNumTokens()),0);
-// m2.resize(V2.getNumTokens(),0);
-// vector<id_type> index(V1.getNumTokens()+1,0);
-// numwrite(out,idxOffset); // blank for the time being
-// numwrite(out,id_type(m1.size()));
-// numwrite(out,id_type(m2.size()));
-
-// id_type cellCount=0;
-// id_type stop = min(first_rare_id,id_type(m1.size()));
-// for (id_type id1 = 0; id1 < stop; ++id1)
-// {
-// index[id1] = cellCount;
-// vector<uint32_t> const& v = FREQ[id1];
-// for (id_type id2 = 0; id2 < id_type(v.size()); ++id2)
-// {
-// if (!v[id2]) continue;
-// cellCount++;
-// numwrite(out,id2);
-// out.write(reinterpret_cast<char const*>(&v[id2]),sizeof(uint32_t));
-// m1[id1] += v[id2];
-// m2[id2] += v[id2];
-// }
-// }
-// for (id_type id1 = stop; id1 < id_type(m1.size()); ++id1)
-// {
-// index[id1] = cellCount;
-// map<id_type,uint32_t> const& M = RARE[id1];
-// for (map<id_type,uint32_t>::const_iterator m = M.begin(); m != M.end(); ++m)
-// {
-// if (m->second == 0) continue;
-// cellCount++;
-// numwrite(out,m->first);
-// out.write(reinterpret_cast<char const*>(&m->second),sizeof(float));
-// m1[id1] += m->second;
-// m2[m->first] += m->second;
-// }
-// }
-// index[m1.size()] = cellCount;
-// idxOffset = out.tellp();
-// for (size_t i = 0; i < index.size(); ++i)
-// numwrite(out,index[i]);
-// out.write(reinterpret_cast<char const*>(&m1[0]),m1.size()*sizeof(float));
-// out.write(reinterpret_cast<char const*>(&m2[0]),m2.size()*sizeof(float));
-
-// // re-write the file header
-// out.seekp(0);
-// numwrite(out,idxOffset);
-// out.close();
-// }
-
-int
+int
main(int argc, char* argv[])
{
interpret_args(argc,argv);
@@ -354,7 +299,7 @@ main(int argc, char* argv[])
if (cooc.size()) coc_out.close();
}
-void
+void
interpret_args(int ac, char* av[])
{
namespace po=boost::program_options;
@@ -376,7 +321,7 @@ interpret_args(int ac, char* av[])
("truncate,n", po::value<size_t>(&truncat)->default_value(0),
"truncate corpus to <N> sentences (for debugging)")
;
-
+
h.add_options()
("bname", po::value<string>(&bname), "base name")
("L1", po::value<string>(&L1),"L1 tag")
diff --git a/moses/TranslationModel/UG/mm/mmlex-lookup.cc b/moses/TranslationModel/UG/mm/mmlex-lookup.cc
index fbdceeaa0..3ba9ef492 100644
--- a/moses/TranslationModel/UG/mm/mmlex-lookup.cc
+++ b/moses/TranslationModel/UG/mm/mmlex-lookup.cc
@@ -1,8 +1,8 @@
// -*- c++ -*-
// Program to extract word cooccurrence counts from a memory-mapped
// word-aligned bitext stores the counts lexicon in the format for
-// mm2dTable<uint32_t> (ug_mm_2d_table.h)
-//
+// mm2dTable<uint32_t> (ug_mm_2d_table.h)
+//
// (c) 2010-2012 Ulrich Germann
// to do: multi-threading
@@ -20,8 +20,8 @@
#include <boost/foreach.hpp>
#include <boost/thread.hpp>
#include <boost/math/distributions/binomial.hpp>
-#include <boost/unordered_map.hpp>
-#include <boost/unordered_set.hpp>
+#include <boost/unordered_map.hpp>
+#include <boost/unordered_set.hpp>
#include "moses/TranslationModel/UG/generic/program_options/ug_get_options.h"
#include "ug_mm_2d_table.h"
@@ -35,7 +35,7 @@ using namespace boost::math;
typedef mm2dTable<id_type,id_type,uint32_t,uint32_t> LEX_t;
typedef SimpleWordId Token;
-// DECLARATIONS
+// DECLARATIONS
void interpret_args(int ac, char* av[]);
string swrd,twrd,L1,L2,bname;
@@ -43,7 +43,7 @@ TokenIndex V1,V2;
LEX_t LEX;
-void
+void
lookup_source(ostream& out, id_type r)
{
vector<LEX_t::Cell> foo(LEX[r].start,LEX[r].stop);
@@ -57,7 +57,7 @@ lookup_source(ostream& out, id_type r)
}
}
-void
+void
lookup_target(ostream& out, id_type c)
{
vector<LEX_t::Cell> foo;
@@ -65,7 +65,7 @@ lookup_target(ostream& out, id_type c)
for (size_t r = 0; r < LEX.numRows; ++r)
{
size_t j = LEX[r][c];
- if (j)
+ if (j)
{
cell.id = r;
cell.val = j;
@@ -82,7 +82,7 @@ lookup_target(ostream& out, id_type c)
}
}
-void
+void
dump(ostream& out)
{
for (size_t r = 0; r < LEX.numRows; ++r)
@@ -91,7 +91,7 @@ dump(ostream& out)
}
-int
+int
main(int argc, char* argv[])
{
interpret_args(argc,argv);
@@ -100,14 +100,14 @@ main(int argc, char* argv[])
V1.open(bname+L1+".tdx");
V2.open(bname+L2+".tdx");
LEX.open(bname+L1+"-"+L2+".lex");
-
+
cout.precision(2);
id_type swid = V1[swrd];
id_type twid = V2[twrd];
if (swid != 1 && twid != 1)
{
- cout << swrd << " " << twrd << " "
- << LEX.m1(swid) << " / "
+ cout << swrd << " " << twrd << " "
+ << LEX.m1(swid) << " / "
<< LEX[swid][twid] << " / "
<< LEX.m2(twid) << endl;
}
@@ -119,7 +119,7 @@ main(int argc, char* argv[])
dump(cout);
}
-void
+void
interpret_args(int ac, char* av[])
{
namespace po=boost::program_options;
@@ -133,7 +133,7 @@ interpret_args(int ac, char* av[])
("source,s",po::value<string>(&swrd),"source word")
("target,t",po::value<string>(&twrd),"target word")
;
-
+
h.add_options()
("bname", po::value<string>(&bname), "base name")
("L1", po::value<string>(&L1),"L1 tag")
diff --git a/moses/TranslationModel/UG/mm/mtt-build.cc b/moses/TranslationModel/UG/mm/mtt-build.cc
index f49895ebf..a61cbac3f 100644
--- a/moses/TranslationModel/UG/mm/mtt-build.cc
+++ b/moses/TranslationModel/UG/mm/mtt-build.cc
@@ -46,8 +46,8 @@ bool quiet = false; // no progress reporting
string vocabBase; // base name for existing vocabs that should be used
string baseName; // base name for all files
-string tmpFile, mttFile; /* name of temporary / actual track file
- * (.mtt for Conll format, .mct for plain text)
+string tmpFile, mttFile; /* name of temporary / actual track file
+ * (.mtt for Conll format, .mct for plain text)
*/
string UNK;
@@ -60,7 +60,7 @@ void interpret_args(int ac, char* av[]);
inline uchar rangeCheck(int p, int limit) { return p < limit ? p : 1; }
-id_type
+id_type
get_id(TokenIndex const& T, string const& w)
{
id_type ret = T[w];
@@ -73,21 +73,21 @@ get_id(TokenIndex const& T, string const& w)
return ret;
}
-void
+void
open_vocab(TokenIndex& T, string fname)
{
- if (!access(fname.c_str(), F_OK))
- {
- T.open(fname,UNK);
- assert(T[UNK] == 1);
+ if (!access(fname.c_str(), F_OK))
+ {
+ T.open(fname,UNK);
+ assert(T[UNK] == 1);
}
else T.setUnkLabel(UNK);
if (incremental) T.setDynamic(true);
- assert(T["NULL"] == 0);
+ assert(T["NULL"] == 0);
assert(T[UNK] == 1);
}
-void
+void
ini_cnt_vec(TokenIndex const& T, vector<pair<string,size_t> > & v)
{
v.resize(T.totalVocabSize());
@@ -142,7 +142,7 @@ void fill_rec(Conll_Record& rec, vector<string> const& w)
else if (w.size() >= 8) // CONLL format
{
int id = atoi(w[0].c_str());
- int gov = atoi(w[6].c_str());
+ int gov = atoi(w[6].c_str());
rec.sform = get_id(SF, w[1]);
rec.lemma = get_id(LM, w[2]);
rec.majpos = rangeCheck(get_id(PS, w[3]), 256);
@@ -161,12 +161,12 @@ void log_progress(size_t ctr)
}
else if (ctr % 10000 == 0)
{
- cerr << ".";
+ cerr << ".";
}
}
-size_t
+size_t
process_plain_input(ostream& out, vector<id_type> & s_index)
{
id_type totalWords = 0;
@@ -176,7 +176,7 @@ process_plain_input(ostream& out, vector<id_type> & s_index)
istringstream buf(line);
if (!quiet) log_progress(s_index.size());
s_index.push_back(totalWords);
- while (buf>>w)
+ while (buf>>w)
{
numwrite(out,get_id(SF,w));
++totalWords;
@@ -186,9 +186,9 @@ process_plain_input(ostream& out, vector<id_type> & s_index)
return totalWords;
}
-size_t
-process_tagged_input(ostream& out,
- vector<id_type> & s_index,
+size_t
+process_tagged_input(ostream& out,
+ vector<id_type> & s_index,
vector<id_type> & p_index)
{
string line;
@@ -196,7 +196,7 @@ process_tagged_input(ostream& out,
bool new_sent = true;
bool new_par = true;
id_type totalWords = 0;
-
+
while (getline(cin,line))
{
vector<string> w; string f; istringstream buf(line);
@@ -205,7 +205,7 @@ process_tagged_input(ostream& out,
if (w.size() == 0 || starts_with(w[0], "SID="))
new_sent = true;
- else if (w.size() == 1 && w[0] == "<P>")
+ else if (w.size() == 1 && w[0] == "<P>")
new_par = new_sent = true;
if (w.size() < 3) continue;
@@ -244,7 +244,7 @@ numberize()
index = &p_index;
}
- if (!quiet)
+ if (!quiet)
cerr << endl << "Writing index ... (" << index->size() << " chunks) ";
startIdx = out.tellp();
@@ -261,7 +261,7 @@ numberize()
vector<id_type> smap,lmap,pmap,dmap;
-void
+void
invert(vector<id_type> const& from, vector<id_type> & to)
{
to.resize(from.size());
@@ -269,11 +269,11 @@ invert(vector<id_type> const& from, vector<id_type> & to)
to[from[i]] = i;
}
-// sorts new items based on occurrence counts but won't reassign
+// sorts new items based on occurrence counts but won't reassign
// existing token ids
-void
-conservative_sort(TokenIndex const & V,
- vector<size_t> const & cnt,
+void
+conservative_sort(TokenIndex const & V,
+ vector<size_t> const & cnt,
vector<id_type> & xmap)
{
xmap.resize(V.totalVocabSize());
@@ -344,21 +344,21 @@ void save_vocabs()
string vbase = baseName;
if (is_conll)
{
- if (SF.totalVocabSize() > SF.knownVocabSize())
+ if (SF.totalVocabSize() > SF.knownVocabSize())
write_tokenindex(vbase+".tdx.sfo",SF,smap);
- if (LM.totalVocabSize() > LM.knownVocabSize())
+ if (LM.totalVocabSize() > LM.knownVocabSize())
write_tokenindex(vbase+".tdx.lem",LM,lmap);
- if (PS.totalVocabSize() > PS.knownVocabSize())
+ if (PS.totalVocabSize() > PS.knownVocabSize())
write_tokenindex(vbase+".tdx.pos",PS,pmap);
- if (DT.totalVocabSize() > DT.knownVocabSize())
+ if (DT.totalVocabSize() > DT.knownVocabSize())
write_tokenindex(vbase+".tdx.drl",DT,dmap);
}
- else if (SF.totalVocabSize() > SF.knownVocabSize())
+ else if (SF.totalVocabSize() > SF.knownVocabSize())
write_tokenindex(vbase+".tdx",SF,smap);
}
template<typename Token>
-size_t
+size_t
build_mmTSA(string infile, string outfile)
{
size_t mypid = fork();
@@ -371,14 +371,14 @@ build_mmTSA(string infile, string outfile)
exit(0);
}
-bool
+bool
build_plaintext_tsas()
{
typedef L2R_Token<SimpleWordId> L2R;
typedef R2L_Token<SimpleWordId> R2L;
size_t c = with_sfas + with_pfas;
- if (with_sfas) build_mmTSA<L2R>(tmpFile, baseName + ".sfa");
- if (with_pfas) build_mmTSA<R2L>(tmpFile, baseName + ".pfa");
+ if (with_sfas) build_mmTSA<L2R>(tmpFile, baseName + ".sfa");
+ if (with_pfas) build_mmTSA<R2L>(tmpFile, baseName + ".pfa");
while (c--) wait(NULL);
return true;
}
@@ -388,27 +388,27 @@ void build_conll_tsas()
string bn = baseName;
string mtt = tmpFile;
size_t c = 3 * (with_sfas + with_pfas + with_dcas);
- if (with_sfas)
+ if (with_sfas)
{
build_mmTSA<L2R_Token<Conll_Sform> >(mtt,bn+".sfa-sform");
build_mmTSA<L2R_Token<Conll_Lemma> >(mtt,bn+".sfa-lemma");
build_mmTSA<L2R_Token<Conll_MinPos> >(mtt,bn+".sfa-minpos");
}
- if (with_pfas)
+ if (with_pfas)
{
build_mmTSA<R2L_Token<Conll_Sform> >(mtt,bn+".pfa-sform");
build_mmTSA<R2L_Token<Conll_Lemma> >(mtt,bn+".pfa-lemma");
build_mmTSA<R2L_Token<Conll_MinPos> >(mtt,bn+".pfa-minpos");
}
- if (with_dcas)
+ if (with_dcas)
{
- build_mmTSA<ConllBottomUpToken<Conll_Sform> >(mtt,bn+".dca-sform");
- build_mmTSA<ConllBottomUpToken<Conll_Lemma> >(mtt,bn+".dca-lemma");
+ build_mmTSA<ConllBottomUpToken<Conll_Sform> >(mtt,bn+".dca-sform");
+ build_mmTSA<ConllBottomUpToken<Conll_Lemma> >(mtt,bn+".dca-lemma");
build_mmTSA<ConllBottomUpToken<Conll_MinPos> >(mtt,bn+".dca-minpos");
}
- while (c--) wait(NULL);
+ while (c--) wait(NULL);
}
@@ -430,7 +430,7 @@ int main(int argc, char* argv[])
rename(tmpFile.c_str(),mttFile.c_str());
}
-void
+void
interpret_args(int ac, char* av[])
{
po::variables_map vm;
@@ -439,10 +439,10 @@ interpret_args(int ac, char* av[])
("help,h", "print this message")
- ("quiet,q", po::bool_switch(&quiet),
+ ("quiet,q", po::bool_switch(&quiet),
"don't print progress information")
- ("incremental,i", po::bool_switch(&incremental),
+ ("incremental,i", po::bool_switch(&incremental),
"incremental mode; rewrites vocab files!")
("vocab-base,v", po::value<string>(&vocabBase),
@@ -451,15 +451,15 @@ interpret_args(int ac, char* av[])
("output,o", po::value<string>(&baseName),
"base file name of the resulting file(s)")
- ("sfa,s", po::value<int>(&with_sfas)->default_value(1),
+ ("sfa,s", po::value<int>(&with_sfas)->default_value(1),
"also build suffix arrays")
("pfa,p", po::value<int>(&with_pfas)
- ->default_value(0)->implicit_value(1),
+ ->default_value(0)->implicit_value(1),
"also build prefix arrays")
("dca,d", po::value<int>(&with_dcas)
- ->default_value(0)->implicit_value(1),
+ ->default_value(0)->implicit_value(1),
"also build dependency chain arrays")
("conll,c", po::bool_switch(&is_conll),
@@ -468,18 +468,18 @@ interpret_args(int ac, char* av[])
("unk,u", po::value<string>(&UNK)->default_value("UNK"),
"label for unknown tokens")
- // ("map,m", po::value<string>(&vmap),
+ // ("map,m", po::value<string>(&vmap),
// "map words to word classes for indexing")
-
+
;
-
+
po::options_description h("Hidden Options");
h.add_options()
;
h.add(o);
po::positional_options_description a;
a.add("output",1);
-
+
po::store(po::command_line_parser(ac,av)
.options(h)
.positional(a)
@@ -487,7 +487,7 @@ interpret_args(int ac, char* av[])
po::notify(vm);
if (vm.count("help") || !vm.count("output"))
{
- cout << "\nusage:\n\t cat <corpus> | " << av[0]
+ cout << "\nusage:\n\t cat <corpus> | " << av[0]
<< " [options] <output .mtt file>" << endl;
cout << o << endl;
exit(0);
diff --git a/moses/TranslationModel/UG/mm/mtt-count-words.cc b/moses/TranslationModel/UG/mm/mtt-count-words.cc
index c9b435477..223ba2090 100644
--- a/moses/TranslationModel/UG/mm/mtt-count-words.cc
+++ b/moses/TranslationModel/UG/mm/mtt-count-words.cc
@@ -36,7 +36,7 @@ int main(int argc, char* argv[])
{
interpret_args(argc,argv);
T.open(bname+".mct");
- V.open(bname+".tdx");
+ V.open(bname+".tdx");
vector<size_t> cnt(V.ksize(),0);
for (size_t sid = 0; sid < T.size(); ++sid)
{
@@ -48,7 +48,7 @@ int main(int argc, char* argv[])
exit(0);
}
-void
+void
interpret_args(int ac, char* av[])
{
namespace po=boost::program_options;
@@ -60,7 +60,7 @@ interpret_args(int ac, char* av[])
o.add_options()
("help,h", "print this message")
;
-
+
h.add_options()
("bname", po::value<string>(&bname), "base name")
;
diff --git a/moses/TranslationModel/UG/mm/mtt-demo1.cc b/moses/TranslationModel/UG/mm/mtt-demo1.cc
index a253e9ed3..d3506fa0f 100644
--- a/moses/TranslationModel/UG/mm/mtt-demo1.cc
+++ b/moses/TranslationModel/UG/mm/mtt-demo1.cc
@@ -21,17 +21,17 @@ int main(int argc, char* argv[])
using namespace std;
if (argc < 3)
{
- cerr << "usage: " << argv[0] << " <track base name> lookup word sequence"
+ cerr << "usage: " << argv[0] << " <track base name> lookup word sequence"
<< endl;
}
string base = argv[1];
- TokenIndex V;
+ TokenIndex V;
V.open(base+".tdx");
- boost::shared_ptr<mmTtrack<Token> > T(new mmTtrack<Token>());
+ boost::shared_ptr<mmTtrack<Token> > T(new mmTtrack<Token>());
T->open(base+".mct");
mmTSA<Token> I; I.open(base+".sfa",T);
mmTSA<Token>::tree_iterator m(&I);
-
+
// look up the search string m.extend() returns true upon success
for (int i = 2; i < argc && m.extend(V[argv[i]]); ++i);
if (int(m.size() + 2) < argc)
@@ -39,7 +39,7 @@ int main(int argc, char* argv[])
cerr << "NOT FOUND" << endl;
exit(1);
}
-
+
tsa::ArrayEntry e(m.lower_bound(-1));
char const* stop = m.upper_bound(-1);
do
diff --git a/moses/TranslationModel/UG/mm/mtt-dump.cc b/moses/TranslationModel/UG/mm/mtt-dump.cc
index b7d85d623..eea1bb400 100644
--- a/moses/TranslationModel/UG/mm/mtt-dump.cc
+++ b/moses/TranslationModel/UG/mm/mtt-dump.cc
@@ -25,7 +25,7 @@ bool sform;
bool have_mtt, have_mct;
bool with_sids;
bool with_positions;
-void
+void
interpret_args(int ac, char* av[])
{
po::variables_map vm;
@@ -36,7 +36,7 @@ interpret_args(int ac, char* av[])
("sform,s", po::bool_switch(&sform), "sform only")
("with-positions,p", po::bool_switch(&with_positions), "show word positions")
;
-
+
po::options_description h("Hidden Options");
h.add_options()
("bname", po::value<string>(&bname), "base name")
@@ -45,7 +45,7 @@ interpret_args(int ac, char* av[])
po::positional_options_description a;
a.add("bname",1);
a.add("range",-1);
-
+
po::store(po::command_line_parser(ac,av)
.options(h.add(o))
.positional(a)
@@ -63,11 +63,11 @@ interpret_args(int ac, char* av[])
mct = bname+".mct";
}
-void
+void
printRangeMTT(size_t start, size_t stop)
{
for (;start < stop; start++)
- {
+ {
size_t i = 0;
Token const* s = MTT.sntStart(start);
Token const* e = MTT.sntEnd(start);
@@ -92,7 +92,7 @@ printRangeMTT(size_t start, size_t stop)
cout << i+t->parent << " ";
cout << DT[t->dtype] << endl;
}
- else
+ else
{
if (with_positions) cout << t-s << ":";
cout << SF[t->id()] << " ";
@@ -102,16 +102,16 @@ printRangeMTT(size_t start, size_t stop)
}
}
-void
+void
printRangeMCT(size_t start, size_t stop)
{
for (;start < stop; start++)
- {
+ {
SimpleWordId const* s = MCT.sntStart(start);
SimpleWordId const* t = s;
SimpleWordId const* e = MCT.sntEnd(start);
if (with_sids) cout << start << " ";
- while (t < e)
+ while (t < e)
{
if (with_positions) cout << t-s << ":";
cout << SF[(t++)->id()] << " ";
@@ -120,7 +120,7 @@ printRangeMCT(size_t start, size_t stop)
}
}
-int
+int
main(int argc, char*argv[])
{
interpret_args(argc,argv);
@@ -139,14 +139,14 @@ main(int argc, char*argv[])
DT.open(bname+".tdx.drl"); DT.iniReverseIndex();
MTT.open(mtt);
}
- else
+ else
{
sform = true;
SF.open(bname+".tdx"); SF.iniReverseIndex();
MCT.open(mct);
}
-
- if (!range.size())
+
+ if (!range.size())
have_mtt ? printRangeMTT(0, MTT.size()) : printRangeMCT(0, MCT.size());
else
{
@@ -157,9 +157,9 @@ main(int argc, char*argv[])
buf>>first;
if (buf.peek() == '-') buf>>c>>last;
else last = first;
- if (have_mtt && last < MTT.size())
+ if (have_mtt && last < MTT.size())
printRangeMTT(first,last+1);
- else if (last < MCT.size())
+ else if (last < MCT.size())
printRangeMCT(first,last+1);
}
}
diff --git a/moses/TranslationModel/UG/mm/mtt.count.cc b/moses/TranslationModel/UG/mm/mtt.count.cc
index 423c12ec7..1e2382f67 100644
--- a/moses/TranslationModel/UG/mm/mtt.count.cc
+++ b/moses/TranslationModel/UG/mm/mtt.count.cc
@@ -36,14 +36,14 @@ bool echo;
int main(int argc, char* argv[])
{
interpret_args(argc,argv);
-
+
T.open(bname+".mct");
V.open(bname+".tdx"); V.iniReverseIndex();
I.open(bname+".sfa",&T);
string line;
while (getline(cin,line))
{
- vector<id_type> phr;
+ vector<id_type> phr;
V.fillIdSeq(line,phr);
TSA<Token>::tree_iterator m(&I);
size_t i = 0;
@@ -55,7 +55,7 @@ int main(int argc, char* argv[])
exit(0);
}
-void
+void
interpret_args(int ac, char* av[])
{
namespace po=boost::program_options;
@@ -68,7 +68,7 @@ interpret_args(int ac, char* av[])
("help,h", "print this message")
("echo,e", po::bool_switch(&echo), "repeat lookup phrases")
;
-
+
h.add_options()
("bname", po::value<string>(&bname), "base name")
;
diff --git a/moses/TranslationModel/UG/mm/num_read_write.cc b/moses/TranslationModel/UG/mm/num_read_write.cc
index 403f7d300..5c281d9dd 100644
--- a/moses/TranslationModel/UG/mm/num_read_write.cc
+++ b/moses/TranslationModel/UG/mm/num_read_write.cc
@@ -2,7 +2,7 @@
namespace ugdiss {
typedef unsigned char uchar;
- void
+ void
numwrite(std::ostream& out, uint16_t const& x)
{
char buf[2];
@@ -11,7 +11,7 @@ namespace ugdiss {
out.write(buf,2);
}
- void
+ void
numwrite(std::ostream& out, uint32_t const& x)
{
char buf[4];
@@ -22,7 +22,7 @@ namespace ugdiss {
out.write(buf,4);
}
- void
+ void
numwrite(std::ostream& out, uint64_t const& x)
{
char buf[8];
@@ -37,7 +37,7 @@ namespace ugdiss {
out.write(buf,8);
}
- char const*
+ char const*
numread(char const* src, uint16_t & x)
{
uchar const* d = reinterpret_cast<uchar const*>(src);
@@ -45,28 +45,28 @@ namespace ugdiss {
return src+2;
}
- char const*
+ char const*
numread(char const* src, uint32_t & x)
{
uchar const* d = reinterpret_cast<uchar const*>(src);
- x = ((uint32_t(d[0])<<0) |
- (uint32_t(d[1])<<8) |
- (uint32_t(d[2])<<16)|
+ x = ((uint32_t(d[0])<<0) |
+ (uint32_t(d[1])<<8) |
+ (uint32_t(d[2])<<16)|
(uint32_t(d[3])<<24));
return src+4;
}
- char const*
+ char const*
numread(char const* src, uint64_t & x)
{
uchar const* d = reinterpret_cast<uchar const*>(src);
- x = ((uint64_t(d[0])<<0) |
- (uint64_t(d[1])<<8) |
- (uint64_t(d[2])<<16) |
+ x = ((uint64_t(d[0])<<0) |
+ (uint64_t(d[1])<<8) |
+ (uint64_t(d[2])<<16) |
(uint64_t(d[3])<<24) |
- (uint64_t(d[4])<<32) |
- (uint64_t(d[5])<<40) |
- (uint64_t(d[6])<<48) |
+ (uint64_t(d[4])<<32) |
+ (uint64_t(d[5])<<40) |
+ (uint64_t(d[6])<<48) |
(uint64_t(d[7])<<56));
return src+8;
}
diff --git a/moses/TranslationModel/UG/mm/num_read_write.h b/moses/TranslationModel/UG/mm/num_read_write.h
index 6fdcecc81..f83e1c982 100644
--- a/moses/TranslationModel/UG/mm/num_read_write.h
+++ b/moses/TranslationModel/UG/mm/num_read_write.h
@@ -14,11 +14,11 @@ namespace ugdiss {
void numwrite(std::ostream& out, uint16_t const& x);
void numwrite(std::ostream& out, uint32_t const& x);
void numwrite(std::ostream& out, uint64_t const& x);
-
+
char const* numread(char const* src, uint16_t & x);
char const* numread(char const* src, uint32_t & x);
char const* numread(char const* src, uint64_t & x);
-
+
// template<typename uintNumber>
// void
// numwrite(std::ostream& out, uintNumber const& x)
@@ -54,7 +54,7 @@ namespace ugdiss {
// case 8: x = bswap_64(x); break;
// default: break;
// }
-// #endif
+// #endif
// }
// template<typename uintNumber>
@@ -71,7 +71,7 @@ namespace ugdiss {
// case 8: x = bswap_64(x); break;
// default: break;
// }
-// #endif
+// #endif
// return src+sizeof(uintNumber);
// }
} // end of namespace ugdiss
diff --git a/moses/TranslationModel/UG/mm/obsolete/ug_bitext_base.h b/moses/TranslationModel/UG/mm/obsolete/ug_bitext_base.h
index 1810027af..e5e9ca88c 100644
--- a/moses/TranslationModel/UG/mm/obsolete/ug_bitext_base.h
+++ b/moses/TranslationModel/UG/mm/obsolete/ug_bitext_base.h
@@ -39,8 +39,8 @@ namespace Moses {
class jstats; // phrase pair ("joint") statistics
class agenda
{
- boost::mutex lock;
- boost::condition_variable ready;
+ boost::mutex lock;
+ boost::condition_variable ready;
class job;
class worker;
list<job> joblist;
@@ -52,9 +52,9 @@ namespace Moses {
agenda(bitext_base const& bitext);
~agenda();
void add_workers(int n);
- sptr<pstats> add_job(mmbitext::iter const& phrase,
+ sptr<pstats> add_job(mmbitext::iter const& phrase,
size_t const max_samples);
- bool get_task(uint64_t & sid, uint64_t & offset, uint64_t & len,
+ bool get_task(uint64_t & sid, uint64_t & offset, uint64_t & len,
bool & fwd, sptr<bitext_base::pstats> & stats);
};
@@ -65,22 +65,22 @@ namespace Moses {
mmTtrack<char> Tx; // word alignments
mmTtrack<Token> T1,T2; // token tracks
TokenIndex V1,V2; // vocabs
- mmTSA<Token> I1,I2; // suffix arrays
+ mmTSA<Token> I1,I2; // suffix arrays
/// given the source phrase sid[start:stop]
- // find the possible start (s1 .. s2) and end (e1 .. e2)
+ // find the possible start (s1 .. s2) and end (e1 .. e2)
// points of the target phrase; if non-NULL, store word
- // alignments in *core_alignment. If /flip/, source phrase is
+ // alignments in *core_alignment. If /flip/, source phrase is
// L2.
- bool
+ bool
find_trg_phr_bounds
- (size_t const sid, size_t const start, size_t const stop,
- size_t & s1, size_t & s2, size_t & e1, size_t & e2,
+ (size_t const sid, size_t const start, size_t const stop,
+ size_t & s1, size_t & s2, size_t & e1, size_t & e2,
vector<uchar> * core_alignment, bool const flip) const;
boost::unordered_map<uint64_t,sptr<pstats> > cache1,cache2;
private:
- sptr<pstats>
+ sptr<pstats>
prep2(iter const& phrase);
public:
mmbitext();
@@ -98,8 +98,8 @@ namespace Moses {
jstats
{
uint32_t my_rcnt; // unweighted count
- float my_wcnt; // weighted count
- vector<pair<size_t, vector<uchar> > > my_aln;
+ float my_wcnt; // weighted count
+ vector<pair<size_t, vector<uchar> > > my_aln;
boost::mutex lock;
public:
jstats();
@@ -110,22 +110,22 @@ namespace Moses {
void add(float w, vector<uchar> const& a);
};
- struct
+ struct
mmbitext::
pstats
{
boost::mutex lock; // for parallel gathering of stats
boost::condition_variable ready; // consumers can wait for this data structure to be ready.
- size_t raw_cnt; // (approximate) raw occurrence count
+ size_t raw_cnt; // (approximate) raw occurrence count
size_t sample_cnt; // number of instances selected during sampling
size_t good; // number of selected instances with valid word alignments
size_t sum_pairs;
- // size_t snt_cnt;
+ // size_t snt_cnt;
// size_t sample_snt;
size_t in_progress; // keeps track of how many threads are currently working on this
boost::unordered_map<uint64_t, jstats> trg;
- pstats();
+ pstats();
// vector<phrase> nbest;
// void select_nbest(size_t const N=10);
void release();
@@ -142,7 +142,7 @@ namespace Moses {
public:
worker(agenda& a);
void operator()();
-
+
};
class
diff --git a/moses/TranslationModel/UG/mm/symal2mam.cc b/moses/TranslationModel/UG/mm/symal2mam.cc
index 631d4ae07..6d0af57b0 100644
--- a/moses/TranslationModel/UG/mm/symal2mam.cc
+++ b/moses/TranslationModel/UG/mm/symal2mam.cc
@@ -2,9 +2,9 @@
// program to convert GIZA-style alignments into memory-mapped format
// (c) 2010 Ulrich Germann
-// Reads from stdin a file with alternating lines: sentence lengths and symal output.
-// We need the sentence lenghts for sanity checks, because GIZA alignment might skip
-// sentences. If --skip, we skip such sentence pairs, otherwise, we leave the word
+// Reads from stdin a file with alternating lines: sentence lengths and symal output.
+// We need the sentence lenghts for sanity checks, because GIZA alignment might skip
+// sentences. If --skip, we skip such sentence pairs, otherwise, we leave the word
// alignment matrix blank.
#include "ug_mm_ttrack.h"
@@ -24,7 +24,7 @@
#include "util/exception.hh"
// #include "headers-base/util/check.hh"
-// NOTE TO SELF:
+// NOTE TO SELF:
/* Program to filter out sentences that GIZA will skip or truncate,
* i.e. sentences longer than 100 words or sentence pairs with a length
*/
@@ -42,7 +42,7 @@ TokenIndex V1;
string mtt1name,mtt2name,o1name,o2name,mamname,cfgFile;
string dataFormat,A3filename;
-void
+void
interpret_args(int ac, char* av[])
{
namespace po=boost::program_options;
@@ -63,7 +63,7 @@ interpret_args(int ac, char* av[])
("t2", po::value<string>(&mtt2name), "file name of L2 mapped token track")
("format,F", po::value<string>(&dataFormat)->default_value("plain"), "data format (plain or conll)")
;
-
+
h.add_options()
("mamname", po::value<string>(&mamname), "name of output file for mam")
;
@@ -76,8 +76,8 @@ interpret_args(int ac, char* av[])
if (vm.count("help") || mamname.empty())
{
cout << "usage:\n"
- << "\t\n"
- << "\t ... | " << av[0]
+ << "\t\n"
+ << "\t ... | " << av[0]
<< " <.mam file> \n" << endl;
cout << o << endl;
cout << "If an A3 file is given (as produced by (m)giza), symal2mam performs\n"
@@ -117,8 +117,8 @@ procSymalLine(string const& line, ostream& out)
{
cerr << a << "-" << b << " " << len1 << "/" << len2 << endl;
}
- assert(len1 == 0 || a<len1);
- assert(len2 == 0 || b<len2);
+ assert(len1 == 0 || a<len1);
+ assert(len2 == 0 || b<len2);
binwrite(out,a);
binwrite(out,b);
}
@@ -138,7 +138,7 @@ void finiMAM(ofstream& out, vector<id_type>& idx, id_type numTok)
out.close();
}
-void
+void
finalize(ofstream& out, vector<id_type> const& idx, id_type tokenCount)
{
id_type idxSize = idx.size();
@@ -184,7 +184,7 @@ go()
while(getline(cin,line))
{
idxm.push_back(procSymalLine(line,mam));
- if (debug && ++ctr%100000==0)
+ if (debug && ++ctr%100000==0)
cerr << ctr/1000 << "K lines processed" << endl;
}
finiMAM(mam,idxm,0);
@@ -208,20 +208,20 @@ go(string t1name, string t2name, string A3filename)
for (sid = 0; sid < T1.size(); ++sid)
{
- len1 = T1.sntLen(sid);
+ len1 = T1.sntLen(sid);
len2 = T2.sntLen(sid);
- if (debug)
- cerr << "[" << lineCtr << "] "
- << len1 << " (" << check1 << ") / "
+ if (debug)
+ cerr << "[" << lineCtr << "] "
+ << len1 << " (" << check1 << ") / "
<< len2 << " (" << check2 << ")" << endl;
- if ((check1 >=0 && check1!=len1) ||
+ if ((check1 >=0 && check1!=len1) ||
(check2 >=0 && check2!=len2))
{
if (skip)
{
- cerr << "[" << ++skipCtr << "] skipping "
- << check1 << "/" << check2 << " vs. "
- << len1 << "/" << len2
+ cerr << "[" << ++skipCtr << "] skipping "
+ << check1 << "/" << check2 << " vs. "
+ << len1 << "/" << len2
<< " at line " << lineCtr << endl;
}
else
@@ -238,9 +238,9 @@ go(string t1name, string t2name, string A3filename)
}
if (skip)
{
- idx1.push_back(tokenCount1 += len1);
+ idx1.push_back(tokenCount1 += len1);
copySentence(T1,sid,t1out);
- idx2.push_back(tokenCount2 += len2);
+ idx2.push_back(tokenCount2 += len2);
copySentence(T2,sid,t2out);
}
@@ -250,7 +250,7 @@ go(string t1name, string t2name, string A3filename)
lineCtr++;
idxm.push_back(procSymalLine(line,mam));
if (debug) cerr << "[" << lineCtr << "] "
- << check1 << " (" << len1 <<") "
+ << check1 << " (" << len1 <<") "
<< check2 << " (" << len2 <<") "
<< line << endl;
getCheckValues(A3file,check1,check2);
@@ -264,7 +264,7 @@ go(string t1name, string t2name, string A3filename)
cout << idxm.size() << endl;
}
-void
+void
initialize(ofstream& out, string const& fname)
{
out.open(fname.c_str());
diff --git a/moses/TranslationModel/UG/mm/tpt_pickler.cc b/moses/TranslationModel/UG/mm/tpt_pickler.cc
index c23913fc2..353e5b901 100644
--- a/moses/TranslationModel/UG/mm/tpt_pickler.cc
+++ b/moses/TranslationModel/UG/mm/tpt_pickler.cc
@@ -73,45 +73,45 @@ namespace ugdiss
data += T(c&mask) << 63;
}
- void
- binwrite(std::ostream& out, unsigned char data)
- {
+ void
+ binwrite(std::ostream& out, unsigned char data)
+ {
binwrite_unsigned_integer(out, data);
}
- void
+ void
binwrite(std::ostream& out, unsigned short data)
- {
+ {
binwrite_unsigned_integer(out, data);
}
- void
+ void
binwrite(std::ostream& out, unsigned long data)
- {
+ {
binwrite_unsigned_integer(out, data);
}
- void
+ void
binwrite(std::ostream& out, unsigned long long data)
- {
+ {
binwrite_unsigned_integer(out, data);
}
#if __WORDSIZE == 64
- void
+ void
binwrite(std::ostream& out, unsigned int data)
- {
+ {
binwrite_unsigned_integer(out, data);
}
-#else
- void
+#else
+ void
binwrite(std::ostream& out, size_t data)
- {
+ {
binwrite_unsigned_integer(out, data);
}
#endif
- void
+ void
binread(std::istream& in, unsigned short& data)
{
assert(sizeof(data)==2);
@@ -127,7 +127,7 @@ namespace ugdiss
data += uint16_t(c&mask) << 14;
}
- void
+ void
binread(std::istream& in, unsigned int& data)
{
assert(sizeof(data) == 4);
@@ -149,7 +149,7 @@ namespace ugdiss
data += uint32_t(c&mask) << 28;
}
- void
+ void
binread(std::istream& in, unsigned long& data)
{
#if __WORDSIZE == 32
@@ -185,16 +185,16 @@ namespace ugdiss
data += static_cast<unsigned long long>(c&mask) << 49;
if (c < 0) return;
in.get(c);
-
+
data += static_cast<unsigned long long>(c&mask) << 56;
if (c < 0) return;
in.get(c);
-
+
data += static_cast<unsigned long long>(c&mask) << 63;
#endif
}
- void
+ void
binread(std::istream& in, unsigned long long& data)
{
assert(sizeof(unsigned long long)==8);
@@ -231,14 +231,14 @@ namespace ugdiss
}
// writing and reading strings ...
- void
+ void
binwrite(std::ostream& out, std::string const& s)
{
size_t len = s.size();
ugdiss::binwrite(out,len);
out.write(s.c_str(),len);
}
-
+
void
binread(std::istream& in, std::string& s)
{
@@ -250,28 +250,28 @@ namespace ugdiss
buf[len] = 0;
s = buf;
}
-
+
void
binwrite(std::ostream& out, float x)
- {
- // IMPORTANT: this is not robust against the big/little endian
- // issue.
- out.write(reinterpret_cast<char*>(&x),sizeof(float));
+ {
+ // IMPORTANT: this is not robust against the big/little endian
+ // issue.
+ out.write(reinterpret_cast<char*>(&x),sizeof(float));
}
-
+
void
binread(std::istream& in, float& x)
- {
- // IMPORTANT: this is not robust against the big/little endian
- // issue.
- in.read(reinterpret_cast<char*>(&x),sizeof(x));
+ {
+ // IMPORTANT: this is not robust against the big/little endian
+ // issue.
+ in.read(reinterpret_cast<char*>(&x),sizeof(x));
}
-
+
char const *binread(char const* p, uint16_t& buf)
{
static char mask = 127;
- buf = (*p)&mask;
+ buf = (*p)&mask;
if (*p++ < 0) return p;
buf += uint16_t((*p)&mask)<<7;
if (*p++ < 0) return p;
@@ -294,26 +294,26 @@ namespace ugdiss
char const *binread(char const* p, uint32_t& buf)
{
static char mask = 127;
-
- if (*p < 0)
- {
- buf = (*p)&mask;
- return ++p;
+
+ if (*p < 0)
+ {
+ buf = (*p)&mask;
+ return ++p;
}
buf = *p;
- if (*(++p) < 0)
+ if (*(++p) < 0)
{
buf += uint32_t((*p)&mask)<<7;
return ++p;
}
buf += uint32_t(*p)<<7;
- if (*(++p) < 0)
+ if (*(++p) < 0)
{
buf += uint32_t((*p)&mask)<<14;
return ++p;
}
buf += uint32_t(*p)<<14;
- if (*(++p) < 0)
+ if (*(++p) < 0)
{
buf += uint32_t((*p)&mask)<<21;
return ++p;
@@ -331,56 +331,56 @@ namespace ugdiss
char const *binread(char const* p, filepos_type& buf)
{
static char mask = 127;
-
- if (*p < 0)
- {
- buf = (*p)&mask;
- return ++p;
+
+ if (*p < 0)
+ {
+ buf = (*p)&mask;
+ return ++p;
}
buf = *p;
- if (*(++p) < 0)
+ if (*(++p) < 0)
{
buf += filepos_type((*p)&mask)<<7;
return ++p;
}
buf += filepos_type(*p)<<7;
- if (*(++p) < 0)
+ if (*(++p) < 0)
{
buf += filepos_type((*p)&mask)<<14;
return ++p;
}
buf += filepos_type(*p)<<14;
- if (*(++p) < 0)
+ if (*(++p) < 0)
{
buf += filepos_type((*p)&mask)<<21;
return ++p;
}
buf += filepos_type(*p)<<21;
- if (*(++p) < 0)
+ if (*(++p) < 0)
{
buf += filepos_type((*p)&mask)<<28;
return ++p;
}
buf += filepos_type(*p)<<28;
- if (*(++p) < 0)
+ if (*(++p) < 0)
{
buf += filepos_type((*p)&mask)<<35;
return ++p;
}
buf += filepos_type(*p)<<35;
- if (*(++p) < 0)
+ if (*(++p) < 0)
{
buf += filepos_type((*p)&mask)<<42;
return ++p;
}
buf += filepos_type(*p)<<42;
- if (*(++p) < 0)
+ if (*(++p) < 0)
{
buf += filepos_type((*p)&mask)<<49;
return ++p;
}
buf += filepos_type(*p)<<49;
- if (*(++p) < 0)
+ if (*(++p) < 0)
{
buf += filepos_type((*p)&mask)<<56;
return ++p;
diff --git a/moses/TranslationModel/UG/mm/tpt_pickler.h b/moses/TranslationModel/UG/mm/tpt_pickler.h
index 7305a858e..5ac71c16d 100644
--- a/moses/TranslationModel/UG/mm/tpt_pickler.h
+++ b/moses/TranslationModel/UG/mm/tpt_pickler.h
@@ -17,30 +17,30 @@ namespace ugdiss
/// @return the size of file fname.
::uint64_t getFileSize(const std::string& fname);
- /**
- * The following functions write and read data in a compact binary
+ /**
+ * The following functions write and read data in a compact binary
* representation. Write and read errors can be checked directly
* on the ostream object after the function call, so no return value is
* necessary.*/
- void binwrite(std::ostream& out, char data);
- void binwrite(std::ostream& out, unsigned char data);
+ void binwrite(std::ostream& out, char data);
+ void binwrite(std::ostream& out, unsigned char data);
void binwrite(std::ostream& out, unsigned short data);
void binwrite(std::ostream& out, unsigned int data);
void binwrite(std::ostream& out, unsigned long data);
void binwrite(std::ostream& out, size_t data);
void binwrite(std::ostream& out, unsigned long long data);
void binwrite(std::ostream& out, std::string const& data);
- void binwrite(std::ostream& out, float data);
+ void binwrite(std::ostream& out, float data);
- void binread(std::istream& in, char &data);
- void binread(std::istream& in, unsigned char &data);
+ void binread(std::istream& in, char &data);
+ void binread(std::istream& in, unsigned char &data);
void binread(std::istream& in, unsigned short &data);
void binread(std::istream& in, unsigned int &data);
void binread(std::istream& in, unsigned long &data);
void binread(std::istream& in, size_t &data);
void binread(std::istream& in, unsigned long long &data);
void binread(std::istream& in, std::string &data);
- void binread(std::istream& in, float &data);
+ void binread(std::istream& in, float &data);
char const *binread(char const* p, uint16_t& buf);
char const *binread(char const* p, uint32_t& buf);
@@ -68,11 +68,11 @@ namespace ugdiss
/*
template<typename WHATEVER>
- char const*
+ char const*
binread(char const* p, WHATEVER* buf);
template<typename numtype>
- char const*
+ char const*
binread(char const* p, numtype& buf);
*/
@@ -113,7 +113,7 @@ namespace ugdiss
p = binread(p,v[i]);
return p;
}
-
+
template<typename T>
T read(std::istream& in)
{
@@ -132,7 +132,7 @@ namespace ugdiss
template<typename T>
- void
+ void
binwrite(std::ostream& out, std::vector<T> const& data)
{
binwrite(out,data.size());
@@ -141,7 +141,7 @@ namespace ugdiss
}
template<typename T>
- void
+ void
binread(std::istream& in, std::vector<T>& data)
{
size_t s;
@@ -157,8 +157,8 @@ namespace ugdiss
{
size_t s; K k; V v;
binread(in,s);
- data.clear();
- // I have no idea why this is necessary, but it is, even when
+ data.clear();
+ // I have no idea why this is necessary, but it is, even when
// /data/ is supposed to be empty
for (size_t i = 0; i < s; i++)
{
@@ -174,7 +174,7 @@ namespace ugdiss
binwrite(std::ostream& out, std::map<K,V> const& data)
{
binwrite(out,data.size());
- for (typename std::map<K,V>::const_iterator m = data.begin();
+ for (typename std::map<K,V>::const_iterator m = data.begin();
m != data.end(); m++)
{
binwrite(out,m->first);
@@ -200,7 +200,7 @@ namespace ugdiss
template<typename WHATEVER>
- char const*
+ char const*
binread(char const* p, WHATEVER* buf)
{
#ifdef VERIFY_TIGHT_PACKING
@@ -209,6 +209,6 @@ namespace ugdiss
return binread(p,*buf);
}
-
+
} // end namespace ugdiss
#endif
diff --git a/moses/TranslationModel/UG/mm/tpt_tightindex.cc b/moses/TranslationModel/UG/mm/tpt_tightindex.cc
index da28c6d93..72cf0c183 100644
--- a/moses/TranslationModel/UG/mm/tpt_tightindex.cc
+++ b/moses/TranslationModel/UG/mm/tpt_tightindex.cc
@@ -8,10 +8,10 @@
*/
//
// ugTightIndex.cc
-//
+//
// Made by Ulrich Germann
// Login <germann@germann-laptop>
-//
+//
// Started on Tue Jul 17 15:09:33 2007 Ulrich Germann
// Started on Tue Jul 17 15:09:33 2007 Ulrich Germann
//
@@ -63,7 +63,7 @@ namespace ugdiss
// }
// #define LOG_WRITE_ACTIVITY
-
+
// write a key or value into a tight index
// flag indicates wheter it's a key or a value
void tightwrite(std::ostream& out, uint64_t data, bool flag)
@@ -80,10 +80,10 @@ namespace ugdiss
std::cerr << " with flag 1 ";
#endif
while (data >= 128)
- {
+ {
char c = char(data%128)|char(-128);
- out.put(c);
- data >>= 7;
+ out.put(c);
+ data >>= 7;
#ifdef LOG_WRITE_ACTIVITY
bytes_written++;
#endif
@@ -99,7 +99,7 @@ namespace ugdiss
while (data >= 128)
{
char c = data&127;
- out.put(c);
+ out.put(c);
data >>= 7;
#ifdef LOG_WRITE_ACTIVITY
bytes_written++;
@@ -112,16 +112,16 @@ namespace ugdiss
std::cerr << " in " << bytes_written << " bytes" << std::endl;
#endif
}
-
-// For the code below: does it make a difference if I hard-code the
+
+// For the code below: does it make a difference if I hard-code the
// unraveled loop or does code optimization by the compiler take care
// of that?
#define DEBUG_TIGHTREAD 0
- // read a key value from a tight index; filepos_type must be at least as
+ // read a key value from a tight index; filepos_type must be at least as
// large as count_type
- filepos_type
+ filepos_type
tightread(std::istream& in, std::ios::pos_type stop)
{
// debug=true;
@@ -131,8 +131,8 @@ namespace ugdiss
short int bitshift = 7;
int pos = in.tellg();
#if DEBUG_TIGHTREAD
- if (debug)
- cerr << bitpattern(uint(in.peek())) << " " << in.peek()
+ if (debug)
+ cerr << bitpattern(uint(in.peek())) << " " << in.peek()
<< " pos=" << in.tellg() << "\n";
#endif
int buf = in.get();
@@ -141,24 +141,24 @@ namespace ugdiss
else
stop = std::min(size_t(stop),size_t(in.tellg())+in.rdbuf()->in_avail());
if (buf < 0)
- std::cerr << "number read: " << buf << " " << pos << " "
+ std::cerr << "number read: " << buf << " " << pos << " "
<< in.tellg() << std::endl;
assert (buf>=0);
-
+
if (buf >= 128) // continuation bit is 1
{
data = buf-128; // unset the bit
while (in.tellg() < stop && in.peek() >= 128)
{
#if DEBUG_TIGHTREAD
- if (debug)
+ if (debug)
cerr << bitpattern(uint(in.peek())) << " " << in.peek();
#endif
// cerr << bitpattern(size_t(in.peek())) << std::endl;
data += size_t(in.get()-128)<<bitshift;
bitshift += 7;
#if DEBUG_TIGHTREAD
- if (debug)
+ if (debug)
cerr << " " << data << " pos=" << in.tellg() << std::endl;
#endif
}
@@ -170,14 +170,14 @@ namespace ugdiss
{
// cerr << bitpattern(size_t(in.peek())) << std::endl;
#if DEBUG_TIGHTREAD
- if (debug)
+ if (debug)
cerr << bitpattern(uint(in.peek())) << " " << in.peek();
-
+
#endif
data += size_t(in.get())<<bitshift;
bitshift += 7;
#if DEBUG_TIGHTREAD
- if (debug)
+ if (debug)
cerr << " " << data << " pos=" << in.tellg() << "\n";
#endif
}
@@ -189,16 +189,16 @@ namespace ugdiss
#if DEBUG_TIGHTFIND
bool debug=true;
#endif
- bool
+ bool
tightfind_midpoint(std::istream& in, filepos_type start, filepos_type stop)
{
- in.seekg((start+stop)/2);
- // Jump approximately to the middle. Since we might land in the
- // middle of a number, we need to find the start of the next
+ in.seekg((start+stop)/2);
+ // Jump approximately to the middle. Since we might land in the
+ // middle of a number, we need to find the start of the next
// [index key/file offset] pair first. Bytes belonging to an index
- // key have the leftmost bit set to 0, bytes belonging to a file
+ // key have the leftmost bit set to 0, bytes belonging to a file
// offset have it set to 1
-
+
// if we landed in the middle of an index key, skip to the end of it
while (static_cast<filepos_type>(in.tellg()) < stop && in.get() < 128)
{
@@ -216,9 +216,9 @@ bool debug=true;
while (static_cast<filepos_type>(in.tellg()) < stop && in.peek() >= 128)
{
#if DEBUG_TIGHTFIND
- int r = in.get();
+ int r = in.get();
if (debug)
- std::cerr << in.tellg() << " skipped value byte " << r
+ std::cerr << in.tellg() << " skipped value byte " << r
<< " next is " << in.peek()
<< std::endl;
#else
@@ -227,9 +227,9 @@ bool debug=true;
}
return true;
}
-
- char const*
- tightfind_midpoint(char const* const start,
+
+ char const*
+ tightfind_midpoint(char const* const start,
char const* const stop)
{
char const* mp = start + (stop - start)/2;
@@ -238,46 +238,46 @@ bool debug=true;
return (*mp < 0) ? ++mp : mp;
}
- bool
- linear_search(std::istream& in, filepos_type start, filepos_type stop,
+ bool
+ linear_search(std::istream& in, filepos_type start, filepos_type stop,
id_type key, unsigned char& flags)
{ // performs a linear search in the range
in.seekg(start);
-
+
#if DEBUG_TIGHTFIND
if (debug) std::cerr << in.tellg() << " ";
#endif
-
- // ATTENTION! The bitshift operations below are important:
- // We use some of the bits in the key value to store additional
+
+ // ATTENTION! The bitshift operations below are important:
+ // We use some of the bits in the key value to store additional
// information about what and where node iformation is stored.
-
+
id_type foo;
- for(foo = tightread(in,stop);
- (foo>>FLAGBITS) < key;
- foo = tightread(in,stop))
+ for(foo = tightread(in,stop);
+ (foo>>FLAGBITS) < key;
+ foo = tightread(in,stop))
{
// skip the value associated with key /foo/
- while (static_cast<filepos_type>(in.tellg()) < stop
- && in.peek() >= 128) in.get();
-
+ while (static_cast<filepos_type>(in.tellg()) < stop
+ && in.peek() >= 128) in.get();
+
#if DEBUG_TIGHTFIND
- if (debug)
- std::cerr << (foo>>FLAGBITS) << " [" << key << "] "
+ if (debug)
+ std::cerr << (foo>>FLAGBITS) << " [" << key << "] "
<< in.tellg() << std::endl;
#endif
-
+
if (in.tellg() == std::ios::pos_type(stop))
return false; // not found
}
-
+
#if DEBUG_TIGHTFIND
- if (debug && (foo>>FLAGBITS)==key)
+ if (debug && (foo>>FLAGBITS)==key)
std::cerr << "found entry for " << key << std::endl;
- std::cerr << "current file position is " << in.tellg()
+ std::cerr << "current file position is " << in.tellg()
<< " (value read: " << key << std::endl;
#endif
-
+
assert(static_cast<filepos_type>(in.tellg()) < stop);
if ((foo>>FLAGBITS)==key)
{
@@ -288,51 +288,51 @@ bool debug=true;
else
return false;
}
-
+
bool
- tightfind(std::istream& in, filepos_type start, filepos_type stop,
+ tightfind(std::istream& in, filepos_type start, filepos_type stop,
id_type key, unsigned char& flags)
{
- // returns true if the value is found
+ // returns true if the value is found
#if DEBUG_TIGHTFIND
if (debug)
- std::cerr << "looking for " << key
+ std::cerr << "looking for " << key
<< " in range [" << start << ":" << stop << "]" << std::endl;
#endif
if (start==stop) return false;
assert(stop>start);
if ((start+1)==stop) return false; // list is empty
-
- unsigned int const granularity = sizeof(filepos_type)*5;
+
+ unsigned int const granularity = sizeof(filepos_type)*5;
// granularity: point where we should switch to linear search,
// because otherwise we might skip over the entry we are looking for
// because we land right in the middle of it.
-
+
if (stop > start + granularity)
- if (!tightfind_midpoint(in,start,stop))
+ if (!tightfind_midpoint(in,start,stop))
return false; // something went wrong (empty index)
-
+
if (stop <= start + granularity || in.tellg() == std::ios::pos_type(stop))
{ // If the search range is very short, tightfind_midpoint might skip the
// entry we are loking for. In this case, we can afford a linear
// search
return linear_search(in,start,stop,key,flags);
}
-
+
// perform binary search
filepos_type curpos = in.tellg();
id_type foo = tightread(in,stop);
id_type tmpid = foo>>FLAGBITS;
- if (tmpid == key)
+ if (tmpid == key)
{
- flags = foo%256;
+ flags = foo%256;
flags &= FLAGMASK;
#if DEBUG_TIGHTFIND
if (debug) std::cerr << "found entry for " << key << std::endl;
#endif
- return true; // done, found
+ return true; // done, found
}
- else if (tmpid > key)
+ else if (tmpid > key)
{ // look in the lower half
#if DEBUG_TIGHTFIND
if (debug) std::cerr << foo << " > " << key << std::endl;
@@ -343,7 +343,7 @@ bool debug=true;
{ // look in the upper half
while (static_cast<filepos_type>(in.tellg()) < stop
&& in.rdbuf()->in_avail() > 0 // is that still necessary???
- && in.peek() >= 128)
+ && in.peek() >= 128)
in.get(); // skip associated value
if (in.rdbuf()->in_avail() == 0 || in.tellg() == std::ios::pos_type(stop))
return false;
@@ -353,16 +353,16 @@ bool debug=true;
return tightfind(in,in.tellg(),stop,key,flags);
}
}
-
+
char const*
- tightfind(char const* const start,
+ tightfind(char const* const start,
char const* const stop,
- id_type key,
+ id_type key,
unsigned char& flags)
{
- // returns true if the value is found
-
+ // returns true if the value is found
+
if (start==stop) return NULL;
assert(stop>start);
if ((start+1)==stop) return NULL; // list is empty
@@ -374,11 +374,11 @@ bool debug=true;
id_type tmpId = foo>>FLAGBITS;
if (tmpId == key)
{
- flags = foo%256;
+ flags = foo%256;
flags &= FLAGMASK;
return after;
}
- else if (tmpId > key)
+ else if (tmpId > key)
{ // look in the lower half
return tightfind(start,p,key,flags);
}
@@ -389,14 +389,14 @@ bool debug=true;
return tightfind(after,stop,key,flags);
}
}
-
+
char const*
- tightfind_noflags(char const* const start,
+ tightfind_noflags(char const* const start,
char const* const stop,
id_type key)
{
- // returns true if the value is found
-
+ // returns true if the value is found
+
if (start==stop) return NULL;
assert(stop>start);
if ((start+1)==stop) return NULL; // list is empty
@@ -407,7 +407,7 @@ bool debug=true;
char const* after = tightread(p,stop,foo);
if (foo == key)
return after;
- else if (foo > key)
+ else if (foo > key)
{ // look in the lower half
return tightfind_noflags(start,p,key);
}
@@ -419,19 +419,19 @@ bool debug=true;
}
}
- bool
- linear_search_noflags(std::istream& in, filepos_type start,
+ bool
+ linear_search_noflags(std::istream& in, filepos_type start,
filepos_type stop, id_type key)
{ // performs a linear search in the range
- std::ios::pos_type mystop = stop;
+ std::ios::pos_type mystop = stop;
in.seekg(start);
id_type foo;
- for(foo = tightread(in,stop); foo < key; foo = tightread(in,stop))
+ for(foo = tightread(in,stop); foo < key; foo = tightread(in,stop))
{
// skip the value associated with key /foo/
- while (in.tellg() < mystop && in.peek() >= 128)
- in.get();
+ while (in.tellg() < mystop && in.peek() >= 128)
+ in.get();
if (in.tellg() == mystop)
return false; // not found
}
@@ -441,45 +441,45 @@ bool debug=true;
bool
- tightfind_noflags(std::istream& in, filepos_type start,
+ tightfind_noflags(std::istream& in, filepos_type start,
filepos_type stop, id_type key)
{
- // returns true if the value is found
+ // returns true if the value is found
if (start==stop) return false;
assert(stop>start);
if ((start+1)==stop) return false; // list is empty
-
+
// granularity: point where we should switch to linear search,
// because otherwise we might skip over the entry we are looking for
// because we land right in the middle of it.
- unsigned int const granularity = sizeof(filepos_type)*5;
+ unsigned int const granularity = sizeof(filepos_type)*5;
// UG: why 5? we should be able to get away with less!
-
+
if (stop > start + granularity)
- if (!tightfind_midpoint(in,start,stop))
+ if (!tightfind_midpoint(in,start,stop))
return false; // something went wrong (empty index)
-
+
// If the search range is very short, tightfind_midpoint might skip the
// entry we are loking for. In this case, we can afford a linear
// search
if (stop <= start + granularity || in.tellg() == std::ios::pos_type(stop))
return linear_search_noflags(in,start,stop,key);
-
+
// Otherwise, perform binary search
filepos_type curpos = in.tellg();
id_type foo = tightread(in,stop);
- if (foo == key)
- return true; // done, found
+ if (foo == key)
+ return true; // done, found
else if (foo > key) // search first half
return tightfind_noflags(in,start,curpos,key);
else // search second half
- {
- std::ios::pos_type mystop = stop;
+ {
+ std::ios::pos_type mystop = stop;
while (in.tellg() < mystop
&& in.rdbuf()->in_avail() > 0 // is that still necessary???
- && in.peek() >= 128)
+ && in.peek() >= 128)
in.get(); // skip associated value
if (in.rdbuf()->in_avail() == 0 || in.tellg() == mystop)
return false;
@@ -496,9 +496,9 @@ bool debug=true;
{
foo += 32768; // set first bit
while (data >= 32768) // = 2^15
- {
+ {
out.write(reinterpret_cast<char*>(&foo),2);
- data >>= 15;
+ data >>= 15;
foo = (data%32768)+32768;
}
}
@@ -507,7 +507,7 @@ bool debug=true;
while (data >= 32768) // = 2^15
{
out.write(reinterpret_cast<char*>(&foo),2);
- data >>= 15;
+ data >>= 15;
foo = data%32768;
}
}
@@ -515,8 +515,8 @@ bool debug=true;
}
char const*
- tightread8(char const* start,
- char const* stop,
+ tightread8(char const* start,
+ char const* stop,
uint64_t& dest)
{
static char bitmask=127;
@@ -570,8 +570,8 @@ bool debug=true;
}
char const*
- tightread4(char const* start,
- char const* stop,
+ tightread4(char const* start,
+ char const* stop,
uint32_t& dest)
{
static char bitmask=127;
@@ -605,8 +605,8 @@ bool debug=true;
}
char const*
- tightread2(char const* start,
- char const* stop,
+ tightread2(char const* start,
+ char const* stop,
uint16_t& dest)
{
static char bitmask=127;
diff --git a/moses/TranslationModel/UG/mm/tpt_tightindex.h b/moses/TranslationModel/UG/mm/tpt_tightindex.h
index 66594bc0a..967215aeb 100644
--- a/moses/TranslationModel/UG/mm/tpt_tightindex.h
+++ b/moses/TranslationModel/UG/mm/tpt_tightindex.h
@@ -28,46 +28,46 @@ extern bool debug;
namespace ugdiss
{
// void tightwritex(iostream& out, size_t data, bool flag);
- void
+ void
tightwrite(std::ostream& out, ::uint64_t data, bool flag);
- filepos_type
+ filepos_type
tightread(std::istream& in, std::ios::pos_type stop);
bool
- tightfind(std::istream& in,
- filepos_type start,
- filepos_type stop,
+ tightfind(std::istream& in,
+ filepos_type start,
+ filepos_type stop,
id_type key,
unsigned char& flags);
bool
- tightfind_noflags(std::istream& in,
- filepos_type start,
- filepos_type stop,
+ tightfind_noflags(std::istream& in,
+ filepos_type start,
+ filepos_type stop,
id_type key);
char const*
- tightfind(char const* const start,
+ tightfind(char const* const start,
char const* const stop,
- id_type key,
+ id_type key,
unsigned char& flags);
char const*
- tightfind_noflags(char const* const start,
+ tightfind_noflags(char const* const start,
char const* const stop,
id_type key);
- /** move read header in istream /in/ to the first entry after the midpoint of
- * file position range [start,stop) in in a 'tight' index
+ /** move read header in istream /in/ to the first entry after the midpoint of
+ * file position range [start,stop) in in a 'tight' index
* @param in the data input stream
* @param start start of the search range
* @param stop end of the search range
- * @return true if no errors occurred
- */
- bool
+ * @return true if no errors occurred
+ */
+ bool
tightfind_midpoint(std::istream& in, filepos_type start, filepos_type stop);
// the bitpattern functions below are for debugging
@@ -115,8 +115,8 @@ namespace ugdiss
#if 0
template<typename dtype>
- char const*
- tightread(char const* start,
+ char const*
+ tightread(char const* start,
char const* stop,
dtype& dest)
{
diff --git a/moses/TranslationModel/UG/mm/tpt_tokenindex.cc b/moses/TranslationModel/UG/mm/tpt_tokenindex.cc
index c6704beac..5fc6a6acc 100644
--- a/moses/TranslationModel/UG/mm/tpt_tokenindex.cc
+++ b/moses/TranslationModel/UG/mm/tpt_tokenindex.cc
@@ -15,15 +15,15 @@ namespace ugdiss
{
TokenIndex::
- TokenIndex(string unkToken)
+ TokenIndex(string unkToken)
: ridx(0),unkLabel(unkToken),unkId(1),numTokens(0)
- {
+ {
lock.reset(new boost::mutex());
};
-
+
#if 0
TokenIndex::
- TokenIndex(string fname, string unkToken,bool dyna)
+ TokenIndex(string fname, string unkToken,bool dyna)
: ridx(0),unkLabel(unkToken)
{
this->open(fname,unkToken,dyna);
@@ -58,8 +58,8 @@ namespace ugdiss
if (!unkToken.empty())
{
Entry const* bla = lower_bound(startIdx,endIdx,unkToken.c_str(),comp);
- unkId = ((bla < endIdx && unkToken == comp.base+bla->offset)
- ? bla->id
+ unkId = ((bla < endIdx && unkToken == comp.base+bla->offset)
+ ? bla->id
: numTokens);
}
this->dynamic=dyna;
@@ -69,7 +69,7 @@ namespace ugdiss
this->newWords.reset(new vector<string>());
}
}
-
+
void
TokenIndex::
close()
@@ -79,9 +79,9 @@ namespace ugdiss
TokenIndex::
CompFunc::
- CompFunc()
+ CompFunc()
{};
-
+
bool
TokenIndex::
CompFunc::
@@ -90,7 +90,7 @@ namespace ugdiss
return strcmp(base+A.offset,w) < 0;
};
- id_type
+ id_type
TokenIndex::
operator[](char const* p) const
{
@@ -101,7 +101,7 @@ namespace ugdiss
if (!dynamic) return unkId;
boost::lock_guard<boost::mutex> lk(*this->lock);
// stuff below is new as of 2011-01-30, for dynamic adding of unknown items
- // IMPORTANT: numTokens is not currently not changed, it is the number of
+ // IMPORTANT: numTokens is not currently not changed, it is the number of
// PRE-EXISING TOKENS, not including dynamically added Items
map<string,id_type>::value_type newItem(p,str2idExtra->size()+numTokens);
pair<map<string,id_type>::iterator,bool> foo = str2idExtra->insert(newItem);
@@ -110,14 +110,14 @@ namespace ugdiss
return foo.first->second;
}
- id_type
+ id_type
TokenIndex::
operator[](string const& w) const
{
return (*this)[w.c_str()];
}
- vector<char const*>
+ vector<char const*>
TokenIndex::
reverseIndex() const
{
@@ -125,11 +125,11 @@ namespace ugdiss
// cout << "tokenindex has " << numToks << " tokens" << endl;
- vector<char const*> v(numToks,NULL);
+ vector<char const*> v(numToks,NULL);
// v.reserve(endIdx-startIdx);
for (Entry const* x = startIdx; x != endIdx; x++)
{
- if (x->id >= v.size())
+ if (x->id >= v.size())
v.resize(x->id+1);
v[x->id] = comp.base+x->offset;
}
@@ -141,12 +141,12 @@ namespace ugdiss
TokenIndex::
operator[](id_type id) const
{
- if (!ridx.size())
+ if (!ridx.size())
{
boost::lock_guard<boost::mutex> lk(*this->lock);
if (!ridx.size()) ridx = reverseIndex();
}
- if (id < ridx.size())
+ if (id < ridx.size())
return ridx[id];
boost::lock_guard<boost::mutex> lk(*this->lock);
if (dynamic && id < ridx.size()+newWords->size())
@@ -156,26 +156,26 @@ namespace ugdiss
void
TokenIndex::
- iniReverseIndex()
+ iniReverseIndex()
{
- if (!ridx.size())
+ if (!ridx.size())
{
boost::lock_guard<boost::mutex> lk(*this->lock);
if (!ridx.size()) ridx = reverseIndex();
}
}
-
+
char const* const
TokenIndex::
- operator[](id_type id)
+ operator[](id_type id)
{
- if (!ridx.size())
+ if (!ridx.size())
{
boost::lock_guard<boost::mutex> lk(*this->lock);
if (!ridx.size()) ridx = reverseIndex();
}
- if (id < ridx.size())
+ if (id < ridx.size())
return ridx[id];
boost::lock_guard<boost::mutex> lk(*this->lock);
if (dynamic && id < ridx.size()+newWords->size())
@@ -183,11 +183,11 @@ namespace ugdiss
return unkLabel.c_str();
}
- string
+ string
TokenIndex::
- toString(vector<id_type> const& v)
+ toString(vector<id_type> const& v)
{
- if (!ridx.size())
+ if (!ridx.size())
{
boost::lock_guard<boost::mutex> lk(*this->lock);
if (!ridx.size()) ridx = reverseIndex();
@@ -198,11 +198,11 @@ namespace ugdiss
return buf.str();
}
- string
+ string
TokenIndex::
toString(vector<id_type> const& v) const
{
- if (!ridx.size())
+ if (!ridx.size())
{
boost::lock_guard<boost::mutex> lk(*this->lock);
if (!ridx.size()) ridx = reverseIndex();
@@ -213,11 +213,11 @@ namespace ugdiss
return buf.str();
}
- string
+ string
TokenIndex::
- toString(id_type const* start, id_type const* const stop)
+ toString(id_type const* start, id_type const* const stop)
{
- if (!ridx.size())
+ if (!ridx.size())
{
boost::lock_guard<boost::mutex> lk(*this->lock);
if (!ridx.size()) ridx = reverseIndex();
@@ -230,11 +230,11 @@ namespace ugdiss
return buf.str();
}
- string
+ string
TokenIndex::
toString(id_type const* start, id_type const* const stop) const
{
- if (!ridx.size())
+ if (!ridx.size())
{
boost::lock_guard<boost::mutex> lk(*this->lock);
if (!ridx.size()) ridx = reverseIndex();
@@ -266,7 +266,7 @@ namespace ugdiss
{
bool allgood = true; string w;
v.clear();
- for (istringstream buf(line); buf>>w;)
+ for (istringstream buf(line); buf>>w;)
{
v.push_back((*this)[w]);
allgood = allgood && v.back() > 1;
@@ -325,15 +325,15 @@ namespace ugdiss
}
void
- write_tokenindex_to_disk(vector<pair<string,uint32_t> > const& tok,
+ write_tokenindex_to_disk(vector<pair<string,uint32_t> > const& tok,
string const& ofile, string const& unkToken)
{
typedef pair<uint32_t,id_type> IndexEntry; // offset and id
// Write token strings to a buffer, keep track of offsets
- vector<IndexEntry> index(tok.size());
+ vector<IndexEntry> index(tok.size());
ostringstream data;
- id_type unkId = tok.size();
+ id_type unkId = tok.size();
for (size_t i = 0; i < tok.size(); i++)
{
if (tok[i].first == unkToken)
@@ -342,7 +342,7 @@ namespace ugdiss
index[i].second = tok[i].second; // respective ID
data<<tok[i].first<<char(0); // write string to buffer
}
-
+
// Now write the actual file
ofstream out(ofile.c_str());
uint32_t vsize = index.size(); // how many vocab items?
@@ -356,26 +356,26 @@ namespace ugdiss
out<<data.str();
}
- void
+ void
TokenIndex::
write(string fname)
{
typedef pair<string,uint32_t> Token; // token and id
- vector<Token> tok(totalVocabSize());
+ vector<Token> tok(totalVocabSize());
for (id_type i = 0; i < tok.size(); ++i)
tok[i] = Token((*this)[i],i);
sort(tok.begin(),tok.end());
write_tokenindex_to_disk(tok,fname,unkLabel);
}
-
- bool
+
+ bool
TokenIndex::
- isDynamic() const
+ isDynamic() const
{
return dynamic;
}
- bool
+ bool
TokenIndex::
setDynamic(bool on)
{
@@ -393,7 +393,7 @@ namespace ugdiss
}
return ret;
}
-
+
void
TokenIndex::
setUnkLabel(string unk)
diff --git a/moses/TranslationModel/UG/mm/tpt_tokenindex.h b/moses/TranslationModel/UG/mm/tpt_tokenindex.h
index 3051f07a5..9f7c69b3e 100644
--- a/moses/TranslationModel/UG/mm/tpt_tokenindex.h
+++ b/moses/TranslationModel/UG/mm/tpt_tokenindex.h
@@ -3,7 +3,7 @@
//
// - Vocab items should be stored in order of ids, so that we can determine their length
// by taking computing V[id+1] - V[id] instead of using strlen.
-//
+//
// (c) 2007,2008 Ulrich Germann
#ifndef __ugTokenIndex_hh
@@ -30,7 +30,7 @@ namespace ugdiss
/** Reverse index: maps from ID to char const* */
mutable vector<char const*> ridx;
/** Label for the UNK token */
- string unkLabel;
+ string unkLabel;
id_type unkId,numTokens;
/// New 2013-09-02: thread-safe
@@ -42,9 +42,9 @@ namespace ugdiss
boost::shared_ptr<vector<string> > newWords;
// The use of pointers to external items is a bit of a bad hack
// in terms of the semantic of TokenIndex const: since external items
- // are changed, the TokenIndex instance remains unchanged and const works,
- // even though in reality the underlying object on the coceptual level
- // *IS* changed. This means that dynamic TokenIndex instances are not
+ // are changed, the TokenIndex instance remains unchanged and const works,
+ // even though in reality the underlying object on the coceptual level
+ // *IS* changed. This means that dynamic TokenIndex instances are not
// thread-safe!
public:
@@ -53,7 +53,7 @@ namespace ugdiss
{
public:
uint32_t offset;
- id_type id;
+ id_type id;
};
/** Comparison function object used for Entry instances */
@@ -111,19 +111,19 @@ namespace ugdiss
void setUnkLabel(string unk);
};
- void
- write_tokenindex_to_disk(vector<pair<string,uint32_t> > const& tok,
+ void
+ write_tokenindex_to_disk(vector<pair<string,uint32_t> > const& tok,
string const& ofile, string const& unkToken);
/** for sorting words by frequency */
class compWords
{
string unk;
- public:
+ public:
compWords(string _unk) : unk(_unk) {};
-
+
bool
- operator()(pair<string,size_t> const& A,
+ operator()(pair<string,size_t> const& A,
pair<string,size_t> const& B) const
{
if (A.first == unk) return false;// do we still need this special treatment?
@@ -142,7 +142,7 @@ namespace ugdiss
typedef pair<string,uint32_t> Token; // token and id
- // first, sort the word list in decreasing order of frequency, so that we
+ // first, sort the word list in decreasing order of frequency, so that we
// can assign IDs in an encoding-efficient manner (high frequency. low ID)
vector<pair<string,size_t> > wcounts(M.size()); // for sorting by frequency
typedef typename MYMAP::const_iterator myIter;
@@ -156,16 +156,16 @@ namespace ugdiss
sort(wcounts.begin(),wcounts.end(),compFunc);
// Assign IDs ...
- vector<Token> tok(wcounts.size());
+ vector<Token> tok(wcounts.size());
for (size_t i = 0; i < wcounts.size(); i++)
tok[i] = Token(wcounts[i].first,i);
// and re-sort in alphabetical order
- sort(tok.begin(),tok.end());
+ sort(tok.begin(),tok.end());
write_tokenindex_to_disk(tok,ofile,unkToken);
}
template<typename Token>
- void
+ void
fill_token_seq(TokenIndex& V, string const& line, vector<Token>& dest)
{
istringstream buf(line); string w;
diff --git a/moses/TranslationModel/UG/mm/tpt_typedefs.h b/moses/TranslationModel/UG/mm/tpt_typedefs.h
index fea221d61..d2d2932de 100644
--- a/moses/TranslationModel/UG/mm/tpt_typedefs.h
+++ b/moses/TranslationModel/UG/mm/tpt_typedefs.h
@@ -12,4 +12,4 @@ namespace ugdiss
typedef uint64_t filepos_type;
typedef unsigned char uchar;
}
-#endif
+#endif
diff --git a/moses/TranslationModel/UG/mm/ug_bitext.cc b/moses/TranslationModel/UG/mm/ug_bitext.cc
index 7d1e4e901..809476aa9 100644
--- a/moses/TranslationModel/UG/mm/ug_bitext.cc
+++ b/moses/TranslationModel/UG/mm/ug_bitext.cc
@@ -8,265 +8,18 @@ using namespace ugdiss;
using namespace std;
namespace Moses
{
- namespace bitext
+ namespace bitext
{
-#if UG_BITEXT_TRACK_ACTIVE_THREADS
- ThreadSafeCounter pstats::active;
-#endif
-
- pstats::
- pstats()
- : raw_cnt (0)
- , sample_cnt (0)
- , good (0)
- , sum_pairs (0)
- , in_progress (0)
- {
- for (int i = 0; i <= Moses::LRModel::NONE; ++i)
- ofwd[i] = obwd[i] = 0;
- }
-
- pstats::
- ~pstats()
- {
-#if UG_BITEXT_TRACK_ACTIVE_THREADS
- // counter may not exist any more at destruction time, so try ... catch
- try { --active; } catch (...) {}
-#endif
- }
-
- void
- pstats::
- register_worker()
- {
- this->lock.lock();
- ++this->in_progress;
- this->lock.unlock();
- }
-
- void
- pstats::
- release()
- {
- this->lock.lock();
- if (this->in_progress-- == 1) // last one - >we're done
- this->ready.notify_all();
- this->lock.unlock();
- }
-
- bool
- pstats::
- add(::uint64_t pid, float const w,
- vector<uchar> const& a,
- uint32_t const cnt2,
- uint32_t fwd_o,
- uint32_t bwd_o)
- {
- boost::lock_guard<boost::mutex> guard(this->lock);
- jstats& entry = this->trg[pid];
- entry.add(w,a,cnt2,fwd_o,bwd_o);
- if (this->good < entry.rcnt())
- {
- UTIL_THROW(util::Exception, "more joint counts than good counts:"
- << entry.rcnt() << "/" << this->good << "!");
- }
- return true;
- }
-
- jstats::
- jstats()
- : my_rcnt(0), my_wcnt(0), my_cnt2(0)
- {
- for (int i = 0; i <= Moses::LRModel::NONE; ++i)
- ofwd[i] = obwd[i] = 0;
- my_aln.reserve(1);
- }
-
- jstats::
- jstats(jstats const& other)
- {
- my_rcnt = other.rcnt();
- my_wcnt = other.wcnt();
- my_aln = other.aln();
- for (int i = 0; i <= Moses::LRModel::NONE; i++)
- {
- ofwd[i] = other.ofwd[i];
- obwd[i] = other.obwd[i];
- }
- }
-
- uint32_t
- jstats::
- dcnt_fwd(PhraseOrientation const idx) const
- {
- assert(idx <= Moses::LRModel::NONE);
- return ofwd[idx];
- }
-
- uint32_t
- jstats::
- dcnt_bwd(PhraseOrientation const idx) const
- {
- assert(idx <= Moses::LRModel::NONE);
- return obwd[idx];
- }
-
- void
- jstats::
- add(float w, vector<uchar> const& a, uint32_t const cnt2,
- uint32_t fwd_orient, uint32_t bwd_orient)
- {
- boost::lock_guard<boost::mutex> lk(this->lock);
- my_rcnt += 1;
- my_wcnt += w;
- // my_cnt2 += cnt2; // could I really be that stupid? [UG]
- my_cnt2 = cnt2;
- if (a.size())
- {
- size_t i = 0;
- while (i < my_aln.size() && my_aln[i].second != a) ++i;
- if (i == my_aln.size())
- my_aln.push_back(pair<size_t,vector<uchar> >(1,a));
- else
- my_aln[i].first++;
- if (my_aln[i].first > my_aln[i/2].first)
- push_heap(my_aln.begin(),my_aln.begin()+i+1);
- }
- ++ofwd[fwd_orient];
- ++obwd[bwd_orient];
- }
-
- uint32_t
- jstats::
- rcnt() const
- { return my_rcnt; }
-
float
- jstats::
- wcnt() const
- { return my_wcnt; }
-
- uint32_t
- jstats::
- cnt2() const
- { return my_cnt2; }
-
- vector<pair<size_t, vector<uchar> > > const&
- jstats::
- aln() const
- { return my_aln; }
-
- void
- jstats::
- invalidate()
- {
- if (my_wcnt > 0)
- my_wcnt *= -1;
- }
-
- void
- jstats::
- validate()
- {
- if (my_wcnt < 0)
- my_wcnt *= -1;
- }
-
- bool
- jstats::
- valid()
- {
- return my_wcnt >= 0;
- }
-
-
- float
lbop(size_t const tries, size_t const succ, float const confidence)
{
- return (confidence == 0
- ? float(succ)/tries
+ return (confidence == 0
+ ? float(succ)/tries
: (boost::math::binomial_distribution<>::
find_lower_bound_on_p(tries, succ, confidence)));
}
-
- template<>
- sptr<imBitext<L2R_Token<SimpleWordId> > >
- imBitext<L2R_Token<SimpleWordId> >::
- add(vector<string> const& s1,
- vector<string> const& s2,
- vector<string> const& aln) const
- {
- typedef L2R_Token<SimpleWordId> TKN;
- assert(s1.size() == s2.size() && s1.size() == aln.size());
-
-#ifndef NDEBUG
- size_t first_new_snt = this->T1 ? this->T1->size() : 0;
-#endif
-
- sptr<imBitext<TKN> > ret;
- {
- boost::lock_guard<boost::mutex> guard(this->lock);
- ret.reset(new imBitext<TKN>(*this));
- }
-
- // we add the sentences in separate threads (so it's faster)
- boost::thread thread1(snt_adder<TKN>(s1,*ret->V1,ret->myT1,ret->myI1));
- // thread1.join(); // for debugging
- boost::thread thread2(snt_adder<TKN>(s2,*ret->V2,ret->myT2,ret->myI2));
- BOOST_FOREACH(string const& a, aln)
- {
- istringstream ibuf(a);
- ostringstream obuf;
- uint32_t row,col; char c;
- while (ibuf >> row >> c >> col)
- {
- UTIL_THROW_IF2(c != '-', "[" << HERE << "] "
- << "Error in alignment information:\n" << a);
- binwrite(obuf,row);
- binwrite(obuf,col);
- }
- // important: DO NOT replace the two lines below this comment by
- // char const* x = obuf.str().c_str(), as the memory x is pointing
- // to is freed immediately upon deconstruction of the string object.
- string foo = obuf.str();
- char const* x = foo.c_str();
- vector<char> v(x,x+foo.size());
- ret->myTx = append(ret->myTx, v);
- }
- thread1.join();
- thread2.join();
-
- ret->Tx = ret->myTx;
- ret->T1 = ret->myT1;
- ret->T2 = ret->myT2;
- ret->I1 = ret->myI1;
- ret->I2 = ret->myI2;
-
-#ifndef NDEBUG
- // sanity check
- for (size_t i = first_new_snt; i < ret->T1->size(); ++i)
- {
- size_t slen1 = ret->T1->sntLen(i);
- size_t slen2 = ret->T2->sntLen(i);
- char const* p = ret->Tx->sntStart(i);
- char const* q = ret->Tx->sntEnd(i);
- size_t k;
- while (p < q)
- {
- p = binread(p,k);
- assert(p);
- assert(p < q);
- assert(k < slen1);
- p = binread(p,k);
- assert(p);
- assert(k < slen2);
- }
- }
-#endif
- return ret;
- }
// template<>
void
@@ -289,37 +42,37 @@ namespace Moses
else
index.reset(new imTSA<tkn>(track,NULL,NULL));
}
-
+
snt_adder<L2R_Token<SimpleWordId> >::
- snt_adder(vector<string> const& s, TokenIndex& v,
- sptr<imTtrack<L2R_Token<SimpleWordId> > >& t,
+ snt_adder(vector<string> const& s, TokenIndex& v,
+ sptr<imTtrack<L2R_Token<SimpleWordId> > >& t,
sptr<imTSA<L2R_Token<SimpleWordId> > >& i)
- : snt(s), V(v), track(t), index(i)
+ : snt(s), V(v), track(t), index(i)
{ }
- bool
+ bool
expand_phrase_pair
- (vector<vector<ushort> >& a1,
+ (vector<vector<ushort> >& a1,
vector<vector<ushort> >& a2,
ushort const s2, // next word on in target side
ushort const L1, ushort const R1, // limits of previous phrase
ushort & s1, ushort & e1, ushort& e2) // start/end src; end trg
{
- if (a2[s2].size() == 0)
+ if (a2[s2].size() == 0)
{
cout << __FILE__ << ":" << __LINE__ << endl;
return false;
}
bitvector done1(a1.size());
bitvector done2(a2.size());
- vector <pair<ushort,ushort> > agenda;
+ vector <pair<ushort,ushort> > agenda;
// x.first: side (1 or 2)
// x.second: word position
agenda.reserve(a1.size() + a2.size());
agenda.push_back(pair<ushort,ushort>(2,s2));
e2 = s2;
s1 = e1 = a2[s2].front();
- if (s1 >= L1 && s1 < R1)
+ if (s1 >= L1 && s1 < R1)
{
cout << __FILE__ << ":" << __LINE__ << endl;
return false;
@@ -335,14 +88,14 @@ namespace Moses
done1.set(p);
BOOST_FOREACH(ushort i, a1[p])
{
- if (i < s2)
+ if (i < s2)
{
// cout << __FILE__ << ":" << __LINE__ << endl;
return false;
}
if (done2[i]) continue;
for (;e2 <= i;++e2)
- if (!done2[e2])
+ if (!done2[e2])
agenda.push_back(pair<ushort,ushort>(2,e2));
}
}
@@ -351,16 +104,16 @@ namespace Moses
done2.set(p);
BOOST_FOREACH(ushort i, a2[p])
{
- if ((e1 < L1 && i >= L1) ||
- (s1 >= R1 && i < R1) ||
+ if ((e1 < L1 && i >= L1) ||
+ (s1 >= R1 && i < R1) ||
(i >= L1 && i < R1))
{
- // cout << __FILE__ << ":" << __LINE__ << " "
- // << L1 << "-" << R1 << " " << i << " "
+ // cout << __FILE__ << ":" << __LINE__ << " "
+ // << L1 << "-" << R1 << " " << i << " "
// << s1 << "-" << e1<< endl;
return false;
}
-
+
if (e1 < i)
{
for (; e1 <= i; ++e1)
@@ -381,7 +134,7 @@ namespace Moses
return true;
}
- void
+ void
print_amatrix(vector<vector<ushort> > a1, uint32_t len2,
ushort b1, ushort e1, ushort b2, ushort e2)
{
@@ -410,5 +163,15 @@ namespace Moses
cout << string(90,'-') << endl;
}
+ void
+ write_bitvector(bitvector const& v, ostream& out)
+ {
+ for (size_t i = v.find_first(); i < v.size();)
+ {
+ out << i;
+ if ((i = v.find_next(i)) < v.size()) out << ",";
+ }
+ }
+
}
}
diff --git a/moses/TranslationModel/UG/mm/ug_bitext.h b/moses/TranslationModel/UG/mm/ug_bitext.h
index 20319eed4..ab5f2a24f 100644
--- a/moses/TranslationModel/UG/mm/ug_bitext.h
+++ b/moses/TranslationModel/UG/mm/ug_bitext.h
@@ -1,21 +1,19 @@
//-*- c++ -*-
-
-#ifndef __ug_bitext_h
-#define __ug_bitext_h
+#pragma once
// Implementations of word-aligned bitext.
// Written by Ulrich Germann
-//
+//
// mmBitext: static, memory-mapped bitext
// imBitext: dynamic, in-memory bitext
//
// things we can do to speed up things:
-// - set up threads at startup time that force the
+// - set up threads at startup time that force the
// data in to memory sequentially
//
-// - use multiple agendas for better load balancing and to avoid
+// - use multiple agendas for better load balancing and to avoid
// competition for locks
-//
+//
#define UG_BITEXT_TRACK_ACTIVE_THREADS 0
@@ -26,11 +24,12 @@
#include <iomanip>
#include <algorithm>
-#include <boost/unordered_map.hpp>
#include <boost/foreach.hpp>
-#include <boost/thread.hpp>
#include <boost/random.hpp>
#include <boost/format.hpp>
+#include <boost/thread.hpp>
+#include <boost/unordered_map.hpp>
+#include <boost/math/distributions/binomial.hpp>
#include "moses/TranslationModel/UG/generic/sorting/VectorIndexSorter.h"
#include "moses/TranslationModel/UG/generic/sampling/Sampling.h"
@@ -38,7 +37,10 @@
#include "moses/TranslationModel/UG/generic/threading/ug_thread_safe_counter.h"
#include "moses/FF/LexicalReordering/LexicalReorderingState.h"
#include "moses/Util.h"
-#include "moses/StaticData.h"
+// #include "moses/StaticData.h"
+#include "moses/thread_safe_container.h"
+#include "moses/ContextScope.h"
+#include "moses/TranslationTask.h"
#include "util/exception.hh"
// #include "util/check.hh"
@@ -52,434 +54,66 @@
#include "ug_corpus_token.h"
#include "tpt_pickler.h"
#include "ug_lexical_phrase_scorer2.h"
-#include "ug_phrasepair.h"
#include "ug_lru_cache.h"
#include "ug_lexical_reordering.h"
+#include "ug_sampling_bias.h"
+#include "ug_phrasepair.h"
#define PSTATS_CACHE_THRESHOLD 50
-using namespace ugdiss;
-using namespace std;
namespace Moses {
class Mmsapt;
namespace bitext
{
-
- template<typename TKN> class Bitext;
- template<typename TKN> class PhrasePair;
using namespace ugdiss;
- template<typename TKN> class Bitext;
-
- template<typename sid_t, typename off_t, typename len_t>
- void
- parse_pid(::uint64_t const pid, sid_t & sid,
- off_t & off, len_t& len)
- {
- static ::uint64_t two32 = ::uint64_t(1)<<32;
- static ::uint64_t two16 = ::uint64_t(1)<<16;
- len = pid%two16;
- off = (pid%two32)>>16;
- sid = pid>>32;
- }
-
- float
- lbop(size_t const tries, size_t const succ,
- float const confidence);
-
- // "joint" (i.e., phrase pair) statistics
- class
- jstats
- {
- boost::mutex lock;
- uint32_t my_rcnt; // unweighted count
- float my_wcnt; // weighted count
- uint32_t my_cnt2;
- vector<pair<size_t, vector<uchar> > > my_aln;
- uint32_t ofwd[Moses::LRModel::NONE+1], obwd[Moses::LRModel::NONE+1];
- public:
- jstats();
- jstats(jstats const& other);
- uint32_t rcnt() const;
- uint32_t cnt2() const; // raw target phrase occurrence count
- float wcnt() const;
-
- vector<pair<size_t, vector<uchar> > > const & aln() const;
- void add(float w, vector<uchar> const& a, uint32_t const cnt2,
- uint32_t fwd_orient, uint32_t bwd_orient);
- void invalidate();
- void validate();
- bool valid();
- uint32_t dcnt_fwd(PhraseOrientation const idx) const;
- uint32_t dcnt_bwd(PhraseOrientation const idx) const;
- };
-
- struct
- pstats
- {
-
-#if UG_BITEXT_TRACK_ACTIVE_THREADS
- static ThreadSafeCounter active;
-#endif
- boost::mutex lock; // for parallel gathering of stats
- boost::condition_variable ready; /* consumers can wait for this
- * data structure to be ready. */
-
- size_t raw_cnt; // (approximate) raw occurrence count
- size_t sample_cnt; // number of instances selected during sampling
- size_t good; // number of selected instances with valid word alignments
- size_t sum_pairs;
- size_t in_progress; // keeps track of how many threads are currently working on this
-
- // size_t Moses::LRModel::ReorderingType
- uint32_t ofwd[Moses::LRModel::NONE+1], obwd[Moses::LRModel::NONE+1];
-
- // typedef typename boost::unordered_map<typename ::uint64_t, jstats> trg_map_t;
- typedef std::map<typename ::uint64_t, jstats> trg_map_t;
- trg_map_t trg;
- pstats();
- ~pstats();
- void release();
- void register_worker();
- size_t count_workers() { return in_progress; }
-
- bool
- add(::uint64_t const pid,
- float const w,
- vector<uchar> const& a,
- uint32_t const cnt2,
- uint32_t fwd_o, uint32_t bwd_o);
- };
-
-
- template<typename Token>
- string
- toString(TokenIndex const& V, Token const* x, size_t const len)
- {
- if (!len) return "";
- UTIL_THROW_IF2(!x, HERE << ": Unexpected end of phrase!");
- ostringstream buf;
- buf << V[x->id()];
- size_t i = 1;
- for (x = x->next(); x && i < len; ++i, x = x->next())
- buf << " " << V[x->id()];
- UTIL_THROW_IF2(i != len, HERE << ": Unexpected end of phrase!");
- return buf.str();
- }
+ float lbop(size_t const tries, size_t const succ, float const confidence);
+ void write_bitvector(bitvector const& v, ostream& out);
- template<typename Token>
- class
- PhrasePair
+ struct
+ ContextForQuery
{
- public:
- class Scorer { public: virtual float operator()(PhrasePair& pp) const = 0; };
- Token const* start1;
- Token const* start2;
- uint32_t len1;
- uint32_t len2;
- ::uint64_t p1, p2;
- uint32_t raw1,raw2,sample1,sample2,good1,good2,joint;
- vector<float> fvals;
- float dfwd[Moses::LRModel::NONE+1]; // distortion counts // counts or probs?
- float dbwd[Moses::LRModel::NONE+1]; // distortion counts
- vector<uchar> aln;
- float score;
- bool inverse;
- PhrasePair() { };
- PhrasePair(PhrasePair const& o);
-
- PhrasePair const& operator+=(PhrasePair const& other);
-
- bool operator<(PhrasePair const& other) const;
- bool operator>(PhrasePair const& other) const;
- bool operator<=(PhrasePair const& other) const;
- bool operator>=(PhrasePair const& other) const;
-
- void init();
- void init(::uint64_t const pid1, bool is_inverse,
- Token const* x, uint32_t const len,
- pstats const* ps = NULL, size_t const numfeats=0);
-
- // void init(::uint64_t const pid1, pstats const& ps, size_t const numfeats);
- // void init(::uint64_t const pid1, pstats const& ps1, pstats const& ps2,
- // size_t const numfeats);
-
- // PhrasePair const&
- // update(::uint64_t const pid2, size_t r2 = 0);
-
- PhrasePair const&
- update(::uint64_t const pid2, Token const* x,
- uint32_t const len, jstats const& js);
-
- // PhrasePair const&
- // update(::uint64_t const pid2, jstats const& js1, jstats const& js2);
-
- // PhrasePair const&
- // update(::uint64_t const pid2, size_t const raw2extra, jstats const& js);
-
- // float
- // eval(vector<float> const& w);
-
- class SortByTargetIdSeq
- {
- public:
- int cmp(PhrasePair const& a, PhrasePair const& b) const;
- bool operator()(PhrasePair const& a, PhrasePair const& b) const;
- };
+ // needs to be made thread-safe
+ // ttasksptr const m_ttask;
+ // size_t max_samples;
+ boost::shared_mutex lock;
+ sptr<SamplingBias> bias;
+ sptr<pstats::cache_t> cache1, cache2;
+ ostream* bias_log;
+ ContextForQuery() : bias_log(NULL) { }
};
- template<typename Token>
- void
- PhrasePair<Token>::
- init(::uint64_t const pid1, bool is_inverse, Token const* x, uint32_t const len,
- pstats const* ps, size_t const numfeats)
- {
- inverse = is_inverse;
- start1 = x; len1 = len;
- p1 = pid1;
- p2 = 0;
- if (ps)
- {
- raw1 = ps->raw_cnt;
- sample1 = ps->sample_cnt;
- good1 = ps->good;
- }
- else raw1 = sample1 = good1 = 0;
- joint = 0;
- good2 = 0;
- sample2 = 0;
- raw2 = 0;
- fvals.resize(numfeats);
- }
-
- template<typename Token>
- PhrasePair<Token> const&
- PhrasePair<Token>::
- update(::uint64_t const pid2,
- Token const* x, uint32_t const len, jstats const& js)
- {
- p2 = pid2;
- start2 = x; len2 = len;
- raw2 = js.cnt2();
- joint = js.rcnt();
- assert(js.aln().size());
- if (js.aln().size())
- aln = js.aln()[0].second;
- float total_fwd = 0, total_bwd = 0;
- for (int i = 0; i <= Moses::LRModel::NONE; i++)
- {
- PhraseOrientation po = static_cast<PhraseOrientation>(i);
- total_fwd += js.dcnt_fwd(po)+1;
- total_bwd += js.dcnt_bwd(po)+1;
- }
-
- // should we do that here or leave the raw counts?
- for (int i = 0; i <= Moses::LRModel::NONE; i++)
- {
- PhraseOrientation po = static_cast<PhraseOrientation>(i);
- dfwd[i] = float(js.dcnt_fwd(po)+1)/total_fwd;
- dbwd[i] = float(js.dcnt_bwd(po)+1)/total_bwd;
- }
-
- return *this;
- }
-
- template<typename Token>
- bool
- PhrasePair<Token>::
- operator<(PhrasePair const& other) const
- { return this->score < other.score; }
-
- template<typename Token>
- bool
- PhrasePair<Token>::
- operator>(PhrasePair const& other) const
- { return this->score > other.score; }
- template<typename Token>
- bool
- PhrasePair<Token>::
- operator<=(PhrasePair const& other) const
- { return this->score <= other.score; }
-
- template<typename Token>
- bool
- PhrasePair<Token>::
- operator>=(PhrasePair const& other) const
- { return this->score >= other.score; }
-
- template<typename Token>
- PhrasePair<Token> const&
- PhrasePair<Token>::
- operator+=(PhrasePair const& o)
- {
- raw1 += o.raw1;
- raw2 += o.raw2;
- sample1 += o.sample1;
- sample2 += o.sample2;
- good1 += o.good1;
- good2 += o.good2;
- joint += o.joint;
- return *this;
- }
-
- template<typename Token>
- PhrasePair<Token>::
- PhrasePair(PhrasePair<Token> const& o)
- : start1(o.start1)
- , start2(o.start2)
- , len1(o.len1)
- , len2(o.len2)
- , p1(o.p1)
- , p2(o.p2)
- , raw1(o.raw1)
- , raw2(o.raw2)
- , sample1(o.sample1)
- , sample2(o.sample2)
- , good1(o.good1)
- , good2(o.good2)
- , joint(o.joint)
- , fvals(o.fvals)
- , aln(o.aln)
- , score(o.score)
- , inverse(o.inverse)
- {
- for (int i = 0; i <= Moses::LRModel::NONE; ++i)
- {
- dfwd[i] = o.dfwd[i];
- dbwd[i] = o.dbwd[i];
- }
- }
-
- template<typename Token>
- int
- PhrasePair<Token>::
- SortByTargetIdSeq::
- cmp(PhrasePair const& a, PhrasePair const& b) const
- {
- size_t i = 0;
- Token const* x = a.start2;
- Token const* y = b.start2;
- while (i < a.len2 && i < b.len2 && x->id() == y->id())
- {
- x = x->next();
- y = y->next();
- ++i;
- }
- if (i == a.len2 && i == b.len2) return 0;
- if (i == a.len2) return -1;
- if (i == b.len2) return 1;
- return x->id() < y->id() ? -1 : 1;
- }
-
- template<typename Token>
- bool
- PhrasePair<Token>::
- SortByTargetIdSeq::
- operator()(PhrasePair const& a, PhrasePair const& b) const
- {
- return this->cmp(a,b) < 0;
- }
-
- template<typename Token>
- void
- PhrasePair<Token>::
- init()
- {
- inverse = false;
- len1 = len2 = raw1 = raw2 = sample1 = sample2 = good1 = good2 = joint = 0;
- start1 = start2 = NULL;
- p1 = p2 = 0;
- }
-
- class
- SamplingBias
- {
- public:
- virtual float operator[](size_t const ID) const = 0;
- virtual size_t size() const = 0;
- };
-
- class
- DocumentBias : public SamplingBias
+ template<typename TKN>
+ class Bitext
{
- sptr<vector<id_type> const> const m_sid2docid;
- vector<float> m_bias;
-
public:
+ typedef TKN Token;
+ typedef typename TSA<Token>::tree_iterator iter;
+ typedef typename std::vector<PhrasePair<Token> > vec_ppair;
+ typedef typename lru_cache::LRU_Cache<uint64_t, vec_ppair> pplist_cache_t;
+ typedef TSA<Token> tsa;
+ friend class Moses::Mmsapt;
+ protected:
+ mutable boost::shared_mutex m_lock; // for thread-safe operation
- DocumentBias(sptr<vector<id_type> const> const& sid2doc,
- map<string,id_type> const& docname2docid,
- map<string,float> const& biasmap)
- : m_sid2docid(sid2doc)
- {
- m_bias.assign(docname2docid.size(),0);
- map<string, id_type>::const_iterator m;
- typedef pair<string,float> item;
- BOOST_FOREACH(item const& x, biasmap)
- {
- m = docname2docid.find(x.first);
- UTIL_THROW_IF2(m == docname2docid.end(),
- "Do not know document '" << x.first << "'");
- m_bias[m->second] = x.second;
- }
-
- }
-
- float
- operator[](size_t const idx) const
- {
- UTIL_THROW_IF2(idx >= m_sid2docid->size(), "Out of bounds");
- return m_bias[(*m_sid2docid)[idx]];
- }
- size_t size() const { return m_sid2docid->size(); }
- };
+ class agenda; // for parallel sampling see ug_bitext_agenda.h
+ mutable sptr<agenda> ag;
+ size_t m_num_workers; // number of workers available to the agenda
- class
- SentenceBias : public SamplingBias
- {
- vector<float> m_bias;
- public:
- SentenceBias(vector<float> const& bias) : m_bias(bias) { }
- SentenceBias(size_t const s) : m_bias(s) { }
+ size_t m_default_sample_size;
+ size_t m_pstats_cache_threshold; // threshold for caching sampling results
+ sptr<pstats::cache_t> m_cache1, m_cache2; // caches for sampling results
- float& operator[](size_t const idx)
- {
- UTIL_THROW_IF2(idx >= m_bias.size(), "Out of bounds");
- return m_bias[idx];
- }
+ vector<string> m_docname;
+ map<string,id_type> m_docname2docid; // maps from doc names to ids
+ sptr<std::vector<id_type> > m_sid2docid; // maps from sentences to docs (ids)
- float operator[](size_t const idx) const
- {
- UTIL_THROW_IF2(idx >= m_bias.size(), "Out of bounds");
- return m_bias[idx];
- }
- size_t size() const { return m_bias.size(); }
-
- };
-
+ mutable pplist_cache_t m_pplist_cache1, m_pplist_cache2;
+ // caches for unbiased sampling; biased sampling uses the caches that
+ // are stored locally on the translation task
- template<typename TKN>
- class Bitext
- {
- friend class Moses::Mmsapt;
- protected:
- mutable boost::mutex lock;
- mutable boost::mutex cache_lock;
public:
- typedef TKN Token;
- typedef typename TSA<Token>::tree_iterator iter;
-
- class agenda;
- // stores the list of unfinished jobs;
- // maintains a pool of workers and assigns the jobs to them
-
- // to be done: work with multiple agendas for faster lookup
- // (multiplex jobs); not sure if an agenda having more than
- // four or so workers is efficient, because workers get into
- // each other's way.
- mutable sptr<agenda> ag;
-
sptr<Ttrack<char> > Tx; // word alignments
sptr<Ttrack<Token> > T1; // token track
sptr<Ttrack<Token> > T2; // token track
@@ -488,87 +122,92 @@ namespace Moses {
sptr<TSA<Token> > I1; // indices
sptr<TSA<Token> > I2; // indices
- map<string,id_type> m_docname2docid;
- sptr<vector<id_type> > m_sid2docid;
-
/// given the source phrase sid[start:stop]
- // find the possible start (s1 .. s2) and end (e1 .. e2)
+ // find the possible start (s1 .. s2) and end (e1 .. e2)
// points of the target phrase; if non-NULL, store word
- // alignments in *core_alignment. If /flip/, source phrase is
+ // alignments in *core_alignment. If /flip/, source phrase is
// L2.
- bool
- find_trg_phr_bounds
- (size_t const sid, size_t const start, size_t const stop,
- size_t & s1, size_t & s2, size_t & e1, size_t & e2,
- int& po_fwd, int& po_bwd,
- vector<uchar> * core_alignment,
- bitvector* full_alignment,
- bool const flip) const;
-
-#if 1
- typedef boost::unordered_map<typename ::uint64_t,sptr<pstats> > pcache_t;
-#else
- typedef map<typename ::uint64_t,sptr<pstats> > pcache_t;
-#endif
- mutable pcache_t cache1,cache2;
- protected:
- typedef typename
- lru_cache::LRU_Cache<typename ::uint64_t, vector<PhrasePair<Token> > >
- pplist_cache_t;
+ bool find_trg_phr_bounds
+ ( size_t const sid, // sentence to investigate
+ size_t const start, // start of source phrase
+ size_t const stop, // last position of source phrase
+ size_t & s1, size_t & s2, // beginning and end of target start
+ size_t & e1, size_t & e2, // beginning and end of target end
+ int& po_fwd, int& po_bwd, // phrase orientations
+ std::vector<uchar> * core_alignment, // stores the core alignment
+ bitvector* full_alignment, // stores full word alignment for this sent.
+ bool const flip) const; // flip source and target (reverse lookup)
+
+ // prep2 launches sampling and returns immediately.
+ // lookup (below) waits for the job to finish before it returns
+ sptr<pstats>
+ prep2(ttasksptr const& ttask, iter const& phrase, int max_sample = -1) const;
- size_t default_sample_size;
- size_t num_workers;
- size_t m_pstats_cache_threshold;
- mutable pplist_cache_t m_pplist_cache1, m_pplist_cache2;
- private:
- sptr<pstats>
- prep2(iter const& phrase, size_t const max_sample,
- SamplingBias const* const bias) const;
public:
- Bitext(size_t const max_sample =1000,
- size_t const xnum_workers =16);
+ Bitext(size_t const max_sample = 1000, size_t const xnum_workers = 16);
- Bitext(Ttrack<Token>* const t1,
- Ttrack<Token>* const t2,
+ Bitext(Ttrack<Token>* const t1, Ttrack<Token>* const t2,
Ttrack<char>* const tx,
- TokenIndex* const v1,
- TokenIndex* const v2,
- TSA<Token>* const i1,
- TSA<Token>* const i2,
+ TokenIndex* const v1, TokenIndex* const v2,
+ TSA<Token>* const i1, TSA<Token>* const i2,
size_t const max_sample=1000,
size_t const xnum_workers=16);
-
- virtual void open(string const base, string const L1, string const L2) = 0;
-
- // sptr<pstats> lookup(Phrase const& phrase, size_t factor) const;
- sptr<pstats> lookup(iter const& phrase, SamplingBias const* const bias=NULL) const;
- sptr<pstats> lookup(iter const& phrase, size_t const max_sample,
- SamplingBias const* const bias) const;
- void
- lookup(vector<Token> const& snt, TSA<Token>& idx,
- vector<vector<sptr<vector<PhrasePair<Token> > > > >& dest,
- vector<vector<typename ::uint64_t> >* pidmap = NULL,
- typename PhrasePair<Token>::Scorer* scorer=NULL,
- SamplingBias const* const bias=NULL,
- bool multithread=true) const;
+ virtual void
+ open(string const base, string const L1, string const L2) = 0;
+
+ sptr<pstats>
+ lookup(ttasksptr const& ttask, iter const& phrase, int max_sample = -1) const;
- void prep(iter const& phrase, SamplingBias const* const bias) const;
+ void prep(ttasksptr const& ttask, iter const& phrase) const;
void setDefaultSampleSize(size_t const max_samples);
size_t getDefaultSampleSize() const;
- string toString(::uint64_t pid, int isL2) const;
+ string toString(uint64_t pid, int isL2) const;
virtual size_t revision() const { return 0; }
-
- sptr<DocumentBias>
- setupDocumentBias(map<string,float> const& bias) const;
-
+
sptr<SentenceBias>
loadSentenceBias(string const& fname) const;
+
+ sptr<DocumentBias>
+ SetupDocumentBias(string const& bserver, string const& text, ostream* log) const;
+
+
+ void
+ mark_match(Token const* start, Token const* end, iter const& m,
+ bitvector& check) const;
+ void
+ write_yawat_alignment
+ ( id_type const sid, iter const* m1, iter const* m2, ostream& out ) const;
+#if 0
+ // needs to be adapted to the new API
+ void
+ lookup(std::vector<Token> const& snt, TSA<Token>& idx,
+ std::vector<std::vector<sptr<std::vector<PhrasePair<Token> > > > >& dest,
+ std::vector<std::vector<uint64_t> >* pidmap = NULL,
+ typename PhrasePair<Token>::Scorer* scorer=NULL,
+ sptr<SamplingBias const> const bias,
+ bool multithread=true) const;
+#endif
+ string docname(id_type const sid) const;
+
};
+#include "ug_bitext_agenda.h"
+
+ template<typename Token>
+ string
+ Bitext<Token>::
+ docname(id_type const sid) const
+ {
+ if (sid < m_sid2docid->size() && (*m_sid2docid)[sid] < m_docname.size())
+ return m_docname[(*m_sid2docid)[sid]];
+ else
+ return "";
+ }
+
template<typename Token>
sptr<SentenceBias>
Bitext<Token>::
@@ -585,820 +224,106 @@ namespace Moses {
}
template<typename Token>
- sptr<DocumentBias>
- Bitext<Token>::
- setupDocumentBias(map<string,float> const& bias) const
- {
- sptr<DocumentBias> ret(new DocumentBias(m_sid2docid, m_docname2docid,bias));
- return ret;
- }
-
-
-
- template<typename Token>
string
Bitext<Token>::
- toString(::uint64_t pid, int isL2) const
+ toString(uint64_t pid, int isL2) const
{
ostringstream buf;
uint32_t sid,off,len; parse_pid(pid,sid,off,len);
Token const* t = (isL2 ? T2 : T1)->sntStart(sid) + off;
Token const* x = t + len;
TokenIndex const& V = isL2 ? *V2 : *V1;
- while (t < x)
+ while (t < x)
{
buf << V[t->id()];
if (++t < x) buf << " ";
}
return buf.str();
}
-
-
template<typename Token>
- size_t
+ size_t
Bitext<Token>::
- getDefaultSampleSize() const
- {
- return default_sample_size;
+ getDefaultSampleSize() const
+ {
+ return m_default_sample_size;
}
template<typename Token>
- void
+ void
Bitext<Token>::
setDefaultSampleSize(size_t const max_samples)
- {
- boost::lock_guard<boost::mutex> guard(this->lock);
- if (max_samples != default_sample_size)
+ {
+ boost::unique_lock<boost::shared_mutex> guard(m_lock);
+ if (max_samples != m_default_sample_size)
{
- cache1.clear();
- cache2.clear();
- default_sample_size = max_samples;
+ m_cache1.reset(new pstats::cache_t);
+ m_cache2.reset(new pstats::cache_t);
+ m_default_sample_size = max_samples;
}
}
template<typename Token>
Bitext<Token>::
Bitext(size_t const max_sample, size_t const xnum_workers)
- : default_sample_size(max_sample)
- , num_workers(xnum_workers)
+ : m_num_workers(xnum_workers)
+ , m_default_sample_size(max_sample)
, m_pstats_cache_threshold(PSTATS_CACHE_THRESHOLD)
+ , m_cache1(new pstats::cache_t)
+ , m_cache2(new pstats::cache_t)
{ }
template<typename Token>
Bitext<Token>::
- Bitext(Ttrack<Token>* const t1,
- Ttrack<Token>* const t2,
+ Bitext(Ttrack<Token>* const t1,
+ Ttrack<Token>* const t2,
Ttrack<char>* const tx,
- TokenIndex* const v1,
+ TokenIndex* const v1,
TokenIndex* const v2,
- TSA<Token>* const i1,
+ TSA<Token>* const i1,
TSA<Token>* const i2,
size_t const max_sample,
size_t const xnum_workers)
- : Tx(tx), T1(t1), T2(t2), V1(v1), V2(v2), I1(i1), I2(i2)
- , default_sample_size(max_sample)
- , num_workers(xnum_workers)
+ : m_num_workers(xnum_workers)
+ , m_default_sample_size(max_sample)
, m_pstats_cache_threshold(PSTATS_CACHE_THRESHOLD)
+ , m_cache1(new pstats::cache_t)
+ , m_cache2(new pstats::cache_t)
+ , Tx(tx), T1(t1), T2(t2), V1(v1), V2(v2), I1(i1), I2(i2)
{ }
- // agenda is a pool of jobs
- template<typename Token>
- class
- Bitext<Token>::
- agenda
- {
- boost::mutex lock;
- class job
- {
-#if UG_BITEXT_TRACK_ACTIVE_THREADS
- static ThreadSafeCounter active;
-#endif
- boost::mutex lock;
- friend class agenda;
- boost::taus88 rnd; // every job has its own pseudo random generator
- double rnddenom; // denominator for scaling random sampling
- size_t min_diverse; // minimum number of distinct translations
- public:
- size_t workers; // how many workers are working on this job?
- sptr<TSA<Token> const> root; // root of the underlying suffix array
- char const* next; // next position to read from
- char const* stop; // end of index range
- size_t max_samples; // how many samples to extract at most
- size_t ctr; /* # of phrase occurrences considered so far
- * # of samples chosen is stored in stats->good
- */
- size_t len; // phrase length
- bool fwd; // if true, source phrase is L1
- sptr<pstats> stats; // stores statistics collected during sampling
- SamplingBias const* m_bias; // sentence-level bias for sampling
- float bias_total;
- bool step(::uint64_t & sid, ::uint64_t & offset); // select another occurrence
- bool done() const;
- job(typename TSA<Token>::tree_iterator const& m,
- sptr<TSA<Token> > const& r, size_t maxsmpl, bool isfwd,
- SamplingBias const* const bias);
- ~job();
- };
- public:
- class
- worker
- {
- agenda& ag;
- public:
- worker(agenda& a) : ag(a) {}
- void operator()();
- };
- private:
- list<sptr<job> > joblist;
- vector<sptr<boost::thread> > workers;
- bool shutdown;
- size_t doomed;
- public:
- Bitext<Token> const& bt;
- agenda(Bitext<Token> const& bitext);
- ~agenda();
- void add_workers(int n);
-
- sptr<pstats>
- add_job(typename TSA<Token>::tree_iterator const& phrase,
- size_t const max_samples,
- SamplingBias const* const bias);
-
- sptr<job> get_job();
- };
-
- template<typename Token>
- bool
- Bitext<Token>::
- agenda::
- job::
- step(::uint64_t & sid, ::uint64_t & offset)
- {
- boost::lock_guard<boost::mutex> jguard(lock);
- bool ret = (max_samples == 0) && (next < stop);
- if (ret)
- {
- next = root->readSid(next,stop,sid);
- next = root->readOffset(next,stop,offset);
- boost::lock_guard<boost::mutex> sguard(stats->lock);
- if (stats->raw_cnt == ctr) ++stats->raw_cnt;
- if (m_bias && (*m_bias)[sid] == 0)
- return false;
- stats->sample_cnt++;
- }
- else
- {
- while (next < stop && (stats->good < max_samples ||
- stats->trg.size() < min_diverse))
- {
- next = root->readSid(next,stop,sid);
- next = root->readOffset(next,stop,offset);
- { // brackets required for lock scoping;
- // see sguard immediately below
- boost::lock_guard<boost::mutex> sguard(stats->lock);
- if (stats->raw_cnt == ctr) ++stats->raw_cnt;
- size_t scalefac = (stats->raw_cnt - ctr++);
- size_t rnum = scalefac * (rnd()/(rnd.max()+1.));
- size_t th = (bias_total
- ? ((*m_bias)[sid]/bias_total * m_bias->size()
- * max_samples)
- : max_samples);
-#if 0
- cerr << rnum << "/" << scalefac << " vs. "
- << max_samples - stats->good << " ("
- << max_samples << " - " << stats->good << ")"
- << " th=" << th;
- if (m_bias)
- cerr << " with bias " << (*m_bias)[sid]
- << " => " << (*m_bias)[sid] * m_bias->size();
- else cerr << " without bias";
- cerr << endl;
-#endif
- if (rnum + stats->good < th)
- {
- stats->sample_cnt++;
- ret = true;
- break;
- }
- }
- }
- }
-
- // boost::lock_guard<boost::mutex> sguard(stats->lock);
- // abuse of lock for clean output to cerr
- // cerr << stats->sample_cnt++;
- return ret;
- }
-
- template<typename Token>
- void
- Bitext<Token>::
- agenda::
- add_workers(int n)
- {
- static boost::posix_time::time_duration nodelay(0,0,0,0);
- boost::lock_guard<boost::mutex> guard(this->lock);
-
- int target = max(1, int(n + workers.size() - this->doomed));
- // house keeping: remove all workers that have finished
- for (size_t i = 0; i < workers.size(); )
- {
- if (workers[i]->timed_join(nodelay))
- {
- if (i + 1 < workers.size())
- workers[i].swap(workers.back());
- workers.pop_back();
- }
- else ++i;
- }
- // cerr << workers.size() << "/" << target << " active" << endl;
- if (int(workers.size()) > target)
- this->doomed = workers.size() - target;
- else
- while (int(workers.size()) < target)
- {
- sptr<boost::thread> w(new boost::thread(worker(*this)));
- workers.push_back(w);
- }
- }
-
- template<typename Token>
- void
- Bitext<Token>::
- agenda::
- worker::
- operator()()
- {
- // things to do:
- // - have each worker maintain their own pstats object and merge results at the end;
- // - ensure the minimum size of samples considered by a non-locked counter that is only
- // ever incremented -- who cares if we look at more samples than required, as long
- // as we look at at least the minimum required
- // This way, we can reduce the number of lock / unlock operations we need to do during
- // sampling.
- size_t s1=0, s2=0, e1=0, e2=0;
- ::uint64_t sid=0, offset=0; // of the source phrase
- while(sptr<job> j = ag.get_job())
- {
- j->stats->register_worker();
- vector<uchar> aln;
- bitvector full_alignment(100*100);
- while (j->step(sid,offset))
- {
- aln.clear();
- int po_fwd=Moses::LRModel::NONE,po_bwd=Moses::LRModel::NONE;
- if (j->fwd)
- {
- if (!ag.bt.find_trg_phr_bounds
- (sid,offset,offset+j->len,s1,s2,e1,e2,po_fwd,po_bwd,
- &aln,&full_alignment,false))
- continue;
- }
- else if (!ag.bt.find_trg_phr_bounds
- (sid,offset,offset+j->len,s1,s2,e1,e2,po_fwd,po_bwd,
- &aln,NULL,true)) // NULL,NULL,true))
- continue;
- j->stats->lock.lock();
- j->stats->good += 1;
- j->stats->sum_pairs += (s2-s1+1)*(e2-e1+1);
- ++j->stats->ofwd[po_fwd];
- ++j->stats->obwd[po_bwd];
- j->stats->lock.unlock();
- // for (size_t k = j->fwd ? 1 : 0; k < aln.size(); k += 2)
- for (size_t k = 1; k < aln.size(); k += 2)
- aln[k] += s2 - s1;
- Token const* o = (j->fwd ? ag.bt.T2 : ag.bt.T1)->sntStart(sid);
- float sample_weight = 1./((s2-s1+1)*(e2-e1+1));
-
- vector<typename ::uint64_t> seen;
- seen.reserve(100);
- // It is possible that the phrase extraction extracts the same
- // phrase twice, e.g., when word a co-occurs with sequence b b b
- // but is aligned only to the middle word. We can only count
- // each phrase pair once per source phrase occurrence, or else
- // run the risk of having more joint counts than marginal
- // counts.
-
- for (size_t s = s1; s <= s2; ++s)
- {
- sptr<iter> b = (j->fwd ? ag.bt.I2 : ag.bt.I1)->find(o+s,e1-s);
- if (!b || b->size() < e1 -s)
- UTIL_THROW(util::Exception, "target phrase not found");
- // assert(b);
- for (size_t i = e1; i <= e2; ++i)
- {
- ::uint64_t tpid = b->getPid();
- size_t s = 0;
- while (s < seen.size() && seen[s] != tpid) ++s;
- if (s < seen.size())
- {
-#if 0
- size_t sid, off, len;
- parse_pid(tpid,sid,off,len);
- cerr << "HA, gotcha! " << sid << ":" << off << " at " << HERE << endl;
- for (size_t z = 0; z < len; ++z)
- {
- id_type tid = ag.bt.T2->sntStart(sid)[off+z].id();
- cerr << (*ag.bt.V2)[tid] << " ";
- }
- cerr << endl;
-#endif
- continue;
- }
- seen.push_back(tpid);
- if (! j->stats->add(tpid,sample_weight,aln,
- b->approxOccurrenceCount(),
- po_fwd,po_bwd))
- {
- cerr << "FATAL ERROR AT " << __FILE__
- << ":" << __LINE__ << endl;
- assert(0);
- ostringstream msg;
- for (size_t z = 0; z < j->len; ++z)
- {
- id_type tid = ag.bt.T1->sntStart(sid)[offset+z].id();
- cerr << (*ag.bt.V1)[tid] << " ";
- }
- cerr << endl;
- for (size_t z = s; z <= i; ++z)
- cerr << (*ag.bt.V2)[(o+z)->id()] << " ";
- cerr << endl;
- assert(0);
- UTIL_THROW(util::Exception,"Error in sampling.");
- }
- if (i < e2)
- {
-#ifndef NDEBUG
- bool ok = b->extend(o[i].id());
- assert(ok);
-#else
- b->extend(o[i].id());
- // cerr << "boo" << endl;
-#endif
- }
- }
- // if (j->fwd && s < s2)
- // for (size_t k = j->fwd ? 1 : 0; k < aln.size(); k += 2)
- if (s < s2)
- for (size_t k = 1; k < aln.size(); k += 2)
- --aln[k];
- }
- // j->stats->lock.unlock();
- }
- j->stats->release();
- }
- }
-
- template<typename Token>
- Bitext<Token>::
- agenda::
- job::
- ~job()
- {
- if (stats) stats.reset();
-#if UG_BITEXT_TRACK_ACTIVE_THREADS
- try { --active; } catch (...) {}
-#endif
- // counter may not exist any more at destruction time
- }
-
- template<typename Token>
- Bitext<Token>::
- agenda::
- job::
- job(typename TSA<Token>::tree_iterator const& m,
- sptr<TSA<Token> > const& r, size_t maxsmpl,
- bool isfwd, SamplingBias const* const sntbias)
- : rnd(0)
- , rnddenom(rnd.max() + 1.)
- , min_diverse(10)
- , workers(0)
- , root(r)
- , next(m.lower_bound(-1))
- , stop(m.upper_bound(-1))
- , max_samples(maxsmpl)
- , ctr(0)
- , len(m.size())
- , fwd(isfwd)
- , m_bias(sntbias)
- {
- stats.reset(new pstats());
- stats->raw_cnt = m.approxOccurrenceCount();
- bias_total = 0;
- // we need to renormalize on the fly, as the summ of all sentence probs over
- // all candidates (not all sentences in the corpus) needs to add to 1.
- // Profiling question: how much does that cost us?
- if (m_bias)
- {
- for (char const* x = m.lower_bound(-1); x < stop;)
- {
- uint32_t sid; ushort offset;
- next = root->readSid(next,stop,sid);
- next = root->readOffset(next,stop,offset);
- bias_total += (*m_bias)[sid];
- }
- }
-#if UG_BITEXT_TRACK_ACTIVE_THREADS
- ++active;
- // if (active%5 == 0)
- // cerr << size_t(active) << " active jobs at " << __FILE__ << ":" << __LINE__ << endl;
-#endif
- }
-
- template<typename Token>
- sptr<pstats>
- Bitext<Token>::
- agenda::
- add_job(typename TSA<Token>::tree_iterator const& phrase,
- size_t const max_samples, SamplingBias const* const bias)
- {
- boost::unique_lock<boost::mutex> lk(this->lock);
- static boost::posix_time::time_duration nodelay(0,0,0,0);
- bool fwd = phrase.root == bt.I1.get();
- sptr<job> j(new job(phrase, fwd ? bt.I1 : bt.I2, max_samples, fwd, bias));
- j->stats->register_worker();
-
- joblist.push_back(j);
- if (joblist.size() == 1)
- {
- size_t i = 0;
- while (i < workers.size())
- {
- if (workers[i]->timed_join(nodelay))
- {
- if (doomed)
- {
- if (i+1 < workers.size())
- workers[i].swap(workers.back());
- workers.pop_back();
- --doomed;
- }
- else
- workers[i++] = sptr<boost::thread>(new boost::thread(worker(*this)));
- }
- else ++i;
- }
- }
- return j->stats;
- }
-
- template<typename Token>
- sptr<typename Bitext<Token>::agenda::job>
- Bitext<Token>::
- agenda::
- get_job()
- {
- // cerr << workers.size() << " workers on record" << endl;
- sptr<job> ret;
- if (this->shutdown) return ret;
- boost::unique_lock<boost::mutex> lock(this->lock);
- if (this->doomed)
- {
- --this->doomed;
- return ret;
- }
- typename list<sptr<job> >::iterator j = joblist.begin();
- while (j != joblist.end())
- {
- if ((*j)->done())
- {
- (*j)->stats->release();
- joblist.erase(j++);
- }
- else if ((*j)->workers >= 4)
- {
- ++j;
- }
- else break;
- }
- if (joblist.size())
- {
- ret = j == joblist.end() ? joblist.front() : *j;
- boost::lock_guard<boost::mutex> jguard(ret->lock);
- ++ret->workers;
- }
- return ret;
- }
-
-
- template<typename TKN>
- class mmBitext : public Bitext<TKN>
- {
- void load_document_map(string const& fname);
- public:
- void open(string const base, string const L1, string L2);
- mmBitext();
- };
-
- template<typename TKN>
- mmBitext<TKN>::
- mmBitext()
- : Bitext<TKN>(new mmTtrack<TKN>(),
- new mmTtrack<TKN>(),
- new mmTtrack<char>(),
- new TokenIndex(),
- new TokenIndex(),
- new mmTSA<TKN>(),
- new mmTSA<TKN>())
- {};
-
- template<typename TKN>
- void
- mmBitext<TKN>::
- load_document_map(string const& fname)
- {
- ifstream docmap(fname.c_str());
- // the docmap file should list the documents in the corpus
- // in the order in which they appear with one line per document:
- // <docname> <number of lines / sentences>
- //
- // in the future, we might also allow listing documents with
- // sentence ranges.
- string buffer,docname; size_t a=0,b;
- this->m_sid2docid.reset(new vector<id_type>(this->T1->size()));
- while(getline(docmap,buffer))
- {
- istringstream line(buffer);
- if (!(line>>docname)) continue; // empty line
- if (docname.size() && docname[0] == '#') continue; // comment
- size_t docid = this->m_docname2docid.size();
- this->m_docname2docid[docname] = docid;
- line >> b;
- for (b += a; a < b; ++a)
- (*this->m_sid2docid)[a] = docid;
- }
- UTIL_THROW_IF2(b != this->T1->size(),
- "Document map doesn't match corpus!");
- }
-
- template<typename TKN>
- void
- mmBitext<TKN>::
- open(string const base, string const L1, string L2)
- {
- mmTtrack<TKN>& t1 = *reinterpret_cast<mmTtrack<TKN>*>(this->T1.get());
- mmTtrack<TKN>& t2 = *reinterpret_cast<mmTtrack<TKN>*>(this->T2.get());
- mmTtrack<char>& tx = *reinterpret_cast<mmTtrack<char>*>(this->Tx.get());
- t1.open(base+L1+".mct");
- t2.open(base+L2+".mct");
- tx.open(base+L1+"-"+L2+".mam");
- this->V1->open(base+L1+".tdx"); this->V1->iniReverseIndex();
- this->V2->open(base+L2+".tdx"); this->V2->iniReverseIndex();
- mmTSA<TKN>& i1 = *reinterpret_cast<mmTSA<TKN>*>(this->I1.get());
- mmTSA<TKN>& i2 = *reinterpret_cast<mmTSA<TKN>*>(this->I2.get());
- i1.open(base+L1+".sfa", this->T1);
- i2.open(base+L2+".sfa", this->T2);
- assert(this->T1->size() == this->T2->size());
-
- string docmapfile = base+".dmp";
- if (!access(docmapfile.c_str(),F_OK))
- load_document_map(docmapfile);
- }
-
-
- template<typename TKN>
- class imBitext : public Bitext<TKN>
- {
- sptr<imTtrack<char> > myTx;
- sptr<imTtrack<TKN> > myT1;
- sptr<imTtrack<TKN> > myT2;
- sptr<imTSA<TKN> > myI1;
- sptr<imTSA<TKN> > myI2;
- static ThreadSafeCounter my_revision;
- public:
- size_t revision() const { return my_revision; }
- void open(string const base, string const L1, string L2);
- imBitext(sptr<TokenIndex> const& V1,
- sptr<TokenIndex> const& V2,
- size_t max_sample = 5000);
- imBitext(size_t max_sample = 5000);
- imBitext(imBitext const& other);
-
- // sptr<imBitext<TKN> >
- // add(vector<TKN> const& s1, vector<TKN> const& s2, vector<ushort> & a);
-
- sptr<imBitext<TKN> >
- add(vector<string> const& s1,
- vector<string> const& s2,
- vector<string> const& a) const;
-
- };
-
- template<typename TKN>
- ThreadSafeCounter
- imBitext<TKN>::my_revision;
-
- template<typename TKN>
- imBitext<TKN>::
- imBitext(size_t max_sample)
- {
- this->default_sample_size = max_sample;
- this->V1.reset(new TokenIndex());
- this->V2.reset(new TokenIndex());
- this->V1->setDynamic(true);
- this->V2->setDynamic(true);
- ++my_revision;
- }
-
- template<typename TKN>
- imBitext<TKN>::
- imBitext(sptr<TokenIndex> const& v1,
- sptr<TokenIndex> const& v2,
- size_t max_sample)
- {
- this->default_sample_size = max_sample;
- this->V1 = v1;
- this->V2 = v2;
- this->V1->setDynamic(true);
- this->V2->setDynamic(true);
- ++my_revision;
- }
-
-
- template<typename TKN>
- imBitext<TKN>::
- imBitext(imBitext<TKN> const& other)
- {
- this->myTx = other.myTx;
- this->myT1 = other.myT1;
- this->myT2 = other.myT2;
- this->myI1 = other.myI1;
- this->myI2 = other.myI2;
- this->Tx = this->myTx;
- this->T1 = this->myT1;
- this->T2 = this->myT2;
- this->I1 = this->myI1;
- this->I2 = this->myI2;
- this->V1 = other.V1;
- this->V2 = other.V2;
- this->default_sample_size = other.default_sample_size;
- this->num_workers = other.num_workers;
- ++my_revision;
- }
-
template<typename TKN> class snt_adder;
template<> class snt_adder<L2R_Token<SimpleWordId> >;
- template<>
+ template<>
class snt_adder<L2R_Token<SimpleWordId> >
{
typedef L2R_Token<SimpleWordId> TKN;
- vector<string> const & snt;
+ std::vector<string> const & snt;
TokenIndex & V;
sptr<imTtrack<TKN> > & track;
sptr<imTSA<TKN > > & index;
public:
- snt_adder(vector<string> const& s, TokenIndex& v,
+ snt_adder(std::vector<string> const& s, TokenIndex& v,
sptr<imTtrack<TKN> >& t, sptr<imTSA<TKN> >& i);
-
+
void operator()();
};
- // template<typename TKN>
- // class snt_adder
- // {
- // vector<string> const & snt;
- // TokenIndex & V;
- // sptr<imTtrack<TKN> > & track;
- // sptr<imTSA<TKN > > & index;
- // public:
- // snt_adder(vector<string> const& s, TokenIndex& v,
- // sptr<imTtrack<TKN> >& t, sptr<imTSA<TKN> >& i);
-
- // template<typename T>
- // void operator()();
- // };
-
- // // template<>
- // void
- // snt_adder<L2R_Token<SimpleWordId> >::
- // operator()();
-
- // template<>
- // void
- // snt_adder<char>::
- // operator()()
- // {
- // vector<id_type> sids;
- // sids.reserve(snt.size());
- // BOOST_FOREACH(string const& s, snt)
- // {
- // sids.push_back(track ? track->size() : 0);
- // istringstream buf(s);
- // string w;
- // vector<char> s;
- // s.reserve(100);
- // while (buf >> w)
- // s.push_back(vector<char>(V[w]));
- // track = append(track,s);
- // }
- // index.reset(new imTSA<char>(*index,track,sids,V.tsize()));
- // }
-
- // template<typename TKN>
- // snt_adder<TKN>::
- // snt_adder(vector<string> const& s, TokenIndex& v,
- // sptr<imTtrack<TKN> >& t, sptr<imTSA<TKN> >& i)
- // : snt(s), V(v), track(t), index(i)
- // {
- // throw "Not implemented yet.";
- // }
-
- template<>
- sptr<imBitext<L2R_Token<SimpleWordId> > >
- imBitext<L2R_Token<SimpleWordId> >::
- add(vector<string> const& s1,
- vector<string> const& s2,
- vector<string> const& aln) const;
-
- template<typename TKN>
- sptr<imBitext<TKN> >
- imBitext<TKN>::
- add(vector<string> const& s1,
- vector<string> const& s2,
- vector<string> const& aln) const
- {
- throw "Not yet implemented";
- }
- // template<typename TKN>
- // sptr<imBitext<TKN> >
- // imBitext<TKN>::
- // add(vector<TKN> const& s1, vector<TKN> const& s2, vector<ushort> & a)
- // {
- // boost::lock_guard<boost::mutex> guard(this->lock);
- // sptr<imBitext<TKN> > ret(new imBitext<TKN>());
- // vector<id_type> sids(1,this->myT1.size()-1);
- // ret->myT1 = add(this->myT1,s1);
- // ret->myT2 = add(this->myT2,s2);
- // size_t v1size = this->V1.tsize();
- // size_t v2size = this->V2.tsize();
- // BOOST_FOREACH(TKN const& t, s1) { if (t->id() >= v1size) v1size = t->id() + 1; }
- // BOOST_FOREACH(TKN const& t, s2) { if (t->id() >= v2size) v2size = t->id() + 1; }
- // ret->myI1.reset(new imTSA<TKN>(*this->I1,ret->myT1,sids,v1size));
- // ret->myI2.reset(new imTSA<TKN>(*this->I2,ret->myT2,sids,v2size));
- // ostringstream abuf;
- // BOOST_FOREACH(ushort x, a) binwrite(abuf,x);
- // vector<char> foo(abuf.str().begin(),abuf.str().end());
- // ret->myTx = add(this->myTx,foo);
- // ret->T1 = ret->myT1;
- // ret->T2 = ret->myT2;
- // ret->Tx = ret->myTx;
- // ret->I1 = ret->myI1;
- // ret->I2 = ret->myI2;
- // ret->V1 = this->V1;
- // ret->V2 = this->V2;
- // return ret;
- // }
-
-
- // template<typename TKN>
- // imBitext<TKN>::
- // imBitext()
- // : Bitext<TKN>(new imTtrack<TKN>(),
- // new imTtrack<TKN>(),
- // new imTtrack<char>(),
- // new TokenIndex(),
- // new TokenIndex(),
- // new imTSA<TKN>(),
- // new imTSA<TKN>())
- // {}
-
-
- template<typename TKN>
- void
- imBitext<TKN>::
- open(string const base, string const L1, string L2)
- {
- mmTtrack<TKN>& t1 = *reinterpret_cast<mmTtrack<TKN>*>(this->T1.get());
- mmTtrack<TKN>& t2 = *reinterpret_cast<mmTtrack<TKN>*>(this->T2.get());
- mmTtrack<char>& tx = *reinterpret_cast<mmTtrack<char>*>(this->Tx.get());
- t1.open(base+L1+".mct");
- t2.open(base+L2+".mct");
- tx.open(base+L1+"-"+L2+".mam");
- this->V1->open(base+L1+".tdx"); this->V1->iniReverseIndex();
- this->V2->open(base+L2+".tdx"); this->V2->iniReverseIndex();
- mmTSA<TKN>& i1 = *reinterpret_cast<mmTSA<TKN>*>(this->I1.get());
- mmTSA<TKN>& i2 = *reinterpret_cast<mmTSA<TKN>*>(this->I2.get());
- i1.open(base+L1+".sfa", this->T1);
- i2.open(base+L2+".sfa", this->T2);
- assert(this->T1->size() == this->T2->size());
- }
-
template<typename Token>
bool
Bitext<Token>::
find_trg_phr_bounds
- (size_t const sid,
+ (size_t const sid,
size_t const start, size_t const stop,
size_t & s1, size_t & s2, size_t & e1, size_t & e2,
int & po_fwd, int & po_bwd,
- vector<uchar>* core_alignment, bitvector* full_alignment,
+ std::vector<uchar>* core_alignment, bitvector* full_alignment,
bool const flip) const
{
// if (core_alignment) cout << "HAVE CORE ALIGNMENT" << endl;
// a word on the core_alignment:
- //
+ //
// since fringe words ([s1,...,s2),[e1,..,e2) if s1 < s2, or e1
// < e2, respectively) are be definition unaligned, we store
// only the core alignment in *core_alignment it is up to the
@@ -1429,7 +354,7 @@ namespace Moses {
size_t src,trg;
size_t lft = forbidden.size();
size_t rgt = 0;
- vector<vector<ushort> > aln1(slen1),aln2(slen2);
+ std::vector<std::vector<ushort> > aln1(slen1),aln2(slen2);
char const* p = Tx->sntStart(sid);
char const* x = Tx->sntEnd(sid);
@@ -1439,18 +364,18 @@ namespace Moses {
else { p = binread(p,src); assert(p<x); p = binread(p,trg); }
UTIL_THROW_IF2((src >= slen1 || trg >= slen2),
- "Alignment range error at sentence " << sid << "!\n"
- << src << "/" << slen1 << " " <<
+ "Alignment range error at sentence " << sid << "!\n"
+ << src << "/" << slen1 << " " <<
trg << "/" << slen2);
-
- if (src < start || src >= stop)
+
+ if (src < start || src >= stop)
forbidden.set(trg);
else
{
lft = min(lft,trg);
rgt = max(rgt,trg);
}
- if (core_alignment)
+ if (core_alignment)
{
aln1[src].push_back(trg);
aln2[trg].push_back(src);
@@ -1458,16 +383,16 @@ namespace Moses {
if (full_alignment)
full_alignment->set(src*slen2 + trg);
}
-
+
for (size_t i = lft; i <= rgt; ++i)
- if (forbidden[i])
+ if (forbidden[i])
return false;
-
+
s2 = lft; for (s1 = s2; s1 && !forbidden[s1-1]; --s1);
e1 = rgt+1; for (e2 = e1; e2 < forbidden.size() && !forbidden[e2]; ++e2);
-
+
if (lft > rgt) return false;
- if (core_alignment)
+ if (core_alignment)
{
core_alignment->clear();
for (size_t i = start; i < stop; ++i)
@@ -1486,60 +411,71 @@ namespace Moses {
}
template<typename Token>
- void
+ sptr<DocumentBias>
Bitext<Token>::
- prep(iter const& phrase, SamplingBias const* const bias) const
+ SetupDocumentBias
+ ( string const& bserver, string const& text, ostream* log ) const
{
- prep2(phrase, this->default_sample_size,bias);
+ sptr<DocumentBias> ret;
+ ret.reset(new DocumentBias(*m_sid2docid, m_docname2docid,
+ bserver, text, log));
+ return ret;
}
template<typename Token>
- sptr<pstats>
+ void
Bitext<Token>::
- prep2(iter const& phrase, size_t const max_sample,
- SamplingBias const* const bias) const
+ prep(ttasksptr const& ttask, iter const& phrase) const
{
- boost::lock_guard<boost::mutex> guard(this->lock);
- if (!ag)
+ prep2(ttask, phrase, m_default_sample_size);
+ }
+
+ // prep2 schedules a phrase for sampling, and returns immediately
+ // the member function lookup retrieves the respective pstats instance
+ // and waits until the sampling is finished before it returns.
+ // This allows sampling in the background
+ template<typename Token>
+ sptr<pstats>
+ Bitext<Token>
+ ::prep2
+ ( ttasksptr const& ttask, iter const& phrase, int max_sample) const
+ {
+ if (max_sample < 0) max_sample = m_default_sample_size;
+ sptr<ContextScope> scope = ttask->GetScope();
+ sptr<ContextForQuery> context = scope->get<ContextForQuery>(this);
+ sptr<SamplingBias> bias;
+ if (context) bias = context->bias;
+ sptr<pstats::cache_t> cache;
+
+ // - no caching for rare phrases and special requests (max_sample)
+ // (still need to test what a good caching threshold is ...)
+ // - use the task-specific cache when there is a sampling bias
+ if (max_sample == int(m_default_sample_size)
+ && phrase.approxOccurrenceCount() > m_pstats_cache_threshold)
{
- ag.reset(new agenda(*this));
- if (this->num_workers > 1)
- ag->add_workers(this->num_workers);
+ cache = (phrase.root == I1.get()
+ ? (bias ? context->cache1 : m_cache1)
+ : (bias ? context->cache2 : m_cache2));
+ // if (bias) cerr << "Using bias." << endl;
}
sptr<pstats> ret;
-#if 1
- // use pcache only for plain sentence input
- // in the long run, we should create a PT clone for every bias
- // as part of a document-specific decoder
- if (StaticData::Instance().GetInputType() == SentenceInput &&
- max_sample == this->default_sample_size && bias == NULL &&
- phrase.approxOccurrenceCount() > m_pstats_cache_threshold)
- {
- // still need to test what a good caching threshold is
- // is caching here the cause of the apparent memory leak in
- // confusion network decoding ???? No, it isn't.
- // That was because of naive, brute-force input path generation.
- ::uint64_t pid = phrase.getPid();
- pcache_t & cache(phrase.root == &(*this->I1) ? cache1 : cache2);
- pcache_t::value_type entry(pid,sptr<pstats>());
- pair<pcache_t::iterator,bool> foo;
- foo = cache.insert(entry);
- if (foo.second)
- {
- // cerr << "NEW FREQUENT PHRASE: "
- // << phrase.str(V1.get()) << " " << phrase.approxOccurrenceCount()
- // << " at " << __FILE__ << ":" << __LINE__ << endl;
- foo.first->second = ag->add_job(phrase, max_sample,NULL);
- assert(foo.first->second);
- }
- assert(foo.first->second);
- ret = foo.first->second;
- assert(ret);
+ sptr<pstats> const* cached;
+
+ if (cache && (cached = cache->get(phrase.getPid(), ret)) && *cached)
+ return *cached;
+ boost::unique_lock<boost::shared_mutex> guard(m_lock);
+ if (!ag)
+ {
+ ag.reset(new agenda(*this));
+ if (m_num_workers > 1)
+ ag->add_workers(m_num_workers);
}
- else
-#endif
- ret = ag->add_job(phrase, max_sample,bias);
- assert(ret);
+ // cerr << "NEW FREQUENT PHRASE: "
+ // << phrase.str(V1.get()) << " " << phrase.approxOccurrenceCount()
+ // << " at " << __FILE__ << ":" << __LINE__ << endl;
+ ret = ag->add_job(this, phrase, max_sample, bias);
+ if (cache) cache->set(phrase.getPid(),ret);
+ UTIL_THROW_IF2(ret == NULL, "Couldn't schedule sampling job.");
return ret;
}
@@ -1549,20 +485,20 @@ namespace Moses {
{
Ttrack<Token> const& m_other;
sptr<pstats> m_pstats;
- vector<PhrasePair<Token> >& m_pplist;
+ std::vector<PhrasePair<Token> >& m_pplist;
typename PhrasePair<Token>::Scorer const* m_scorer;
PhrasePair<Token> m_pp;
Token const* m_token;
size_t m_len;
- ::uint64_t m_pid1;
+ uint64_t m_pid1;
bool m_is_inverse;
public:
// CONSTRUCTOR
pstats2pplist(typename TSA<Token>::tree_iterator const& m,
Ttrack<Token> const& other,
- sptr<pstats> const& ps,
- vector<PhrasePair<Token> >& dest,
+ sptr<pstats> const& ps,
+ std::vector<PhrasePair<Token> >& dest,
typename PhrasePair<Token>::Scorer const* scorer)
: m_other(other)
, m_pstats(ps)
@@ -1573,17 +509,17 @@ namespace Moses {
, m_pid1(m.getPid())
, m_is_inverse(false)
{ }
-
+
// WORKER
- void
- operator()()
+ void
+ operator()()
{
// wait till all statistics have been collected
boost::unique_lock<boost::mutex> lock(m_pstats->lock);
while (m_pstats->in_progress)
m_pstats->ready.wait(lock);
- m_pp.init(m_pid1, m_is_inverse, m_token,m_len,m_pstats.get(),0);
+ m_pp.init(m_pid1, m_is_inverse, m_token,m_len,m_pstats.get(),0);
// convert pstats entries to phrase pairs
pstats::trg_map_t::iterator a;
@@ -1592,10 +528,11 @@ namespace Moses {
uint32_t sid,off,len;
parse_pid(a->first, sid, off, len);
m_pp.update(a->first, m_other.sntStart(sid)+off, len, a->second);
- m_pp.good2 = max(uint32_t(m_pp.raw2 * float(m_pp.good1)/m_pp.raw1),m_pp.joint);
+ m_pp.good2 = max(uint32_t(m_pp.raw2 * float(m_pp.good1)/m_pp.raw1),
+ m_pp.joint);
size_t J = m_pp.joint<<7; // hard coded threshold of 1/128
- if (m_pp.good1 > J || m_pp.good2 > J) continue;
- if (m_scorer)
+ if (m_pp.good1 > J || m_pp.good2 > J) continue;
+ if (m_scorer)
{
(*m_scorer)(m_pp);
}
@@ -1605,26 +542,27 @@ namespace Moses {
if (m_scorer) sort(m_pplist.begin(), m_pplist.end(),sorter);
}
};
-
+
+#if 0
template<typename Token>
void
Bitext<Token>::
- lookup(vector<Token> const& snt, TSA<Token>& idx,
- vector<vector<sptr<vector<PhrasePair<Token> > > > >& dest,
- vector<vector<typename ::uint64_t> >* pidmap,
+ lookup(std::vector<Token> const& snt, TSA<Token>& idx,
+ std::vector<std::vector<sptr<std::vector<PhrasePair<Token> > > > >& dest,
+ std::vector<std::vector<uint64_t> >* pidmap,
typename PhrasePair<Token>::Scorer* scorer,
- SamplingBias const* const bias, bool multithread) const
+ sptr<SamplingBias const> const& bias, bool multithread) const
{
- // typedef vector<vector<sptr<vector<PhrasePair<Token> > > > > ret_t;
-
- dest.clear();
+ // typedef std::vector<std::vector<sptr<std::vector<PhrasePair<Token> > > > > ret_t;
+
+ dest.clear();
dest.resize(snt.size());
if (pidmap) { pidmap->clear(); pidmap->resize(snt.size()); }
- // collect statistics in parallel, then build PT entries as
+ // collect statistics in parallel, then build PT entries as
// the sampling finishes
bool fwd = &idx == I1.get();
- vector<boost::thread*> workers; // background threads doing the lookup
+ std::vector<boost::thread*> workers; // background threads doing the lookup
pplist_cache_t& C = (fwd ? m_pplist_cache1 : m_pplist_cache2);
if (C.capacity() < 100000) C.reserve(100000);
for (size_t i = 0; i < snt.size(); ++i)
@@ -1633,19 +571,19 @@ namespace Moses {
typename TSA<Token>::tree_iterator m(&idx);
for (size_t k = i; k < snt.size() && m.extend(snt[k].id()); ++k)
{
- ::uint64_t key = m.getPid();
+ uint64_t key = m.getPid();
if (pidmap) (*pidmap)[i].push_back(key);
- sptr<vector<PhrasePair<Token> > > pp = C.get(key);
- if (pp)
+ sptr<std::vector<PhrasePair<Token> > > pp = C.get(key);
+ if (pp)
dest[i].push_back(pp);
- else
+ else
{
- pp.reset(new vector<PhrasePair<Token> >());
+ pp.reset(new std::vector<PhrasePair<Token> >());
C.set(key,pp);
dest[i].push_back(pp);
sptr<pstats> x = prep2(m, this->default_sample_size,bias);
pstats2pplist<Token> w(m,*(fwd?T2:T1),x,*pp,scorer);
- if (multithread)
+ if (multithread)
{
boost::thread* t = new boost::thread(w);
workers.push_back(t);
@@ -1654,24 +592,30 @@ namespace Moses {
}
}
}
- for (size_t w = 0; w < workers.size(); ++w)
+ for (size_t w = 0; w < workers.size(); ++w)
{
- workers[w]->join();
+ workers[w]->join();
delete workers[w];
}
}
+#endif
template<typename Token>
- sptr<pstats>
+ sptr<pstats>
Bitext<Token>::
- lookup(iter const& phrase, SamplingBias const* const bias) const
+ lookup(ttasksptr const& ttask, iter const& phrase, int max_sample) const
{
- sptr<pstats> ret = prep2(phrase, this->default_sample_size, bias);
- assert(ret);
- boost::lock_guard<boost::mutex> guard(this->lock);
- if (this->num_workers <= 1)
- typename agenda::worker(*this->ag)();
- else
+ sptr<pstats> ret = prep2(ttask, phrase, max_sample);
+
+ UTIL_THROW_IF2(!ret, "Got NULL pointer where I expected a valid pointer.");
+
+ // Why were we locking here?
+ if (m_num_workers <= 1)
+ {
+ boost::unique_lock<boost::shared_mutex> guard(m_lock);
+ typename agenda::worker(*this->ag)();
+ }
+ else
{
boost::unique_lock<boost::mutex> lock(ret->lock);
while (ret->in_progress)
@@ -1681,16 +625,120 @@ namespace Moses {
}
template<typename Token>
- sptr<pstats>
+ void
+ Bitext<Token>
+ ::mark_match(Token const* start, Token const* end,
+ iter const& m, bitvector& check) const
+ {
+ check.resize(end-start);
+ check.reset();
+ Token const* x = m.getToken(0);
+ for (Token const* s = start; s < end; ++s)
+ {
+ if (s->id() != x->id()) continue;
+ Token const* a = x;
+ Token const* b = s;
+ size_t i = 0;
+ while (a && b && a->id() == b->id() && i < m.size())
+ {
+ ++i;
+ a = a->next();
+ b = b->next();
+ }
+ if (i == m.size())
+ {
+ b = s;
+ while (i-- > 0) { check.set(b-start); b = b->next(); }
+ }
+ }
+ }
+
+ template<typename Token>
+ void
+ Bitext<Token>::
+ write_yawat_alignment
+ ( id_type const sid, iter const* m1, iter const* m2, ostream& out ) const
+ {
+ vector<int> a1(T1->sntLen(sid),-1), a2(T2->sntLen(sid),-1);
+ bitvector f1(a1.size()), f2(a2.size());
+ if (m1) mark_match(T1->sntStart(sid), T1->sntEnd(sid), *m1, f1);
+ if (m2) mark_match(T2->sntStart(sid), T2->sntEnd(sid), *m2, f2);
+
+ vector<pair<bitvector,bitvector> > agroups;
+ vector<string> grouplabel;
+ pair<bitvector,bitvector> ag;
+ ag.first.resize(a1.size());
+ ag.second.resize(a2.size());
+ char const* x = Tx->sntStart(sid);
+ size_t a, b;
+ while (x < Tx->sntEnd(sid))
+ {
+ x = binread(x,a);
+ x = binread(x,b);
+ if (a1.at(a) < 0 && a2.at(b) < 0)
+ {
+ a1[a] = a2[b] = agroups.size();
+ ag.first.reset();
+ ag.second.reset();
+ ag.first.set(a);
+ ag.second.set(b);
+ agroups.push_back(ag);
+ grouplabel.push_back(f1[a] || f2[b] ? "infocusbi" : "unspec");
+ }
+ else if (a1.at(a) < 0)
+ {
+ a1[a] = a2[b];
+ agroups[a2[b]].first.set(a);
+ if (f1[a] || f2[b]) grouplabel[a1[a]] = "infocusbi";
+ }
+ else if (a2.at(b) < 0)
+ {
+ a2[b] = a1[a];
+ agroups[a1[a]].second.set(b);
+ if (f1[a] || f2[b]) grouplabel[a1[a]] = "infocusbi";
+ }
+ else
+ {
+ agroups[a1[a]].first |= agroups[a2[b]].first;
+ agroups[a1[a]].second |= agroups[a2[b]].second;
+ a2[b] = a1[a];
+ if (f1[a] || f2[b]) grouplabel[a1[a]] = "infocusbi";
+ }
+ }
+
+ for (a = 0; a < a1.size(); ++a)
+ {
+ if (a1[a] < 0)
+ {
+ if (f1[a]) out << a << "::" << "infocusmono ";
+ continue;
+ }
+ bitvector const& A = agroups[a1[a]].first;
+ bitvector const& B = agroups[a1[a]].second;
+ if (A.find_first() < a) continue;
+ write_bitvector(A,out); out << ":";
+ write_bitvector(B,out); out << ":";
+ out << grouplabel[a1[a]] << " ";
+ }
+ for (b = 0; b < a2.size(); ++b)
+ {
+ if (a2[b] < 0 && f2[b])
+ out << "::" << "infocusmono ";
+ }
+ }
+
+#if 0
+ template<typename Token>
+ sptr<pstats>
Bitext<Token>::
- lookup(iter const& phrase, size_t const max_sample,
- SamplingBias const* const bias) const
+ lookup(siter const& phrase, size_t const max_sample,
+ sptr<SamplingBias const> const& bias) const
{
sptr<pstats> ret = prep2(phrase, max_sample);
- boost::lock_guard<boost::mutex> guard(this->lock);
+ boost::unique_lock<boost::shared_mutex> guard(m_lock);
if (this->num_workers <= 1)
typename agenda::worker(*this->ag)();
- else
+ else
{
boost::unique_lock<boost::mutex> lock(ret->lock);
while (ret->in_progress)
@@ -1698,75 +746,58 @@ namespace Moses {
}
return ret;
}
-
- template<typename Token>
- Bitext<Token>::
- agenda::
- ~agenda()
- {
- this->lock.lock();
- this->shutdown = true;
- this->lock.unlock();
- for (size_t i = 0; i < workers.size(); ++i)
- workers[i]->join();
- }
-
- template<typename Token>
- Bitext<Token>::
- agenda::
- agenda(Bitext<Token> const& thebitext)
- : shutdown(false), doomed(0), bt(thebitext)
- { }
-
- template<typename Token>
- bool
- Bitext<Token>::
- agenda::
- job::
- done() const
- {
- return (max_samples && stats->good >= max_samples) || next == stop;
- }
-
-#if UG_BITEXT_TRACK_ACTIVE_THREADS
- template<typename TKN>
- ThreadSafeCounter
- Bitext<TKN>::
- agenda::
- job::active;
#endif
template<typename Token>
- void
- expand(typename Bitext<Token>::iter const& m,
- Bitext<Token> const& bt,
- pstats const& ps, vector<PhrasePair<Token> >& dest)
+ void
+ expand(typename Bitext<Token>::iter const& m,
+ Bitext<Token> const& bt, pstats const& ps,
+ std::vector<PhrasePair<Token> >& dest, ostream* log)
{
bool fwd = m.root == bt.I1.get();
dest.reserve(ps.trg.size());
PhrasePair<Token> pp;
pp.init(m.getPid(), !fwd, m.getToken(0), m.size(), &ps, 0);
- // cout << HERE << " " << toString(*(fwd ? bt.V1 : bt.V2), pp.start1,pp.len1) << endl;
+ // cout << HERE << " "
+ // << toString(*(fwd ? bt.V1 : bt.V2), pp.start1,pp.len1) << endl;
pstats::trg_map_t::const_iterator a;
for (a = ps.trg.begin(); a != ps.trg.end(); ++a)
{
uint32_t sid,off,len;
parse_pid(a->first, sid, off, len);
- pp.update(a->first, (fwd ? bt.T2 : bt.T1)->sntStart(sid)+off,
+ pp.update(a->first, (fwd ? bt.T2 : bt.T1)->sntStart(sid)+off,
len, a->second);
dest.push_back(pp);
}
+ }
+
#if 0
- typename PhrasePair<Token>::SortByTargetIdSeq sorter;
- sort(dest.begin(), dest.end(),sorter);
- BOOST_FOREACH(PhrasePair<Token> const& p, dest)
- cout << toString (*(fwd ? bt.V1 : bt.V2),p.start1,p.len1) << " ::: "
- << toString (*(fwd ? bt.V2 : bt.V1),p.start2,p.len2) << " "
- << p.joint << endl;
+ template<typename Token>
+ class
+ PStatsCache
+ {
+ typedef boost::unordered_map<uint64_t, sptr<pstats> > my_cache_t;
+ boost::shared_mutex m_lock;
+ my_cache_t m_cache;
+
+ public:
+ sptr<pstats> get(Bitext<Token>::iter const& phrase) const;
+
+ sptr<pstats>
+ add(Bitext<Token>::iter const& phrase) const
+ {
+ uint64_t pid = phrase.getPid();
+ std::pair<my_cache_t::iterator,bool>
+ }
+
+
+ };
#endif
- }
-
} // end of namespace bitext
} // end of namespace moses
-#endif
+
+#include "ug_im_bitext.h"
+#include "ug_mm_bitext.h"
+
+
diff --git a/moses/TranslationModel/UG/mm/ug_bitext_agenda.h b/moses/TranslationModel/UG/mm/ug_bitext_agenda.h
new file mode 100644
index 000000000..d07fba6aa
--- /dev/null
+++ b/moses/TranslationModel/UG/mm/ug_bitext_agenda.h
@@ -0,0 +1,186 @@
+// -*- c++ -*-
+// to be included from ug_bitext.h
+
+// The agenda handles parallel sampling.
+// It maintains a queue of unfinished sampling jobs and
+// assigns them to a pool of workers.
+//
+template<typename Token>
+class Bitext<Token>
+::agenda
+{
+public:
+ class job;
+ class worker;
+private:
+ boost::mutex lock;
+ std::list<sptr<job> > joblist;
+ std::vector<sptr<boost::thread> > workers;
+ bool shutdown;
+ size_t doomed;
+
+public:
+
+
+ Bitext<Token> const& bt;
+
+ agenda(Bitext<Token> const& bitext);
+ ~agenda();
+
+ void
+ add_workers(int n);
+
+ sptr<pstats>
+ add_job(Bitext<Token> const* const theBitext,
+ typename TSA<Token>::tree_iterator const& phrase,
+ size_t const max_samples, sptr<SamplingBias const> const& bias);
+ // add_job(Bitext<Token> const* const theBitext,
+ // typename TSA<Token>::tree_iterator const& phrase,
+ // size_t const max_samples, SamplingBias const* const bias);
+
+ sptr<job>
+ get_job();
+};
+
+template<typename Token>
+class
+Bitext<Token>::agenda::
+worker
+{
+ agenda& ag;
+public:
+ worker(agenda& a) : ag(a) {}
+ void operator()();
+};
+
+#include "ug_bitext_agenda_worker.h"
+#include "ug_bitext_agenda_job.h"
+
+template<typename Token>
+void Bitext<Token>
+::agenda
+::add_workers(int n)
+{
+ static boost::posix_time::time_duration nodelay(0,0,0,0);
+ boost::lock_guard<boost::mutex> guard(this->lock);
+
+ int target = max(1, int(n + workers.size() - this->doomed));
+ // house keeping: remove all workers that have finished
+ for (size_t i = 0; i < workers.size(); )
+ {
+ if (workers[i]->timed_join(nodelay))
+ {
+ if (i + 1 < workers.size())
+ workers[i].swap(workers.back());
+ workers.pop_back();
+ }
+ else ++i;
+ }
+ // cerr << workers.size() << "/" << target << " active" << endl;
+ if (int(workers.size()) > target)
+ this->doomed = workers.size() - target;
+ else
+ while (int(workers.size()) < target)
+ {
+ sptr<boost::thread> w(new boost::thread(worker(*this)));
+ workers.push_back(w);
+ }
+}
+
+
+template<typename Token>
+sptr<pstats> Bitext<Token>
+::agenda
+::add_job(Bitext<Token> const* const theBitext,
+ typename TSA<Token>::tree_iterator const& phrase,
+ size_t const max_samples, sptr<SamplingBias const> const& bias)
+{
+ boost::unique_lock<boost::mutex> lk(this->lock);
+ static boost::posix_time::time_duration nodelay(0,0,0,0);
+ bool fwd = phrase.root == bt.I1.get();
+ sptr<job> j(new job(theBitext, phrase, fwd ? bt.I1 : bt.I2,
+ max_samples, fwd, bias));
+ j->stats->register_worker();
+
+ joblist.push_back(j);
+ if (joblist.size() == 1)
+ {
+ size_t i = 0;
+ while (i < workers.size())
+ {
+ if (workers[i]->timed_join(nodelay))
+ {
+ if (doomed)
+ {
+ if (i+1 < workers.size())
+ workers[i].swap(workers.back());
+ workers.pop_back();
+ --doomed;
+ }
+ else
+ workers[i++] = sptr<boost::thread>(new boost::thread(worker(*this)));
+ }
+ else ++i;
+ }
+ }
+ return j->stats;
+}
+
+template<typename Token>
+sptr<typename Bitext<Token>::agenda::job>
+Bitext<Token>
+::agenda
+::get_job()
+{
+ // cerr << workers.size() << " workers on record" << endl;
+ sptr<job> ret;
+ if (this->shutdown) return ret;
+ boost::unique_lock<boost::mutex> lock(this->lock);
+ if (this->doomed)
+ { // the number of workers has been reduced, tell the redundant once to quit
+ --this->doomed;
+ return ret;
+ }
+
+ typename list<sptr<job> >::iterator j = joblist.begin();
+ while (j != joblist.end())
+ {
+ if ((*j)->done())
+ {
+ (*j)->stats->release();
+ joblist.erase(j++);
+ }
+ else if ((*j)->workers >= 4) ++j; // no more than 4 workers per job
+ else break; // found one
+ }
+ if (joblist.size())
+ {
+ ret = j == joblist.end() ? joblist.front() : *j;
+ // if we've reached the end of the queue (all jobs have 4 workers on them),
+ // take the first in the queue
+ boost::lock_guard<boost::mutex> jguard(ret->lock);
+ ++ret->workers;
+ }
+ return ret;
+}
+
+template<typename Token>
+Bitext<Token>::
+agenda::
+~agenda()
+{
+ this->lock.lock();
+ this->shutdown = true;
+ this->lock.unlock();
+ for (size_t i = 0; i < workers.size(); ++i)
+ workers[i]->join();
+}
+
+template<typename Token>
+Bitext<Token>::
+agenda::
+agenda(Bitext<Token> const& thebitext)
+ : shutdown(false), doomed(0), bt(thebitext)
+{ }
+
+
diff --git a/moses/TranslationModel/UG/mm/ug_bitext_agenda_job.h b/moses/TranslationModel/UG/mm/ug_bitext_agenda_job.h
new file mode 100644
index 000000000..0e0624351
--- /dev/null
+++ b/moses/TranslationModel/UG/mm/ug_bitext_agenda_job.h
@@ -0,0 +1,240 @@
+// -*- c++ -*-
+// class declaration of template<typename Token> class Bitxt<Token>::agenda::job
+// to be included by ug_bitext.h
+// todo: add check to enforce this
+
+template<typename Token>
+class
+Bitext<Token>::agenda::
+job
+{
+#if UG_BITEXT_TRACK_ACTIVE_THREADS
+ static ThreadSafeCounter active;
+#endif
+ Bitext<Token> const* const m_bitext;
+ boost::mutex lock;
+ friend class agenda;
+ boost::taus88 rnd; // every job has its own pseudo random generator
+ double rnddenom; // denominator for scaling random sampling
+ size_t min_diverse; // minimum number of distinct translations
+
+ bool flip_coin(uint64_t & sid, uint64_t & offset);
+ bool step(uint64_t & sid, uint64_t & offset); // proceed to next occurrence
+
+public:
+ size_t workers; // how many workers are working on this job?
+ sptr<TSA<Token> const> root; // root of the underlying suffix array
+ char const* next; // next position to read from
+ char const* stop; // end of index range
+ size_t max_samples; // how many samples to extract at most
+ size_t ctr; /* # of phrase occurrences considered so far
+ * # of samples chosen is stored in stats->good
+ */
+ size_t len; // phrase length
+ bool fwd; // if true, source phrase is L1
+ sptr<pstats> stats; // stores statistics collected during sampling
+ sptr<SamplingBias const> const m_bias; // sentence-level bias for sampling
+ float bias_total;
+ bool nextSample(uint64_t & sid, uint64_t & offset); // select next occurrence
+
+ int
+ check_sample_distribution(uint64_t const& sid, uint64_t const& offset);
+ // for biased sampling: ensure the distribution approximately matches
+ // the bias
+
+ bool done() const;
+ job(Bitext<Token> const* const theBitext,
+ typename TSA<Token>::tree_iterator const& m,
+ sptr<TSA<Token> > const& r, size_t maxsmpl, bool isfwd,
+ sptr<SamplingBias const> const& bias);
+ ~job();
+};
+
+template<typename Token>
+Bitext<Token>::agenda::job
+::~job()
+{
+ if (stats) stats.reset();
+#if UG_BITEXT_TRACK_ACTIVE_THREADS
+ // counter may not exist any more at destruction time, hence try .. catch ...
+ try { --active; } catch (...) {}
+#endif
+}
+
+template<typename Token>
+Bitext<Token>::agenda::job
+::job(Bitext<Token> const* const theBitext,
+ typename TSA<Token>::tree_iterator const& m,
+ sptr<TSA<Token> > const& r, size_t maxsmpl,
+ bool isfwd, sptr<SamplingBias const> const& bias)
+ : m_bitext(theBitext)
+ , rnd(0)
+ , rnddenom(rnd.max() + 1.)
+ , min_diverse(1)
+ , workers(0)
+ , root(r)
+ , next(m.lower_bound(-1))
+ , stop(m.upper_bound(-1))
+ , max_samples(maxsmpl)
+ , ctr(0)
+ , len(m.size())
+ , fwd(isfwd)
+ , m_bias(bias)
+{
+ stats.reset(new pstats());
+ stats->raw_cnt = m.approxOccurrenceCount();
+ bias_total = 0;
+
+ // we need to renormalize on the fly, as the summ of all sentence probs over
+ // all candidates (not all sentences in the corpus) needs to add to 1.
+ // Profiling question: how much does that cost us?
+ if (m_bias)
+ {
+ // int ctr = 0;
+ stats->raw_cnt = 0;
+ for (char const* x = m.lower_bound(-1); x < stop;)
+ {
+ uint32_t sid; ushort offset;
+ x = root->readSid(x,stop,sid);
+ x = root->readOffset(x,stop,offset);
+#if 0
+ cerr << ctr++ << " " << m.str(m_bitext->V1.get())
+ << " " << sid << "/" << root->getCorpusSize()
+ << " " << offset << " " << stop-x << endl;
+#endif
+ bias_total += (*m_bias)[sid];
+ ++stats->raw_cnt;
+ }
+ }
+#if UG_BITEXT_TRACK_ACTIVE_THREADS
+ ++active;
+ // if (active%5 == 0)
+ // cerr << size_t(active) << " active jobs at " << __FILE__ << ":" << __LINE__ << endl;
+#endif
+}
+
+template<typename Token>
+bool Bitext<Token>::agenda::job
+::done() const
+{
+ return (max_samples && stats->good >= max_samples) || next == stop;
+}
+
+template<typename Token>
+int Bitext<Token>::agenda::job
+::check_sample_distribution(uint64_t const& sid, uint64_t const& offset)
+{ // ensure that the sampled distribution approximately matches the bias
+ // @return 0: SKIP this occurrence
+ // @return 1: consider this occurrence for sampling
+ // @return 2: include this occurrence in the sample by all means
+
+ if (!m_bias) return 1;
+
+ using namespace boost::math;
+ typedef boost::math::binomial_distribution<> binomial;
+
+ ostream* log = m_bias->loglevel > 1 ? m_bias->log : NULL;
+
+ float p = (*m_bias)[sid];
+ id_type docid = m_bias->GetClass(sid);
+ uint32_t k = docid < stats->indoc.size() ? stats->indoc[docid] : 0;
+
+ // always consider candidates from dominating documents and
+ // from documents that have not been considered at all yet
+ bool ret = (p > .5 || k == 0);
+
+ if (ret && !log) return 1;
+
+ uint32_t N = stats->good; // number of trials
+ float d = cdf(complement(binomial(N, p), k));
+ // d: probability that samples contains k or more instances from doc #docid
+ ret = ret || d >= .05;
+
+ if (log)
+ {
+ Token const* t = root->getCorpus()->sntStart(sid)+offset;
+ Token const* x = t - min(offset,uint64_t(3));
+ Token const* e = t+4;
+ if (e > root->getCorpus()->sntEnd(sid))
+ e = root->getCorpus()->sntEnd(sid);
+ *log << docid << ":" << sid << " " << size_t(k) << "/" << N
+ << " @" << p << " => " << d << " [";
+ for (size_t i = 0; i < stats->indoc.size(); ++i)
+ {
+ if (i) *log << " ";
+ *log << stats->indoc[i];
+ }
+ *log << "] ";
+ for (; x < e; ++x) *log << (*m_bitext->V1)[x->id()] << " ";
+ if (!ret) *log << "SKIP";
+ else if (p < .5 && d > .9) *log << "FORCE";
+ *log << endl;
+ }
+
+ return (ret ? (p < .5 && d > .9) ? 2 : 1 : 0);
+}
+
+template<typename Token>
+bool Bitext<Token>::agenda::job
+::flip_coin(uint64_t & sid, uint64_t & offset)
+{
+ int no_maybe_yes = m_bias ? check_sample_distribution(sid, offset) : 1;
+ if (no_maybe_yes == 0) return false; // no
+ if (no_maybe_yes > 1) return true; // yes
+ // ... maybe: flip a coin
+ size_t options_chosen = stats->good;
+ size_t options_total = max(stats->raw_cnt, this->ctr);
+ size_t options_left = (options_total - this->ctr);
+ size_t random_number = options_left * (rnd()/(rnd.max()+1.));
+ size_t threshold;
+ if (bias_total) // we have a bias and there are candidates with non-zero prob
+ threshold = ((*m_bias)[sid]/bias_total * options_total * max_samples);
+ else // no bias, or all have prob 0 (can happen with a very opinionated bias)
+ threshold = max_samples;
+ return random_number + options_chosen < threshold;
+}
+
+template<typename Token>
+bool Bitext<Token>::agenda::job
+::step(uint64_t & sid, uint64_t & offset)
+{ // caller must lock!
+ if (next == stop) return false;
+ UTIL_THROW_IF2
+ ( next > stop, "Fatal error at " << HERE << ". How did that happen?" );
+ // boost::lock_guard<boost::mutex> jguard(lock); // caller must lock!
+ next = root->readSid(next, stop, sid);
+ next = root->readOffset(next, stop, offset);
+ ++ctr;
+ return true;
+}
+
+template<typename Token>
+bool Bitext<Token>::agenda::job
+::nextSample(uint64_t & sid, uint64_t & offset)
+{
+ boost::lock_guard<boost::mutex> jguard(lock);
+ if (max_samples == 0) // no sampling, consider all occurrences
+ return step(sid, offset);
+
+ while (step(sid,offset))
+ {
+ size_t good = stats->good;
+ size_t diversity = stats->trg.size();
+ if (good >= max_samples && diversity >= min_diverse)
+ return false; // done
+
+ // flip_coin softly enforces approximation of the sampling to the
+ // bias (occurrences that would steer the sample too far from the bias
+ // are ruled out), and flips a biased coin otherwise.
+ if (!flip_coin(sid,offset)) continue;
+ return true;
+ }
+ return false;
+}
+
+#if UG_BITEXT_TRACK_ACTIVE_THREADS
+template<typename TKN>
+ThreadSafeCounter Bitext<TKN>::agenda
+::job
+::active;
+#endif
diff --git a/moses/TranslationModel/UG/mm/ug_bitext_agenda_worker.h b/moses/TranslationModel/UG/mm/ug_bitext_agenda_worker.h
new file mode 100644
index 000000000..5ff39312c
--- /dev/null
+++ b/moses/TranslationModel/UG/mm/ug_bitext_agenda_worker.h
@@ -0,0 +1,102 @@
+// to be included from ug_bitext_agenda.h
+
+template<typename Token>
+void
+Bitext<Token>::agenda
+::worker
+::operator()()
+{
+ // things to do:
+ //
+ // - have each worker maintain their own pstats object and merge
+ // results at the end (to minimize mutex locking);
+ //
+ // - use a non-locked, monotonically increasing counter to
+ // ensure the minimum size of samples considered --- it's OK if
+ // we look at more samples than required. This way, we can
+ // reduce the number of lock / unlock operations we need to do
+ // during sampling.
+
+ uint64_t sid=0, offset=0; // sid and offset of source phrase
+ size_t s1=0, s2=0, e1=0, e2=0; // soft and hard boundaries of target phrase
+ vector<uchar> aln; // stores phrase-pair-internal alignment
+ while(sptr<job> j = ag.get_job())
+ {
+ j->stats->register_worker();
+ bitvector full_alignment(100*100); // Is full_alignment still needed???
+ while (j->nextSample(sid,offset))
+ {
+ aln.clear();
+ int po_fwd = Moses::LRModel::NONE;
+ int po_bwd = Moses::LRModel::NONE;
+ int docid = j->m_bias ? j->m_bias->GetClass(sid) : -1;
+ bitvector* full_aln = j->fwd ? &full_alignment : NULL;
+
+ // find soft and hard boundaries of target phrase
+ bool good = (ag.bt.find_trg_phr_bounds
+ (sid, offset, offset + j->len, // input parameters
+ s1, s2, e1, e2, po_fwd, po_bwd, // bounds & orientation
+ &aln, full_aln, !j->fwd)); // aln info / flip sides?
+
+ if (!good)
+ { // no good, probably because phrase is not coherent
+ j->stats->count_sample(docid, 0, po_fwd, po_bwd);
+ continue;
+ }
+
+ // all good: register this sample as valid
+ size_t num_pairs = (s2-s1+1) * (e2-e1+1);
+ j->stats->count_sample(docid, num_pairs, po_fwd, po_bwd);
+
+#if 0
+ Token const* t = ag.bt.T2->sntStart(sid);
+ Token const* eos = ag.bt.T2->sntEnd(sid);
+ cerr << "[" << j->stats->good + 1 << "] ";
+ while (t != eos) cerr << (*ag.bt.V2)[(t++)->id()] << " ";
+ cerr << "[" << docid << "]" << endl;
+#endif
+
+ float sample_weight = 1./num_pairs;
+ Token const* o = (j->fwd ? ag.bt.T2 : ag.bt.T1)->sntStart(sid);
+
+ // adjust offsets in phrase-internal aligment
+ for (size_t k = 1; k < aln.size(); k += 2) aln[k] += s2 - s1;
+
+ vector<uint64_t> seen; seen.reserve(10);
+ // It is possible that the phrase extraction extracts the same
+ // phrase twice, e.g., when word a co-occurs with sequence b b b
+ // but is aligned only to the middle word. We can only count
+ // each phrase pair once per source phrase occurrence, or else
+ // run the risk of having more joint counts than marginal
+ // counts.
+
+ for (size_t s = s1; s <= s2; ++s)
+ {
+ TSA<Token> const& I = j->fwd ? *ag.bt.I2 : *ag.bt.I1;
+ sptr<iter> b = I.find(o + s, e1 - s);
+ UTIL_THROW_IF2(!b || b->size() < e1-s, "target phrase not found");
+
+ for (size_t i = e1; i <= e2; ++i)
+ {
+ uint64_t tpid = b->getPid();
+
+ // poor man's protection against over-counting
+ size_t s = 0;
+ while (s < seen.size() && seen[s] != tpid) ++s;
+ if (s < seen.size()) continue;
+ seen.push_back(tpid);
+
+ size_t raw2 = b->approxOccurrenceCount();
+ j->stats->add(tpid, sample_weight, aln, raw2,
+ po_fwd, po_bwd, docid);
+ bool ok = (i == e2) || b->extend(o[i].id());
+ UTIL_THROW_IF2(!ok, "Could not extend target phrase.");
+ }
+ if (s < s2) // shift phrase-internal alignments
+ for (size_t k = 1; k < aln.size(); k += 2)
+ --aln[k];
+ }
+ }
+ j->stats->release(); // indicate that you're done working on j->stats
+ }
+}
diff --git a/moses/TranslationModel/UG/mm/ug_bitext_jstats.cc b/moses/TranslationModel/UG/mm/ug_bitext_jstats.cc
new file mode 100644
index 000000000..bcda9ebf3
--- /dev/null
+++ b/moses/TranslationModel/UG/mm/ug_bitext_jstats.cc
@@ -0,0 +1,90 @@
+#include "ug_bitext_jstats.h"
+namespace Moses
+{
+ namespace bitext
+ {
+
+ uint32_t jstats::rcnt() const { return my_rcnt; }
+ float jstats::wcnt() const { return my_wcnt; }
+ uint32_t jstats::cnt2() const { return my_cnt2; }
+
+ // What was that used for again? UG
+ bool jstats::valid() { return my_wcnt >= 0; }
+ void jstats::validate() { if (my_wcnt < 0) my_wcnt *= -1; }
+ void jstats::invalidate() { if (my_wcnt > 0) my_wcnt *= -1; }
+
+ jstats::
+ jstats()
+ : my_rcnt(0), my_cnt2(0), my_wcnt(0)
+ {
+ for (int i = 0; i <= Moses::LRModel::NONE; ++i)
+ ofwd[i] = obwd[i] = 0;
+ my_aln.reserve(1);
+ }
+
+ jstats::
+ jstats(jstats const& other)
+ {
+ my_rcnt = other.rcnt();
+ my_wcnt = other.wcnt();
+ my_aln = other.aln();
+ indoc = other.indoc;
+ for (int i = 0; i <= Moses::LRModel::NONE; i++)
+ {
+ ofwd[i] = other.ofwd[i];
+ obwd[i] = other.obwd[i];
+ }
+ }
+
+ uint32_t
+ jstats::
+ dcnt_fwd(PhraseOrientation const idx) const
+ {
+ assert(idx <= Moses::LRModel::NONE);
+ return ofwd[idx];
+ }
+
+ uint32_t
+ jstats::
+ dcnt_bwd(PhraseOrientation const idx) const
+ {
+ assert(idx <= Moses::LRModel::NONE);
+ return obwd[idx];
+ }
+
+ void
+ jstats::
+ add(float w, vector<uchar> const& a, uint32_t const cnt2,
+ uint32_t fwd_orient, uint32_t bwd_orient, int const docid)
+ {
+ boost::lock_guard<boost::mutex> lk(this->lock);
+ my_cnt2 = cnt2;
+ my_rcnt += 1;
+ my_wcnt += w;
+ if (a.size())
+ {
+ size_t i = 0;
+ while (i < my_aln.size() && my_aln[i].second != a) ++i;
+ if (i == my_aln.size())
+ my_aln.push_back(pair<size_t,vector<uchar> >(1,a));
+ else
+ my_aln[i].first++;
+ if (my_aln[i].first > my_aln[i/2].first)
+ push_heap(my_aln.begin(),my_aln.begin()+i+1);
+ }
+ ++ofwd[fwd_orient];
+ ++obwd[bwd_orient];
+ if (docid >= 0)
+ {
+ while (int(indoc.size()) <= docid) indoc.push_back(0);
+ ++indoc[docid];
+ }
+ }
+
+ vector<pair<size_t, vector<uchar> > > const&
+ jstats::
+ aln() const
+ { return my_aln; }
+
+ } // namespace bitext
+} // namespace Moses
diff --git a/moses/TranslationModel/UG/mm/ug_bitext_jstats.h b/moses/TranslationModel/UG/mm/ug_bitext_jstats.h
new file mode 100644
index 000000000..dade27649
--- /dev/null
+++ b/moses/TranslationModel/UG/mm/ug_bitext_jstats.h
@@ -0,0 +1,51 @@
+// -*- c++ -*-
+#pragma once
+#include "ug_typedefs.h"
+#include "ug_lexical_reordering.h"
+#include <boost/thread.hpp>
+
+namespace Moses
+{
+ namespace bitext
+ {
+ using namespace ugdiss;
+
+ // "joint" (i.e., phrase pair) statistics
+ class
+ jstats
+ {
+ boost::mutex lock;
+ uint32_t my_rcnt; // unweighted joint count
+ uint32_t my_cnt2; // raw counts L2
+ float my_wcnt; // weighted joint count
+
+ // to do: use a static alignment pattern store that stores each pattern only
+ // once, so that we don't have to store so many alignment vectors
+ vector<pair<size_t, vector<uchar> > > my_aln; // internal word alignment
+
+ uint32_t ofwd[Moses::LRModel::NONE+1]; // forward distortion type counts
+ uint32_t obwd[Moses::LRModel::NONE+1]; // backward distortion type counts
+
+ public:
+ vector<uint32_t> indoc; // counts origin of samples (for biased sampling)
+ jstats();
+ jstats(jstats const& other);
+ uint32_t rcnt() const; // raw joint counts
+ uint32_t cnt2() const; // raw target phrase occurrence count
+ float wcnt() const; // weighted joint counts
+
+ vector<pair<size_t, vector<uchar> > > const & aln() const;
+ void add(float w, vector<uchar> const& a, uint32_t const cnt2,
+ uint32_t fwd_orient, uint32_t bwd_orient,
+ int const docid);
+ void invalidate();
+ void validate();
+ bool valid();
+ uint32_t dcnt_fwd(PhraseOrientation const idx) const;
+ uint32_t dcnt_bwd(PhraseOrientation const idx) const;
+ void fill_lr_vec(Moses::LRModel::Direction const& dir,
+ Moses::LRModel::ModelType const& mdl,
+ vector<float>& v);
+ };
+ }
+}
diff --git a/moses/TranslationModel/UG/mm/ug_bitext_pstats.cc b/moses/TranslationModel/UG/mm/ug_bitext_pstats.cc
new file mode 100644
index 000000000..580d7669b
--- /dev/null
+++ b/moses/TranslationModel/UG/mm/ug_bitext_pstats.cc
@@ -0,0 +1,86 @@
+#include "ug_bitext_pstats.h"
+
+namespace Moses
+{
+ namespace bitext
+ {
+
+#if UG_BITEXT_TRACK_ACTIVE_THREADS
+ ThreadSafeCounter pstats::active;
+#endif
+
+ pstats::
+ pstats() : raw_cnt(0), sample_cnt(0), good(0), sum_pairs(0), in_progress(0)
+ {
+ for (int i = 0; i <= Moses::LRModel::NONE; ++i)
+ ofwd[i] = obwd[i] = 0;
+ }
+
+ pstats::
+ ~pstats()
+ {
+#if UG_BITEXT_TRACK_ACTIVE_THREADS
+ // counter may not exist any more at destruction time, so try ... catch
+ try { --active; } catch (...) {}
+#endif
+ }
+
+ void
+ pstats::
+ register_worker()
+ {
+ this->lock.lock();
+ ++this->in_progress;
+ this->lock.unlock();
+ }
+
+ void
+ pstats::
+ release()
+ {
+ this->lock.lock();
+ if (this->in_progress-- == 1) // last one - >we're done
+ this->ready.notify_all();
+ this->lock.unlock();
+ }
+
+ void
+ pstats
+ ::count_sample(int const docid, size_t const num_pairs,
+ int const po_fwd, int const po_bwd)
+ {
+ boost::lock_guard<boost::mutex> guard(lock);
+ ++sample_cnt;
+ if (num_pairs == 0) return;
+ ++good;
+ sum_pairs += num_pairs;
+ ++ofwd[po_fwd];
+ ++obwd[po_bwd];
+ if (docid >= 0)
+ {
+ while (int(indoc.size()) <= docid) indoc.push_back(0);
+ ++indoc[docid];
+ }
+ }
+
+ bool
+ pstats::
+ add(uint64_t pid, float const w,
+ vector<uchar> const& a,
+ uint32_t const cnt2,
+ uint32_t fwd_o,
+ uint32_t bwd_o, int const docid)
+ {
+ boost::lock_guard<boost::mutex> guard(this->lock);
+ jstats& entry = this->trg[pid];
+ entry.add(w, a, cnt2, fwd_o, bwd_o, docid);
+ if (this->good < entry.rcnt())
+ {
+ UTIL_THROW(util::Exception, "more joint counts than good counts:"
+ << entry.rcnt() << "/" << this->good << "!");
+ }
+ return true;
+ }
+
+ }
+}
diff --git a/moses/TranslationModel/UG/mm/ug_bitext_pstats.h b/moses/TranslationModel/UG/mm/ug_bitext_pstats.h
new file mode 100644
index 000000000..9a14e378b
--- /dev/null
+++ b/moses/TranslationModel/UG/mm/ug_bitext_pstats.h
@@ -0,0 +1,63 @@
+// -*- c++ -*-
+#pragma once
+
+#include <boost/thread.hpp>
+#include <boost/unordered_map.hpp>
+
+#include "ug_typedefs.h"
+#include "ug_bitext_jstats.h"
+#include "moses/thread_safe_container.h"
+
+namespace Moses
+{
+ namespace bitext
+ {
+ struct
+ pstats
+ {
+ typedef boost::unordered_map<uint64_t, sptr<pstats> > map_t;
+ typedef ThreadSafeContainer<uint64_t, sptr<pstats>, map_t> cache_t;
+ typedef std::vector<uchar> alnvec;
+#if UG_BITEXT_TRACK_ACTIVE_THREADS
+ static ThreadSafeCounter active;
+#endif
+ boost::mutex lock; // for parallel gathering of stats
+ boost::condition_variable ready; // consumers can wait for me to be ready
+
+ size_t raw_cnt; // (approximate) raw occurrence count
+ size_t sample_cnt; // number of instances selected during sampling
+ size_t good; // number of selected instances with valid word alignments
+ size_t sum_pairs; // total number of target phrases extracted (can be > raw_cnt)
+ size_t in_progress; // how many threads are currently working on this?
+
+ uint32_t ofwd[Moses::LRModel::NONE+1]; // distribution of fwd phrase orientations
+ uint32_t obwd[Moses::LRModel::NONE+1]; // distribution of bwd phrase orientations
+
+ std::vector<uint32_t> indoc; // distribution over where samples came from
+
+ typedef std::map<uint64_t, jstats> trg_map_t;
+ trg_map_t trg;
+ pstats();
+ ~pstats();
+ void release();
+ void register_worker();
+ size_t count_workers() { return in_progress; }
+
+ bool
+ add(uint64_t const pid, // target phrase id
+ float const w, // sample weight (1./(# of phrases extractable))
+ alnvec const& a, // local alignment
+ uint32_t const cnt2, // raw target phrase count
+ uint32_t fwd_o, // fwd. phrase orientation
+ uint32_t bwd_o, // bwd. phrase orientation
+ int const docid); // document where sample was found
+
+ void
+ count_sample(int const docid, // document where sample was found
+ size_t const num_pairs, // # of phrases extractable here
+ int const po_fwd, // fwd phrase orientation
+ int const po_bwd); // bwd phrase orientation
+ };
+
+ }
+}
diff --git a/moses/TranslationModel/UG/mm/ug_conll_bottom_up_token.h b/moses/TranslationModel/UG/mm/ug_conll_bottom_up_token.h
index 845fe374e..89dc93ad1 100644
--- a/moses/TranslationModel/UG/mm/ug_conll_bottom_up_token.h
+++ b/moses/TranslationModel/UG/mm/ug_conll_bottom_up_token.h
@@ -25,13 +25,13 @@ namespace ugdiss
return NULL;
};
- ConllBottomUpToken const*
- stop(ConllBottomUpToken const* seqStart,
+ ConllBottomUpToken const*
+ stop(ConllBottomUpToken const* seqStart,
ConllBottomUpToken const* seqEnd) const
{
return NULL;
};
-
+
bool operator<(T const& other) const { return this->cmp(other) < 0; }
bool operator>(T const& other) const { return this->cmp(other) > 0; }
bool operator==(T const& other) const { return this->cmp(other) == 0; }
@@ -44,9 +44,9 @@ namespace ugdiss
return false;
}
};
-
+
template<typename T>
- ConllBottomUpToken<T> const*
+ ConllBottomUpToken<T> const*
ConllBottomUpToken<T>::
next(int length) const
{
diff --git a/moses/TranslationModel/UG/mm/ug_conll_record.h b/moses/TranslationModel/UG/mm/ug_conll_record.h
index ea2cda29e..e52a4974b 100644
--- a/moses/TranslationModel/UG/mm/ug_conll_record.h
+++ b/moses/TranslationModel/UG/mm/ug_conll_record.h
@@ -3,22 +3,22 @@
#include "ug_typedefs.h"
// Base class for dependency tree corpora with POS and Lemma annotations
-namespace ugdiss
+namespace ugdiss
{
using namespace std;
- class
- Conll_Record
+ class
+ Conll_Record
{
public:
id_type sform; // surface form
id_type lemma; // lemma
uchar majpos; // major part of speech
uchar minpos; // minor part of speech
- short parent; // id of parent
+ short parent; // id of parent
uchar dtype; // dependency type
uchar info[3]; /* additional information (depends on the part of speech)
- * a place holder for the time being, to ensure proper
+ * a place holder for the time being, to ensure proper
* alignment in memory */
Conll_Record();
Conll_Record const* up(int length=1) const;
@@ -38,8 +38,8 @@ namespace ugdiss
* @parameter PS Vocabulary for part-of-speech
* @parameter DT Vocabulary for dependency type
*/
- Conll_Record(string const& line,
- TokenIndex const& SF, TokenIndex const& LM,
+ Conll_Record(string const& line,
+ TokenIndex const& SF, TokenIndex const& LM,
TokenIndex const& PS, TokenIndex const& DT);
/** store the record as-is to disk (for memory-mapped reading later) */
@@ -62,7 +62,7 @@ namespace ugdiss
// this is for contigous word sequences extracted from longer sequences
// adjust parent pointers to 0 (no parent) if they point out of the
// subsequence
- void
+ void
fixParse(Conll_Record* start, Conll_Record* stop);
} // end of namespace ugdiss
diff --git a/moses/TranslationModel/UG/mm/ug_corpus_token.cc b/moses/TranslationModel/UG/mm/ug_corpus_token.cc
index 742c17ace..4be8cbd95 100644
--- a/moses/TranslationModel/UG/mm/ug_corpus_token.cc
+++ b/moses/TranslationModel/UG/mm/ug_corpus_token.cc
@@ -6,9 +6,9 @@ namespace ugdiss
{
id_type const&
SimpleWordId::
- id() const
- {
- return theID;
+ id() const
+ {
+ return theID;
}
int
diff --git a/moses/TranslationModel/UG/mm/ug_corpus_token.h b/moses/TranslationModel/UG/mm/ug_corpus_token.h
index c1baaf21e..b9693cbf2 100644
--- a/moses/TranslationModel/UG/mm/ug_corpus_token.h
+++ b/moses/TranslationModel/UG/mm/ug_corpus_token.h
@@ -19,7 +19,7 @@ namespace ugdiss
{
/** Simple wrapper around id_type for use with the Ttrack/TSA template classes */
- class SimpleWordId
+ class SimpleWordId
{
id_type theID;
public:
@@ -29,7 +29,7 @@ namespace ugdiss
bool operator==(SimpleWordId const& other) const;
id_type remap(vector<id_type const*> const& m) const;
};
-
+
/** Token class for suffix arrays */
template<typename T>
class
@@ -43,16 +43,16 @@ namespace ugdiss
L2R_Token const* next(int n=1) const { return this+n; }
- /** return a pointer to the end of a sentence; used as a stopping criterion during
+ /** return a pointer to the end of a sentence; used as a stopping criterion during
* comparison of suffixes; see Ttrack::cmp() */
template<typename TTRACK_TYPE>
- L2R_Token const* stop(TTRACK_TYPE const& C, id_type sid) const
- {
- return reinterpret_cast<L2R_Token<T> const*>(C.sntEnd(sid));
+ L2R_Token const* stop(TTRACK_TYPE const& C, id_type sid) const
+ {
+ return reinterpret_cast<L2R_Token<T> const*>(C.sntEnd(sid));
}
- L2R_Token const* stop(L2R_Token const* seqStart, L2R_Token const* seqEnd) const
- {
+ L2R_Token const* stop(L2R_Token const* seqStart, L2R_Token const* seqEnd) const
+ {
return seqEnd;
}
@@ -69,20 +69,20 @@ namespace ugdiss
{
public:
typedef T Token;
-
+
R2L_Token() : T() {};
R2L_Token(id_type id) : T(id) {};
R2L_Token const* next(int n = 1) const { return this - n; }
template<typename TTRACK_TYPE>
- R2L_Token const* stop(TTRACK_TYPE const& C, id_type sid) const
- {
- return reinterpret_cast<R2L_Token<T> const*>(C.sntStart(sid) - 1);
+ R2L_Token const* stop(TTRACK_TYPE const& C, id_type sid) const
+ {
+ return reinterpret_cast<R2L_Token<T> const*>(C.sntStart(sid) - 1);
}
- R2L_Token const* stop(R2L_Token const* seqStart, R2L_Token const* seqEnd) const
- {
+ R2L_Token const* stop(R2L_Token const* seqStart, R2L_Token const* seqEnd) const
+ {
assert(seqStart);
return seqStart - 1;
}
diff --git a/moses/TranslationModel/UG/mm/ug_deptree.cc b/moses/TranslationModel/UG/mm/ug_deptree.cc
index 545268e04..003d9b35e 100644
--- a/moses/TranslationModel/UG/mm/ug_deptree.cc
+++ b/moses/TranslationModel/UG/mm/ug_deptree.cc
@@ -7,14 +7,14 @@ using namespace std;
namespace ugdiss
{
- bool
+ bool
Conll_Record::
isDescendentOf(Conll_Record const* other) const
{
Conll_Record const* a = this;
- while (a != other && a->parent)
+ while (a != other && a->parent)
a += a->parent;
- return a==other;
+ return a==other;
}
Conll_Record&
@@ -43,7 +43,7 @@ namespace ugdiss
}
Conll_AllFields::
- Conll_AllFields()
+ Conll_AllFields()
: Conll_Record::Conll_Record()
{};
@@ -64,7 +64,7 @@ namespace ugdiss
}
Conll_WildCard::
- Conll_WildCard()
+ Conll_WildCard()
: Conll_Record::Conll_Record()
{};
@@ -95,8 +95,8 @@ namespace ugdiss
#if 0
Conll_Record::
- Conll_Record(string const& line,
- TokenIndex const& SF, TokenIndex const& LM,
+ Conll_Record(string const& line,
+ TokenIndex const& SF, TokenIndex const& LM,
TokenIndex const& PS, TokenIndex const& DT)
{
@@ -140,35 +140,35 @@ namespace ugdiss
#endif
Conll_Sform::
- Conll_Sform()
- : Conll_Record::Conll_Record()
+ Conll_Sform()
+ : Conll_Record::Conll_Record()
{};
Conll_MinPos::
- Conll_MinPos()
- : Conll_Record::Conll_Record()
+ Conll_MinPos()
+ : Conll_Record::Conll_Record()
{};
-
+
Conll_MinPos_Lemma::
- Conll_MinPos_Lemma()
- : Conll_Record::Conll_Record()
+ Conll_MinPos_Lemma()
+ : Conll_Record::Conll_Record()
{};
Conll_Lemma::
Conll_Lemma()
- : Conll_Record::Conll_Record()
+ : Conll_Record::Conll_Record()
{};
Conll_Lemma::
Conll_Lemma(id_type _id)
- : Conll_Record::Conll_Record()
+ : Conll_Record::Conll_Record()
{
this->lemma = _id;
};
Conll_MinPos::
Conll_MinPos(id_type _id)
- : Conll_Record::Conll_Record()
+ : Conll_Record::Conll_Record()
{
this->minpos = _id;
};
@@ -182,7 +182,7 @@ namespace ugdiss
Conll_MajPos::
Conll_MajPos(id_type _id)
- : Conll_Record::Conll_Record()
+ : Conll_Record::Conll_Record()
{
this->majpos = _id;
};
@@ -219,21 +219,21 @@ namespace ugdiss
Conll_MinPos_Lemma::
cmp(Conll_Record const& other) const
{
- if (this->minpos != 0 && other.minpos != 0 && this->minpos != other.minpos)
+ if (this->minpos != 0 && other.minpos != 0 && this->minpos != other.minpos)
return this->minpos < other.minpos ? -1 : 1;
if (this->lemma != 0 && other.lemma != 0 && this->lemma != other.lemma)
return this->lemma < other.lemma ? -1 : 1;
return 0;
}
- id_type
+ id_type
Conll_Lemma::
- id() const
- {
- return this->lemma;
+ id() const
+ {
+ return this->lemma;
}
- int
+ int
Conll_Lemma::
cmp(Conll_Record const& other) const
{
@@ -251,16 +251,16 @@ namespace ugdiss
Conll_Sform::
Conll_Sform(id_type _id)
- : Conll_Record::Conll_Record()
+ : Conll_Record::Conll_Record()
{
this->sform = _id;
};
- id_type
+ id_type
Conll_Sform
- ::id() const
- {
- return this->sform;
+ ::id() const
+ {
+ return this->sform;
}
int
@@ -282,7 +282,7 @@ namespace ugdiss
short p = w[i].rec->parent;
if (p != 0)
{
- if (p > 0) assert(i+p < w.size());
+ if (p > 0) assert(i+p < w.size());
else assert(i >= size_t(-p));
w[i].parent = &(w[i+p]);
w[i].parent->children.push_back(&(w[i]));
@@ -291,7 +291,7 @@ namespace ugdiss
}
#endif
- /** @return true if the linear sequence of /Conll_Record/s is coherent,
+ /** @return true if the linear sequence of /Conll_Record/s is coherent,
* i.e., a proper connected tree structure */
bool
isCoherent(Conll_Record const* const start, Conll_Record const* const stop)
@@ -300,16 +300,16 @@ namespace ugdiss
for (Conll_Record const* x = start; outOfRange <= 1 && x < stop; ++x)
{
Conll_Record const* n = x->up();
- if (!n || n < start || n >= stop)
+ if (!n || n < start || n >= stop)
outOfRange++;
}
return outOfRange<=1;
}
-
+
// this is for contigous word sequences extracted from longer sequences
// adjust parent pointers to 0 (no parent) if they point out of the
// subsequence
- void
+ void
fixParse(Conll_Record* start, Conll_Record* stop)
{
int len = stop-start;
diff --git a/moses/TranslationModel/UG/mm/ug_deptree.h b/moses/TranslationModel/UG/mm/ug_deptree.h
index 0d393aa33..b28a4bbe8 100644
--- a/moses/TranslationModel/UG/mm/ug_deptree.h
+++ b/moses/TranslationModel/UG/mm/ug_deptree.h
@@ -19,8 +19,8 @@ using namespace std;
namespace ugdiss
{
- // Fills the vector v with pointers to the internal root r_x for the
- // stretch [start,x] for all x: start <= x < stop. If the stretch
+ // Fills the vector v with pointers to the internal root r_x for the
+ // stretch [start,x] for all x: start <= x < stop. If the stretch
// is incoherent, r_x is NULL
template<typename T>
void
@@ -37,8 +37,8 @@ namespace ugdiss
{
size_t p = x-start;
root[p] = x+x->parent;
- for (size_t i = isR.find_first(); i < isR.size(); i = isR.find_next(i))
- if (root[i]==x)
+ for (size_t i = isR.find_first(); i < isR.size(); i = isR.find_next(i))
+ if (root[i]==x)
isR.reset(i);
if (root[p] < start || root[p] >= stop)
isR.set(x-start);
@@ -46,7 +46,7 @@ namespace ugdiss
}
}
- // return the root of the tree if the span [start,stop) constitutes a
+ // return the root of the tree if the span [start,stop) constitutes a
// tree, NULL otherwise
template<typename T>
T const*
@@ -66,7 +66,7 @@ namespace ugdiss
assert(outOfRange);
return outOfRange == 1 ? root : NULL;
}
-
+
// return the governor of the tree given by [start,stop) if the span
// constitutes a tree, NULL otherwise
template<typename T>
@@ -82,7 +82,7 @@ namespace ugdiss
{
if (root && n != root)
numRoots++;
- else
+ else
{
root = n;
if (!numRoots) numRoots++;
@@ -101,7 +101,7 @@ namespace ugdiss
T const* b = as<T>(&(*v.end()));
return (a==b) ? NULL : findInternalRoot<T>(a,b);
}
-
+
#if 1
class DTNode
{
@@ -113,7 +113,7 @@ namespace ugdiss
};
/** A parsed sentence */
- class
+ class
DependencyTree
{
public:
@@ -189,13 +189,13 @@ namespace ugdiss
int cmp(Conll_Record const& other) const;
};
- /** @return true if the linear sequence of /Conll_Record/s is coherent,
+ /** @return true if the linear sequence of /Conll_Record/s is coherent,
* i.e., a proper connected tree structure */
bool
isCoherent(Conll_Record const* start, Conll_Record const* const stop);
- /** @return the root node of the tree covering the span [start,stop), if the span is coherent;
+ /** @return the root node of the tree covering the span [start,stop), if the span is coherent;
* NULL otherwise */
template<typename T>
T const* topNode(T const* start , T const* stop)
@@ -204,9 +204,9 @@ namespace ugdiss
for (T const* x = start; x < stop; ++x)
{
T const* n = reinterpret_cast<T const*>(x->up());
- if (!n || n < start || n >= stop)
+ if (!n || n < start || n >= stop)
{
- if (ret) return NULL;
+ if (ret) return NULL;
else ret = x;
}
}
diff --git a/moses/TranslationModel/UG/mm/ug_im_bitext.cc b/moses/TranslationModel/UG/mm/ug_im_bitext.cc
new file mode 100644
index 000000000..b411cc7dc
--- /dev/null
+++ b/moses/TranslationModel/UG/mm/ug_im_bitext.cc
@@ -0,0 +1,87 @@
+#include "ug_im_bitext.h"
+
+namespace Moses
+{
+ namespace bitext
+ {
+
+ template<>
+ sptr<imBitext<L2R_Token<SimpleWordId> > >
+ imBitext<L2R_Token<SimpleWordId> >::
+ add(vector<string> const& s1,
+ vector<string> const& s2,
+ vector<string> const& aln) const
+ {
+ typedef L2R_Token<SimpleWordId> TKN;
+ assert(s1.size() == s2.size() && s1.size() == aln.size());
+
+#ifndef NDEBUG
+ size_t first_new_snt = this->T1 ? this->T1->size() : 0;
+#endif
+
+ sptr<imBitext<TKN> > ret;
+ {
+ boost::unique_lock<boost::shared_mutex> guard(m_lock);
+ ret.reset(new imBitext<TKN>(*this));
+ }
+
+ // we add the sentences in separate threads (so it's faster)
+ boost::thread thread1(snt_adder<TKN>(s1,*ret->V1,ret->myT1,ret->myI1));
+ // thread1.join(); // for debugging
+ boost::thread thread2(snt_adder<TKN>(s2,*ret->V2,ret->myT2,ret->myI2));
+ BOOST_FOREACH(string const& a, aln)
+ {
+ istringstream ibuf(a);
+ ostringstream obuf;
+ uint32_t row,col; char c;
+ while (ibuf >> row >> c >> col)
+ {
+ UTIL_THROW_IF2(c != '-', "[" << HERE << "] "
+ << "Error in alignment information:\n" << a);
+ binwrite(obuf,row);
+ binwrite(obuf,col);
+ }
+ // important: DO NOT replace the two lines below this comment by
+ // char const* x = obuf.str().c_str(), as the memory x is pointing
+ // to is freed immediately upon deconstruction of the string object.
+ string foo = obuf.str();
+ char const* x = foo.c_str();
+ vector<char> v(x,x+foo.size());
+ ret->myTx = append(ret->myTx, v);
+ }
+
+ thread1.join();
+ thread2.join();
+
+ ret->Tx = ret->myTx;
+ ret->T1 = ret->myT1;
+ ret->T2 = ret->myT2;
+ ret->I1 = ret->myI1;
+ ret->I2 = ret->myI2;
+
+#ifndef NDEBUG
+ // sanity check
+ for (size_t i = first_new_snt; i < ret->T1->size(); ++i)
+ {
+ size_t slen1 = ret->T1->sntLen(i);
+ size_t slen2 = ret->T2->sntLen(i);
+ char const* p = ret->Tx->sntStart(i);
+ char const* q = ret->Tx->sntEnd(i);
+ size_t k;
+ while (p < q)
+ {
+ p = binread(p,k);
+ assert(p);
+ assert(p < q);
+ assert(k < slen1);
+ p = binread(p,k);
+ assert(p);
+ assert(k < slen2);
+ }
+ }
+#endif
+ return ret;
+ }
+
+ }
+}
diff --git a/moses/TranslationModel/UG/mm/ug_im_bitext.h b/moses/TranslationModel/UG/mm/ug_im_bitext.h
new file mode 100644
index 000000000..63e44f1b9
--- /dev/null
+++ b/moses/TranslationModel/UG/mm/ug_im_bitext.h
@@ -0,0 +1,130 @@
+// -*- c++ -*-
+#pragma once
+#include "ug_bitext.h"
+
+namespace Moses
+{
+ namespace bitext
+ {
+ template<typename TKN>
+ class imBitext : public Bitext<TKN>
+ {
+ sptr<imTtrack<char> > myTx;
+ sptr<imTtrack<TKN> > myT1;
+ sptr<imTtrack<TKN> > myT2;
+ sptr<imTSA<TKN> > myI1;
+ sptr<imTSA<TKN> > myI2;
+ static ThreadSafeCounter my_revision;
+ public:
+ size_t revision() const { return my_revision; }
+ void open(string const base, string const L1, string L2);
+ imBitext(sptr<TokenIndex> const& V1,
+ sptr<TokenIndex> const& V2,
+ size_t max_sample = 5000, size_t num_workers=4);
+ imBitext(size_t max_sample = 5000, size_t num_workers=4);
+ imBitext(imBitext const& other);
+
+ // sptr<imBitext<TKN> >
+ // add(vector<TKN> const& s1, vector<TKN> const& s2, vector<ushort> & a);
+
+ sptr<imBitext<TKN> >
+ add(vector<string> const& s1,
+ vector<string> const& s2,
+ vector<string> const& a) const;
+
+ };
+
+ template<typename TKN>
+ ThreadSafeCounter
+ imBitext<TKN>::my_revision;
+
+ template<typename TKN>
+ imBitext<TKN>::
+ imBitext(size_t max_sample, size_t num_workers)
+ : Bitext<TKN>(max_sample, num_workers)
+ {
+ this->m_default_sample_size = max_sample;
+ this->V1.reset(new TokenIndex());
+ this->V2.reset(new TokenIndex());
+ this->V1->setDynamic(true);
+ this->V2->setDynamic(true);
+ ++my_revision;
+ }
+
+ template<typename TKN>
+ imBitext<TKN>::
+ imBitext(sptr<TokenIndex> const& v1,
+ sptr<TokenIndex> const& v2,
+ size_t max_sample, size_t num_workers)
+ : Bitext<TKN>(max_sample, num_workers)
+ {
+ // this->default_sample_size = max_sample;
+ this->V1 = v1;
+ this->V2 = v2;
+ this->V1->setDynamic(true);
+ this->V2->setDynamic(true);
+ ++my_revision;
+ }
+
+
+ template<typename TKN>
+ imBitext<TKN>::
+ imBitext(imBitext<TKN> const& other)
+ {
+ this->myTx = other.myTx;
+ this->myT1 = other.myT1;
+ this->myT2 = other.myT2;
+ this->myI1 = other.myI1;
+ this->myI2 = other.myI2;
+ this->Tx = this->myTx;
+ this->T1 = this->myT1;
+ this->T2 = this->myT2;
+ this->I1 = this->myI1;
+ this->I2 = this->myI2;
+ this->V1 = other.V1;
+ this->V2 = other.V2;
+ this->m_default_sample_size = other.m_default_sample_size;
+ this->m_num_workers = other.m_num_workers;
+ ++my_revision;
+ }
+
+ template<>
+ sptr<imBitext<L2R_Token<SimpleWordId> > >
+ imBitext<L2R_Token<SimpleWordId> >::
+ add(vector<string> const& s1,
+ vector<string> const& s2,
+ vector<string> const& aln) const;
+
+ template<typename TKN>
+ sptr<imBitext<TKN> >
+ imBitext<TKN>::
+ add(vector<string> const& s1,
+ vector<string> const& s2,
+ vector<string> const& aln) const
+ {
+ throw "Not yet implemented";
+ }
+
+ // What's up with this function???? UG
+ template<typename TKN>
+ void
+ imBitext<TKN>::
+ open(string const base, string const L1, string L2)
+ {
+ mmTtrack<TKN>& t1 = *reinterpret_cast<mmTtrack<TKN>*>(this->T1.get());
+ mmTtrack<TKN>& t2 = *reinterpret_cast<mmTtrack<TKN>*>(this->T2.get());
+ mmTtrack<char>& tx = *reinterpret_cast<mmTtrack<char>*>(this->Tx.get());
+ t1.open(base+L1+".mct");
+ t2.open(base+L2+".mct");
+ tx.open(base+L1+"-"+L2+".mam");
+ this->V1->open(base+L1+".tdx"); this->V1->iniReverseIndex();
+ this->V2->open(base+L2+".tdx"); this->V2->iniReverseIndex();
+ mmTSA<TKN>& i1 = *reinterpret_cast<mmTSA<TKN>*>(this->I1.get());
+ mmTSA<TKN>& i2 = *reinterpret_cast<mmTSA<TKN>*>(this->I2.get());
+ i1.open(base+L1+".sfa", this->T1);
+ i2.open(base+L2+".sfa", this->T2);
+ assert(this->T1->size() == this->T2->size());
+ }
+
+ }
+}
diff --git a/moses/TranslationModel/UG/mm/ug_im_tsa.h b/moses/TranslationModel/UG/mm/ug_im_tsa.h
index f7256ba2d..e920d9f96 100644
--- a/moses/TranslationModel/UG/mm/ug_im_tsa.h
+++ b/moses/TranslationModel/UG/mm/ug_im_tsa.h
@@ -23,7 +23,7 @@ namespace ugdiss
using namespace std;
using namespace boost;
namespace bio=boost::iostreams;
-
+
// template<typename TOKEN> class imBitext<TOKEN>;
//-----------------------------------------------------------------------
@@ -35,61 +35,61 @@ namespace ugdiss
public:
class tree_iterator;
friend class tree_iterator;
-
+
private:
vector<cpos> sufa; // stores the actual array
- vector<filepos_type> index; /* top-level index into regions in sufa
+ vector<filepos_type> index; /* top-level index into regions in sufa
* (for faster access) */
private:
- char const*
+ char const*
index_jump(char const* a, char const* z, float ratio) const;
- char const*
+ char const*
getLowerBound(id_type id) const;
- char const*
+ char const*
getUpperBound(id_type id) const;
-
+
public:
imTSA();
- imTSA(boost::shared_ptr<Ttrack<TOKEN> const> c,
- bdBitset const* filt,
+ imTSA(boost::shared_ptr<Ttrack<TOKEN> const> c,
+ bdBitset const* filt,
ostream* log = NULL);
- imTSA(imTSA<TOKEN> const& prior,
+ imTSA(imTSA<TOKEN> const& prior,
boost::shared_ptr<imTtrack<TOKEN> const> const& crp,
vector<id_type> const& newsids, size_t const vsize);
- count_type
- sntCnt(char const* p, char const * const q) const;
+ count_type
+ sntCnt(char const* p, char const * const q) const;
- count_type
+ count_type
rawCnt(char const* p, char const * const q) const;
-
- void
- getCounts(char const* p, char const * const q,
+
+ void
+ getCounts(char const* p, char const * const q,
count_type& sids, count_type& raw) const;
-
- char const*
+
+ char const*
readSid(char const* p, char const* q, id_type& sid) const;
-
- char const*
+
+ char const*
readSid(char const* p, char const* q, ::uint64_t& sid) const;
- char const*
+ char const*
readOffset(char const* p, char const* q, uint16_t& offset) const;
- char const*
+ char const*
readOffset(char const* p, char const* q, ::uint64_t& offset) const;
-
- void
+
+ void
sanityCheck() const;
-
- void
+
+ void
save_as_mm_tsa(string fname) const;
-
+
/// add a sentence to the database
- // shared_ptr<imTSA<TOKEN> > add(vector<TOKEN> const& snt) const;
+ // shared_ptr<imTSA<TOKEN> > add(vector<TOKEN> const& snt) const;
};
@@ -108,12 +108,12 @@ namespace ugdiss
tree_iterator(imTSA<TOKEN> const* s)
: TSA<TOKEN>::tree_iterator::tree_iterator(reinterpret_cast<TSA<TOKEN> const*>(s))
{};
-
+
/** jump to the point 1/ratio in a tightly packed index
* assumes that keys are flagged with '1', values with '0'
*/
template<typename TOKEN>
- char const*
+ char const*
imTSA<TOKEN>::
index_jump(char const* a, char const* z, float ratio) const
{
@@ -123,10 +123,10 @@ namespace ugdiss
cpos const* xz = reinterpret_cast<cpos const*>(z);
return reinterpret_cast<char const*>(xa+int(ratio*(xz-xa)));
}
-
+
template<typename TOKEN>
imTSA<TOKEN>::
- imTSA()
+ imTSA()
{
this->indexSize = 0;
// this->data = NULL;
@@ -135,7 +135,7 @@ namespace ugdiss
this->corpusSize = 0;
this->BitSetCachingThreshold=4096;
};
-
+
// build an array from all the tokens in the sentences in *c that are
// specified in filter
template<typename TOKEN>
@@ -153,12 +153,12 @@ namespace ugdiss
}
assert(filter);
// In the first iteration over the corpus, we obtain word counts.
- // They allows us to
+ // They allows us to
// a. allocate the exact amount of memory we need
- // b. place tokens into the right 'section' in the array, based on
+ // b. place tokens into the right 'section' in the array, based on
// the ID of the first token in the sequence. We can then sort
// each section separately.
-
+
if (log) *log << "counting tokens ... ";
int slimit = 65536;
// slimit=65536 is the upper bound of what we can fit into a ushort which
@@ -176,7 +176,7 @@ namespace ugdiss
vector<count_type> tmp(wcnt.size(),0);
for (size_t i = 1; i < wcnt.size(); ++i)
tmp[i] = tmp[i-1] + wcnt[i-1];
-
+
// Now dump all token positions into the right place in sufa
this->corpusSize = 0;
for (id_type sid = filter->find_first();
@@ -204,7 +204,7 @@ namespace ugdiss
for (size_t i = 0; i < wcnt.size(); i++)
{
if (log && wcnt[i] > 5000)
- *log << "sorting " << wcnt[i]
+ *log << "sorting " << wcnt[i]
<< " entries starting with id " << i << "." << endl;
index[i+1] = index[i]+wcnt[i];
assert(index[i+1]==tmp[i]); // sanity check
@@ -247,7 +247,7 @@ namespace ugdiss
imTSA<TOKEN>::
getUpperBound(id_type id) const
{
- if (++id >= this->index.size())
+ if (++id >= this->index.size())
return NULL;
assert(index[id] <= this->sufa.size());
return reinterpret_cast<char const*>(&(this->sufa.front()) + index[id]);
@@ -263,7 +263,7 @@ namespace ugdiss
sid = reinterpret_cast<cpos const*>(p)->sid;
return p;
}
-
+
template<typename TOKEN>
char const*
imTSA<TOKEN>::
@@ -306,11 +306,11 @@ namespace ugdiss
cpos const* xq = reinterpret_cast<cpos const*>(q);
return xq-xp;
}
-
+
template<typename TOKEN>
- void
+ void
imTSA<TOKEN>::
- getCounts(char const* p, char const* const q,
+ getCounts(char const* p, char const* const q,
count_type& sids, count_type& raw) const
{
id_type sid; // uint16_t off;
@@ -328,7 +328,7 @@ namespace ugdiss
}
template<typename TOKEN>
- void
+ void
imTSA<TOKEN>::
save_as_mm_tsa(string fname) const
{
@@ -352,34 +352,34 @@ namespace ugdiss
for (size_t i = 0; i < mmIndex.size(); i++)
numwrite(out,mmIndex[i]-mmIndex[0]);
out.seekp(0);
- numwrite(out,idxStart);
+ numwrite(out,idxStart);
out.close();
}
template<typename TOKEN>
imTSA<TOKEN>::
- imTSA(imTSA<TOKEN> const& prior,
+ imTSA(imTSA<TOKEN> const& prior,
boost::shared_ptr<imTtrack<TOKEN> const> const& crp,
vector<id_type> const& newsids, size_t const vsize)
{
typename ttrack::Position::LESS<Ttrack<TOKEN> > sorter(crp.get());
-
+
// count how many tokens will be added to the TSA
// and index the new additions to the corpus
size_t newToks = 0;
- BOOST_FOREACH(id_type sid, newsids)
+ BOOST_FOREACH(id_type sid, newsids)
newToks += crp->sntLen(sid);
vector<cpos> nidx(newToks); // new array entries
-
+
size_t n = 0;
- BOOST_FOREACH(id_type sid, newsids)
+ BOOST_FOREACH(id_type sid, newsids)
{
assert(sid < crp->size());
for (size_t o = 0; o < (*crp)[sid].size(); ++o, ++n)
{ nidx[n].offset = o; nidx[n].sid = sid; }
}
sort(nidx.begin(),nidx.end(),sorter);
-
+
// create the new suffix array
this->numTokens = newToks + prior.sufa.size();
this->sufa.resize(this->numTokens);
@@ -388,10 +388,10 @@ namespace ugdiss
this->corpusSize = crp->size();
this->corpus = crp;
this->index.resize(vsize+1);
-
+
size_t i = 0;
typename vector<cpos>::iterator k = this->sufa.begin();
- // cerr << newToks << " new items at "
+ // cerr << newToks << " new items at "
// << __FILE__ << ":" << __LINE__ << endl;
for (size_t n = 0; n < nidx.size();)
{
@@ -402,7 +402,7 @@ namespace ugdiss
this->index[i] = k - this->sufa.begin();
if (++i < prior.index.size() && prior.index[i-1] < prior.index[i])
{
- k = copy(prior.sufa.begin() + prior.index[i-1],
+ k = copy(prior.sufa.begin() + prior.index[i-1],
prior.sufa.begin() + prior.index[i], k);
}
}
@@ -410,13 +410,13 @@ namespace ugdiss
if (++i < prior.index.size() && prior.index[i] > prior.index[i-1])
{
size_t j = prior.index[i-1];
- while (j < prior.index[i] && n < nidx.size()
+ while (j < prior.index[i] && n < nidx.size()
&& crp->getToken(nidx[n])->id() < i)
{
assert(k < this->sufa.end());
if (sorter(prior.sufa[j],nidx[n]))
*k++ = prior.sufa[j++];
- else
+ else
*k++ = nidx[n++];
}
while (j < prior.index[i])
@@ -436,7 +436,7 @@ namespace ugdiss
while (++i < this->index.size())
{
if (i < prior.index.size() && prior.index[i-1] < prior.index[i])
- k = copy(prior.sufa.begin() + prior.index[i-1],
+ k = copy(prior.sufa.begin() + prior.index[i-1],
prior.sufa.begin() + prior.index[i], k);
this->index[i] = k - this->sufa.begin();
}
@@ -462,5 +462,5 @@ namespace ugdiss
}
}
-
+
#endif
diff --git a/moses/TranslationModel/UG/mm/ug_im_ttrack.h b/moses/TranslationModel/UG/mm/ug_im_ttrack.h
index ac49ebcd4..20ab653f4 100644
--- a/moses/TranslationModel/UG/mm/ug_im_ttrack.h
+++ b/moses/TranslationModel/UG/mm/ug_im_ttrack.h
@@ -1,6 +1,6 @@
// -*- c++ -*-
// In-memory corpus track
-// (c) 2006-2012 Ulrich Germann.
+// (c) 2006-2012 Ulrich Germann.
#ifndef __ug_im_ttrack
#define __ug_im_ttrack
@@ -36,20 +36,20 @@ namespace ugdiss
template<typename Token> class imTtrack;
template<typename TOKEN>
- typename boost::shared_ptr<imTtrack<TOKEN> >
+ typename boost::shared_ptr<imTtrack<TOKEN> >
append(typename boost::shared_ptr<imTtrack<TOKEN> > const & crp, vector<TOKEN> const & snt);
template<typename Token>
class imTtrack : public Ttrack<Token>
{
-
+
private:
size_t numToks;
boost::shared_ptr<vector<vector<Token> > > myData; // pointer to corpus data
friend class imTSA<Token>;
- friend
- typename boost::shared_ptr<imTtrack<Token> >
+ friend
+ typename boost::shared_ptr<imTtrack<Token> >
append<Token>(typename boost::shared_ptr<imTtrack<Token> > const & crp, vector<Token> const & snt);
void m_check_token_count(); // debugging function
@@ -60,14 +60,14 @@ namespace ugdiss
imTtrack(istream& in, TokenIndex const& V, ostream* log = NULL);
imTtrack(size_t reserve = 0);
// imTtrack(istream& in, Vocab& V);
-
+
/** return pointer to beginning of sentence */
- Token const* sntStart(size_t sid) const;
+ Token const* sntStart(size_t sid) const;
/** return pointer to beginning of sentence */
- Token const* sntEnd(size_t sid) const;
+ Token const* sntEnd(size_t sid) const;
- size_t size() const;
+ size_t size() const;
size_t numTokens() const;
id_type findSid(Token const* t) const;
@@ -82,16 +82,16 @@ namespace ugdiss
size_t check = 0;
BOOST_FOREACH(vector<Token> const& s, *myData)
check += s.size();
- UTIL_THROW_IF2(check != this->numToks, "[" << HERE << "]"
+ UTIL_THROW_IF2(check != this->numToks, "[" << HERE << "]"
<< " Wrong token count after appending sentence!"
- << " Counted " << check << " but expected "
- << this->numToks << " in a total of " << myData->size()
+ << " Counted " << check << " but expected "
+ << this->numToks << " in a total of " << myData->size()
<< " sentences.");
-
+
}
template<typename Token>
- Token const*
+ Token const*
imTtrack<Token>::
sntStart(size_t sid) const // return pointer to beginning of sentence
{
@@ -99,9 +99,9 @@ namespace ugdiss
if ((*myData)[sid].size() == 0) return NULL;
return &((*myData)[sid].front());
}
-
+
template<typename Token>
- Token const*
+ Token const*
imTtrack<Token>::
sntEnd(size_t sid) const // return pointer to end of sentence
{
@@ -109,9 +109,9 @@ namespace ugdiss
if ((*myData)[sid].size() == 0) return NULL;
return &(*myData)[sid].back()+1;
}
-
+
template<typename Token>
- size_t
+ size_t
imTtrack<Token>::
size() const // return size of corpus (in number of sentences)
{
@@ -120,15 +120,15 @@ namespace ugdiss
// offset in the myIndex than there are sentences
return myData->size();
}
-
+
template<typename Token>
- size_t
+ size_t
imTtrack<Token>::
numTokens() const // return size of corpus (in number of words)
{
return numToks;
}
-
+
template<typename Token>
imTtrack<Token>::
imTtrack(istream& in, TokenIndex const& V, ostream* log)
@@ -140,19 +140,19 @@ namespace ugdiss
boost::unordered_map<string,id_type> H;
for (id_type i = 0; i < V.knownVocabSize(); ++i)
H[V[i]] = i;
- while (getline(in,line))
+ while (getline(in,line))
{
myData->push_back(vector<Token>());
- if (log && ++linectr%1000000==0)
+ if (log && ++linectr%1000000==0)
*log << linectr/1000000 << "M lines of input processed" << endl;
istringstream buf(line);
- while (buf>>w)
+ while (buf>>w)
myData->back().push_back(Token(H[w]));
myData->back().resize(myData.back().size());
numToks += myData->back().size();
}
}
-
+
template<typename Token>
imTtrack<Token>::
imTtrack(size_t reserve)
@@ -171,7 +171,7 @@ namespace ugdiss
BOOST_FOREACH(vector<Token> const& v, *d)
numToks += v.size();
}
-
+
template<typename Token>
id_type
imTtrack<Token>::
@@ -182,7 +182,7 @@ namespace ugdiss
{
vector<Token> const& v = (*myData)[i];
if (v.size() == 0) continue;
- if (&v.front() <= t && &v.back() >= t)
+ if (&v.front() <= t && &v.back() >= t)
break;
}
return i;
@@ -190,7 +190,7 @@ namespace ugdiss
/// add a sentence to the database
template<typename TOKEN>
- boost::shared_ptr<imTtrack<TOKEN> >
+ boost::shared_ptr<imTtrack<TOKEN> >
append(boost::shared_ptr<imTtrack<TOKEN> > const& crp, vector<TOKEN> const & snt)
{
#if 1
diff --git a/moses/TranslationModel/UG/mm/ug_lexical_phrase_scorer1.h b/moses/TranslationModel/UG/mm/ug_lexical_phrase_scorer1.h
index 53628e3b3..742e0dd4e 100644
--- a/moses/TranslationModel/UG/mm/ug_lexical_phrase_scorer1.h
+++ b/moses/TranslationModel/UG/mm/ug_lexical_phrase_scorer1.h
@@ -15,14 +15,14 @@ using namespace std;
namespace ugdiss
{
- template<typename TKN>
- class
+ template<typename TKN>
+ class
LexicalPhraseScorer1
{
typedef boost::unordered_map<id_type, float> inner_map_t;
vector<inner_map_t> L1_given_L2;
vector<inner_map_t> L2_given_L1;
- void load_lex (string const& fname, TokenIndex & V1, TokenIndex & V2,
+ void load_lex (string const& fname, TokenIndex & V1, TokenIndex & V2,
vector<inner_map_t> & lex);
public:
void open(string const& bname, string const& L1, string const& L2,
@@ -34,14 +34,14 @@ namespace ugdiss
TKN const* snt2, size_t const s2, size_t const e2,
char const* const aln_start, char const* const aln_end,
float & fwd_score, float& bwd_score);
- float permissive_lookup(vector<inner_map_t> const& lex,
+ float permissive_lookup(vector<inner_map_t> const& lex,
id_type const s, id_type const t) const;
};
-
+
template<typename TKN>
void
LexicalPhraseScorer1<TKN>::
- load_lex (string const& fname, TokenIndex & V1, TokenIndex & V2,
+ load_lex (string const& fname, TokenIndex & V1, TokenIndex & V2,
vector<inner_map_t> & lex)
{
boost::iostreams::filtering_istream in;
@@ -52,20 +52,20 @@ namespace ugdiss
while (in >> w1 >> w2 >> p)
{
id_type id1 = V1[w1];
- while (lex.size() <= id1)
+ while (lex.size() <= id1)
lex.push_back(inner_map_t());
lex[id1][V2[w2]] = p;
}
}
-
+
template<typename TKN>
void
LexicalPhraseScorer1<TKN>::
open(string const& bname, string const& L1, string const& L2,
TokenIndex & V1, TokenIndex & V2)
{
- string lex1 = bname+L1+"-"+L2+"."+L1+"-given-"+L2+".lex.gz";
- string lex2 = bname+L1+"-"+L2+"."+L2+"-given-"+L1+".lex.gz";
+ string lex1 = bname+L1+"-"+L2+"."+L1+"-given-"+L2+".lex.gz";
+ string lex2 = bname+L1+"-"+L2+"."+L2+"-given-"+L1+".lex.gz";
cout << lex1 << endl;
cout << lex2 << endl;
load_lex(lex1,V1,V2,L1_given_L2);
@@ -86,9 +86,9 @@ namespace ugdiss
{
i1 = aln[k]; i2 = aln[++k];
if (i1 < s1 || i1 >= e1 || i2 < s2 || i2 >= e2) continue;
- p1[i1] += permissive_lookup(L2_given_L1, snt2[i2].id(), snt1[i1].id());
+ p1[i1] += permissive_lookup(L2_given_L1, snt2[i2].id(), snt1[i1].id());
++c1[i1];
- p2[i2] += permissive_lookup(L1_given_L2, snt1[i1].id(), snt2[i2].id());
+ p2[i2] += permissive_lookup(L1_given_L2, snt1[i1].id(), snt2[i2].id());
++c2[i2];
}
fwd_score = 0;
@@ -110,7 +110,7 @@ namespace ugdiss
template<typename TKN>
float
LexicalPhraseScorer1<TKN>::
- permissive_lookup(vector<inner_map_t> const& lex,
+ permissive_lookup(vector<inner_map_t> const& lex,
id_type const s, id_type const t) const
{
if (s >= lex.size()) return 1.0;
@@ -135,9 +135,9 @@ namespace ugdiss
// assert(snt1[i2].id() < L1_given_L2.size());
// assert(snt2[i2].id() < L2_given_L1.size());
if (i1 < s1 || i1 >= e1 || i2 < s2 || i2 >= e2) continue;
- p1[i1] += permissive_lookup(L1_given_L2, snt1[i1].id(), snt2[i2].id());
+ p1[i1] += permissive_lookup(L1_given_L2, snt1[i1].id(), snt2[i2].id());
++c1[i1];
- p2[i2] += permissive_lookup(L2_given_L1, snt2[i2].id(), snt1[i1].id());
+ p2[i2] += permissive_lookup(L2_given_L1, snt2[i2].id(), snt1[i1].id());
++c2[i2];
}
fwd_score = 0;
diff --git a/moses/TranslationModel/UG/mm/ug_lexical_phrase_scorer2.h b/moses/TranslationModel/UG/mm/ug_lexical_phrase_scorer2.h
index b7e359223..fdd0366df 100644
--- a/moses/TranslationModel/UG/mm/ug_lexical_phrase_scorer2.h
+++ b/moses/TranslationModel/UG/mm/ug_lexical_phrase_scorer2.h
@@ -18,8 +18,8 @@ using namespace std;
namespace ugdiss
{
- template<typename TKN>
- class
+ template<typename TKN>
+ class
LexicalPhraseScorer2
{
vector<string> ftag;
@@ -28,28 +28,28 @@ namespace ugdiss
table_t COOC;
void open(string const& fname);
template<typename someint>
- void
+ void
score(TKN const* snt1, size_t const s1, size_t const e1,
TKN const* snt2, size_t const s2, size_t const e2,
vector<someint> const & aln, float const alpha,
float & fwd_score, float& bwd_score) const;
- void
+ void
score(TKN const* snt1, size_t const s1, size_t const e1,
TKN const* snt2, size_t const s2, size_t const e2,
char const* const aln_start, char const* const aln_end,
float const alpha, float & fwd_score, float& bwd_score) const;
// plup: permissive lookup
- float plup_fwd(id_type const s,id_type const t, float const alpha) const;
+ float plup_fwd(id_type const s,id_type const t, float const alpha) const;
float plup_bwd(id_type const s,id_type const t, float const alpha) const;
- // to be done:
- // - on-the-fly smoothing ?
- // - better (than permissive-lookup) treatment of unknown combinations
+ // to be done:
+ // - on-the-fly smoothing ?
+ // - better (than permissive-lookup) treatment of unknown combinations
// permissive lookup is currently used for compatibility reasons
// - zens-ney smoothed scoring via noisy-or combination
};
-
+
template<typename TKN>
void
LexicalPhraseScorer2<TKN>::
@@ -64,7 +64,7 @@ namespace ugdiss
LexicalPhraseScorer2<TKN>::
score(TKN const* snt1, size_t const s1, size_t const e1,
TKN const* snt2, size_t const s2, size_t const e2,
- vector<someint> const & aln, float const alpha,
+ vector<someint> const & aln, float const alpha,
float & fwd_score, float& bwd_score) const
{
vector<float> p1(e1,0), p2(e2,0);
@@ -74,9 +74,9 @@ namespace ugdiss
{
i1 = aln[k]; i2 = aln[++k];
if (i1 < s1 || i1 >= e1 || i2 < s2 || i2 >= e2) continue;
- p1[i1] += plup_fwd(snt1[i1].id(),snt2[i2].id(),alpha);
+ p1[i1] += plup_fwd(snt1[i1].id(),snt2[i2].id(),alpha);
++c1[i1];
- p2[i2] += plup_bwd(snt1[i1].id(),snt2[i2].id(),alpha);
+ p2[i2] += plup_bwd(snt1[i1].id(),snt2[i2].id(),alpha);
++c2[i2];
}
fwd_score = 0;
@@ -105,19 +105,19 @@ namespace ugdiss
<< ": alpha parameter must be >= 0");
float ret = COOC[s][t]+alpha;
ret = (ret?ret:1.)/(COOC.m1(s)+alpha);
- UTIL_THROW_IF2(ret <= 0 || ret > 1, "At " << __FILE__ << ":" << __LINE__
+ UTIL_THROW_IF2(ret <= 0 || ret > 1, "At " << __FILE__ << ":" << __LINE__
<< ": result not > 0 and <= 1. alpha = " << alpha << "; "
<< COOC[s][t] << "/" << COOC.m1(s));
#if 0
- cerr << "[" << s << "," << t << "] "
- << COOC.m1(s) << "/"
- << COOC[s][t] << "/"
+ cerr << "[" << s << "," << t << "] "
+ << COOC.m1(s) << "/"
+ << COOC[s][t] << "/"
<< COOC.m2(t) << endl;
#endif
return ret;
}
-
+
template<typename TKN>
float
LexicalPhraseScorer2<TKN>::
@@ -128,11 +128,11 @@ namespace ugdiss
<< ": alpha parameter must be >= 0");
float ret = float(COOC[s][t]+alpha);
ret = (ret?ret:1.)/(COOC.m2(t)+alpha);
- UTIL_THROW_IF2(ret <= 0 || ret > 1, "At " << __FILE__ << ":" << __LINE__
+ UTIL_THROW_IF2(ret <= 0 || ret > 1, "At " << __FILE__ << ":" << __LINE__
<< ": result not > 0 and <= 1.");
return ret;
}
-
+
template<typename TKN>
void
LexicalPhraseScorer2<TKN>::
@@ -148,9 +148,9 @@ namespace ugdiss
{
x = binread(binread(x,i1),i2);
if (i1 < s1 || i1 >= e1 || i2 < s2 || i2 >= e2) continue;
- p1[i1] += plup_fwd(snt1[i1].id(), snt2[i2].id(),alpha);
+ p1[i1] += plup_fwd(snt1[i1].id(), snt2[i2].id(),alpha);
++c1[i1];
- p2[i2] += plup_bwd(snt1[i1].id(), snt2[i2].id(),alpha);
+ p2[i2] += plup_bwd(snt1[i1].id(), snt2[i2].id(),alpha);
++c2[i2];
}
fwd_score = 0;
diff --git a/moses/TranslationModel/UG/mm/ug_lexical_reordering.cc b/moses/TranslationModel/UG/mm/ug_lexical_reordering.cc
index 706c042c0..d0522c528 100644
--- a/moses/TranslationModel/UG/mm/ug_lexical_reordering.cc
+++ b/moses/TranslationModel/UG/mm/ug_lexical_reordering.cc
@@ -10,26 +10,26 @@ namespace Moses
// bounds LFT and RGT and update the actual bounds L and R; update
// the total count of alignment links in the underlying phrase
// pair
- bool
+ bool
check(vector<ushort> const& v, // alignment row/column
size_t const LFT, size_t const RGT, // hard limits
ushort& L, ushort& R, size_t& count) // current bounds, count
{
if (v.size() == 0) return 0;
- if (L > v.front() && (L=v.front()) < LFT) return false;
+ if (L > v.front() && (L=v.front()) < LFT) return false;
if (R < v.back() && (R=v.back()) > RGT) return false;
count += v.size();
return true;
}
-
+
/// return number of alignment points in box, -1 on failure
- int
+ int
expand_block(vector<vector<ushort> > const& row2col,
vector<vector<ushort> > const& col2row,
size_t row, size_t col, // seed coordinates
- size_t const TOP, size_t const LFT, // hard limits
- size_t const BOT, size_t const RGT, // hard limits
- ushort* top = NULL, ushort* lft = NULL,
+ size_t const TOP, size_t const LFT, // hard limits
+ size_t const BOT, size_t const RGT, // hard limits
+ ushort* top = NULL, ushort* lft = NULL,
ushort* bot = NULL, ushort* rgt = NULL) // store results
{
if (row < TOP || row > BOT || col < LFT || col > RGT) return -1;
@@ -37,7 +37,7 @@ namespace Moses
UTIL_THROW_IF2(col >= col2row.size(), "out of bounds");
// ====================================================
- // tables grow downwards, so TOP is smaller than BOT!
+ // tables grow downwards, so TOP is smaller than BOT!
// ====================================================
ushort T, L, B, R; // box dimensions
@@ -45,7 +45,7 @@ namespace Moses
// if we start on an empty cell, search for the first alignment point
if (row2col[row].size() == 0 && col2row[col].size() == 0)
{
- if (row == TOP) while (row < BOT && !row2col[++row].size());
+ if (row == TOP) while (row < BOT && !row2col[++row].size());
else if (row == BOT) while (row > TOP && !row2col[--row].size());
if (col == LFT) while (col < RGT && !col2row[++col].size());
@@ -54,7 +54,7 @@ namespace Moses
if (row2col[row].size() == 0 && col2row[col].size() == 0)
return 0;
}
- if (row2col[row].size() == 0)
+ if (row2col[row].size() == 0)
row = col2row[col].front();
if (col2row[col].size() == 0)
col = row2col[row].front();
@@ -65,9 +65,9 @@ namespace Moses
if ((R = row2col[row].back()) > RGT) return -1;
if (B == T && R == L) return 1;
-
+
// start/end of row / column coverage:
- ushort rs = row, re = row, cs = col, ce = col;
+ ushort rs = row, re = row, cs = col, ce = col;
int ret = row2col[row].size();
for (size_t tmp = 1; tmp; ret += tmp)
{
@@ -126,11 +126,24 @@ namespace Moses
T = x = e1; B = a1.size()-1;
if (expand_block(a1,a2,x,y,T,L,B,R) >= 0)
return Moses::LRModel::S;
- while (s2 && a2[s2].size() == 0) --s2;
- if (a2[s2].size() == 0) return po_other;
- if (a2[s2].back() < s1) return Moses::LRModel::DR;
- if (a2[s2].front() >= e1) return Moses::LRModel::DL;
- return po_other;
+ while (s2-- && a2[s2].size() == 0);
+
+ Moses::LRModel::ReorderingType ret;
+ ret = (a2[s2].size() == 0 ? po_other :
+ a2[s2].back() < s1 ? Moses::LRModel::DR :
+ a2[s2].front() >= e1 ? Moses::LRModel::DL :
+ po_other);
+#if 0
+ cout << "s1=" << s1 << endl;
+ cout << "s2=" << s2x << "=>" << s2 << endl;
+ cout << "e1=" << e1 << endl;
+ cout << "e2=" << e2 << endl;
+ cout << "a2[s2].size()=" << a2[s2].size() << endl;
+ cout << "a2[s2].back()=" << a2[s2].back() << endl;
+ cout << "a2[s2].front()=" << a2[s2].front() << endl;
+ cout << "RETURNING " << ret << endl;
+#endif
+ return ret;
}
} // namespace bitext
diff --git a/moses/TranslationModel/UG/mm/ug_lexical_reordering.h b/moses/TranslationModel/UG/mm/ug_lexical_reordering.h
index d432ea37e..9004b757e 100644
--- a/moses/TranslationModel/UG/mm/ug_lexical_reordering.h
+++ b/moses/TranslationModel/UG/mm/ug_lexical_reordering.h
@@ -7,13 +7,13 @@ namespace Moses { namespace bitext {
typedef Moses::LRModel::ReorderingType PhraseOrientation;
-PhraseOrientation
+PhraseOrientation
find_po_fwd(std::vector<std::vector<ushort> >& a1,
std::vector<std::vector<ushort> >& a2,
size_t b1, size_t e1,
size_t b2, size_t e2);
-PhraseOrientation
+PhraseOrientation
find_po_bwd(std::vector<std::vector<ushort> >& a1,
std::vector<std::vector<ushort> >& a2,
size_t b1, size_t e1,
@@ -21,5 +21,5 @@ find_po_bwd(std::vector<std::vector<ushort> >& a1,
-
+
}} // close namespaces
diff --git a/moses/TranslationModel/UG/mm/ug_load_primer.h b/moses/TranslationModel/UG/mm/ug_load_primer.h
index 1cd167a68..961c45da1 100644
--- a/moses/TranslationModel/UG/mm/ug_load_primer.h
+++ b/moses/TranslationModel/UG/mm/ug_load_primer.h
@@ -1,7 +1,7 @@
//-*- c++ -*-
#pragma once
#include <boost/iostreams/device/mapped_file.hpp>
-//
+//
namespace Moses
{
class FastLoader
@@ -14,5 +14,5 @@ namespace Moses
void prime(boost::iostreams::mapped_file_source const& f);
-
+
};
diff --git a/moses/TranslationModel/UG/mm/ug_lru_cache.h b/moses/TranslationModel/UG/mm/ug_lru_cache.h
index d1c9a9767..0000b194f 100644
--- a/moses/TranslationModel/UG/mm/ug_lru_cache.h
+++ b/moses/TranslationModel/UG/mm/ug_lru_cache.h
@@ -30,25 +30,25 @@ namespace lru_cache
// timeval tstamp; // time stamp
typename boost::shared_ptr<VAL> ptr; // cached shared ptr
};
-
+
mutable boost::shared_mutex m_lock;
uint32_t m_qfront, m_qback;
- vector<Record> m_recs;
+ vector<Record> m_recs;
map_t m_idx;
- void
+ void
update_queue(KEY const& key, uint32_t const p)
{
// CALLER MUST LOCK!
- // "remove" item in slot p from it's current position of the
- // queue (which is different from the slot position) and move it
+ // "remove" item in slot p from it's current position of the
+ // queue (which is different from the slot position) and move it
// to the end
Record& r = m_recs[p];
if (m_recs.size() == 1)
r.next = r.prev = m_qback = m_qfront = 0;
-
+
if (r.key != key || p == m_qback) return;
-
+
if (m_qfront == p)
m_qfront = m_recs[r.next].prev = r.next;
else
@@ -65,8 +65,8 @@ namespace lru_cache
size_t capacity() const { return m_recs.capacity(); }
void reserve(size_t s) { m_recs.reserve(s); }
- sptr<VAL>
- get(KEY const& key)
+ sptr<VAL>
+ get(KEY const& key)
{
uint32_t p;
{ // brackets needed for lock scoping
@@ -86,13 +86,13 @@ namespace lru_cache
boost::lock_guard<boost::shared_mutex> lock(m_lock);
pair<typename map_t::iterator,bool> foo;
foo = m_idx.insert(make_pair(key,m_recs.size()));
-
+
uint32_t p = foo.first->second;
if (foo.second) // was not in the cache
{
if (m_recs.size() < m_recs.capacity())
m_recs.push_back(Record());
- else
+ else
{
foo.first->second = p = m_qfront;
m_idx.erase(m_recs[p].key);
diff --git a/moses/TranslationModel/UG/mm/ug_mm_2d_table.h b/moses/TranslationModel/UG/mm/ug_mm_2d_table.h
index cfc86b8fc..2455ca603 100644
--- a/moses/TranslationModel/UG/mm/ug_mm_2d_table.h
+++ b/moses/TranslationModel/UG/mm/ug_mm_2d_table.h
@@ -24,12 +24,12 @@ namespace ugdiss
ID id;
VAL val;
- bool
+ bool
operator<(ID const otherId) const
{
return id < otherId;
}
-
+
bool
operator<(Cell const& other) const
{
@@ -60,14 +60,14 @@ namespace ugdiss
ID numCols;
boost::shared_ptr<bio::mapped_file_source> file;
- VAL m1(ID key) const
- {
- return (key < numRows) ? M1[key] : INIT(0);
+ VAL m1(ID key) const
+ {
+ return (key < numRows) ? M1[key] : INIT(0);
}
VAL m2(ID key) const
{
- return (key < numCols) ? M2[key] : INIT(0);
+ return (key < numCols) ? M2[key] : INIT(0);
}
@@ -106,7 +106,7 @@ namespace ugdiss
Cell const* c = lower_bound(start,stop,key);
return (c != stop && c->id == key ? c->val : INIT(0));
}
-
+
template<typename OFFSET, typename ID, typename VAL, typename INIT>
void
mm2dTable<OFFSET,ID,VAL,INIT>::
@@ -140,10 +140,10 @@ namespace ugdiss
// cout << numRows << " rows; " << numCols << " columns " << endl;
M1 = reinterpret_cast<VAL const*>(index+numRows+1);
M2 = M1+numRows;
- // cout << "Table " << fname << " has " << numRows << " rows and "
+ // cout << "Table " << fname << " has " << numRows << " rows and "
// << numCols << " columns." << endl;
- // cout << "File size is " << file.size()*1024 << " bytes; ";
- // cout << "M2 starts " << (reinterpret_cast<char const*>(M2) - file.data())
+ // cout << "File size is " << file.size()*1024 << " bytes; ";
+ // cout << "M2 starts " << (reinterpret_cast<char const*>(M2) - file.data())
// << " bytes into the file" << endl;
// cout << M2[0] << endl;
}
@@ -156,8 +156,8 @@ namespace ugdiss
typename ICONT // inner container type
>
void
- write_mm_2d_table(ostream& out, vector<ICONT> const& T,
- vector<VAL> const* m1 = NULL,
+ write_mm_2d_table(ostream& out, vector<ICONT> const& T,
+ vector<VAL> const* m1 = NULL,
vector<VAL> const* m2 = NULL)
{
assert(T.size());
@@ -223,7 +223,7 @@ namespace ugdiss
OFFSET o = index[i]; // (index[i]-index[0])/sizeof(VAL);
out.write(reinterpret_cast<char*>(&o),sizeof(OFFSET));
}
-
+
// write marginals
out.write(reinterpret_cast<char const*>(&(*m1)[0]),m1->size()*sizeof(VAL));
out.write(reinterpret_cast<char const*>(&(*m2)[0]),m2->size()*sizeof(VAL));
diff --git a/moses/TranslationModel/UG/mm/ug_mm_bitext.h b/moses/TranslationModel/UG/mm/ug_mm_bitext.h
new file mode 100644
index 000000000..be3fdfce8
--- /dev/null
+++ b/moses/TranslationModel/UG/mm/ug_mm_bitext.h
@@ -0,0 +1,82 @@
+// -*- c++ -*-
+// don't include this file directly! it is included by ug_bitext.h
+
+namespace Moses
+{
+ namespace bitext
+ {
+ template<typename TKN>
+ class mmBitext : public Bitext<TKN>
+ {
+ void load_document_map(string const& fname);
+ public:
+ void open(string const base, string const L1, string L2);
+ mmBitext();
+ };
+
+ template<typename TKN>
+ mmBitext<TKN>::
+ mmBitext()
+ : Bitext<TKN>(new mmTtrack<TKN>(), new mmTtrack<TKN>(), new mmTtrack<char>(),
+ new TokenIndex(), new TokenIndex(),
+ new mmTSA<TKN>(), new mmTSA<TKN>())
+ {};
+
+ template<typename TKN>
+ void
+ mmBitext<TKN>::
+ load_document_map(string const& fname)
+ {
+ ifstream docmap(fname.c_str());
+ // the docmap file should list the documents in the corpus
+ // in the order in which they appear with one line per document:
+ // <docname> <number of lines / sentences>
+ //
+ // in the future, we might also allow listing documents with
+ // sentence ranges.
+ string buffer,docname; size_t a=0,b;
+ this->m_sid2docid.reset(new vector<id_type>(this->T1->size()));
+ while(getline(docmap,buffer))
+ {
+ istringstream line(buffer);
+ if (!(line>>docname)) continue; // empty line
+ if (docname.size() && docname[0] == '#') continue; // comment
+ size_t docid = this->m_docname2docid.size();
+ this->m_docname2docid[docname] = docid;
+ this->m_docname.push_back(docname);
+ line >> b;
+ VERBOSE(1, "DOCUMENT MAP " << docname
+ << " " << a << "-" << b+a << endl);
+ for (b += a; a < b; ++a)
+ (*this->m_sid2docid)[a] = docid;
+ }
+ UTIL_THROW_IF2(b != this->T1->size(),
+ "Document map doesn't match corpus!");
+ }
+
+ template<typename TKN>
+ void
+ mmBitext<TKN>::
+ open(string const base, string const L1, string L2)
+ {
+ mmTtrack<TKN>& t1 = *reinterpret_cast<mmTtrack<TKN>*>(this->T1.get());
+ mmTtrack<TKN>& t2 = *reinterpret_cast<mmTtrack<TKN>*>(this->T2.get());
+ mmTtrack<char>& tx = *reinterpret_cast<mmTtrack<char>*>(this->Tx.get());
+ t1.open(base+L1+".mct");
+ t2.open(base+L2+".mct");
+ tx.open(base+L1+"-"+L2+".mam");
+ this->V1->open(base+L1+".tdx"); this->V1->iniReverseIndex();
+ this->V2->open(base+L2+".tdx"); this->V2->iniReverseIndex();
+ mmTSA<TKN>& i1 = *reinterpret_cast<mmTSA<TKN>*>(this->I1.get());
+ mmTSA<TKN>& i2 = *reinterpret_cast<mmTSA<TKN>*>(this->I2.get());
+ i1.open(base+L1+".sfa", this->T1);
+ i2.open(base+L2+".sfa", this->T2);
+ assert(this->T1->size() == this->T2->size());
+
+ string docmapfile = base+"dmp";
+ if (!access(docmapfile.c_str(),F_OK))
+ load_document_map(docmapfile);
+ }
+
+ }
+}
diff --git a/moses/TranslationModel/UG/mm/ug_mm_tsa.h b/moses/TranslationModel/UG/mm/ug_mm_tsa.h
index 9d5038e26..ff2d4c693 100644
--- a/moses/TranslationModel/UG/mm/ug_mm_tsa.h
+++ b/moses/TranslationModel/UG/mm/ug_mm_tsa.h
@@ -40,7 +40,7 @@ namespace ugdiss
char const* index_jump(char const* a, char const* z, float ratio) const;
char const* getLowerBound(id_type t) const;
char const* getUpperBound(id_type t) const;
-
+
public:
mmTSA();
mmTSA(string fname, Ttrack<TOKEN> const* c);
@@ -53,24 +53,24 @@ namespace ugdiss
rawCnt(char const* p, char const * const q) const;
void
- getCounts(char const* p, char const * const q,
+ getCounts(char const* p, char const * const q,
count_type& sids, count_type& raw) const;
- char const*
+ char const*
readSid(char const* p, char const* q, id_type& sid) const;
- char const*
+ char const*
readSid(char const* p, char const* q, ::uint64_t& sid) const;
- char const*
+ char const*
readOffset(char const* p, char const* q, uint16_t& offset) const;
- char const*
+ char const*
readOffset(char const* p, char const* q, ::uint64_t& offset) const;
void sanityCheck() const;
- };
+ };
// ======================================================================
@@ -78,13 +78,13 @@ namespace ugdiss
* assumes that keys are flagged with '1', values with '0'
*/
template<typename TOKEN>
- char const*
+ char const*
mmTSA<TOKEN>::
index_jump(char const* a, char const* z, float ratio) const
{
assert(ratio >= 0 && ratio < 1);
char const* m = a+int(ratio*(z-a));
- if (m > a)
+ if (m > a)
{
while (m > a && *m < 0) --m;
while (m > a && *m >= 0) --m;
@@ -98,7 +98,7 @@ namespace ugdiss
template<typename TOKEN>
mmTSA<TOKEN>::
- mmTSA()
+ mmTSA()
{
this->startArray = NULL;
this->endArray = NULL;
@@ -136,9 +136,9 @@ namespace ugdiss
filepos_type idxOffset;
p = numread(p,idxOffset);
p = numread(p,this->indexSize);
-
+
// cerr << fname << ": " << idxOffset << " " << this->indexSize << endl;
-
+
this->startArray = p;
this->index = reinterpret_cast<filepos_type const*>(file.data()+idxOffset);
this->endArray = reinterpret_cast<char const*>(index);
@@ -153,7 +153,7 @@ namespace ugdiss
mmTSA<TOKEN>::
getLowerBound(id_type id) const
{
- if (id >= this->indexSize)
+ if (id >= this->indexSize)
return NULL;
return this->startArray + this->index[id];
}
@@ -165,7 +165,7 @@ namespace ugdiss
mmTSA<TOKEN>::
getUpperBound(id_type id) const
{
- if (id >= this->indexSize)
+ if (id >= this->indexSize)
return NULL;
// if (index[id] == index[id+1])
// return NULL;
@@ -232,13 +232,13 @@ namespace ugdiss
}
return ret;
}
-
+
// ======================================================================
template<typename TOKEN>
- void
+ void
mmTSA<TOKEN>::
- getCounts(char const* p, char const* const q,
+ getCounts(char const* p, char const* const q,
count_type& sids, count_type& raw) const
{
raw = 0;
diff --git a/moses/TranslationModel/UG/mm/ug_mm_ttrack.h b/moses/TranslationModel/UG/mm/ug_mm_ttrack.h
index 51ba21778..bfee14e3e 100644
--- a/moses/TranslationModel/UG/mm/ug_mm_ttrack.h
+++ b/moses/TranslationModel/UG/mm/ug_mm_ttrack.h
@@ -26,7 +26,7 @@ namespace ugdiss
{
using namespace std;
namespace bio=boost::iostreams;
-
+
template<typename TKN=id_type>
class mmTtrack : public Ttrack<TKN>
{
@@ -38,21 +38,21 @@ namespace ugdiss
id_type numWords;
bio::mapped_file_source file;
Token const* data; // pointer to first word of first sentence
- id_type const* index; /* pointer to index (change data type for corpora
+ id_type const* index; /* pointer to index (change data type for corpora
* of more than four billion words)
*/
public:
mmTtrack(string fname);
mmTtrack();
- // return pointer to beginning of sentence
- Token const* sntStart(size_t sid) const;
+ // return pointer to beginning of sentence
+ Token const* sntStart(size_t sid) const;
- // return pointer to end of sentence
- Token const* sntEnd(size_t sid) const;
+ // return pointer to end of sentence
+ Token const* sntEnd(size_t sid) const;
// return size of corpus (in number of sentences)
- size_t size() const;
+ size_t size() const;
// return size of corpus (in number of sentences)
size_t numTokens() const;
@@ -60,23 +60,23 @@ namespace ugdiss
// open an mmTtrack file
void open(string fname);
- // FUNCTIONS FOR BUILDING CORPUS TRACKS
- // write a blank file header at the beginning of a new ttrack file
+ // FUNCTIONS FOR BUILDING CORPUS TRACKS
+ // write a blank file header at the beginning of a new ttrack file
void write_blank_file_header(ostream& out) const;
// write the sentence index /idx/ and fill the file header
- void write_index_and_finalize(ostream& out,
+ void write_index_and_finalize(ostream& out,
vector<id_type> const& idx,
count_type tokenCount) const;
// copy a contiguous sequence of sentences to another stream
// return the number of tokens copied
id_type copySentences(ostream& trg, id_type start, id_type stop) const;
-
+
/** find the sentence id of a given token */
- id_type findSid(TKN const* t) const;
+ id_type findSid(TKN const* t) const;
- id_type findSid(id_type tokenOffset) const;
+ id_type findSid(id_type tokenOffset) const;
/// re-assign ids based on the id maps in /f/
void remap(string const fname, vector<id_type const*> const & f) const;
@@ -88,7 +88,7 @@ namespace ugdiss
void
mmTtrack<TKN>::
remap(string const fname, vector<id_type const*> const & f) const
- {
+ {
bio::mapped_file myfile(fname);
assert(myfile.is_open());
Moses::prime(myfile);
@@ -110,7 +110,7 @@ namespace ugdiss
mmTtrack<TKN>::
size() const
{
- return this->numSent;
+ return this->numSent;
}
template<typename TKN>
@@ -118,17 +118,17 @@ namespace ugdiss
mmTtrack<TKN>::
numTokens() const
{
- return this->numWords;
+ return this->numWords;
}
template<typename TKN>
- TKN const*
+ TKN const*
mmTtrack<TKN>::
sntStart(size_t sid) const // return pointer to beginning of sentence
{
if (sid >= this->numSent)
{
- cerr << "Fatal error: requested sentence #"<<sid<<" is beyond corpus size ("
+ cerr << "Fatal error: requested sentence #"<<sid<<" is beyond corpus size ("
<< this->numSent <<")" << endl;
}
assert(sid < this->numSent);
@@ -136,14 +136,14 @@ namespace ugdiss
}
template<typename TKN>
- TKN const*
+ TKN const*
mmTtrack<TKN>::
sntEnd(size_t sid) const // return pointer to end of sentence
{
assert(sid < this->numSent);
return data+index[sid+1];
}
-
+
template<typename TKN>
mmTtrack<TKN>::
mmTtrack()
@@ -161,7 +161,7 @@ namespace ugdiss
}
template<typename TKN>
- void
+ void
mmTtrack<TKN>::
open(string fname)
{
@@ -235,7 +235,7 @@ namespace ugdiss
}
template<typename TKN>
- id_type
+ id_type
mmTtrack<TKN>::
copySentences(ostream& trg, id_type start, id_type stop) const
{
diff --git a/moses/TranslationModel/UG/mm/ug_mmbitext.cc b/moses/TranslationModel/UG/mm/ug_mmbitext.cc
index 8f1a4aa12..34e3f1b1e 100644
--- a/moses/TranslationModel/UG/mm/ug_mmbitext.cc
+++ b/moses/TranslationModel/UG/mm/ug_mmbitext.cc
@@ -21,7 +21,7 @@
// ++this->in_progress;
// this->lock.unlock();
// }
-
+
// void
// pstats::
// release()
@@ -52,7 +52,7 @@
// mmbitext()
// : ag(NULL)
// {
-
+
// }
// bool
@@ -78,13 +78,13 @@
// {
// if (flip) { p = binread(p,trg); assert(p<x); p = binread(p,src); }
// else { p = binread(p,src); assert(p<x); p = binread(p,trg); }
-// if (src < start || src >= stop)
+// if (src < start || src >= stop)
// forbidden.set(trg);
// else
// {
// lft = min(lft,trg);
// rgt = max(rgt,trg);
-// if (core_alignment)
+// if (core_alignment)
// {
// if (flip) aln[trg].push_back(src);
// else aln[src].push_back(trg);
@@ -101,16 +101,16 @@
// }
// cout << endl;
// #endif
-
+
// for (size_t i = lft; i <= rgt; ++i)
-// if (forbidden[i])
+// if (forbidden[i])
// return false;
-
+
// s2 = lft; for (s1 = s2; s1 && !forbidden[s1-1]; --s1);
// e1 = rgt+1; for (e2 = e1; e2 < forbidden.size() && !forbidden[e2]; ++e2);
-
+
// if (lft > rgt) return false;
-// if (core_alignment)
+// if (core_alignment)
// {
// core_alignment->clear();
// if (flip)
@@ -147,11 +147,11 @@
// prep2(phrase);
// }
-// sptr<mmbitext::pstats>
+// sptr<mmbitext::pstats>
// mmbitext::
// prep2(iter const& phrase)
// {
-// if (!ag)
+// if (!ag)
// {
// ag = new agenda(*this);
// ag->add_workers(20);
@@ -197,11 +197,11 @@
// continue;
// }
-// stats->lock.lock();
-// stats->good += 1;
+// stats->lock.lock();
+// stats->good += 1;
// stats->lock.unlock();
-// for (size_t k = 0; k < aln.size(); k += 2)
+// for (size_t k = 0; k < aln.size(); k += 2)
// aln[k] += s2 - s1;
// Token const* o = (fwd ? ag.bitext.T2 : ag.bitext.T1).sntStart(sid);
// float sample_weight = 1./((s2-s1+1)*(e2-e1+1));
@@ -215,14 +215,14 @@
// stats->add(b,sample_weight,aln);
// if (i < e2) assert(b.extend(o[i].id()));
// }
-// if (fwd && s < s2)
-// for (size_t k = 0; k < aln.size(); k += 2)
+// if (fwd && s < s2)
+// for (size_t k = 0; k < aln.size(); k += 2)
// --aln[k];
// }
// stats->release();
// }
// }
-
+
// void
// mmbitext::
// pstats::
@@ -239,7 +239,7 @@
// agenda(mmbitext const& thebitext)
// : shutdown(false), doomed(0), bitext(thebitext)
// {
-
+
// }
// mmbitext::
@@ -259,13 +259,13 @@
// {
// if (ag) delete ag;
// }
-
+
// sptr<mmbitext::pstats>
// mmbitext::
// agenda::
// add_job(mmbitext::iter const& phrase, size_t const max_samples)
// {
-// static boost::posix_time::time_duration nodelay(0,0,0,0);
+// static boost::posix_time::time_duration nodelay(0,0,0,0);
// job j;
// j.stats.reset(new mmbitext::pstats());
@@ -296,11 +296,11 @@
// bool
// mmbitext::
// agenda::
-// get_task(uint64_t & sid, uint64_t & offset, uint64_t & len,
+// get_task(uint64_t & sid, uint64_t & offset, uint64_t & len,
// bool & fwd, sptr<mmbitext::pstats> & stats)
// {
// boost::unique_lock<boost::mutex> lock(this->lock);
-// if (this->doomed || this->shutdown)
+// if (this->doomed || this->shutdown)
// {
// if (this->doomed) --this->doomed;
// return false;
@@ -309,7 +309,7 @@
// // {
// // cerr << "no jobs" << endl;
// // this->ready.wait(lock);
-// // if (this->doomed || this->shutdown)
+// // if (this->doomed || this->shutdown)
// // {
// // if (this->doomed) --this->doomed;
// // return false;
@@ -345,8 +345,8 @@
// {
// boost::lock_guard<boost::mutex> lock(stats->lock);
// if (stats->raw_cnt == ctr) ++stats->raw_cnt;
-// size_t rnum = randInt(stats->raw_cnt - ctr++);
-// // cout << stats->raw_cnt << " " << ctr-1 << " "
+// size_t rnum = util::rand_excl(stats->raw_cnt - ctr++);
+// // cout << stats->raw_cnt << " " << ctr-1 << " "
// // << rnum << " " << max_samples - stats->good << endl;
// if (rnum < max_samples - stats->good)
// {
@@ -364,7 +364,7 @@
// agenda::
// add_workers(int n)
// {
-// static boost::posix_time::time_duration nodelay(0,0,0,0);
+// static boost::posix_time::time_duration nodelay(0,0,0,0);
// boost::lock_guard<boost::mutex> lock(this->lock);
// // house keeping: remove all workers that have finished
// for (size_t i = 0; i < workers.size(); )
@@ -377,7 +377,7 @@
// }
// else ++i;
// }
-// if (n < 0)
+// if (n < 0)
// {
// this->doomed -= n;
// }
@@ -394,8 +394,8 @@
// mmbitext::
// jstats::
// jstats()
-// {
-// my_aln.reserve(1);
+// {
+// my_aln.reserve(1);
// }
// mmbitext::
@@ -406,8 +406,8 @@
// my_wcnt = other.wcnt();
// my_aln = other.aln();
// }
-
-// void
+
+// void
// mmbitext::
// jstats::
// add(float w, vector<uchar> const& a)
@@ -419,7 +419,7 @@
// {
// size_t i = 0;
// while (i < my_aln.size() && my_aln[i].second != a) ++i;
-// if (i == my_aln.size())
+// if (i == my_aln.size())
// my_aln.push_back(pair<size_t,vector<uchar> >(1,a));
// else
// my_aln[i].first++;
@@ -431,7 +431,7 @@
// uint32_t
// mmbitext::
// jstats::
-// rcnt() const
+// rcnt() const
// { return my_rcnt; }
// float
@@ -443,7 +443,7 @@
// vector<pair<size_t, vector<uchar> > > const&
// mmbitext::
// jstats::
-// aln() const
+// aln() const
// { return my_aln; }
// }
diff --git a/moses/TranslationModel/UG/mm/ug_mmbitext.h b/moses/TranslationModel/UG/mm/ug_mmbitext.h
index e7378e7f6..3837abc59 100644
--- a/moses/TranslationModel/UG/mm/ug_mmbitext.h
+++ b/moses/TranslationModel/UG/mm/ug_mmbitext.h
@@ -4,10 +4,10 @@
// Written by Ulrich Germann
// things we can do to speed up things:
-// - set up threads at startup time that force the
+// - set up threads at startup time that force the
// data in to memory sequentially
//
-// - use multiple agendas for better load balancing and to avoid
+// - use multiple agendas for better load balancing and to avoid
// competition for locks
#include <string>
@@ -46,8 +46,8 @@ namespace Moses {
class jstats; // phrase pair ("joint") statistics
class agenda
{
- boost::mutex lock;
- boost::condition_variable ready;
+ boost::mutex lock;
+ boost::condition_variable ready;
class job;
class worker;
list<job> joblist;
@@ -59,9 +59,9 @@ namespace Moses {
agenda(mmbitext const& bitext);
~agenda();
void add_workers(int n);
- sptr<pstats> add_job(mmbitext::iter const& phrase,
+ sptr<pstats> add_job(mmbitext::iter const& phrase,
size_t const max_samples);
- bool get_task(uint64_t & sid, uint64_t & offset, uint64_t & len,
+ bool get_task(uint64_t & sid, uint64_t & offset, uint64_t & len,
bool & fwd, sptr<mmbitext::pstats> & stats);
};
@@ -72,22 +72,22 @@ namespace Moses {
mmTtrack<char> Tx; // word alignments
mmTtrack<Token> T1,T2; // token tracks
TokenIndex V1,V2; // vocabs
- mmTSA<Token> I1,I2; // suffix arrays
+ mmTSA<Token> I1,I2; // suffix arrays
/// given the source phrase sid[start:stop]
- // find the possible start (s1 .. s2) and end (e1 .. e2)
+ // find the possible start (s1 .. s2) and end (e1 .. e2)
// points of the target phrase; if non-NULL, store word
- // alignments in *core_alignment. If /flip/, source phrase is
+ // alignments in *core_alignment. If /flip/, source phrase is
// L2.
- bool
+ bool
find_trg_phr_bounds
- (size_t const sid, size_t const start, size_t const stop,
- size_t & s1, size_t & s2, size_t & e1, size_t & e2,
+ (size_t const sid, size_t const start, size_t const stop,
+ size_t & s1, size_t & s2, size_t & e1, size_t & e2,
vector<uchar> * core_alignment, bool const flip) const;
boost::unordered_map<uint64_t,sptr<pstats> > cache1,cache2;
private:
- sptr<pstats>
+ sptr<pstats>
prep2(iter const& phrase);
public:
mmbitext();
@@ -105,8 +105,8 @@ namespace Moses {
jstats
{
uint32_t my_rcnt; // unweighted count
- float my_wcnt; // weighted count
- vector<pair<size_t, vector<uchar> > > my_aln;
+ float my_wcnt; // weighted count
+ vector<pair<size_t, vector<uchar> > > my_aln;
boost::mutex lock;
public:
jstats();
@@ -117,7 +117,7 @@ namespace Moses {
void add(float w, vector<uchar> const& a);
};
- // struct
+ // struct
// mmbitext:
// phrasepair
// {
@@ -125,32 +125,32 @@ namespace Moses {
// size_t len;
// size_t cnt;
// float fwd, bwd;
-
+
// map<uint32_t,uint32_t> aln;
// string toString(TokenIndex const& V) const;
// bool operator<(phrase const& other) const;
// bool operator>(phrase const& other) const;
// phrase(pair<pair<Token const*, size_t>,jstats> const & foo);
-
+
// };
- struct
+ struct
mmbitext::
pstats
{
boost::mutex lock; // for parallel gathering of stats
boost::condition_variable ready; // consumers can wait for this data structure to be ready.
- size_t raw_cnt; // (approximate) raw occurrence count
+ size_t raw_cnt; // (approximate) raw occurrence count
size_t sample_cnt; // number of instances selected during sampling
size_t good; // number of selected instances with valid word alignments
size_t sum_pairs;
- // size_t snt_cnt;
+ // size_t snt_cnt;
// size_t sample_snt;
size_t in_progress; // keeps track of how many threads are currently working on this
boost::unordered_map<uint64_t, jstats> trg;
- pstats();
+ pstats();
// vector<phrase> nbest;
// void select_nbest(size_t const N=10);
void release();
@@ -167,7 +167,7 @@ namespace Moses {
public:
worker(agenda& a);
void operator()();
-
+
};
class
diff --git a/moses/TranslationModel/UG/mm/ug_phrasepair.cc b/moses/TranslationModel/UG/mm/ug_phrasepair.cc
index 6373f8468..d533dafa3 100644
--- a/moses/TranslationModel/UG/mm/ug_phrasepair.cc
+++ b/moses/TranslationModel/UG/mm/ug_phrasepair.cc
@@ -1,97 +1,44 @@
#include "ug_phrasepair.h"
-namespace Moses {
- namespace bitext
- {
-#if 0
- void
- PhrasePair::
- init()
+namespace Moses {
+namespace bitext {
+
+void
+fill_lr_vec2
+( LRModel::ModelType mdl, float const* const cnt,
+ float const total, float* v)
+{
+ if (mdl == LRModel::Monotonic)
{
- p1 = p2 = raw1 = raw2 = sample1 = sample2 = good1 = good2 = joint = 0;
+ float denom = log(total + 2);
+ v[LRModel::M] = log(cnt[LRModel::M] + 1.) - denom;
+ v[LRModel::NM] = log(total - v[LRModel::M] + 1) - denom;
}
-
- void
- PhrasePair::
- init(uint64_t const pid1,
- pstats const& ps1,
- pstats const& ps2,
- size_t const numfeats)
+ else if (mdl == LRModel::LeftRight)
{
- p1 = pid1;
- raw1 = ps1.raw_cnt + ps2.raw_cnt;
- sample1 = ps1.sample_cnt + ps2.sample_cnt;
- sample2 = 0;
- good1 = ps1.good + ps2.good;
- good2 = 0;
- joint = 0;
- fvals.resize(numfeats);
+ float denom = log(total + 2);
+ v[LRModel::R] = log(cnt[LRModel::M] + cnt[LRModel::DR] + 1.) - denom;
+ v[LRModel::L] = log(cnt[LRModel::S] + cnt[LRModel::DL] + 1.) - denom;
}
-
- PhrasePair const&
- PhrasePair::
- update(uint64_t const pid2, jstats const& js1, jstats const& js2)
+ else if (mdl == LRModel::MSD)
{
- p2 = pid2;
- raw2 = js1.cnt2() + js2.cnt2();
- joint = js1.rcnt() + js2.rcnt();
- assert(js1.aln().size() || js2.aln().size());
- if (js1.aln().size())
- aln = js1.aln()[0].second;
- else if (js2.aln().size())
- aln = js2.aln()[0].second;
- for (int i = po_first; i < po_other; i++)
- {
- PhraseOrientation po = static_cast<PhraseOrientation>(i);
- dfwd[i] = float(js1.dcnt_fwd(po) + js2.dcnt_fwd(po) + 1)/(sample1+po_other);
- dbwd[i] = float(js1.dcnt_bwd(po) + js2.dcnt_bwd(po) + 1)/(sample1+po_other);
- }
- return *this;
+ float denom = log(total + 3);
+ v[LRModel::M] = log(cnt[LRModel::M] + 1) - denom;
+ v[LRModel::S] = log(cnt[LRModel::S] + 1) - denom;
+ v[LRModel::D] = log(cnt[LRModel::DR] +
+ cnt[LRModel::DL] + 1) - denom;
}
-
- PhrasePair const&
- PhrasePair::
- update(uint64_t const pid2, size_t r2)
- {
- p2 = pid2;
- raw2 = r2;
- joint = 0;
- return *this;
- }
-
-
- PhrasePair const&
- PhrasePair::
- update(uint64_t const pid2,
- size_t const raw2extra,
- jstats const& js)
+ else if (mdl == LRModel::MSLR)
{
- p2 = pid2;
- raw2 = js.cnt2() + raw2extra;
- joint = js.rcnt();
- assert(js.aln().size());
- if (js.aln().size())
- aln = js.aln()[0].second;
- for (int i = po_first; i <= po_other; i++)
- {
- PhraseOrientation po = static_cast<PhraseOrientation>(i);
- dfwd[i] = float(js.dcnt_fwd(po)+1)/(sample1+po_other);
- dbwd[i] = float(js.dcnt_bwd(po)+1)/(sample1+po_other);
- }
- return *this;
+ float denom = log(total + 4);
+ v[LRModel::M] = log(cnt[LRModel::M] + 1) - denom;
+ v[LRModel::S] = log(cnt[LRModel::S] + 1) - denom;
+ v[LRModel::DL] = log(cnt[LRModel::DL] + 1) - denom;
+ v[LRModel::DR] = log(cnt[LRModel::DR] + 1) - denom;
}
+ else UTIL_THROW2("Reordering type not recognized!");
+}
- float
- PhrasePair::
- eval(vector<float> const& w)
- {
- assert(w.size() == this->fvals.size());
- this->score = 0;
- for (size_t i = 0; i < w.size(); ++i)
- this->score += w[i] * this->fvals[i];
- return this->score;
- }
-#endif
- } // namespace bitext
-} // namespace Moses
+} // namespace bitext
+} // namespace Moses
diff --git a/moses/TranslationModel/UG/mm/ug_phrasepair.h b/moses/TranslationModel/UG/mm/ug_phrasepair.h
index a966d00dc..53a9f761c 100644
--- a/moses/TranslationModel/UG/mm/ug_phrasepair.h
+++ b/moses/TranslationModel/UG/mm/ug_phrasepair.h
@@ -1,13 +1,335 @@
-//-*- c++ -*-
+// -*- c++ -*-
#pragma once
+#include <vector>
+#include "ug_typedefs.h"
+#include "ug_bitext_pstats.h"
+#include "moses/FF/LexicalReordering/LexicalReorderingState.h"
+#include "boost/format.hpp"
+#include "tpt_tokenindex.h"
+namespace Moses
+{
+ namespace bitext
+ {
+ template<typename Token>
+ class
+ PhrasePair
+ {
+ public:
+ class Scorer { public: virtual float operator()(PhrasePair& pp) const = 0; };
+ Token const* start1;
+ Token const* start2;
+ uint32_t len1;
+ uint32_t len2;
+ uint64_t p1, p2;
+ uint32_t raw1, raw2, sample1, sample2, good1, good2, joint;
+ std::vector<float> fvals;
+ float dfwd[Moses::LRModel::NONE+1]; // distortion counts // counts or probs?
+ float dbwd[Moses::LRModel::NONE+1]; // distortion counts
+ std::vector<uchar> aln;
+ float score;
+ bool inverse;
+ std::vector<uint32_t> indoc;
+ PhrasePair() { };
+ PhrasePair(PhrasePair const& o);
-// using namespace ugdiss;
-// using namespace std;
+ PhrasePair const& operator+=(PhrasePair const& other);
-// namespace Moses {
-// namespace bitext
-// {
+ bool operator<(PhrasePair const& other) const;
+ bool operator>(PhrasePair const& other) const;
+ bool operator<=(PhrasePair const& other) const;
+ bool operator>=(PhrasePair const& other) const;
+ void init();
+ void init(uint64_t const pid1, bool is_inverse,
+ Token const* x, uint32_t const len,
+ pstats const* ps = NULL, size_t const numfeats=0);
-// } // namespace bitext
-// } // namespace Moses
+ PhrasePair const&
+ update(uint64_t const pid2, Token const* x,
+ uint32_t const len, jstats const& js);
+
+ void
+ fill_lr_vec(LRModel::Direction const& dir,
+ LRModel::ModelType const& mdl,
+ vector<float>& v) const;
+ void
+ print(ostream& out, TokenIndex const& V1, TokenIndex const& V2,
+ LRModel const& LR) const;
+
+ class SortByTargetIdSeq
+ {
+ public:
+ int cmp(PhrasePair const& a, PhrasePair const& b) const;
+ bool operator()(PhrasePair const& a, PhrasePair const& b) const;
+ };
+
+ class SortDescendingByJointCount
+ {
+ public:
+ int cmp(PhrasePair const& a, PhrasePair const& b) const;
+ bool operator()(PhrasePair const& a, PhrasePair const& b) const;
+ };
+ };
+
+ template<typename Token>
+ void PhrasePair<Token>
+ ::init(uint64_t const pid1, bool is_inverse,
+ Token const* x, uint32_t const len,
+ pstats const* ps, size_t const numfeats)
+ {
+ inverse = is_inverse;
+ start1 = x; len1 = len;
+ p1 = pid1;
+ p2 = 0;
+ if (ps)
+ {
+ raw1 = ps->raw_cnt;
+ sample1 = ps->sample_cnt;
+ good1 = ps->good;
+ }
+ else raw1 = sample1 = good1 = 0;
+ joint = 0;
+ good2 = 0;
+ sample2 = 0;
+ raw2 = 0;
+ fvals.resize(numfeats);
+ }
+
+ template<typename Token>
+ PhrasePair<Token> const&
+ PhrasePair<Token>
+ ::update(uint64_t const pid2,
+ Token const* x, uint32_t const len, jstats const& js)
+ {
+ p2 = pid2;
+ start2 = x; len2 = len;
+ raw2 = js.cnt2();
+ joint = js.rcnt();
+ assert(js.aln().size());
+ if (js.aln().size())
+ aln = js.aln()[0].second;
+ // float total_fwd = 0, total_bwd = 0;
+ // for (int i = 0; i <= Moses::LRModel::NONE; i++)
+ // {
+ // PhraseOrientation po = static_cast<PhraseOrientation>(i);
+ // total_fwd += js.dcnt_fwd(po)+1;
+ // total_bwd += js.dcnt_bwd(po)+1;
+ // }
+
+ // should we do that here or leave the raw counts?
+ for (int i = 0; i <= Moses::LRModel::NONE; i++)
+ {
+ PhraseOrientation po = static_cast<PhraseOrientation>(i);
+ dfwd[i] = js.dcnt_fwd(po);
+ dbwd[i] = js.dcnt_bwd(po);
+ }
+
+ indoc = js.indoc;
+ return *this;
+ }
+
+ template<typename Token>
+ bool
+ PhrasePair<Token>
+ ::operator<(PhrasePair const& other) const
+ {
+ return this->score < other.score;
+ }
+
+ template<typename Token>
+ bool
+ PhrasePair<Token>
+ ::operator>(PhrasePair const& other) const
+ {
+ return this->score > other.score;
+ }
+
+ template<typename Token>
+ bool
+ PhrasePair<Token>
+ ::operator<=(PhrasePair const& other) const
+ {
+ return this->score <= other.score;
+ }
+
+ template<typename Token>
+ bool
+ PhrasePair<Token>
+ ::operator>=(PhrasePair const& other) const
+ {
+ return this->score >= other.score;
+ }
+
+ template<typename Token>
+ PhrasePair<Token> const&
+ PhrasePair<Token>
+ ::operator+=(PhrasePair const& o)
+ {
+ raw1 += o.raw1;
+ raw2 += o.raw2;
+ good1 += o.good1;
+ good2 += o.good2;
+ joint += o.joint;
+ sample1 += o.sample1;
+ sample2 += o.sample2;
+ // todo: add distortion counts
+ return *this;
+ }
+
+ template<typename Token>
+ PhrasePair<Token>
+ ::PhrasePair(PhrasePair<Token> const& o)
+ : start1(o.start1) , start2(o.start2)
+ , len1(o.len1) , len2(o.len2)
+ , p1(o.p1) , p2(o.p2)
+ , raw1(o.raw1) , raw2(o.raw2)
+ , sample1(o.sample1) , sample2(o.sample2)
+ , good1(o.good1) , good2(o.good2)
+ , joint(o.joint)
+ , fvals(o.fvals)
+ , aln(o.aln)
+ , score(o.score)
+ , inverse(o.inverse)
+ , indoc(o.indoc)
+ {
+ for (int i = 0; i <= Moses::LRModel::NONE; ++i)
+ {
+ dfwd[i] = o.dfwd[i];
+ dbwd[i] = o.dbwd[i];
+ }
+ }
+
+ template<typename Token>
+ int PhrasePair<Token>
+ ::SortByTargetIdSeq
+ ::cmp(PhrasePair const& a, PhrasePair const& b) const
+ {
+ size_t i = 0;
+ Token const* x = a.start2;
+ Token const* y = b.start2;
+ while (i < a.len2 && i < b.len2 && x->id() == y->id())
+ {
+ x = x->next();
+ y = y->next();
+ ++i;
+ }
+ if (i == a.len2 && i == b.len2) return 0;
+ if (i == a.len2) return -1;
+ if (i == b.len2) return 1;
+ return x->id() < y->id() ? -1 : 1;
+ }
+
+ template<typename Token>
+ bool PhrasePair<Token>
+ ::SortByTargetIdSeq
+ ::operator()(PhrasePair const& a, PhrasePair const& b) const
+ {
+ return this->cmp(a,b) < 0;
+ }
+
+ template<typename Token>
+ int PhrasePair<Token>
+ ::SortDescendingByJointCount
+ ::cmp(PhrasePair const& a, PhrasePair const& b) const
+ {
+ if (a.joint == b.joint) return 0;
+ return a.joint > b.joint ? -1 : 1;
+ }
+
+ template<typename Token>
+ bool
+ PhrasePair<Token>
+ ::SortDescendingByJointCount
+ ::operator()(PhrasePair const& a, PhrasePair const& b) const
+ {
+ return this->cmp(a,b) < 0;
+ }
+
+ template<typename Token>
+ void
+ PhrasePair<Token>
+ ::init()
+ {
+ inverse = false;
+ len1 = len2 = raw1 = raw2 = sample1 = sample2 = good1 = good2 = joint = 0;
+ start1 = start2 = NULL;
+ p1 = p2 = 0;
+ }
+
+
+ void
+ fill_lr_vec2(LRModel::ModelType mdl, float const* const cnt,
+ float const total, float* v);
+
+ template<typename Token>
+ void
+ PhrasePair<Token>
+ ::fill_lr_vec(LRModel::Direction const& dir,
+ LRModel::ModelType const& mdl,
+ vector<float>& v) const
+ {
+ // how many distinct scores do we have?
+ size_t num_scores = (mdl == LRModel::MSLR ? 4 : mdl == LRModel::MSD ? 3 : 2);
+ size_t offset;
+ if (dir == LRModel::Bidirectional)
+ {
+ offset = num_scores;
+ num_scores *= 2;
+ }
+ else offset = 0;
+
+ v.resize(num_scores);
+
+ // determine the denominator
+ float total = 0;
+ for (size_t i = 0; i <= LRModel::NONE; ++i)
+ total += dfwd[i];
+
+ if (dir != LRModel::Forward) // i.e., Backward or Bidirectional
+ fill_lr_vec2(mdl, dbwd, total, &v[0]);
+ if (dir != LRModel::Backward) // i.e., Forward or Bidirectional
+ fill_lr_vec2(mdl, dfwd, total, &v[offset]);
+ }
+
+
+ template<typename Token>
+ void
+ PhrasePair<Token>
+ ::print(ostream& out, TokenIndex const& V1, TokenIndex const& V2,
+ LRModel const& LR) const
+ {
+ out << toString (V1, this->start1, this->len1) << " ::: "
+ << toString (V2, this->start2, this->len2) << " "
+ << this->joint << " [";
+ for (size_t i = 0; i < this->indoc.size(); ++i)
+ {
+ if (i) out << " ";
+ out << this->indoc[i];
+ }
+ out << "] [";
+ vector<float> lrscores;
+ this->fill_lr_vec(LR.GetDirection(), LR.GetModelType(), lrscores);
+ for (size_t i = 0; i < lrscores.size(); ++i)
+ {
+ if (i) out << " ";
+ out << boost::format("%.2f") % exp(lrscores[i]);
+ }
+ out << "]" << endl;
+#if 0
+ for (int i = 0; i <= Moses::LRModel::NONE; i++)
+ {
+ // PhraseOrientation po = static_cast<PhraseOrientation>(i);
+ if (i) *log << " ";
+ *log << p.dfwd[i];
+ }
+ *log << "] [";
+ for (int i = 0; i <= Moses::LRModel::NONE; i++)
+ {
+ // PhraseOrientation po = static_cast<PhraseOrientation>(i);
+ if (i) *log << " ";
+ *log << p.dbwd[i];
+ }
+#endif
+ }
+ } // namespace bitext
+} // namespace Moses
diff --git a/moses/TranslationModel/UG/mm/ug_sampling_bias.cc b/moses/TranslationModel/UG/mm/ug_sampling_bias.cc
new file mode 100644
index 000000000..95b93ec7b
--- /dev/null
+++ b/moses/TranslationModel/UG/mm/ug_sampling_bias.cc
@@ -0,0 +1,178 @@
+#include "ug_sampling_bias.h"
+#include <iostream>
+#include <boost/foreach.hpp>
+
+#ifdef HAVE_CURLPP
+#include <curlpp/Options.hpp>
+#include <curlpp/cURLpp.hpp>
+#include <curlpp/Easy.hpp>
+#endif
+
+namespace Moses
+{
+ namespace bitext
+ {
+ using ugdiss::id_type;
+
+#ifdef HAVE_CURLPP
+ std::string
+ query_bias_server(std::string const& url, std::string const& text)
+ {
+ // communicate with the bias server; resuts will be in ...
+ std::ostringstream os;
+ curlpp::Easy myRequest;
+ std::string query = url+curlpp::escape(text);
+ myRequest.setOpt(new curlpp::options::Url(query));
+ curlpp::options::WriteStream ws(&os);
+ myRequest.setOpt(ws); // Give it to your request
+ myRequest.perform(); // This will output to os
+ return os.str();
+ }
+#endif
+
+ DocumentBias
+ ::DocumentBias
+ ( std::vector<id_type> const& sid2doc,
+ std::map<std::string,id_type> const& docname2docid,
+ std::string const& server_url, std::string const& text,
+ std::ostream* log)
+ : m_sid2docid(sid2doc)
+ , m_bias(docname2docid.size(), 0)
+ {
+#ifdef HAVE_CURLPP
+ std::string json = query_bias_server(server_url, text);
+ init_from_json(json, docname2docid, log);
+#endif
+ }
+
+ void
+ DocumentBias
+ ::init_from_json
+ ( std::string const& json, std::map<std::string,id_type> const& docname2docid,
+ std::ostream* log)
+ { // poor man's special purpose json parser for responses from the
+ // MMT bias server
+
+ std::string d; float total = 0; std::map<std::string,float> bias;
+ size_t i = 0; while (i < json.size() && json[i] != '"') ++i;
+ while (++i < json.size())
+ {
+ size_t k = i; while (i < json.size() && json[i] != '"') ++i;
+ if (i >= json.size()) break;
+ float& f = bias[json.substr(k,i-k)];
+ while (++i < json.size() && json[i] != ':');
+ k = ++i;
+ while (++i < json.size() && json[i] != ',' && json[i] != '}');
+ total += (f = atof(json.substr(k, i-k).c_str()));
+ k = ++i; while (i < json.size() && json[i] != '"') ++i;
+ }
+
+ typedef std::pair<std::string const,float> item;
+ if (total) { BOOST_FOREACH(item& x, bias) { x.second /= total; } }
+ if (log)
+ {
+ BOOST_FOREACH(item& x, bias)
+ {
+ std::map<std::string,id_type>::const_iterator m;
+ m = docname2docid.find(x.first);
+ int docid = m != docname2docid.end() ? m->second : -1;
+ *log << "CONTEXT SERVER RESPONSE "
+ << "[" << docid << "] "
+ << x.first << " " << x.second << std::endl;
+ }
+ }
+ init(bias, docname2docid);
+
+ // using xmlrpc_parse_json didn't always work (parser errors)
+ // xmlrpc_value* b = xmlrpc_parse_json(env ,buf.str().c_str());
+ // std::cerr << "|" << buf.str() << "|" << std::endl;
+ // // if (b == NULL) std::cerr << "OOpS" << std::endl;
+ // xmlrpc_c::value_struct v(b); // = *b;
+ // std::map<std::string, xmlrpc_c::value> const
+ // bmap = static_cast<map<std::string, xmlrpc_c::value> >(v);
+ // std::map<std::string, float> bias;
+ // typedef std::map<std::string, xmlrpc_c::value>::value_type item;
+ // float total = 0;
+ // BOOST_FOREACH(item const& x, bmap)
+ // {
+ // total += bias[x.first] = xmlrpc_c::value_double(x.second);
+ // }
+ // typedef std::map<std::string, float>::value_type fitem;
+ // BOOST_FOREACH(fitem const& x, bias)
+ // std::cerr << x.first << " " << x.second/total << std::endl;
+ // // delete b;
+ }
+
+ void
+ DocumentBias
+ ::init(std::map<std::string,float> const& biasmap,
+ std::map<std::string,id_type> const& docname2docid)
+ {
+ typedef std::map<std::string, id_type>::value_type doc_record;
+ float total = 0;
+ BOOST_FOREACH(doc_record const& d, docname2docid)
+ {
+ std::map<std::string, float>::const_iterator m = biasmap.find(d.first);
+ if (m != biasmap.end()) total += (m_bias[d.second] = m->second);
+ }
+ if (total) { BOOST_FOREACH(float& f, m_bias) f /= total; }
+ BOOST_FOREACH(doc_record const& d, docname2docid)
+ std::cerr << "BIAS " << d.first << " " << m_bias[d.second] << std::endl;
+ }
+
+ id_type
+ DocumentBias
+ ::GetClass(id_type const idx) const
+ {
+ return m_sid2docid.at(idx);
+ }
+
+ float
+ DocumentBias
+ ::operator[](id_type const idx) const
+ {
+ UTIL_THROW_IF2(idx >= m_sid2docid.size(),
+ "Out of bounds: " << idx << "/" << m_sid2docid.size());
+ return m_bias[m_sid2docid[idx]];
+ }
+
+ size_t
+ DocumentBias
+ ::size() const
+ { return m_sid2docid.size(); }
+
+
+
+ SentenceBias
+ ::SentenceBias(std::vector<float> const& bias)
+ : m_bias(bias) { }
+
+ SentenceBias
+ ::SentenceBias(size_t const s) : m_bias(s) { }
+
+ id_type
+ SentenceBias
+ ::GetClass(id_type idx) const { return idx; }
+
+ float&
+ SentenceBias
+ ::operator[](id_type const idx)
+ {
+ UTIL_THROW_IF2(idx >= m_bias.size(), "Out of bounds");
+ return m_bias[idx];
+ }
+
+ float
+ SentenceBias
+ ::operator[](id_type const idx) const
+ {
+ UTIL_THROW_IF2(idx >= m_bias.size(), "Out of bounds");
+ return m_bias[idx];
+ }
+
+ size_t
+ SentenceBias
+ ::size() const { return m_bias.size(); }
+
+ }
+}
diff --git a/moses/TranslationModel/UG/mm/ug_sampling_bias.h b/moses/TranslationModel/UG/mm/ug_sampling_bias.h
new file mode 100644
index 000000000..f540ddc76
--- /dev/null
+++ b/moses/TranslationModel/UG/mm/ug_sampling_bias.h
@@ -0,0 +1,86 @@
+// -*- c++ -*-
+#pragma once
+
+#include <map>
+#include<vector>
+#include <string>
+#include <iostream>
+#include "moses/Util.h"
+#include "ug_typedefs.h"
+namespace Moses
+{
+ namespace bitext
+ {
+ using ugdiss::id_type;
+
+ std::string query_bias_server(std::string const& url, std::string const& text);
+
+ class SamplingBias
+ {
+ public:
+ int loglevel;
+ std::ostream* log;
+ virtual float
+ operator[](id_type const ID) const = 0;
+ // returns (unnormalized bias) for the class of item ID
+
+ virtual size_t size() const = 0;
+ // number of classes
+
+ virtual id_type
+ GetClass(id_type const ID) const = 0;
+ // returns class of item ID
+ };
+
+ class
+ DocumentBias : public SamplingBias
+ {
+ std::vector<id_type> const& m_sid2docid;
+ std::vector<float> m_bias;
+
+ public:
+
+ DocumentBias(std::vector<id_type> const& sid2doc,
+ std::map<std::string,id_type> const& docname2docid,
+ std::string const& server_url, std::string const& text,
+ std::ostream* log);
+
+ void
+ init_from_json
+ ( std::string const& json,
+ std::map<std::string,id_type> const& docname2docid,
+ std::ostream* log );
+
+ void
+ init
+ ( std::map<std::string,float> const& biasmap,
+ std::map<std::string,id_type> const& docname2docid);
+
+ id_type
+ GetClass(id_type const idx) const;
+
+ float
+ operator[](id_type const idx) const;
+
+ size_t
+ size() const;
+ };
+
+ class
+ SentenceBias : public SamplingBias
+ {
+ std::vector<float> m_bias;
+ public:
+ SentenceBias(std::vector<float> const& bias);
+ SentenceBias(size_t const s);
+
+ id_type GetClass(id_type idx) const;
+
+ float& operator[](id_type const idx);
+ float operator[](id_type const idx) const;
+ size_t size() const;
+
+ };
+
+ }
+}
diff --git a/moses/TranslationModel/UG/mm/ug_tsa_array_entry.h b/moses/TranslationModel/UG/mm/ug_tsa_array_entry.h
index fc4b9f0ad..3af929644 100644
--- a/moses/TranslationModel/UG/mm/ug_tsa_array_entry.h
+++ b/moses/TranslationModel/UG/mm/ug_tsa_array_entry.h
@@ -1,13 +1,13 @@
// -*- c++ -*-
// (c) 2007-2010 Ulrich Germann
// implementation of stuff related to ArrayEntries
-// this file should only be included via ug_tsa_base.h,
+// this file should only be included via ug_tsa_base.h,
// never by itself
#ifndef __ug_tsa_array_entry_h
#define __ug_tsa_array_entry_h
#include "ug_ttrack_position.h"
-namespace ugdiss
+namespace ugdiss
{
namespace tsa
{
@@ -20,7 +20,7 @@ namespace ugdiss
ArrayEntry();
ArrayEntry(char const* p);
-
+
template<typename TSA_TYPE>
ArrayEntry(TSA_TYPE const* S, char const* p);
@@ -34,7 +34,7 @@ namespace ugdiss
}
// template<typename TSA_TYPE>
- // class SamplingArrayEntryIterator
+ // class SamplingArrayEntryIterator
// : public tsa::ArrayEntry
// {
// size_t const N; // (approximate) total number of occurrences
@@ -46,7 +46,7 @@ namespace ugdiss
// public:
// SamplingArrayEntryIterator(TSA_TYPE::tree_iterator const& m, size_t const s);
// bool step(); // returns false when at end of range
- // bool done(); //
+ // bool done(); //
// };
// template<typename TSA_TYPE>
@@ -60,7 +60,7 @@ namespace ugdiss
// , root(m.root)
// , stop(m.upper_bound(-1))
// { }
-
+
// template<typename TSA_TYPE>
// bool
// SamplingArrayEntryIterator::
@@ -69,7 +69,7 @@ namespace ugdiss
// while (chosen < samplesize && next < stop)
// {
// root->readEntry(next,*this);
- // if (randInt(N - sampled++) < samplesize - chosen)
+ // if (util::rand_excl(N - sampled++) < samplesize - chosen)
// {
// ++chosen;
// return true;
diff --git a/moses/TranslationModel/UG/mm/ug_tsa_base.h b/moses/TranslationModel/UG/mm/ug_tsa_base.h
index 83593c79c..8a4117910 100644
--- a/moses/TranslationModel/UG/mm/ug_tsa_base.h
+++ b/moses/TranslationModel/UG/mm/ug_tsa_base.h
@@ -26,7 +26,7 @@ namespace ugdiss
namespace bio=boost::iostreams;
template<typename TKN>
- TKN const*
+ TKN const*
next(TKN const* x)
{
return static_cast<TKN const*>(x ? x->next() : NULL);
@@ -42,20 +42,20 @@ namespace ugdiss
* ordering of sequences. Both are decleared/defined in
* ug_corpus_token.{h|cc}
*/
- template<typename TKN>
- class TSA
+ template<typename TKN>
+ class TSA
{
public:
virtual ~TSA() {};
- typedef TSA_tree_iterator<TKN> tree_iterator;
+ typedef TSA_tree_iterator<TKN> tree_iterator;
// allows iteration over the array as if it were a trie
- typedef tsa::ArrayEntry ArrayEntry;
+ typedef tsa::ArrayEntry ArrayEntry;
/* an entry in the array, for iteration over all occurrences of a
* particular sequence */
- // typedef boost::dynamic_bitset<uint64_t> bitset;
+ // typedef boost::dynamic_bitset<uint64_t> bitset;
typedef boost::shared_ptr<bitvector> bitset_pointer;
typedef TKN Token;
- typedef BitSetCache<TSA<TKN> > BSC_t;
+ typedef BitSetCache<TSA<TKN> > BSC_t;
/* to allow caching of bit vectors that are expensive to create on
* the fly */
@@ -67,7 +67,7 @@ namespace ugdiss
char const* endArray; // ... and end ...
// of memory block storing the actual TSA
- size_t corpusSize;
+ size_t corpusSize;
/** size of the corpus (in number of sentences) of the corpus
* underlying the sequence array.
*
@@ -76,37 +76,37 @@ namespace ugdiss
* suffix array is based on a subset
* of the sentences of /corpus/.
*/
-
- id_type numTokens;
+
+ id_type numTokens;
/** size of the corpus (in number of tokens) of the corpus underlying the
- * sequence array.
+ * sequence array.
*
* ATTENTION: This number may differ from corpus->numTokens(), namely when
- * the suffix array is based on a subset of the sentences of
+ * the suffix array is based on a subset of the sentences of
* /corpus/.
*/
- id_type indexSize;
- // (number of entries +1) in the index of root-level nodes
+ id_type indexSize;
+ // (number of entries +1) in the index of root-level nodes
size_t BitSetCachingThreshold;
-
+
////////////////////////////////////////////////////////////////
// private member functions:
- /** @return an index position approximately /fraction/ between
+ /** @return an index position approximately /fraction/ between
* /startRange/ and /endRange/.
- */
- virtual
- char const*
- index_jump(char const* startRange,
- char const* stopRange,
+ */
+ virtual
+ char const*
+ index_jump(char const* startRange,
+ char const* stopRange,
float fraction) const = 0;
-
- /** return the index position of the first item that
+
+ /** return the index position of the first item that
* is equal to or includes [refStart,refStart+refLen) as a prefix
*/
- char const*
+ char const*
find_start(char const* lo, char const* const upX,
TKN const* const refStart, int refLen,
size_t d) const;
@@ -114,19 +114,19 @@ namespace ugdiss
/** return the index position of the first item that is greater than
* [refStart,refStart+refLen) and does not include it as a prefix
*/
- char const*
+ char const*
find_end(char const* lo, char const* const upX,
TKN const* const refStart, int refLen,
size_t d) const;
-
+
/** return the index position of the first item that is longer than
* [refStart,refStart+refLen) and includes it as a prefix
*/
- char const*
+ char const*
find_longer(char const* lo, char const* const upX,
TKN const* const refStart, int refLen,
size_t d) const;
-
+
/** Returns a char const* pointing to the position in the data block
* where the first item starting with token /id/ is located.
*/
@@ -140,37 +140,37 @@ namespace ugdiss
public:
boost::shared_ptr<BSC_t> bsc;
-
+
char const* arrayStart() const { return startArray; }
char const* arrayEnd() const { return endArray; }
- /** @return a pointer to the beginning of the index entry range covering
+ /** @return a pointer to the beginning of the index entry range covering
* [keyStart,keyStop)
*/
- char const*
+ char const*
lower_bound(typename vector<TKN>::const_iterator const& keyStart,
typename vector<TKN>::const_iterator const& keyStop) const;
- char const*
+ char const*
lower_bound(TKN const* keyStart, TKN const* keyStop) const;
- char const*
+ char const*
lower_bound(TKN const* keyStart, int keyLen) const;
- /** @return a pointer to the end point of the index entry range covering
+ /** @return a pointer to the end point of the index entry range covering
* [keyStart,keyStop)
*/
- char const*
- upper_bound(typename vector<TKN>::const_iterator const& keyStart,
+ char const*
+ upper_bound(typename vector<TKN>::const_iterator const& keyStart,
typename vector<TKN>::const_iterator const& keyStop) const;
- char const*
+ char const*
upper_bound(TKN const* keyStart, int keyLength) const;
/** dump all suffixes in order to /out/ */
void dump(ostream& out, TokenIndex const& T) const;
-
- /** fill the dynamic bit set with true for all sentences that contain
+
+ /** fill the dynamic bit set with true for all sentences that contain
* /phrase/.
* @return the raw number of occurrences.
*/
@@ -188,70 +188,70 @@ namespace ugdiss
setTokenBits(char const* startRange, char const* endRange, size_t len,
bitvector& bs) const;
- /** read the sentence ID into /sid/
- * @return position of associated offset.
+ /** read the sentence ID into /sid/
+ * @return position of associated offset.
*
* The function provides an abstraction that uses the right
* interpretation of the position based on the subclass
* (memory-mapped or in-memory).
*/
virtual
- char const*
+ char const*
readSid(char const* p, char const* q, id_type& sid) const = 0;
virtual
- char const*
+ char const*
readSid(char const* p, char const* q, ::uint64_t& sid) const = 0;
- /** read the offset part of the index entry into /offset/
- * @return position of the next entry in the index.
+ /** read the offset part of the index entry into /offset/
+ * @return position of the next entry in the index.
*
* The function provides an abstraction that uses the right
* interpretation of the position based on the subclass
* (memory-mapped or in-memory).
*/
virtual
- char const*
+ char const*
readOffset(char const* p, char const* q, uint16_t& offset) const = 0;
virtual
- char const*
+ char const*
readOffset(char const* p, char const* q, ::uint64_t& offset) const = 0;
- /** @return sentence count
+ /** @return sentence count
*/
count_type
- sntCnt(char const* p, char const* const q) const;
-
+ sntCnt(char const* p, char const* const q) const;
+
count_type
- rawCnt2(TKN const* keyStart, size_t keyLen) const;
+ rawCnt2(TKN const* keyStart, size_t keyLen) const;
/** @return raw occurrence count
- *
+ *
* depending on the subclass, this is constant time (imTSA) or
* linear in in the number of occurrences (mmTSA).
*/
virtual
count_type
- rawCnt(char const* p, char const* const q) const = 0;
+ rawCnt(char const* p, char const* const q) const = 0;
- /** get both sentence and word counts.
+ /** get both sentence and word counts.
*
* Avoids having to go over the byte range representing the range
* of suffixes in question twice when dealing with memory-mapped
* suffix arrays.
- */
+ */
virtual
- void
- getCounts(char const* p, char const* const q,
- count_type& sids, count_type& raw) const = 0;
+ void
+ getCounts(char const* p, char const* const q,
+ count_type& sids, count_type& raw) const = 0;
- string
- suffixAt(char const* p, TokenIndex const* V=NULL, size_t maxlen=0)
+ string
+ suffixAt(char const* p, TokenIndex const* V=NULL, size_t maxlen=0)
const;
- string
- suffixAt(ArrayEntry const& I, TokenIndex const* V=NULL, size_t maxlen=0)
+ string
+ suffixAt(ArrayEntry const& I, TokenIndex const* V=NULL, size_t maxlen=0)
const;
tsa::ArrayEntry& readEntry(char const* p, tsa::ArrayEntry& I) const;
@@ -260,36 +260,36 @@ namespace ugdiss
char const* dataEnd() const;
bool sanityCheck1() const;
-
- /** Return an ID that represents a given phrase;
+
+ /** Return an ID that represents a given phrase;
This should NEVER be 0!
- Structure of a phrase ID:
+ Structure of a phrase ID:
leftmost 32 bits: sentence ID in the corpus
next 16 bits: offset from the start of the sentence
next 16 bits: length of the phrase
*/
- ::uint64_t
+ ::uint64_t
getSequenceId(typename vector<TKN>::const_iterator const& pstart,
typename vector<TKN>::const_iterator const& pstop) const;
-
- ::uint64_t
+
+ ::uint64_t
getSequenceId(TKN const* t, ushort plen) const;
-
+
/** Return the phrase represented by phrase ID pid_ */
string
getSequence(::uint64_t pid, TokenIndex const& V) const;
-
+
/** Return the phrase represented by phrase ID pid_ */
vector<TKN>
getSequence(::uint64_t pid) const;
- TKN const*
+ TKN const*
getSequenceStart(::uint64_t) const;
ushort
getSequenceLength(::uint64_t) const;
- size_t
+ size_t
getCorpusSize() const;
Ttrack<TKN> const*
@@ -297,13 +297,13 @@ namespace ugdiss
bitset_pointer
getBitSet(TKN const* startKey, size_t keyLen) const;
-
+
boost::shared_ptr<bitvector>
- findTree(TKN const* treeStart, TKN const* treeEnd,
+ findTree(TKN const* treeStart, TKN const* treeEnd,
bitvector const* filter) const;
-
+
size_t markOccurrences(char const* lo, char const* up, size_t len,
- bitvector& bitset,
+ bitvector& bitset,
bool markOnlyStartPosition) const;
bool
@@ -311,13 +311,13 @@ namespace ugdiss
vector<tree_iterator>& dest) const;
double aveIndexEntrySize() const
- {
- return (endArray-startArray)/double(numTokens);
+ {
+ return (endArray-startArray)/double(numTokens);
}
public:
- // virtual
- sptr<TSA_tree_iterator<TKN> >
+ // virtual
+ sptr<TSA_tree_iterator<TKN> >
find(TKN const* start, size_t len) const
{
typedef TSA_tree_iterator<TKN> iter;
@@ -333,7 +333,7 @@ namespace ugdiss
// ======================================================================
// template<typename TOKEN>
- // sptr<TSA_tree_iterator<TOKEN> >
+ // sptr<TSA_tree_iterator<TOKEN> >
// TSA<TOKEN>::
// find(TOKEN const* start, size_t len) const
// {
@@ -354,7 +354,7 @@ namespace ugdiss
* @return number of total occurrences of the phrase in the corpus
*/
template<typename TKN>
- count_type
+ count_type
TSA<TKN>::
fillBitSet(vector<TKN> const& key,
bitvector& bitset) const
@@ -362,7 +362,7 @@ namespace ugdiss
if (!key.size()) return 0;
return fillBitset(&(key[0]),key.size(),bitset);
}
-
+
// ---------------------------------------------------------------------------
/** fill the dynamic bitset with information as to which sentences
@@ -370,7 +370,7 @@ namespace ugdiss
* @return number of total occurrences of the phrase in the corpus
*/
template<typename TKN>
- count_type
+ count_type
TSA<TKN>::
fillBitSet(TKN const* key, size_t keyLen,
bitvector& bitset) const
@@ -385,7 +385,7 @@ namespace ugdiss
// ---------------------------------------------------------------------------
template<typename TKN>
- count_type
+ count_type
TSA<TKN>::
setBits(char const* startRange, char const* endRange,
bitvector& bs) const
@@ -452,7 +452,7 @@ namespace ugdiss
* of the token range matching [startKey,endKey)
*/
template<typename TKN>
- char const*
+ char const*
TSA<TKN>::
find_start(char const* lo, char const* const upX,
TKN const* const refStart, int refLen,
@@ -485,12 +485,12 @@ namespace ugdiss
* of the token range matching [startKey,endKey)
*/
template<typename TKN>
- char const*
+ char const*
TSA<TKN>::
find_end(char const* lo, char const* const upX,
TKN const* const refStart, int refLen,
size_t d) const
-
+
{
char const* up = upX;
if (lo >= up) return NULL;
@@ -520,7 +520,7 @@ namespace ugdiss
* but continues on
*/
template<typename TKN>
- char const*
+ char const*
TSA<TKN>::
find_longer(char const* lo, char const* const upX,
TKN const* const refStart, int refLen,
@@ -553,7 +553,7 @@ namespace ugdiss
* given search phrase
*/
template<typename TKN>
- char const*
+ char const*
TSA<TKN>::
lower_bound(typename vector<TKN>::const_iterator const& keyStart,
typename vector<TKN>::const_iterator const& keyStop) const
@@ -570,7 +570,7 @@ namespace ugdiss
* given search phrase
*/
template<typename TKN>
- char const*
+ char const*
TSA<TKN>::
lower_bound(TKN const* const keyStart,
TKN const* const keyStop) const
@@ -579,7 +579,7 @@ namespace ugdiss
}
template<typename TKN>
- char const*
+ char const*
TSA<TKN>::
lower_bound(TKN const* const keyStart, int keyLen) const
{
@@ -595,7 +595,7 @@ namespace ugdiss
* given search phrase (i.e., points just beyond the range)
*/
template<typename TKN>
- char const*
+ char const*
TSA<TKN>::
upper_bound(typename vector<TKN>::const_iterator const& keyStart,
typename vector<TKN>::const_iterator const& keyStop) const
@@ -612,7 +612,7 @@ namespace ugdiss
* given search phrase (i.e., points just beyond the range)
*/
template<typename TKN>
- char const*
+ char const*
TSA<TKN>::
upper_bound(TKN const* keyStart, int keyLength) const
{
@@ -645,7 +645,7 @@ namespace ugdiss
{
return getSequenceId(&(*pstart),pstop-pstart);
}
-
+
//---------------------------------------------------------------------------
template<typename TKN>
@@ -667,14 +667,14 @@ namespace ugdiss
//---------------------------------------------------------------------------
- template<typename TKN>
+ template<typename TKN>
vector<TKN>
TSA<TKN>::
getSequence(::uint64_t pid) const
{
size_t plen = pid % 65536;
size_t offset = (pid >> 16) % 65536;
- TKN const* w = corpus->sntStart(pid >> 32)+offset;
+ TKN const* w = corpus->sntStart(pid >> 32)+offset;
vector<TKN> ret(plen);
for (size_t i = 0; i < plen; i++, w = w->next())
{
@@ -684,7 +684,7 @@ namespace ugdiss
return ret;
}
- template<typename TKN>
+ template<typename TKN>
string
TSA<TKN>::
getSequence(::uint64_t pid, TokenIndex const& V) const
@@ -698,21 +698,21 @@ namespace ugdiss
return buf.str();
}
-
+
//---------------------------------------------------------------------------
- template<typename TKN>
+ template<typename TKN>
TKN const*
TSA<TKN>::
getSequenceStart(::uint64_t pid) const
{
size_t offset = (pid >> 16) % 65536;
- return corpus->sntStart(pid >> 32)+offset;
+ return corpus->sntStart(pid >> 32)+offset;
}
-
+
//---------------------------------------------------------------------------
- template<typename TKN>
+ template<typename TKN>
ushort
TSA<TKN>::
getSequenceLength(::uint64_t pid) const
@@ -729,7 +729,7 @@ namespace ugdiss
{
return corpusSize;
}
-
+
//---------------------------------------------------------------------------
template<typename TKN>
@@ -756,7 +756,7 @@ namespace ugdiss
};
//---------------------------------------------------------------------------
-
+
/// find all instances of the tree described by [treeStart, treeEnd)
template<typename TKN>
typename TSA<TKN>::bitset_pointer
@@ -764,7 +764,7 @@ namespace ugdiss
getBitSet(TKN const* startKey, size_t keyLen) const
{
bitset_pointer ret;
- if (bsc != NULL)
+ if (bsc != NULL)
ret = bsc->get(startKey,keyLen);
else
{
@@ -773,7 +773,7 @@ namespace ugdiss
}
return ret;
}
-
+
//---------------------------------------------------------------------------
template<typename TKN>
@@ -809,12 +809,12 @@ namespace ugdiss
vector<tree_iterator>& dest) const
{
dest.assign(terminals.count(),tree_iterator(this));
- for (size_t i = terminals.find_first(), k = 0;
- i < terminals.size();
+ for (size_t i = terminals.find_first(), k = 0;
+ i < terminals.size();
i = terminals.find_next(i),++k)
{
for (TKN const* x = base+i; x && x->id(); x = x->next())
- if (!dest[k].extend(x->id()))
+ if (!dest[k].extend(x->id()))
return false;
}
typename tree_iterator::SortByApproximateCount sorter;
diff --git a/moses/TranslationModel/UG/mm/ug_tsa_bitset_cache.h b/moses/TranslationModel/UG/mm/ug_tsa_bitset_cache.h
index 3111f1c1d..d13449e36 100644
--- a/moses/TranslationModel/UG/mm/ug_tsa_bitset_cache.h
+++ b/moses/TranslationModel/UG/mm/ug_tsa_bitset_cache.h
@@ -20,7 +20,7 @@ namespace ugdiss
{
using namespace std;
template<typename TSA>
- class
+ class
BitSetCache
{
public:
@@ -33,15 +33,15 @@ namespace ugdiss
myMap cached1,cached2;
int threshold;
public:
-
+
BitSetCache() : tsa(NULL), threshold(0) {};
- BitSetCache(TSA const* t, size_t th=4194304)
+ BitSetCache(TSA const* t, size_t th=4194304)
{
init(t,th);
};
- void
- init(TSA const* t, size_t th=4194304)
+ void
+ init(TSA const* t, size_t th=4194304)
{
tsa = t;
threshold = th;
@@ -84,7 +84,7 @@ namespace ugdiss
if (up-lo > threshold)
{
pair<char const*,ushort> k(lo,keyLen);
- // cout << "bla " << keyStart->id() << " "
+ // cout << "bla " << keyStart->id() << " "
// << cached2.size() << " " << up-lo << " " << k.second << endl;
myMapIter m = cached2.find(k);
if (m != cached2.end())
diff --git a/moses/TranslationModel/UG/mm/ug_tsa_tree_iterator.h b/moses/TranslationModel/UG/mm/ug_tsa_tree_iterator.h
index ac8cbe24e..053ff2445 100644
--- a/moses/TranslationModel/UG/mm/ug_tsa_tree_iterator.h
+++ b/moses/TranslationModel/UG/mm/ug_tsa_tree_iterator.h
@@ -9,6 +9,7 @@
#include <iostream>
#include "util/exception.hh"
#include "moses/Util.h"
+#include "util/random.hh"
//#include <cassert>
// #include "ug_bv_iter.h"
@@ -22,24 +23,24 @@ namespace ugdiss
template<typename T>
void display(T const* x, string label)
{
- cout << label << ":";
- for (;x;x=next(x)) cout << " " << x->lemma;
- cout << endl;
+ cout << label << ":";
+ for (;x;x=next(x)) cout << " " << x->lemma;
+ cout << endl;
}
#endif
template<typename T> class TSA;
// CLASS DEFINITION
- // The TSA_tree_iterator allows traversal of a Token Sequence Array
+ // The TSA_tree_iterator allows traversal of a Token Sequence Array
// as if it was a trie.
//
// down(): go to first child
- // over(): go to next sibling
+ // over(): go to next sibling
// up(): go to parent
// extend(id): go to a specific child node
// all four functions return true if successful, false otherwise
- // lower_bound() and upper_bound() give the range of entries in the
+ // lower_bound() and upper_bound() give the range of entries in the
// array covered by the "virtual trie node".
template<typename TKN>
class
@@ -48,7 +49,7 @@ namespace ugdiss
protected:
vector<char const*> lower;
vector<char const*> upper;
-
+
// for debugging ...
void showBounds(ostream& out) const;
public:
@@ -56,7 +57,7 @@ namespace ugdiss
virtual ~TSA_tree_iterator() {};
- TSA<Token> const* root;
+ TSA<Token> const* root;
// TO BE DONE: make the pointer private and add a const function
// to return the pointer
@@ -65,17 +66,17 @@ namespace ugdiss
TSA_tree_iterator(TSA<Token> const* s, TSA_tree_iterator<Token> const& other);
TSA_tree_iterator(TSA<Token> const* r, id_type const* s, size_t const len);
// TSA_tree_iterator(TSA<Token> const* s, Token const& t);
- TSA_tree_iterator(TSA<Token> const* s,
- Token const* kstart,
- size_t const len,
+ TSA_tree_iterator(TSA<Token> const* s,
+ Token const* kstart,
+ size_t const len,
bool full_match_only=true);
- TSA_tree_iterator(TSA<Token> const* s,
- Token const* kstart,
- Token const* kend,
+ TSA_tree_iterator(TSA<Token> const* s,
+ Token const* kstart,
+ Token const* kend,
bool full_match_only=true);
- // TSA_tree_iterator(TSA<Token> const* s,
- // TokenIndex const& V,
- // string const& key);
+ TSA_tree_iterator(TSA<Token> const* s,
+ TokenIndex const& V,
+ string const& key);
char const* lower_bound(int p) const;
char const* upper_bound(int p) const;
@@ -103,49 +104,49 @@ namespace ugdiss
bool match(id_type sid) const;
// fillBitSet: deprecated; use markSentences() instead
- count_type
+ count_type
fillBitSet(boost::dynamic_bitset<typename ::uint64_t>& bitset) const;
-
- count_type
+
+ count_type
markEndOfSequence(Token const* start, Token const* stop,
boost::dynamic_bitset<typename ::uint64_t>& dest) const;
- count_type
+ count_type
markSequence(Token const* start, Token const* stop, bitvector& dest) const;
-
- count_type
+
+ count_type
markSentences(boost::dynamic_bitset<typename ::uint64_t>& bitset) const;
-
- count_type
+
+ count_type
markOccurrences(boost::dynamic_bitset<typename ::uint64_t>& bitset,
bool markOnlyStartPosition=false) const;
-
- count_type
+
+ count_type
markOccurrences(vector<ushort>& dest) const;
-
- ::uint64_t
+
+ ::uint64_t
getSequenceId() const;
-
- // equivalent but more efficient than
+
+ // equivalent but more efficient than
// bitvector tmp; markSentences(tmp); foo &= tmp;
bitvector& filterSentences(bitvector& foo) const;
-
+
/// a special auxiliary function for finding trees
- void
- tfAndRoot(bitvector const& ref, // reference root positions
+ void
+ tfAndRoot(bitvector const& ref, // reference root positions
bitvector const& snt, // relevant sentences
bitvector& dest) const;
-
+
size_t arrayByteSpanSize(int p = -1) const
- {
+ {
if (lower.size()==0) return 0; // or endArray-startArray???
if (p < 0) p = lower.size()+p;
assert(p >=0 && p < int(lower.size()));
return lower.size() ? upper[p]-lower[p] : 0;
}
-
+
struct SortByApproximateCount
{
- bool operator()(TSA_tree_iterator const& a,
+ bool operator()(TSA_tree_iterator const& a,
TSA_tree_iterator const& b) const
{
if (a.size()==0) return b.size() ? true : false;
@@ -174,7 +175,7 @@ namespace ugdiss
size_t grow(Token const* snt, bitvector const& cov)
{
- size_t x = cov.find_first();
+ size_t x = cov.find_first();
while (x < cov.size() && extend(snt[x]))
x = cov.find_next(x);
return this->size();
@@ -182,7 +183,7 @@ namespace ugdiss
sptr<vector<typename ttrack::Position> >
randomSample(int level, size_t N) const;
-
+
};
//---------------------------------------------------------------------------
@@ -204,7 +205,7 @@ namespace ugdiss
assert(root->corpus->getToken(A));
assert(lo < root->getUpperBound(root->corpus->getToken(A)->id()));
lower.push_back(lo);
- Token const* foo = this->getToken(0);
+ Token const* foo = this->getToken(0);
upper.push_back(root->upper_bound(foo,lower.size()));
return lower.size();
}
@@ -216,7 +217,7 @@ namespace ugdiss
Token const* z = next(a);
for (size_t i = 1; i < size(); ++i) z = next(z);
if (z < root->corpus->sntStart(A.sid) || z >= root->corpus->sntEnd(A.sid))
- {
+ {
char const* up = upper.back();
lo = root->find_longer(lo,up,a,lower.size(),0);
if (!lo) return false;
@@ -243,7 +244,7 @@ namespace ugdiss
TSA_tree_iterator<Token>::
over()
{
- if (lower.size() == 0)
+ if (lower.size() == 0)
return false;
if (lower.size() == 1)
{
@@ -253,7 +254,7 @@ namespace ugdiss
if (upper[0] < hi)
{
lower[0] = upper[0];
- Token const* foo = this->getToken(0);
+ Token const* foo = this->getToken(0);
upper.back() = root->upper_bound(foo,lower.size());
}
else
@@ -263,11 +264,11 @@ namespace ugdiss
char const* lo = root->getLowerBound(wid);
if (lo == root->endArray) return false;
char const* hi = root->getUpperBound(wid);
- if (!hi) return false;
+ if (!hi) return false;
if (lo == hi) continue;
assert(lo);
lower[0] = lo;
- Token const* foo = this->getToken(0);
+ Token const* foo = this->getToken(0);
upper.back() = root->upper_bound(foo,lower.size());
break;
}
@@ -292,7 +293,7 @@ namespace ugdiss
// display(root->corpus->getToken(U),"L2");
- Token const* foo = this->getToken(0);
+ Token const* foo = this->getToken(0);
// display(foo,"F!");
upper.back() = root->upper_bound(foo,lower.size());
return true;
@@ -325,17 +326,17 @@ namespace ugdiss
template<typename Token>
TSA_tree_iterator<Token>::
TSA_tree_iterator(TSA<Token> const* s)
- : root(s)
+ : root(s)
{};
template<typename Token>
TSA_tree_iterator<Token>::
TSA_tree_iterator(TSA<Token> const* s, TSA_tree_iterator<Token> const& other)
- : root(s)
+ : root(s)
{
Token const* x = other.getToken(0);
for (size_t i = 0; i < other.size() && this->extend(x->id()); ++i)
- x = x->next();
+ x = x->next();
};
@@ -344,28 +345,28 @@ namespace ugdiss
TSA_tree_iterator<Token>::
TSA_tree_iterator
(TSA<Token> const* r,
- id_type const* s,
+ id_type const* s,
size_t const len)
- : root(r)
+ : root(r)
{
for (id_type const* e = s + len; s < e && extend(*s); ++s);
};
// ---------------------------------------------------------------------------
-#if 0
+#if 1
template<typename Token>
TSA_tree_iterator<Token>::
- TSA_tree_iterator(TSA<Token> const* s,
- TokenIndex const& V,
+ TSA_tree_iterator(TSA<Token> const* s,
+ TokenIndex const& V,
string const& key)
: root(s)
{
istringstream buf(key); string w;
while (buf >> w)
{
- if (this->extend(V[w]))
- continue;
+ if (this->extend(V[w]))
+ continue;
else
{
lower.clear();
@@ -374,7 +375,9 @@ namespace ugdiss
}
}
};
-
+#endif
+
+#if 0
// ---------------------------------------------------------------------------
template<typename Token>
@@ -391,7 +394,7 @@ namespace ugdiss
template<typename Token>
TSA_tree_iterator<Token>::
TSA_tree_iterator(TSA<Token> const* s, Token const& t)
- : root(s)
+ : root(s)
{
if (!root) return;
char const* up = root->getUpperBound(t.id());
@@ -406,33 +409,33 @@ namespace ugdiss
template<typename Token>
TSA_tree_iterator<Token>::
- TSA_tree_iterator(TSA<Token> const* s, Token const* kstart,
+ TSA_tree_iterator(TSA<Token> const* s, Token const* kstart,
size_t const len, bool full_match_only)
- : root(s)
+ : root(s)
{
if (!root) return;
size_t i = 0;
for (; i < len && kstart && extend(*kstart); ++i)
kstart = kstart->next();
- if (full_match_only && i != len)
+ if (full_match_only && i != len)
{
lower.clear();
upper.clear();
}
};
- // DEPRECATED: DO NOT USE. Use the one that takes the length
+ // DEPRECATED: DO NOT USE. Use the one that takes the length
// instead of kend.
template<typename Token>
TSA_tree_iterator<Token>::
- TSA_tree_iterator(TSA<Token> const* s, Token const* kstart,
+ TSA_tree_iterator(TSA<Token> const* s, Token const* kstart,
Token const* kend, bool full_match_only)
- : root(s)
+ : root(s)
{
- for (;kstart != kend; kstart = kstart->next())
- if (!extend(*kstart))
+ for (;kstart != kend; kstart = kstart->next())
+ if (!extend(*kstart))
break;
- if (full_match_only && kstart != kend)
+ if (full_match_only && kstart != kend)
{
lower.clear();
upper.clear();
@@ -442,7 +445,7 @@ namespace ugdiss
// ---------------------------------------------------------------------------
// EXTEND
// ---------------------------------------------------------------------------
-
+
template<typename Token>
bool
TSA_tree_iterator<Token>::
@@ -493,9 +496,9 @@ namespace ugdiss
template<typename Token>
size_t
TSA_tree_iterator<Token>::
- size() const
- {
- return lower.size();
+ size() const
+ {
+ return lower.size();
}
// ---------------------------------------------------------------------------
@@ -503,8 +506,8 @@ namespace ugdiss
template<typename Token>
id_type
TSA_tree_iterator<Token>::
- getSid() const
- {
+ getSid() const
+ {
char const* p = (lower.size() ? lower.back() : root->startArray);
char const* q = (upper.size() ? upper.back() : root->endArray);
id_type sid;
@@ -517,8 +520,8 @@ namespace ugdiss
template<typename Token>
::uint64_t
TSA_tree_iterator<Token>::
- getPid(int p) const
- {
+ getPid(int p) const
+ {
if (this->size() == 0) return 0;
if (p < 0) p += upper.size();
char const* lb = lower_bound(p);
@@ -528,7 +531,7 @@ namespace ugdiss
::uint64_t ret = (sid<<32) + (off<<16) + ::uint64_t(p+1);
return ret;
}
-
+
// ---------------------------------------------------------------------------
template<typename Token>
@@ -611,7 +614,7 @@ namespace ugdiss
//---------------------------------------------------------------------------
template<typename Token>
- count_type
+ count_type
TSA_tree_iterator<Token>::
fillBitSet(boost::dynamic_bitset<typename ::uint64_t>& bitset) const
{
@@ -621,7 +624,7 @@ namespace ugdiss
//---------------------------------------------------------------------------
template<typename Token>
- count_type
+ count_type
TSA_tree_iterator<Token>::
markSentences(boost::dynamic_bitset<typename ::uint64_t>& bitset) const
{
@@ -648,7 +651,7 @@ namespace ugdiss
//---------------------------------------------------------------------------
template<typename Token>
- count_type
+ count_type
TSA_tree_iterator<Token>::
markOccurrences(boost::dynamic_bitset<typename ::uint64_t>& bitset, bool markOnlyStartPosition) const
{
@@ -664,7 +667,7 @@ namespace ugdiss
//---------------------------------------------------------------------------
template<typename Token>
- count_type
+ count_type
TSA_tree_iterator<Token>::
markOccurrences(vector<ushort>& dest) const
{
@@ -691,10 +694,10 @@ namespace ugdiss
}
//---------------------------------------------------------------------------
- // mark all endpoints of instances of the path represented by this
+ // mark all endpoints of instances of the path represented by this
// iterator in the sentence [start,stop)
template<typename Token>
- count_type
+ count_type
TSA_tree_iterator<Token>::
markEndOfSequence(Token const* start, Token const* stop,
boost::dynamic_bitset<typename ::uint64_t>& dest) const
@@ -723,10 +726,10 @@ namespace ugdiss
}
//---------------------------------------------------------------------------
- // mark all occurrences of the sequence represented by this
+ // mark all occurrences of the sequence represented by this
// iterator in the sentence [start,stop)
template<typename Token>
- count_type
+ count_type
TSA_tree_iterator<Token>::
markSequence(Token const* start,
Token const* stop,
@@ -781,7 +784,7 @@ namespace ugdiss
{
assert(x);
buf << (i > start ? " " : "");
- if (V) buf << (*V)[x->id()];
+ if (V) buf << (*V)[x->id()];
else buf << x->id();
}
return buf.str();
@@ -804,13 +807,13 @@ namespace ugdiss
{
assert(x);
buf << (i > start ? " " : "");
- buf << V[x->id()].str;
+ buf << V[x->id()].str;
}
return buf.str();
}
#endif
- /// @return true if the sentence [start,stop) contains the sequence
+ /// @return true if the sentence [start,stop) contains the sequence
template<typename Token>
bool
TSA_tree_iterator<Token>::
@@ -820,7 +823,7 @@ namespace ugdiss
for (Token const* t = start; t < stop; ++t)
{
if (*t != *a) continue;
- Token const* b = a;
+ Token const* b = a;
Token const* y = t;
size_t i;
for (i = 1; i < lower.size(); ++i)
@@ -835,7 +838,7 @@ namespace ugdiss
return false;
}
- /// @return true if the sentence /sid/ contains the sequence
+ /// @return true if the sentence /sid/ contains the sequence
template<typename Token>
bool
TSA_tree_iterator<Token>::
@@ -848,9 +851,9 @@ namespace ugdiss
// @param sntcheck: number of roots in the respective sentence
// @param dest: bitvector to keep track of the exact root location
template<typename Token>
- void
+ void
TSA_tree_iterator<Token>::
- tfAndRoot(bitvector const& ref, // reference root positions
+ tfAndRoot(bitvector const& ref, // reference root positions
bitvector const& snt, // relevant sentences
bitvector& dest) const
{
@@ -877,12 +880,12 @@ namespace ugdiss
filterSentences(bitvector& bv) const
{
float aveSntLen = root->corpus->numTokens()/root->corpus->size();
- size_t ANDcost = bv.size()/8; // cost of dest&=ref;
+ size_t ANDcost = bv.size()/8; // cost of dest&=ref;
float aveEntrySize = ((root->endArray-root->startArray)
/root->corpus->numTokens());
if (arrayByteSpanSize()+ANDcost < aveEntrySize*aveSntLen*bv.count())
{
- bitvector tmp(bv.size());
+ bitvector tmp(bv.size());
markSentences(tmp);
bv &= tmp;
}
@@ -894,13 +897,6 @@ namespace ugdiss
return bv;
}
- inline
- size_t
- randInt(size_t N)
- {
- return size_t(N*(rand()/(RAND_MAX+1.)));
- }
-
/// randomly select up to N occurrences of the sequence
template<typename Token>
sptr<vector<typename ttrack::Position> >
@@ -910,9 +906,9 @@ namespace ugdiss
if (level < 0) level += lower.size();
assert(level >=0);
- sptr<vector<typename ttrack::Position> >
+ sptr<vector<typename ttrack::Position> >
ret(new vector<typename ttrack::Position>(N));
-
+
size_t m=0; // number of samples selected so far
typename Token::ArrayEntry I(lower.at(level));
@@ -920,10 +916,10 @@ namespace ugdiss
while (m < N && (I.next) < stop)
{
root->readEntry(I.next,I);
-
+
// t: expected number of remaining samples
- double t = (stop - I.pos)/root->aveIndexEntrySize();
- double r = t*rand()/(RAND_MAX+1.);
+ const double t = (stop - I.pos)/root->aveIndexEntrySize();
+ const double r = util::rand_excl(t);
if (r < N-m)
{
ret->at(m).offset = I.offset;
@@ -934,6 +930,6 @@ namespace ugdiss
return ret;
}
-
+
} // end of namespace ugdiss
#endif
diff --git a/moses/TranslationModel/UG/mm/ug_ttrack_base.cc b/moses/TranslationModel/UG/mm/ug_ttrack_base.cc
index 644c53c3a..60d20a5f9 100644
--- a/moses/TranslationModel/UG/mm/ug_ttrack_base.cc
+++ b/moses/TranslationModel/UG/mm/ug_ttrack_base.cc
@@ -9,12 +9,12 @@
namespace ugdiss
{
using namespace std;
-
+
#if 0
template<>
id_type
Ttrack<id_type>::
- toID(id_type const& t)
+ toID(id_type const& t)
{
return t;
}
diff --git a/moses/TranslationModel/UG/mm/ug_ttrack_base.h b/moses/TranslationModel/UG/mm/ug_ttrack_base.h
index 7c11b3942..d087a9e58 100644
--- a/moses/TranslationModel/UG/mm/ug_ttrack_base.h
+++ b/moses/TranslationModel/UG/mm/ug_ttrack_base.h
@@ -2,7 +2,7 @@
// Base class for corpus tracks. mmTtrack (memory-mapped Ttrack) and imTtrack (in-memory Ttrack)
// are derived from this class.
-// This code is part of a refactorization of the earlier Ttrack class as a template class for
+// This code is part of a refactorization of the earlier Ttrack class as a template class for
// tokens of arbitrary fixed-length size.
// (c) 2007-2009 Ulrich Germann. All rights reserved.
@@ -17,6 +17,7 @@
#include "ug_ttrack_position.h"
#include "tpt_typedefs.h"
#include "tpt_tokenindex.h"
+#include "moses/Util.h"
// #include "ug_vocab.h"
namespace ugdiss
@@ -25,6 +26,33 @@ namespace ugdiss
typedef boost::dynamic_bitset<uint64_t> bdBitset;
+ template<typename sid_t, typename off_t, typename len_t>
+ void
+ parse_pid(uint64_t const pid, sid_t & sid,
+ off_t & off, len_t& len)
+ {
+ static uint64_t two32 = uint64_t(1)<<32;
+ static uint64_t two16 = uint64_t(1)<<16;
+ len = pid%two16;
+ off = (pid%two32)>>16;
+ sid = pid>>32;
+ }
+
+ template<typename Token>
+ string
+ toString(TokenIndex const& V, Token const* x, size_t const len)
+ {
+ if (!len) return "";
+ UTIL_THROW_IF2(!x, HERE << ": Unexpected end of phrase!");
+ ostringstream buf;
+ buf << V[x->id()];
+ size_t i = 1;
+ for (x = x->next(); x && i < len; ++i, x = x->next())
+ buf << " " << V[x->id()];
+ UTIL_THROW_IF2(i != len, HERE << ": Unexpected end of phrase!");
+ return buf.str();
+ }
+
template<typename TKN=id_type>
class Ttrack
{
@@ -35,66 +63,66 @@ namespace ugdiss
typedef TKN Token;
/** @return a pointer to beginning of sentence /sid/ */
- virtual
- TKN const*
- sntStart(size_t sid) const = 0;
+ virtual
+ TKN const*
+ sntStart(size_t sid) const = 0;
/** @return end point of sentence /sid/ */
- virtual
- TKN const*
- sntEnd(size_t sid) const = 0;
+ virtual
+ TKN const*
+ sntEnd(size_t sid) const = 0;
TKN const*
getToken(Position const& p) const;
template<typename T>
- T const*
- getTokenAs(Position const& p) const
+ T const*
+ getTokenAs(Position const& p) const
{ return reinterpret_cast<T const*>(getToken(p)); }
template<typename T>
T const*
- sntStartAs(id_type sid) const
+ sntStartAs(id_type sid) const
{ return reinterpret_cast<T const*>(sntStart(sid)); }
template<typename T>
T const*
- sntEndAs(id_type sid) const
+ sntEndAs(id_type sid) const
{ return reinterpret_cast<T const*>(sntEnd(sid)); }
/** @return length of sentence /sid/ */
size_t sntLen(size_t sid) const { return sntEnd(sid) - sntStart(sid); }
- size_t
+ size_t
startPos(id_type sid) const { return sntStart(sid)-sntStart(0); }
-
- size_t
+
+ size_t
endPos(id_type sid) const { return sntEnd(sid)-sntStart(0); }
/** Don't use this unless you want a copy of the sentence */
- vector<TKN>
- operator[](id_type sid) const
- {
- return vector<TKN>(sntStart(sid),sntEnd(sid));
+ vector<TKN>
+ operator[](id_type sid) const
+ {
+ return vector<TKN>(sntStart(sid),sntEnd(sid));
}
/** @return size of corpus in number of sentences */
- virtual size_t size() const = 0;
+ virtual size_t size() const = 0;
/** @return size of corpus in number of words/tokens */
- virtual size_t numTokens() const = 0;
+ virtual size_t numTokens() const = 0;
- /** @return string representation of sentence /sid/
+ /** @return string representation of sentence /sid/
* Currently only defined for Ttrack<id_type> */
string str(id_type sid, TokenIndex const& T) const;
string pid2str(TokenIndex const* V, uint64_t pid) const;
- // /** @return string representation of sentence /sid/
+ // /** @return string representation of sentence /sid/
// * Currently only defined for Ttrack<id_type> */
// string str(id_type sid, Vocab const& V) const;
-
- /** counts the tokens in the corpus; used for example in the construction of
+
+ /** counts the tokens in the corpus; used for example in the construction of
* token sequence arrays */
count_type count_tokens(vector<count_type>& cnt, bdBitset const* filter,
int lengthCutoff=0, ostream* log=NULL) const;
@@ -102,7 +130,7 @@ namespace ugdiss
// static id_type toID(TKN const& t);
int cmp(Position const& A, Position const& B, int keyLength) const;
- int cmp(Position const& A, TKN const* keyStart, int keyLength=-1,
+ int cmp(Position const& A, TKN const* keyStart, int keyLength=-1,
int depth=0) const;
virtual id_type findSid(TKN const* t) const = 0; // find the sentence id of a given token
@@ -111,18 +139,18 @@ namespace ugdiss
// the following three functions are currently not used by any program ... (deprecate?)
TKN const*
- find_next_within_sentence(TKN const* startKey,
- int keyLength,
+ find_next_within_sentence(TKN const* startKey,
+ int keyLength,
Position startHere) const;
Position
- find_first(TKN const* startKey, int keyLength,
+ find_first(TKN const* startKey, int keyLength,
bdBitset const* filter=NULL) const;
Position
- find_next(TKN const* startKey, int keyLength, Position startAfter,
+ find_next(TKN const* startKey, int keyLength, Position startAfter,
bdBitset const* filter=NULL) const;
-
+
virtual size_t offset(TKN const* t) const { return t-sntStart(0); }
};
@@ -143,11 +171,11 @@ namespace ugdiss
template<typename TKN>
count_type
Ttrack<TKN>::
- count_tokens(vector<count_type>& cnt, bdBitset const* filter,
+ count_tokens(vector<count_type>& cnt, bdBitset const* filter,
int lengthCutoff, ostream* log) const
{
- bdBitset filter2;
- if (!filter)
+ bdBitset filter2;
+ if (!filter)
{
filter2.resize(this->size());
filter2.set();
@@ -156,21 +184,21 @@ namespace ugdiss
cnt.clear();
cnt.reserve(500000);
count_type totalCount=0;
-
+
int64_t expectedTotal=0;
for (size_t sid = 0; sid < this->size(); ++sid)
expectedTotal += this->sntLen(sid);
-
+
for (size_t sid = filter->find_first();
sid < filter->size();
sid = filter->find_next(sid))
{
TKN const* k = sntStart(sid);
TKN const* const stop = sntEnd(sid);
- if (lengthCutoff && stop-k >= lengthCutoff)
+ if (lengthCutoff && stop-k >= lengthCutoff)
{
- if (log)
- *log << "WARNING: skipping sentence #" << sid
+ if (log)
+ *log << "WARNING: skipping sentence #" << sid
<< " with more than 65536 tokens" << endl;
expectedTotal -= stop-k;
}
@@ -189,7 +217,7 @@ namespace ugdiss
if (this->size() == filter->count())
{
if (totalCount != expectedTotal)
- cerr << "OOPS: expected " << expectedTotal
+ cerr << "OOPS: expected " << expectedTotal
<< " tokens but counted " << totalCount << endl;
assert(totalCount == expectedTotal);
}
@@ -228,16 +256,16 @@ namespace ugdiss
a = next(a);
b = next(b);
// cerr << keyLength << "b. " << (a ? a->lemma : 0) << " " << (b ? b->lemma : 0) << endl;
- if (--keyLength==0 || b < bosB || b >= eosB)
- {
+ if (--keyLength==0 || b < bosB || b >= eosB)
+ {
ret = (a < bosA || a >= eosA) ? 0 : 1;
break;
}
}
// cerr << "RETURNING " << ret << endl;
- return ret;
+ return ret;
}
-
+
template<typename TKN>
int
Ttrack<TKN>::
@@ -259,17 +287,17 @@ namespace ugdiss
if (*x > *key) return 2;
key = key->next();
x = x->next();
- if (--keyLength==0) // || !key)
+ if (--keyLength==0) // || !key)
return (x == stopx) ? 0 : 1;
assert(key);
}
- return -1;
+ return -1;
}
template<typename TKN>
- TKN const*
+ TKN const*
Ttrack<TKN>::
- find_next_within_sentence(TKN const* startKey, int keyLength,
+ find_next_within_sentence(TKN const* startKey, int keyLength,
Position startHere) const
{
for (TKN const* t = getToken(startHere); t; t = getToken(startHere))
@@ -280,12 +308,12 @@ namespace ugdiss
{
TKN const* k = startKey->next();
TKN const* t2 = t->next();
- if (t2)
+ if (t2)
{
- cout << t2->lemma << "." << int(t2->minpos) << " "
+ cout << t2->lemma << "." << int(t2->minpos) << " "
<< k->lemma << "." << int(k->minpos) << " "
<< t2->cmp(*k) << endl;
- }
+ }
}
#endif
int x = cmp(startHere,startKey,keyLength,0);
@@ -302,8 +330,8 @@ namespace ugdiss
{
if (filter)
{
- for (size_t sid = filter->find_first();
- sid < filter->size();
+ for (size_t sid = filter->find_first();
+ sid < filter->size();
sid = filter->find_next(sid))
{
TKN const* x = find_next_within_sentence(startKey,keyLength,Position(sid,0));
@@ -320,7 +348,7 @@ namespace ugdiss
}
return Position(this->size(),0);
}
-
+
template<typename TKN>
typename Ttrack<TKN>::Position
Ttrack<TKN>::
@@ -383,6 +411,6 @@ namespace ugdiss
}
return buf.str();
}
-
+
}
#endif
diff --git a/moses/TranslationModel/UG/mm/ug_ttrack_position.h b/moses/TranslationModel/UG/mm/ug_ttrack_position.h
index 64fab3afb..6d473f263 100644
--- a/moses/TranslationModel/UG/mm/ug_ttrack_position.h
+++ b/moses/TranslationModel/UG/mm/ug_ttrack_position.h
@@ -6,7 +6,7 @@
#include "ug_typedefs.h"
// A token position in a Ttrack, with a LESS functor for comparing token
-// positions in whatever sorting order the underlying token type implies.
+// positions in whatever sorting order the underlying token type implies.
//
// (c) 2007-2010 Ulrich Germann. All rights reserved.
@@ -26,19 +26,19 @@ namespace ugdiss
Position(id_type _sid, ushort _off);
template<typename TTRACK_TYPE> class LESS; // probably abandoned
}; // end of deklaration of Position
-
-#if 1
+
+#if 1
template<typename TTRACK_TYPE>
- class
+ class
Position::
LESS
{
TTRACK_TYPE const* c;
public:
typedef typename TTRACK_TYPE::Token Token;
-
+
LESS(TTRACK_TYPE const* crp) : c(crp) {};
-
+
bool operator()(Position const& A, Position const& B) const
{
Token const* a = c->getToken(A); assert(a);
@@ -48,30 +48,30 @@ namespace ugdiss
Token const* bosA = c->sntStart(A.sid);
Token const* eosA = c->sntEnd(A.sid);
-
+
Token const* bosB = c->sntStart(B.sid);
Token const* eosB = c->sntEnd(B.sid);
-
+
#if 0
- Token const* z = a;
+ Token const* z = a;
cout << "A: " << z->id();
for (z = next(z); z >= bosA && z < eosA; z = next(z))
- cout << "-" << z->id();
+ cout << "-" << z->id();
cout << endl;
-
- z = b;
+
+ z = b;
cout << "B: " << z->id();
for (z = next(z); z >= bosB && z < eosB; z = next(z))
- cout << "-" << z->id();
+ cout << "-" << z->id();
cout << endl;
#endif
while (*a == *b)
{
a = next(a);
b = next(b);
- if (a < bosA || a >= eosA)
+ if (a < bosA || a >= eosA)
return (b >= bosB && b < eosB);
- if (b < bosB || b >= eosB)
+ if (b < bosB || b >= eosB)
return false;
}
int x = a->cmp(*b);
@@ -86,4 +86,4 @@ namespace ugdiss
} // end of namespace ttrack
} // end of namespace ugdiss
#endif
-
+
diff --git a/moses/TranslationModel/UG/mm/ug_typedefs.h b/moses/TranslationModel/UG/mm/ug_typedefs.h
index 83c8684e0..0181bef9e 100644
--- a/moses/TranslationModel/UG/mm/ug_typedefs.h
+++ b/moses/TranslationModel/UG/mm/ug_typedefs.h
@@ -24,7 +24,7 @@ namespace ugdiss
typedef vector<vector<short> > short_2d_table;
typedef vector<short_2d_table> short_3d_table;
typedef vector<short_3d_table> short_4d_table;
-
+
typedef vector<vector<int> > int_2d_table;
typedef vector<int_2d_table> int_3d_table;
typedef vector<int_3d_table> int_4d_table;
diff --git a/moses/TranslationModel/UG/mmsapt.cpp b/moses/TranslationModel/UG/mmsapt.cpp
index d24f571b3..6e680bbc5 100644
--- a/moses/TranslationModel/UG/mmsapt.cpp
+++ b/moses/TranslationModel/UG/mmsapt.cpp
@@ -1,9 +1,15 @@
+#ifdef HAVE_CURLPP
+#include <curlpp/Options.hpp>
+#include <curlpp/cURLpp.hpp>
+#include <curlpp/Easy.hpp>
+#endif
+
#include "mmsapt.h"
#include <boost/foreach.hpp>
#include <boost/scoped_ptr.hpp>
#include <boost/tokenizer.hpp>
+#include <boost/thread/locks.hpp>
#include <algorithm>
-#include "moses/TranslationModel/UG/mm/ug_phrasepair.h"
#include "util/exception.hh"
#include <set>
@@ -13,7 +19,7 @@ namespace Moses
using namespace std;
using namespace boost;
- void
+ void
fillIdSeq(Phrase const& mophrase, size_t const ifactor,
TokenIndex const& V, vector<id_type>& dest)
{
@@ -24,9 +30,8 @@ namespace Moses
dest[i] = V[f->ToString()];
}
}
-
- void
+ void
parseLine(string const& line, map<string,string> & param)
{
char_separator<char> sep("; ");
@@ -50,7 +55,8 @@ namespace Moses
#if 0
Mmsapt::
Mmsapt(string const& description, string const& line)
- : PhraseDictionary(description,line), ofactor(1,0)
+ : PhraseDictionary(description,line), ofactor(1,0), m_bias_log(NULL)
+ , m_bias_loglevel(0)
{
this->init(line);
}
@@ -65,16 +71,23 @@ namespace Moses
Mmsapt::
Mmsapt(string const& line)
- : PhraseDictionary(line)
- // , ofactor(1,0)
- , m_tpc_ctr(0)
+ : PhraseDictionary(line, false)
+ , m_bias_log(NULL)
+ , m_bias_loglevel(0)
+ , m_lr_func(NULL)
+ , cache_key(((char*)this)+2)
+ , context_key(((char*)this)+1)
+ // , m_tpc_ctr(0)
+ , ofactor(1,0)
{
- this->init(line);
+ init(line);
+ setup_local_feature_functions();
+ Register();
}
- void
+ void
Mmsapt::
- read_config_file(string fname,map<string,string>& param)
+ read_config_file(string fname, map<string,string>& param)
{
string line;
ifstream config(fname.c_str());
@@ -86,9 +99,9 @@ namespace Moses
tokenizer<char_separator<char> >::const_iterator t = tokens.begin();
if (t == tokens.end()) continue;
string& foo = param[*t++];
- if (t == tokens.end() || foo.size()) continue;
+ if (t == tokens.end() || foo.size()) continue;
// second condition: do not overwrite settings from the line in moses.ini
- UTIL_THROW_IF2(*t++ != "=" || t == tokens.end(),
+ UTIL_THROW_IF2(*t++ != "=" || t == tokens.end(),
"Syntax error in Mmsapt config file '" << fname << "'.");
for (foo = *t++; t != tokens.end(); foo += " " + *t++);
}
@@ -108,23 +121,16 @@ namespace Moses
}
}
- bool
- Mmsapt::
- isLogVal(int i) const { return m_is_logval.at(i); }
-
- bool
- Mmsapt::
- isInteger(int i) const { return m_is_integer.at(i); }
+ bool Mmsapt::isLogVal(int i) const { return m_is_logval.at(i); }
+ bool Mmsapt::isInteger(int i) const { return m_is_integer.at(i); }
- void
- Mmsapt::
- init(string const& line)
+ void Mmsapt::init(string const& line)
{
map<string,string>::const_iterator m;
parseLine(line,this->param);
this->m_numScoreComponents = atoi(param["num-features"].c_str());
-
+
m = param.find("config");
if (m != param.end())
read_config_file(m->second,param);
@@ -132,18 +138,18 @@ namespace Moses
m = param.find("base");
if (m != param.end())
{
- bname = m->second;
+ m_bname = m->second;
m = param.find("path");
- UTIL_THROW_IF2((m != param.end() && m->second != bname),
- "Conflicting aliases for path:\n"
+ UTIL_THROW_IF2((m != param.end() && m->second != m_bname),
+ "Conflicting aliases for path:\n"
<< "path=" << string(m->second) << "\n"
- << "base=" << bname.c_str() );
+ << "base=" << m_bname.c_str() );
}
- else bname = param["path"];
+ else m_bname = param["path"];
L1 = param["L1"];
L2 = param["L2"];
-
- UTIL_THROW_IF2(bname.size() == 0, "Missing corpus base name at " << HERE);
+
+ UTIL_THROW_IF2(m_bname.size() == 0, "Missing corpus base name at " << HERE);
UTIL_THROW_IF2(L1.size() == 0, "Missing L1 tag at " << HERE);
UTIL_THROW_IF2(L2.size() == 0, "Missing L2 tag at " << HERE);
@@ -151,11 +157,11 @@ namespace Moses
pair<string,string> dflt("input-factor","0");
input_factor = atoi(param.insert(dflt).first->second.c_str());
// shouldn't that be a string?
-
+
dflt = pair<string,string> ("output-factor","0");
output_factor = atoi(param.insert(dflt).first->second.c_str());
ofactor.assign(1,output_factor);
-
+
dflt = pair<string,string> ("smooth",".01");
m_lbop_conf = atof(param.insert(dflt).first->second.c_str());
@@ -169,32 +175,56 @@ namespace Moses
m_workers = atoi(param.insert(dflt).first->second.c_str());
m_workers = min(m_workers,24UL);
-
+ dflt = pair<string,string>("bias-loglevel","0");
+ m_bias_loglevel = atoi(param.insert(dflt).first->second.c_str());
+
dflt = pair<string,string>("table-limit","20");
m_tableLimit = atoi(param.insert(dflt).first->second.c_str());
dflt = pair<string,string>("cache","10000");
- size_t hsize = max(1000,atoi(param.insert(dflt).first->second.c_str()));
- m_history.reserve(hsize);
+ m_cache_size = max(1000,atoi(param.insert(dflt).first->second.c_str()));
+ m_cache.reset(new TPCollCache(m_cache_size));
+ // m_history.reserve(hsize);
// in plain language: cache size is at least 1000, and 10,000 by default
- // this cache keeps track of the most frequently used target phrase collections
- // even when not actively in use
+ // this cache keeps track of the most frequently used target
+ // phrase collections even when not actively in use
// Feature functions are initialized in function Load();
- param.insert(pair<string,string>("pfwd", "g"));
- param.insert(pair<string,string>("pbwd", "g"));
- param.insert(pair<string,string>("logcnt", "0"));
- param.insert(pair<string,string>("coh", "0"));
- param.insert(pair<string,string>("rare", "1"));
- param.insert(pair<string,string>("prov", "1"));
-
+ param.insert(pair<string,string>("pfwd", "g"));
+ param.insert(pair<string,string>("pbwd", "g"));
+ param.insert(pair<string,string>("logcnt", "0"));
+ param.insert(pair<string,string>("coh", "0"));
+ param.insert(pair<string,string>("rare", "1"));
+ param.insert(pair<string,string>("prov", "1"));
+
poolCounts = true;
-
- if ((m = param.find("bias")) != param.end())
- bias_file = m->second;
- if ((m = param.find("extra")) != param.end())
- extra_data = m->second;
+ // this is for pre-comuted sentence-level bias; DEPRECATED!
+ if ((m = param.find("bias")) != param.end())
+ m_bias_file = m->second;
+
+ if ((m = param.find("bias-server")) != param.end())
+ m_bias_server = m->second;
+
+ if ((m = param.find("bias-logfile")) != param.end())
+ {
+ m_bias_logfile = m->second;
+ if (m_bias_logfile == "/dev/stderr")
+ m_bias_log = &std::cerr;
+ else if (m_bias_logfile == "/dev/stdout")
+ m_bias_log = &std::cout;
+ else
+ {
+ m_bias_logger.reset(new ofstream(m_bias_logfile.c_str()));
+ m_bias_log = m_bias_logger.get();
+ }
+ }
+
+ if ((m = param.find("lr-func")) != param.end())
+ m_lr_func_name = m->second;
+
+ if ((m = param.find("extra")) != param.end())
+ m_extra_data = m->second;
dflt = pair<string,string>("tuneable","true");
m_tuneable = Scan<bool>(param.insert(dflt).first->second.c_str());
@@ -209,9 +239,13 @@ namespace Moses
known_parameters.push_back("L1");
known_parameters.push_back("L2");
known_parameters.push_back("Mmsapt");
- known_parameters.push_back("PhraseDictionaryBitextSampling"); // alias for Mmsapt
+ known_parameters.push_back("PhraseDictionaryBitextSampling");
+ // alias for Mmsapt
known_parameters.push_back("base"); // alias for path
known_parameters.push_back("bias");
+ known_parameters.push_back("bias-server");
+ known_parameters.push_back("bias-logfile");
+ known_parameters.push_back("bias-loglevel");
known_parameters.push_back("cache");
known_parameters.push_back("coh");
known_parameters.push_back("config");
@@ -221,10 +255,11 @@ namespace Moses
known_parameters.push_back("lexalpha");
// known_parameters.push_back("limit"); // replaced by "table-limit"
known_parameters.push_back("logcnt");
+ known_parameters.push_back("lr-func"); // associated lexical reordering function
known_parameters.push_back("name");
known_parameters.push_back("num-features");
known_parameters.push_back("output-factor");
- known_parameters.push_back("path");
+ known_parameters.push_back("path");
known_parameters.push_back("pbwd");
known_parameters.push_back("pfwd");
known_parameters.push_back("prov");
@@ -240,12 +275,12 @@ namespace Moses
{
UTIL_THROW_IF2(!binary_search(known_parameters.begin(),
known_parameters.end(), m->first),
- HERE << ": Unknown parameter specification for Mmsapt: "
+ HERE << ": Unknown parameter specification for Mmsapt: "
<< m->first);
}
}
- void
+ void
Mmsapt::
load_bias(string const fname)
{
@@ -256,13 +291,14 @@ namespace Moses
Mmsapt::
load_extra_data(string bname, bool locking = true)
{
+ using namespace boost;
// TO DO: ADD CHECKS FOR ROBUSTNESS
// - file existence?
// - same number of lines?
// - sane word alignment?
vector<string> text1,text2,symal;
string line;
- filtering_istream in1,in2,ina;
+ filtering_istream in1,in2,ina;
open_input_stream(bname+L1+".txt.gz",in1);
open_input_stream(bname+L2+".txt.gz",in2);
@@ -272,8 +308,8 @@ namespace Moses
while(getline(in2,line)) text2.push_back(line);
while(getline(ina,line)) symal.push_back(line);
- boost::scoped_ptr<boost::lock_guard<boost::mutex> > guard;
- if (locking) guard.reset(new boost::lock_guard<boost::mutex>(this->lock));
+ scoped_ptr<boost::unique_lock<shared_mutex> > guard;
+ if (locking) guard.reset(new boost::unique_lock<shared_mutex>(m_lock));
btdyn = btdyn->add(text1,text2,symal);
assert(btdyn);
cerr << "Loaded " << btdyn->T1->size() << " sentence pairs" << endl;
@@ -298,7 +334,7 @@ namespace Moses
ff.reset(new fftype(spec));
register_ff(ff, m_active_ff_dyn);
}
- else
+ else
{
sptr<fftype> ff(new fftype(spec));
register_ff(ff, m_active_ff_common);
@@ -308,7 +344,8 @@ namespace Moses
template<typename fftype>
void
Mmsapt::
- check_ff(string const ffname, float const xtra, vector<sptr<pscorer> >* registry)
+ check_ff(string const ffname, float const xtra,
+ vector<sptr<pscorer> >* registry)
{
string const& spec = param[ffname];
if (spec == "" || spec == "0") return;
@@ -324,7 +361,7 @@ namespace Moses
ff.reset(new fftype(xtra,spec));
register_ff(ff, m_active_ff_dyn);
}
- else
+ else
{
sptr<fftype> ff(new fftype(xtra,spec));
register_ff(ff, m_active_ff_common);
@@ -347,41 +384,38 @@ namespace Moses
}
void
- Mmsapt::
- Load(bool with_checks)
+ Mmsapt
+ ::setup_local_feature_functions()
{
- boost::lock_guard<boost::mutex> guard(this->lock);
-
- // can load only once
- // UTIL_THROW_IF2(shards.size(),"Mmsapt is already loaded at " << HERE);
-
+ boost::unique_lock<boost::shared_mutex> lock(m_lock);
// load feature sets
BOOST_FOREACH(string const& fsname, m_feature_set_names)
{
// standard (default) feature set
if (fsname == "standard")
{
- // lexical scores
- string lexfile = bname + L1 + "-" + L2 + ".lex";
- sptr<PScoreLex1<Token> > ff(new PScoreLex1<Token>(param["lex_alpha"],lexfile));
+ // lexical scores
+ string lexfile = m_bname + L1 + "-" + L2 + ".lex";
+ sptr<PScoreLex1<Token> >
+ ff(new PScoreLex1<Token>(param["lex_alpha"],lexfile));
register_ff(ff,m_active_ff_common);
-
+
// these are always computed on pooled data
check_ff<PScoreRareness<Token> > ("rare", &m_active_ff_common);
check_ff<PScoreUnaligned<Token> >("unal", &m_active_ff_common);
check_ff<PScoreCoherence<Token> >("coh", &m_active_ff_common);
-
- // for these ones either way is possible (specification ends with '+'
- // if corpus-specific
+
+ // for these ones either way is possible (specification ends with '+'
+ // if corpus-specific
check_ff<PScorePfwd<Token> >("pfwd", m_lbop_conf);
check_ff<PScorePbwd<Token> >("pbwd", m_lbop_conf);
check_ff<PScoreLogCnt<Token> >("logcnt");
-
+
// These are always corpus-specific
check_ff<PScoreProvenance<Token> >("prov", &m_active_ff_fix);
check_ff<PScoreProvenance<Token> >("prov", &m_active_ff_dyn);
}
-
+
// data source features (copies of phrase and word count specific to
// this translation model)
else if (fsname == "datasource")
@@ -393,7 +427,19 @@ namespace Moses
}
}
// cerr << "Features: " << Join("|",m_feature_names) << endl;
-
+ this->m_numScoreComponents = this->m_feature_names.size();
+ this->m_numTuneableComponents = this->m_numScoreComponents;
+ }
+
+ void
+ Mmsapt::
+ Load(bool with_checks)
+ {
+ // load feature functions (i.e., load underlying data bases, if any)
+ BOOST_FOREACH(sptr<pscorer>& ff, m_active_ff_fix) ff->load();
+ BOOST_FOREACH(sptr<pscorer>& ff, m_active_ff_dyn) ff->load();
+ BOOST_FOREACH(sptr<pscorer>& ff, m_active_ff_common) ff->load();
+#if 0
if (with_checks)
{
UTIL_THROW_IF2(this->m_feature_names.size() != this->m_numScoreComponents,
@@ -402,22 +448,22 @@ namespace Moses
<< ") does not match number specified in Moses config file ("
<< this->m_numScoreComponents << ")!\n";);
}
+#endif
// Load corpora. For the time being, we can have one memory-mapped static
// corpus and one in-memory dynamic corpus
- // sptr<mmbitext> btfix(new mmbitext());
- btfix.num_workers = this->m_workers;
- btfix.open(bname, L1, L2);
+ boost::unique_lock<boost::shared_mutex> lock(m_lock);
+
+ btfix.m_num_workers = this->m_workers;
+ btfix.open(m_bname, L1, L2);
btfix.setDefaultSampleSize(m_default_sample_size);
- // shards.push_back(btfix);
-
- btdyn.reset(new imbitext(btfix.V1, btfix.V2, m_default_sample_size));
- btdyn->num_workers = this->m_workers;
- if (bias_file.size())
- load_bias(bias_file);
-
- if (extra_data.size())
- load_extra_data(extra_data,false);
-
+
+ btdyn.reset(new imbitext(btfix.V1, btfix.V2, m_default_sample_size, m_workers));
+ if (m_bias_file.size())
+ load_bias(m_bias_file);
+
+ if (m_extra_data.size())
+ load_extra_data(m_extra_data, false);
+
#if 0
// currently not used
LexicalPhraseScorer2<Token>::table_t & COOC = calc_lex.scorer.COOC;
@@ -426,7 +472,7 @@ namespace Moses
for (size_t r = 0; r < COOC.numRows; ++r)
for (cell_t const* c = COOC[r].start; c < COOC[r].stop; ++c)
wlex21[c->id].push_back(r);
- COOCraw.open(bname + L1 + "-" + L2 + ".coc");
+ COOCraw.open(m_bname + L1 + "-" + L2 + ".coc");
#endif
assert(btdyn);
// cerr << "LOADED " << HERE << endl;
@@ -439,23 +485,23 @@ namespace Moses
vector<string> S1(1,s1);
vector<string> S2(1,s2);
vector<string> ALN(1,a);
- boost::lock_guard<boost::mutex> guard(this->lock);
+ boost::unique_lock<boost::shared_mutex> guard(m_lock);
btdyn = btdyn->add(S1,S2,ALN);
}
- TargetPhrase*
+ TargetPhrase*
Mmsapt::
mkTPhrase(Phrase const& src,
- PhrasePair<Token>* fix,
- PhrasePair<Token>* dyn,
+ PhrasePair<Token>* fix,
+ PhrasePair<Token>* dyn,
sptr<Bitext<Token> > const& dynbt) const
{
- UTIL_THROW_IF2(!fix && !dyn, HERE <<
+ UTIL_THROW_IF2(!fix && !dyn, HERE <<
": Can't create target phrase from nothing.");
vector<float> fvals(this->m_numScoreComponents);
PhrasePair<Token> pool = fix ? *fix : *dyn;
- if (fix)
+ if (fix)
{
BOOST_FOREACH(sptr<pscorer> const& ff, m_active_ff_fix)
(*ff)(btfix, *fix, &fvals);
@@ -465,7 +511,7 @@ namespace Moses
BOOST_FOREACH(sptr<pscorer> const& ff, m_active_ff_dyn)
(*ff)(*dynbt, *dyn, &fvals);
}
-
+
if (fix && dyn) { pool += *dyn; }
else if (fix)
{
@@ -487,7 +533,7 @@ namespace Moses
BOOST_FOREACH(sptr<pscorer> const& ff, m_active_ff_fix)
(*ff)(*dynbt, ff->allowPooling() ? pool : zilch, &fvals);
}
- if (fix)
+ if (fix)
{
BOOST_FOREACH(sptr<pscorer> const& ff, m_active_ff_common)
(*ff)(btfix, pool, &fvals);
@@ -509,38 +555,64 @@ namespace Moses
tp->SetAlignTerm(pool.aln);
tp->GetScoreBreakdown().Assign(this, fvals);
tp->EvaluateInIsolation(src);
+
+ if (m_lr_func)
+ {
+ LRModel::ModelType mdl = m_lr_func->GetModel().GetModelType();
+ LRModel::Direction dir = m_lr_func->GetModel().GetDirection();
+ sptr<Scores> scores(new Scores());
+ pool.fill_lr_vec(dir, mdl, *scores);
+ tp->SetExtraScores(m_lr_func, scores);
+ }
+
return tp;
}
+ void
Mmsapt::
- TargetPhraseCollectionWrapper::
- TargetPhraseCollectionWrapper(size_t r, ::uint64_t k)
- : revision(r), key(k), refCount(0), idx(-1)
- { }
+ GetTargetPhraseCollectionBatch(ttasksptr const& ttask,
+ const InputPathList &inputPathQueue) const
+ {
+ InputPathList::const_iterator iter;
+ for (iter = inputPathQueue.begin(); iter != inputPathQueue.end(); ++iter)
+ {
+ InputPath &inputPath = **iter;
+ const Phrase &phrase = inputPath.GetPhrase();
+ PrefixExists(ttask, phrase); // launches parallel lookup
+ }
+ for (iter = inputPathQueue.begin(); iter != inputPathQueue.end(); ++iter)
+ {
+ InputPath &inputPath = **iter;
+ const Phrase &phrase = inputPath.GetPhrase();
+ const TargetPhraseCollection *targetPhrases
+ = this->GetTargetPhraseCollectionLEGACY(ttask,phrase);
+ inputPath.SetTargetPhrases(*this, targetPhrases, NULL);
+ }
+ }
+ TargetPhraseCollection const*
Mmsapt::
- TargetPhraseCollectionWrapper::
- ~TargetPhraseCollectionWrapper()
+ GetTargetPhraseCollectionLEGACY(const Phrase& src) const
{
- assert(this->refCount == 0);
+ UTIL_THROW2("Don't call me without the translation task.");
}
-
- // This is not the most efficient way of phrase lookup!
- TargetPhraseCollection const*
+
+ // This is not the most efficient way of phrase lookup!
+ TargetPhraseCollection const*
Mmsapt::
- GetTargetPhraseCollectionLEGACY(const Phrase& src) const
+ GetTargetPhraseCollectionLEGACY(ttasksptr const& ttask, const Phrase& src) const
{
// map from Moses Phrase to internal id sequence
- vector<id_type> sphrase;
+ vector<id_type> sphrase;
fillIdSeq(src,input_factor,*(btfix.V1),sphrase);
if (sphrase.size() == 0) return NULL;
-
+
// Reserve a local copy of the dynamic bitext in its current form. /btdyn/
// is set to a new copy of the dynamic bitext every time a sentence pair
// is added. /dyn/ keeps the old bitext around as long as we need it.
sptr<imBitext<Token> > dyn;
{ // braces are needed for scoping mutex lock guard!
- boost::lock_guard<boost::mutex> guard(this->lock);
+ boost::unique_lock<boost::shared_mutex> guard(m_lock);
assert(btdyn);
dyn = btdyn;
}
@@ -559,54 +631,52 @@ namespace Moses
<< mdyn.size() << " " << mdyn.getPid() << endl;
#endif
- if (mdyn.size() != sphrase.size() && mfix.size() != sphrase.size())
+ if (mdyn.size() != sphrase.size() && mfix.size() != sphrase.size())
return NULL; // phrase not found in either bitext
- // cache lookup:
- ::uint64_t phrasekey = (mfix.size() == sphrase.size() ? (mfix.getPid()<<1)
- : (mdyn.getPid()<<1)+1);
- size_t revision = dyn->revision();
- {
- boost::lock_guard<boost::mutex> guard(this->lock);
- tpc_cache_t::iterator c = m_cache.find(phrasekey);
- // TO DO: we should revise the revision mechanism: we take the length
- // of the dynamic bitext (in sentences) at the time the PT entry
- // was stored as the time stamp. For each word in the
- // vocabulary, we also store its most recent occurrence in the
- // bitext. Only if the timestamp of each word in the phrase is
- // newer than the timestamp of the phrase itself we must update
- // the entry.
- if (c != m_cache.end() && c->second->revision == revision)
- return encache(c->second);
- }
-
- // OK: pt entry not found or not up to date
- // lookup and expansion could be done in parallel threds,
+ // do we have cached results for this phrase?
+ uint64_t phrasekey = (mfix.size() == sphrase.size()
+ ? (mfix.getPid()<<1) : (mdyn.getPid()<<1)+1);
+
+ // get context-specific cache of items previously looked up
+ sptr<ContextScope> const& scope = ttask->GetScope();
+ sptr<TPCollCache> cache = scope->get<TPCollCache>(cache_key);
+ TPCollWrapper* ret = cache->get(phrasekey, dyn->revision());
+ // TO DO: we should revise the revision mechanism: we take the length
+ // of the dynamic bitext (in sentences) at the time the PT entry
+ // was stored as the time stamp. For each word in the
+ // vocabulary, we also store its most recent occurrence in the
+ // bitext. Only if the timestamp of each word in the phrase is
+ // newer than the timestamp of the phrase itself we must update
+ // the entry.
+
+ if (ret) return ret; // yes, was cached => DONE
+
+ // OK: pt entry NOT found or NOT up to date
+ // lookup and expansion could be done in parallel threads,
// but ppdyn is probably small anyway
// TO DO: have Bitexts return lists of PhrasePairs instead of pstats
- // no need to expand pstats at every single lookup again, especially
+ // no need to expand pstats at every single lookup again, especially
// for btfix.
sptr<pstats> sfix,sdyn;
- if (mfix.size() == sphrase.size())
- sfix = btfix.lookup(mfix);
- if (mdyn.size() == sphrase.size()) sdyn = dyn->lookup(mdyn);
+
+ if (mfix.size() == sphrase.size()) sfix = btfix.lookup(ttask, mfix);
+ if (mdyn.size() == sphrase.size()) sdyn = dyn->lookup(ttask, mdyn);
vector<PhrasePair<Token> > ppfix,ppdyn;
PhrasePair<Token>::SortByTargetIdSeq sort_by_tgt_id;
- if (sfix)
+ if (sfix)
{
- expand(mfix, btfix, *sfix, ppfix);
+ expand(mfix, btfix, *sfix, ppfix, m_bias_log);
sort(ppfix.begin(), ppfix.end(),sort_by_tgt_id);
}
if (sdyn)
{
- expand(mdyn, *dyn, *sdyn, ppdyn);
+ expand(mdyn, *dyn, *sdyn, ppdyn, m_bias_log);
sort(ppdyn.begin(), ppdyn.end(),sort_by_tgt_id);
}
-
// now we have two lists of Phrase Pairs, let's merge them
- TargetPhraseCollectionWrapper* ret;
- ret = new TargetPhraseCollectionWrapper(revision,phrasekey);
+ ret = new TPCollWrapper(dyn->revision(), phrasekey);
PhrasePair<Token>::SortByTargetIdSeq sorter;
size_t i = 0; size_t k = 0;
while (i < ppfix.size() && k < ppdyn.size())
@@ -620,9 +690,24 @@ namespace Moses
while (k < ppdyn.size()) ret->Add(mkTPhrase(src,NULL,&ppdyn[k++],dyn));
if (m_tableLimit) ret->Prune(true, m_tableLimit);
else ret->Prune(true,ret->GetSize());
+
+#if 1
+ if (m_bias_log && m_lr_func)
+ {
+ typename PhrasePair<Token>::SortDescendingByJointCount sorter;
+ sort(ppfix.begin(), ppfix.end(),sorter);
+ BOOST_FOREACH(PhrasePair<Token> const& pp, ppfix)
+ {
+ if (&pp != &ppfix.front() && pp.joint <= 1) break;
+ pp.print(*m_bias_log,*btfix.V1, *btfix.V2, m_lr_func->GetModel());
+ }
+ }
+#endif
+
+
#if 0
- if (combine_pstats(src,
- mfix.getPid(), sfix.get(), btfix,
+ if (combine_pstats(src,
+ mfix.getPid(), sfix.get(), btfix,
mdyn.getPid(), sdyn.get(), *dyn, ret))
{
#if 0
@@ -643,12 +728,12 @@ namespace Moses
#endif
// put the result in the cache and return
- boost::lock_guard<boost::mutex> guard(this->lock);
- m_cache[phrasekey] = ret;
- return encache(ret);
+
+ cache->add(phrasekey, ret);
+ return ret;
}
- size_t
+ size_t
Mmsapt::
SetTableLimit(size_t limit)
{
@@ -658,7 +743,7 @@ namespace Moses
void
Mmsapt::
- CleanUpAfterSentenceProcessing(const InputType& source)
+ CleanUpAfterSentenceProcessing(ttasksptr const& ttask)
{ }
@@ -677,155 +762,79 @@ namespace Moses
throw "CreateRuleLookupManager is currently not supported in Mmsapt!";
}
- void
+ void
Mmsapt::
- InitializeForInput(InputType const& source)
- {
- // assert(0);
- }
-
-#if defined(timespec)
- bool operator<(timespec const& a, timespec const& b)
- {
- if (a.tv_sec != b.tv_sec) return a.tv_sec < b.tv_sec;
- return (a.tv_nsec < b.tv_nsec);
- }
-
- bool operator>=(timespec const& a, timespec const& b)
- {
- if (a.tv_sec != b.tv_sec) return a.tv_sec > b.tv_sec;
- return (a.tv_nsec >= b.tv_nsec);
- }
-#endif
-
- bool operator<(timeval const& a, timeval const& b)
+ InitializeForInput(ttasksptr const& ttask)
{
- if (a.tv_sec != b.tv_sec) return a.tv_sec < b.tv_sec;
- return (a.tv_usec < b.tv_usec);
- }
-
- bool operator>=(timeval const& a, timeval const& b)
- {
- if (a.tv_sec != b.tv_sec) return a.tv_sec > b.tv_sec;
- return (a.tv_usec >= b.tv_usec);
- }
-
- void
- bubble_up(vector<Mmsapt::TargetPhraseCollectionWrapper*>& v, size_t k)
- {
- if (k >= v.size()) return;
- for (;k && (v[k]->tstamp < v[k/2]->tstamp); k /=2)
- {
- std::swap(v[k],v[k/2]);
- std::swap(v[k]->idx,v[k/2]->idx);
+ sptr<ContextScope> const& scope = ttask->GetScope();
+ sptr<ContextForQuery> context
+ = scope->get<ContextForQuery>(&btfix, true);
+ if (m_bias_server.size() && context->bias == NULL)
+ { // we need to create the bias
+ boost::unique_lock<boost::shared_mutex> lock(context->lock);
+ string const& context_words = ttask->GetContextString();
+ if (context_words.size())
+ {
+ if (m_bias_log)
+ {
+ *m_bias_log << HERE << endl
+ << "BIAS LOOKUP CONTEXT: "
+ << context_words << endl;
+ context->bias_log = m_bias_log;
+ }
+ context->bias
+ = btfix.SetupDocumentBias(m_bias_server, context_words, m_bias_log);
+ context->bias->loglevel = m_bias_loglevel;
+ context->bias->log = m_bias_log;
+ }
+ if (!context->cache1) context->cache1.reset(new pstats::cache_t);
+ if (!context->cache2) context->cache2.reset(new pstats::cache_t);
}
- }
-
- void
- bubble_down(vector<Mmsapt::TargetPhraseCollectionWrapper*>& v, size_t k)
- {
- for (size_t j = 2*(k+1); j <= v.size(); j = 2*((k=j)+1))
+ boost::unique_lock<boost::shared_mutex> mylock(m_lock);
+ sptr<TPCollCache> localcache = scope->get<TPCollCache>(cache_key);
+ if (!localcache)
{
- if (j == v.size() || (v[j-1]->tstamp < v[j]->tstamp)) --j;
- if (v[j]->tstamp >= v[k]->tstamp) break;
- std::swap(v[k],v[j]);
- v[k]->idx = k;
- v[j]->idx = j;
+ if (context->bias) localcache.reset(new TPCollCache(m_cache_size));
+ else localcache = m_cache;
+ scope->set<TPCollCache>(cache_key, localcache);
}
- }
-
- void
- Mmsapt::
- decache(TargetPhraseCollectionWrapper* ptr) const
- {
- if (ptr->refCount || ptr->idx >= 0) return;
- // if (t.tv_nsec < v[0]->tstamp.tv_nsec)
-#if 0
- timespec t; clock_gettime(CLOCK_MONOTONIC,&t);
- timespec r; clock_getres(CLOCK_MONOTONIC,&r);
- float delta = t.tv_sec - ptr->tstamp.tv_sec;
- cerr << "deleting old cache entry after "
- << delta << " seconds."
- << " clock resolution is " << r.tv_sec << ":" << r.tv_nsec
- << " at " << __FILE__ << ":" << __LINE__ << endl;
-#endif
- tpc_cache_t::iterator m = m_cache.find(ptr->key);
- if (m != m_cache.end())
- if (m->second == ptr)
- m_cache.erase(m);
- delete ptr;
- --m_tpc_ctr;
- }
-
- Mmsapt::
- TargetPhraseCollectionWrapper*
- Mmsapt::
- encache(TargetPhraseCollectionWrapper* ptr) const
- {
- // Calling process must lock for thread safety!!
- if (!ptr) return NULL;
- ++ptr->refCount;
- ++m_tpc_ctr;
-#if defined(timespec)
- clock_gettime(CLOCK_MONOTONIC, &ptr->tstamp);
-#else
- gettimeofday(&ptr->tstamp, NULL);
-#endif
- // update history
- if (m_history.capacity() > 1)
- {
- vector<TargetPhraseCollectionWrapper*>& v = m_history;
- if (ptr->idx >= 0) // ptr is already in history
- {
- assert(ptr == v[ptr->idx]);
- size_t k = 2 * (ptr->idx + 1);
- if (k < v.size()) bubble_up(v,k--);
- if (k < v.size()) bubble_up(v,k);
- }
- else if (v.size() < v.capacity())
- {
- size_t k = ptr->idx = v.size();
- v.push_back(ptr);
- bubble_up(v,k);
- }
- else
- {
- v[0]->idx = -1;
- decache(v[0]);
- v[0] = ptr;
- bubble_down(v,0);
- }
+ if (m_lr_func_name.size() && m_lr_func == NULL)
+ {
+ FeatureFunction* lr = &FeatureFunction::FindFeatureFunction(m_lr_func_name);
+ m_lr_func = dynamic_cast<LexicalReordering*>(lr);
+ UTIL_THROW_IF2(lr == NULL, "FF " << m_lr_func_name
+ << " does not seem to be a lexical reordering function!");
+ // todo: verify that lr_func implements a hierarchical reordering model
}
- return ptr;
}
- bool
- Mmsapt::
- PrefixExists(Moses::Phrase const& phrase) const
- {
- return PrefixExists(phrase,NULL);
- }
+ // bool
+ // Mmsapt::
+ // PrefixExists(Moses::Phrase const& phrase) const
+ // {
+ // return PrefixExists(phrase,NULL);
+ // }
bool
Mmsapt::
- PrefixExists(Moses::Phrase const& phrase, SamplingBias const* const bias) const
+ PrefixExists(ttasksptr const& ttask, Moses::Phrase const& phrase) const
{
if (phrase.GetSize() == 0) return false;
- vector<id_type> myphrase;
+ vector<id_type> myphrase;
fillIdSeq(phrase,input_factor,*btfix.V1,myphrase);
-
+
TSA<Token>::tree_iterator mfix(btfix.I1.get(),&myphrase[0],myphrase.size());
- if (mfix.size() == myphrase.size())
+ if (mfix.size() == myphrase.size())
{
- btfix.prep(mfix,bias);
+ btfix.prep(ttask, mfix);
// cerr << phrase << " " << mfix.approxOccurrenceCount() << endl;
return true;
}
sptr<imBitext<Token> > dyn;
- { // braces are needed for scoping mutex lock guard!
- boost::lock_guard<boost::mutex> guard(this->lock);
+ { // braces are needed for scoping lock!
+ boost::unique_lock<boost::shared_mutex> guard(m_lock);
dyn = btdyn;
}
assert(dyn);
@@ -835,46 +844,37 @@ namespace Moses
for (size_t i = 0; mdyn.size() == i && i < myphrase.size(); ++i)
mdyn.extend(myphrase[i]);
// let's assume a uniform bias over the foreground corpus
- if (mdyn.size() == myphrase.size()) dyn->prep(mdyn,NULL);
+ if (mdyn.size() == myphrase.size()) dyn->prep(ttask, mdyn);
}
return mdyn.size() == myphrase.size();
}
void
- Mmsapt::
- Release(TargetPhraseCollection const* tpc) const
+ Mmsapt
+ ::Release(ttasksptr const& ttask, TargetPhraseCollection*& tpc) const
{
- if (!tpc) return;
- boost::lock_guard<boost::mutex> guard(this->lock);
- TargetPhraseCollectionWrapper* ptr
- = (reinterpret_cast<TargetPhraseCollectionWrapper*>
- (const_cast<TargetPhraseCollection*>(tpc)));
- if (--ptr->refCount == 0 && ptr->idx < 0)
- decache(ptr);
-#if 0
- cerr << ptr->refCount << " references at "
- << __FILE__ << ":" << __LINE__
- << "; " << m_tpc_ctr << " TPC references still in circulation; "
- << m_history.size() << " instances in history."
- << endl;
-#endif
+ sptr<TPCollCache> cache = ttask->GetScope()->get<TPCollCache>(cache_key);
+ TPCollWrapper* foo = static_cast<TPCollWrapper*>(tpc);
+ if (cache) cache->release(foo);
+ tpc = NULL;
}
- bool
- Mmsapt::
- ProvidesPrefixCheck() const
- { return true; }
+ bool Mmsapt
+ ::ProvidesPrefixCheck() const { return true; }
- string const&
- Mmsapt::
- GetName() const
- { return m_name; }
+ string const& Mmsapt
+ ::GetName() const { return m_name; }
- sptr<DocumentBias>
- Mmsapt::
- setupDocumentBias(map<string,float> const& bias) const
- {
- return btfix.setupDocumentBias(bias);
- }
+ // sptr<DocumentBias>
+ // Mmsapt
+ // ::setupDocumentBias(map<string,float> const& bias) const
+ // {
+ // return btfix.SetupDocumentBias(bias);
+ // }
+
+ vector<float>
+ Mmsapt
+ ::DefaultWeights() const
+ { return vector<float>(this->GetNumScoreComponents(), 1.); }
}
diff --git a/moses/TranslationModel/UG/mmsapt.h b/moses/TranslationModel/UG/mmsapt.h
index 23258475a..5f688cfd8 100644
--- a/moses/TranslationModel/UG/mmsapt.h
+++ b/moses/TranslationModel/UG/mmsapt.h
@@ -3,7 +3,6 @@
// Design and code by Ulrich Germann.
#pragma once
-#include <ctime>
#include <boost/thread.hpp>
#include <boost/scoped_ptr.hpp>
@@ -19,9 +18,12 @@
#include "moses/TranslationModel/UG/mm/ug_typedefs.h"
#include "moses/TranslationModel/UG/mm/tpt_pickler.h"
#include "moses/TranslationModel/UG/mm/ug_bitext.h"
-#include "moses/TranslationModel/UG/mm/ug_phrasepair.h"
#include "moses/TranslationModel/UG/mm/ug_lexical_phrase_scorer2.h"
+#include "moses/TranslationModel/UG/TargetPhraseCollectionCache.h"
+
+#include "moses/FF/LexicalReordering/LexicalReordering.h"
+
#include "moses/InputFileStream.h"
#include "moses/FactorTypeSet.h"
#include "moses/TargetPhrase.h"
@@ -36,23 +38,23 @@
// TO DO:
// - make lexical phrase scorer take addition to the "dynamic overlay" into account
// - switch to pool of sapts, where each sapt has its own provenance feature
-// RESEARCH QUESTION: is this more effective than having multiple phrase tables,
+// RESEARCH QUESTION: is this more effective than having multiple phrase tables,
// each with its own set of features?
-using namespace std;
namespace Moses
{
using namespace bitext;
- class Mmsapt
+ class Mmsapt
#ifndef NO_MOSES
: public PhraseDictionary
#endif
{
+ // using namespace std;
+ class TPCOllCache;
friend class Alignment;
- map<string,string> param;
- sptr<SamplingBias> m_bias;
- string m_name;
- public:
+ std::map<std::string,std::string> param;
+ std::string m_name;
+ public:
typedef L2R_Token<SimpleWordId> Token;
typedef mmBitext<Token> mmbitext;
typedef imBitext<Token> imbitext;
@@ -61,235 +63,185 @@ namespace Moses
typedef PhraseScorer<Token> pscorer;
private:
// vector<sptr<bitext> > shards;
- mmbitext btfix;
- sptr<imbitext> btdyn;
- string bname,extra_data,bias_file;
- string L1;
- string L2;
+ mmbitext btfix;
+ sptr<imbitext> btdyn;
+ std::string m_bname, m_extra_data, m_bias_file,m_bias_server;
+ std::string L1;
+ std::string L2;
float m_lbop_conf; // confidence level for lbop smoothing
float m_lex_alpha; // alpha paramter (j+a)/(m+a) for lexical smoothing
// alpha parameter for lexical smoothing (joint+alpha)/(marg + alpha)
- // must be > 0 if dynamic
+ // must be > 0 if dynamic
size_t m_default_sample_size;
size_t m_workers; // number of worker threads for sampling the bitexts
- vector<string> m_feature_set_names; // one or more of: standard, datasource
-
-
-
-
- // // deprecated!
- // char m_pfwd_denom; // denominator for computation of fwd phrase score:
- // // 'r' - divide by raw count
- // // 's' - divide by sample count
- // // 'g' - devide by number of "good" (i.e. coherent) samples
- // // size_t num_features;
-
- size_t input_factor;
+ std::vector<std::string> m_feature_set_names; // one or more of: standard, datasource
+ std::string m_bias_logfile;
+ boost::scoped_ptr<ofstream> m_bias_logger; // for logging to a file
+ ostream* m_bias_log;
+ int m_bias_loglevel;
+ LexicalReordering* m_lr_func; // associated lexical reordering function
+ string m_lr_func_name; // name of associated lexical reordering function
+ public:
+ void* const cache_key; // for getting cache from ttask
+ void* const context_key; // for context scope from ttask
+ private:
+ boost::shared_ptr<SamplingBias> m_bias; // for global default bias
+ boost::shared_ptr<TPCollCache> m_cache; // for global default bias
+ size_t m_cache_size; //
+ size_t input_factor; //
size_t output_factor; // we can actually return entire Tokens!
- // bool withLogCountFeatures; // add logs of counts as features?
- // bool withCoherence;
- // string m_pfwd_features; // which pfwd functions to use
- // string m_pbwd_features; // which pbwd functions to use
-
// for display for human inspection (ttable dumps):
- vector<string> m_feature_names; // names of features activated
- vector<bool> m_is_logval; // keeps track of which features are log valued
- vector<bool> m_is_integer; // keeps track of which features are integer valued
+ std::vector<std::string> m_feature_names; // names of features activated
+ std::vector<bool> m_is_logval; // keeps track of which features are log valued
+ std::vector<bool> m_is_integer; // keeps track of which features are integer valued
- vector<sptr<pscorer > > m_active_ff_fix; // activated feature functions (fix)
- vector<sptr<pscorer > > m_active_ff_dyn; // activated feature functions (dyn)
- vector<sptr<pscorer > > m_active_ff_common; // activated feature functions (dyn)
+ std::vector<sptr<pscorer > > m_active_ff_fix; // activated feature functions (fix)
+ std::vector<sptr<pscorer > > m_active_ff_dyn; // activated feature functions (dyn)
+ std::vector<sptr<pscorer > > m_active_ff_common;
+ // activated feature functions (dyn)
void
- register_ff(sptr<pscorer> const& ff, vector<sptr<pscorer> > & registry);
+ register_ff(sptr<pscorer> const& ff, std::vector<sptr<pscorer> > & registry);
template<typename fftype>
- void
- check_ff(string const ffname,vector<sptr<pscorer> >* registry = NULL);
- // add feature function if specified
-
+ void
+ check_ff(std::string const ffname,std::vector<sptr<pscorer> >* registry = NULL);
+ // add feature function if specified
+
template<typename fftype>
- void
- check_ff(string const ffname, float const xtra, vector<sptr<pscorer> >* registry = NULL);
+ void
+ check_ff(std::string const ffname, float const xtra,
+ std::vector<sptr<pscorer> >* registry = NULL);
// add feature function if specified
void
- add_corpus_specific_features(vector<sptr<pscorer > >& ffvec);
-
+ add_corpus_specific_features(std::vector<sptr<pscorer > >& ffvec);
+
// built-in feature functions
// PScorePfwd<Token> calc_pfwd_fix, calc_pfwd_dyn;
// PScorePbwd<Token> calc_pbwd_fix, calc_pbwd_dyn;
- // PScoreLex<Token> calc_lex; // this one I'd like to see as an external ff eventually
- // PScorePC<Token> apply_pp; // apply phrase penalty
+ // PScoreLex<Token> calc_lex;
+ // this one I'd like to see as an external ff eventually
+ // PScorePC<Token> apply_pp; // apply phrase penalty
// PScoreLogCounts<Token> add_logcounts_fix;
// PScoreLogCounts<Token> add_logcounts_dyn;
- void init(string const& line);
- mutable boost::mutex lock;
+ void init(std::string const& line);
+ mutable boost::shared_mutex m_lock;
+ // mutable boost::shared_mutex m_cache_lock;
+ // for more complex operations on the cache
bool withPbwd;
bool poolCounts;
- vector<FactorType> ofactor;
+ std::vector<FactorType> ofactor;
-
- public:
- // typedef boost::unordered_map<uint64_t, sptr<TargetPhraseCollection> > tpcoll_cache_t;
- class TargetPhraseCollectionWrapper
- : public TargetPhraseCollection
- {
- public:
- size_t const revision; // time stamp from dynamic bitext
- ::uint64_t const key; // phrase key
- uint32_t refCount; // reference count
-#if defined(timespec)
- timespec tstamp; // last use
-#else
- timeval tstamp; // last use
-#endif
- int idx; // position in history heap
- TargetPhraseCollectionWrapper(size_t r, ::uint64_t const k);
- ~TargetPhraseCollectionWrapper();
- };
+ void setup_local_feature_functions();
private:
- void read_config_file(string fname, map<string,string>& param);
-
- TargetPhraseCollectionWrapper*
- encache(TargetPhraseCollectionWrapper* const ptr) const;
-
- void
- decache(TargetPhraseCollectionWrapper* ptr) const;
+ void read_config_file(std::string fname, std::map<std::string,std::string>& param);
- typedef map<typename ::uint64_t, TargetPhraseCollectionWrapper*> tpc_cache_t;
- mutable tpc_cache_t m_cache;
- mutable vector<TargetPhraseCollectionWrapper*> m_history;
// phrase table feature weights for alignment:
- vector<float> feature_weights;
+ std::vector<float> feature_weights;
- vector<vector<id_type> > wlex21;
+ std::vector<std::vector<id_type> > wlex21;
// word translation lexicon (without counts, get these from calc_lex.COOC)
typedef mm2dTable<id_type,id_type,uint32_t,uint32_t> mm2dtable_t;
mm2dtable_t COOCraw;
- TargetPhrase*
- mkTPhrase(Phrase const& src,
- Moses::bitext::PhrasePair<Token>* fix,
- Moses::bitext::PhrasePair<Token>* dyn,
+ TargetPhrase*
+ mkTPhrase(Phrase const& src,
+ Moses::bitext::PhrasePair<Token>* fix,
+ Moses::bitext::PhrasePair<Token>* dyn,
sptr<Bitext<Token> > const& dynbt) const;
- // template<typename Token>
- // void
- // expand(typename Bitext<Token>::iter const& m, Bitext<Token> const& bt,
- // pstats const& pstats, vector<PhrasePair<Token> >& dest);
-
-#if 0
- TargetPhrase*
- mkTPhrase
- (Phrase const& src,
- Bitext<Token> const& bt,
- Moses::bitext::PhrasePair const& pp
- ) const;
-#endif
void
process_pstats
(Phrase const& src,
- ::uint64_t const pid1,
- pstats const& stats,
- Bitext<Token> const & bt,
+ uint64_t const pid1,
+ pstats const& stats,
+ Bitext<Token> const & bt,
TargetPhraseCollection* tpcoll
) const;
bool
pool_pstats
(Phrase const& src,
- ::uint64_t const pid1a, pstats * statsa, Bitext<Token> const & bta,
- ::uint64_t const pid1b, pstats const* statsb, Bitext<Token> const & btb,
+ uint64_t const pid1a, pstats * statsa, Bitext<Token> const & bta,
+ uint64_t const pid1b, pstats const* statsb, Bitext<Token> const & btb,
TargetPhraseCollection* tpcoll) const;
-
+
bool
combine_pstats
- (Phrase const& src,
- ::uint64_t const pid1a, pstats* statsa, Bitext<Token> const & bta,
- ::uint64_t const pid1b, pstats const* statsb, Bitext<Token> const & btb,
+ (Phrase const& src,
+ uint64_t const pid1a, pstats* statsa, Bitext<Token> const & bta,
+ uint64_t const pid1b, pstats const* statsb, Bitext<Token> const & btb,
TargetPhraseCollection* tpcoll) const;
- void
- load_extra_data(string bname, bool locking);
-
- void
- load_bias(string bname);
+ void load_extra_data(std::string bname, bool locking);
+ void load_bias(std::string bname);
- mutable size_t m_tpc_ctr;
public:
- // Mmsapt(string const& description, string const& line);
- Mmsapt(string const& line);
+ // Mmsapt(std::string const& description, std::string const& line);
+ Mmsapt(std::string const& line);
- void
- Load();
-
- void
- Load(bool with_checks);
-
- // returns the prior table limit
- size_t SetTableLimit(size_t limit);
-
- string const&
- GetName() const;
+ void Load();
+ void Load(bool with_checks);
+ size_t SetTableLimit(size_t limit); // returns the prior table limit
+ std::string const& GetName() const;
#ifndef NO_MOSES
- TargetPhraseCollection const*
+ TargetPhraseCollection const*
+ GetTargetPhraseCollectionLEGACY(ttasksptr const& ttask, const Phrase& src) const;
+
+ TargetPhraseCollection const*
GetTargetPhraseCollectionLEGACY(const Phrase& src) const;
+
+ void
+ GetTargetPhraseCollectionBatch(ttasksptr const& ttask,
+ const InputPathList &inputPathQueue) const;
+
//! Create a sentence-specific manager for SCFG rule lookup.
ChartRuleLookupManager*
CreateRuleLookupManager(const ChartParser &, const ChartCellCollectionBase &);
ChartRuleLookupManager*
- CreateRuleLookupManager
- (const ChartParser &, const ChartCellCollectionBase &, std::size_t);
+ CreateRuleLookupManager(const ChartParser &, const ChartCellCollectionBase &,
+ std::size_t);
#endif
- void add(string const& s1, string const& s2, string const& a);
+ void add(std::string const& s1, std::string const& s2, std::string const& a);
+ // add a new sentence pair to the dynamic bitext
- // align two new sentences
- sptr<vector<int> >
- align(string const& src, string const& trg) const;
+ void setWeights(std::vector<float> const& w);
- void setWeights(vector<float> const& w);
- void
- CleanUpAfterSentenceProcessing(const InputType& source);
+ void Release(ttasksptr const& ttask, TargetPhraseCollection*& tpc) const;
+ // some consumer lets me know that *tpc isn't needed any more
- void
- InitializeForInput(InputType const& source);
- void
- Release(TargetPhraseCollection const* tpc) const;
+ bool ProvidesPrefixCheck() const; // return true if prefix /phrase/ check exists
+ // bool PrefixExists(Phrase const& phrase, SamplingBias const* const bias) const;
+ bool PrefixExists(ttasksptr const& ttask, Phrase const& phrase) const;
- bool
- ProvidesPrefixCheck() const;
-
- /// return true if prefix /phrase/ exists
- bool
- PrefixExists(Phrase const& phrase, SamplingBias const* const bias) const;
+ bool isLogVal(int i) const;
+ bool isInteger(int i) const;
- bool
- PrefixExists(Phrase const& phrase) const;
+ // task setup and takedown functions
+ void InitializeForInput(ttasksptr const& ttask);
+ // void CleanUpAfterSentenceProcessing(const InputType& source);
+ void CleanUpAfterSentenceProcessing(ttasksptr const& ttask);
- vector<string> const&
- GetFeatureNames() const;
-
- // void
- // ScorePPfix(bitext::PhrasePair& pp) const;
+ // align two new sentences
+ sptr<std::vector<int> >
+ align(std::string const& src, std::string const& trg) const;
- bool
- isLogVal(int i) const;
-
- bool
- isInteger(int i) const;
+ std::vector<std::string> const&
+ GetFeatureNames() const;
sptr<DocumentBias>
- setupDocumentBias(map<string,float> const& bias) const;
- private:
+ setupDocumentBias(std::map<std::string,float> const& bias) const;
+
+ vector<float> DefaultWeights() const;
};
} // end namespace
diff --git a/moses/TranslationModel/UG/mmsapt_align.cc b/moses/TranslationModel/UG/mmsapt_align.cc
index 65cf979e1..13d8387d2 100644
--- a/moses/TranslationModel/UG/mmsapt_align.cc
+++ b/moses/TranslationModel/UG/mmsapt_align.cc
@@ -6,7 +6,7 @@
// using namespace bitext;
// using namespace std;
// using namespace boost;
-
+
// struct PPgreater
// {
// bool operator()(PhrasePair const& a, PhrasePair const& b)
@@ -28,7 +28,7 @@
// PhrasePair pp;
// ushort s1,e1,s2,e2; // start and end positions
// int prev; // preceding alignment hypothesis
-// float score;
+// float score;
// bitvector scov; // source coverage
// PhraseAlnHyp(PhrasePair const& ppx, int slen,
// pair<uint32_t,uint32_t> const& sspan,
@@ -37,7 +37,7 @@
// {
// s1 = sspan.first; e1 = sspan.second;
// s2 = tspan.first; e2 = tspan.second;
-// for (size_t i = s1; i < e1; ++i)
+// for (size_t i = s1; i < e1; ++i)
// scov.set(i);
// }
@@ -78,13 +78,13 @@
// return po_other;
// }
-// float
+// float
// dprob_fwd(PhraseAlnHyp const& next)
// {
// return pp.dfwd[po_fwd(&next)];
// }
-// float
+// float
// dprob_bwd(PhraseAlnHyp const& prev)
// {
// return pp.dbwd[po_bwd(&prev)];
@@ -102,15 +102,15 @@
// typedef pstats::trg_map_t jStatsTable;
// Mmsapt const& PT;
-// vector<id_type> s,t;
+// vector<id_type> s,t;
// pidmap_t sspan2pid, tspan2pid; // span -> phrase ID
// pid2span_t spid2span,tpid2span;
// vector<vector<sptr<pstats> > > spstats;
-// vector<PhrasePair> PP;
+// vector<PhrasePair> PP;
// // position-independent phrase pair info
// public:
-// vector<PhraseAlnHyp> PAH;
+// vector<PhraseAlnHyp> PAH;
// vector<vector<int> > tpos2ahyp;
// // maps from target start positions to PhraseAlnHyps starting at
// // that position
@@ -120,8 +120,8 @@
// void fill_sspan_maps();
// public:
// Alignment(Mmsapt const& pt, string const& src, string const& trg);
-// void show(ostream& out);
-// void show(ostream& out, PhraseAlnHyp const& ah);
+// void show(ostream& out);
+// void show(ostream& out, PhraseAlnHyp const& ah);
// };
// void
@@ -129,11 +129,11 @@
// show(ostream& out, PhraseAlnHyp const& ah)
// {
// #if 0
-// LexicalPhraseScorer2<Token>::table_t const&
+// LexicalPhraseScorer2<Token>::table_t const&
// COOCjnt = PT.calc_lex.scorer.COOC;
// out << setw(10) << exp(ah.score) << " "
-// << PT.btfix.T2->pid2str(PT.btfix.V2.get(), ah.pp.p2)
+// << PT.btfix.T2->pid2str(PT.btfix.V2.get(), ah.pp.p2)
// << " <=> "
// << PT.btfix.T1->pid2str(PT.btfix.V1.get(), ah.pp.p1);
// vector<uchar> const& a = ah.pp.aln;
@@ -168,7 +168,7 @@
// // << "]" << endl;
// #endif
// }
-
+
// void
// Alignment::
// show(ostream& out)
@@ -192,7 +192,7 @@
// return spstats[sspan.first][k];
// else return sptr<pstats>();
// }
-
+
// void
// Alignment::
// fill_tspan_maps()
@@ -207,7 +207,7 @@
// tpid2span[pid].push_back(pair<uint32_t,uint32_t>(i,k+1));
// tspan2pid[i][k] = pid;
// }
-// }
+// }
// }
// void
@@ -230,11 +230,11 @@
// int y = p->second[0].second-1;
// spstats[i].push_back(spstats[x][y-x]);
// }
-// else
+// else
// {
// spstats[i].push_back(PT.btfix.lookup(m));
// cout << PT.btfix.T1->pid2str(PT.btfix.V1.get(),pid) << " "
-// << spstats[i].back()->good << "/" << spstats[i].back()->sample_cnt
+// << spstats[i].back()->good << "/" << spstats[i].back()->sample_cnt
// << endl;
// }
// spid2span[pid].push_back(pair<uint32_t,uint32_t>(i,k+1));
@@ -262,14 +262,14 @@
// // size_t m2 = COOC.m2(i);
// // if (j*1000 > m1 && j*1000 > m2)
// // cout << " " << (*PT.btfix.V1)[k];
-// // }
+// // }
// // }
// // cout << endl;
// // }
-
+
// fill_tspan_maps();
// fill_sspan_maps();
-// tpos2ahyp.resize(t.size());
+// tpos2ahyp.resize(t.size());
// // now fill the association score table
// PAH.reserve(1000000);
// typedef pid2span_t::iterator psiter;
@@ -301,12 +301,12 @@
// }
// }
-
+
// int
// extend(vector<PhraseAlnHyp> & PAH, int edge, int next)
// {
-// if ((PAH[edge].scov & PAH[next].scov).count())
+// if ((PAH[edge].scov & PAH[next].scov).count())
// return -1;
// int ret = PAH.size();
// PAH.push_back(PAH[next]);
diff --git a/moses/TranslationModel/UG/ptable-describe-features.cc b/moses/TranslationModel/UG/ptable-describe-features.cc
index dbd5accb9..c9dd3abd1 100644
--- a/moses/TranslationModel/UG/ptable-describe-features.cc
+++ b/moses/TranslationModel/UG/ptable-describe-features.cc
@@ -19,7 +19,7 @@ int main()
{
if (line.empty()) continue;
size_t k = line.find_first_not_of(" ");
- if (line.find("Mmsapt") != k &&
+ if (line.find("Mmsapt") != k &&
line.find("PhraseDictionaryBitextSampling") != k)
continue;
Mmsapt PT(line);
@@ -32,6 +32,6 @@ int main()
}
exit(0);
}
-
-
+
+
diff --git a/moses/TranslationModel/UG/ptable-lookup.cc b/moses/TranslationModel/UG/ptable-lookup.cc
index 2cbf89b16..94627a02c 100644
--- a/moses/TranslationModel/UG/ptable-lookup.cc
+++ b/moses/TranslationModel/UG/ptable-lookup.cc
@@ -19,13 +19,13 @@ class SimplePhrase : public Moses::Phrase
vector<FactorType> const m_fo; // factor order
public:
SimplePhrase(): m_fo(1,FactorType(0)) {}
-
- void init(string const& s)
+
+ void init(string const& s)
{
istringstream buf(s); string w;
- while (buf >> w)
+ while (buf >> w)
{
- Word wrd;
+ Word wrd;
this->AddWord().CreateFromString(Input,m_fo,StringPiece(w),false,false);
}
}
@@ -63,15 +63,15 @@ int main(int argc, char* argv[])
cerr << "Phrase table implementation not supported by this utility." << endl;
exit(1);
}
-
+
string line;
while (true)
{
Sentence phrase;
if (!phrase.Read(cin,ifo)) break;
- if (pdta)
+ if (pdta)
{
- pdta->InitializeForInput(phrase);
+ pdta->InitializeForInput(phrase);
// do we also need to call CleanupAfterSentenceProcessing at the end?
}
Phrase& p = phrase;
@@ -79,13 +79,13 @@ int main(int argc, char* argv[])
cout << p << endl;
TargetPhraseCollection const* trg = PT->GetTargetPhraseCollectionLEGACY(p);
if (!trg) continue;
- vector<size_t> order(trg->GetSize());
+ vector<size_t> order(trg->GetSize());
for (size_t i = 0; i < order.size(); ++i) order[i] = i;
sort(order.begin(),order.end(),TargetPhraseIndexSorter(*trg));
size_t k = 0;
- // size_t precision =
+ // size_t precision =
cout.precision(2);
-
+
vector<string> fname;
if (mmsapt)
{
@@ -101,12 +101,13 @@ int main(int argc, char* argv[])
Phrase const& phr = static_cast<Phrase const&>(*(*trg)[i]);
cout << setw(3) << ++k << " " << phr << endl;
ScoreComponentCollection const& scc = (*trg)[i]->GetScoreBreakdown();
- ScoreComponentCollection::IndexPair idx = scc.GetIndexes(PT);
+ size_t start = PT->GetIndex();
+ size_t stop = start + PT->GetNumScoreComponents();
FVector const& scores = scc.GetScoresVector();
cout << " ";
- for (size_t k = idx.first; k < idx.second; ++k)
+ for (size_t k = start; k < stop; ++k)
{
- size_t j = k-idx.first;
+ size_t j = k-start;
float f = (mmsapt ? mmsapt->isLogVal(j) ? exp(scores[k]) : scores[k]
: scores[k] < 0 ? exp(scores[k]) : scores[k]);
string fmt = (mmsapt && mmsapt->isInteger(j)) ? "%10d" : "%10.8f";
@@ -118,6 +119,6 @@ int main(int argc, char* argv[])
}
exit(0);
}
-
-
+
+
diff --git a/moses/TranslationModel/UG/sapt_phrase_key.h b/moses/TranslationModel/UG/sapt_phrase_key.h
index e1ecf1573..0caf11e43 100644
--- a/moses/TranslationModel/UG/sapt_phrase_key.h
+++ b/moses/TranslationModel/UG/sapt_phrase_key.h
@@ -8,6 +8,6 @@ namespace sapt
using namespace Moses;
using namespace std;
-
+
}
diff --git a/moses/TranslationModel/UG/sapt_phrase_scorers.h b/moses/TranslationModel/UG/sapt_phrase_scorers.h
index 9870ed7f0..ace907d73 100644
--- a/moses/TranslationModel/UG/sapt_phrase_scorers.h
+++ b/moses/TranslationModel/UG/sapt_phrase_scorers.h
@@ -1,6 +1,6 @@
// -*- c++ -*-
// Phrase scoring functions for suffix array-based phrase tables
-// written by Ulrich Germann
+// written by Ulrich Germann
#pragma once
#include "sapt_pscore_unaligned.h" // count # of unaligned words
#include "sapt_pscore_provenance.h" // reward for joint phrase occ. per corpus
diff --git a/moses/TranslationModel/UG/sapt_pscore_base.h b/moses/TranslationModel/UG/sapt_pscore_base.h
index 68a491145..388c83d9b 100644
--- a/moses/TranslationModel/UG/sapt_pscore_base.h
+++ b/moses/TranslationModel/UG/sapt_pscore_base.h
@@ -1,9 +1,8 @@
// -*- c++ -*-
// Base classes for suffix array-based phrase scorers
-// written by Ulrich Germann
+// written by Ulrich Germann
#pragma once
#include "moses/TranslationModel/UG/mm/ug_bitext.h"
-#include "moses/TranslationModel/UG/mm/ug_phrasepair.h"
#include "util/exception.hh"
#include "boost/format.hpp"
@@ -22,68 +21,72 @@ namespace Moses {
string m_tag;
vector<string> m_feature_names;
public:
-
- virtual
- void
- operator()(Bitext<Token> const& pt,
- PhrasePair<Token>& pp,
- vector<float> * dest=NULL)
+
+ virtual
+ void
+ operator()(Bitext<Token> const& pt,
+ PhrasePair<Token>& pp,
+ vector<float> * dest=NULL)
const = 0;
void
setIndex(int const i) { m_index = i; }
-
+
int
getIndex() const { return m_index; }
- int
+ int
fcnt() const { return m_num_feats; }
-
+
vector<string> const &
fnames() const { return m_feature_names; }
string const &
fname(int i) const
- {
+ {
if (i < 0) i += m_num_feats;
UTIL_THROW_IF2(i < 0 || i >= m_num_feats,
"Feature name index out of range at " << HERE);
- return m_feature_names.at(i);
+ return m_feature_names.at(i);
}
virtual
bool
- isLogVal(int i) const { return true; };
- // is this feature log valued?
-
+ isLogVal(int i) const { return true; };
+ // is this feature log valued?
+
virtual
bool
- isIntegerValued(int i) const { return false; };
- // is this feature integer valued (e.g., count features)?
+ isIntegerValued(int i) const { return false; };
+ // is this feature integer valued (e.g., count features)?
virtual
bool
allowPooling() const { return true; }
- // does this feature function allow pooling of counts if
+ // does this feature function allow pooling of counts if
// there are no occurrences in the respective corpus?
-
+
+ virtual
+ void
+ load() { }
+
};
- // base class for 'families' of phrase scorers that have a single
+ // base class for 'families' of phrase scorers that have a single
template<typename Token>
class
- SingleRealValuedParameterPhraseScorerFamily
+ SingleRealValuedParameterPhraseScorerFamily
: public PhraseScorer<Token>
{
protected:
vector<float> m_x;
- virtual
- void
- init(string const specs)
- {
+ virtual
+ void
+ init(string const specs)
+ {
using namespace boost;
- UTIL_THROW_IF2(this->m_tag.size() == 0,
+ UTIL_THROW_IF2(this->m_tag.size() == 0,
"m_tag must be initialized in constructor");
UTIL_THROW_IF2(specs.size() == 0,"empty specification string!");
UTIL_THROW_IF2(this->m_feature_names.size(),
diff --git a/moses/TranslationModel/UG/sapt_pscore_coherence.h b/moses/TranslationModel/UG/sapt_pscore_coherence.h
index a3211df54..c201c9651 100644
--- a/moses/TranslationModel/UG/sapt_pscore_coherence.h
+++ b/moses/TranslationModel/UG/sapt_pscore_coherence.h
@@ -1,5 +1,5 @@
// -*- c++ -*-
-// written by Ulrich Germann
+// written by Ulrich Germann
#pragma once
#include "moses/TranslationModel/UG/mm/ug_bitext.h"
#include "util/exception.hh"
@@ -13,16 +13,16 @@ namespace Moses {
PScoreCoherence : public PhraseScorer<Token>
{
public:
- PScoreCoherence(string const dummy)
- {
+ PScoreCoherence(string const dummy)
+ {
this->m_index = -1;
this->m_num_feats = 1;
this->m_feature_names.push_back(string("coherence"));
}
-
- void
- operator()(Bitext<Token> const& bt,
- PhrasePair<Token>& pp,
+
+ void
+ operator()(Bitext<Token> const& bt,
+ PhrasePair<Token>& pp,
vector<float> * dest = NULL) const
{
if (!dest) dest = &pp.fvals;
diff --git a/moses/TranslationModel/UG/sapt_pscore_lex1.h b/moses/TranslationModel/UG/sapt_pscore_lex1.h
index be994b0d3..76ca2a9a4 100644
--- a/moses/TranslationModel/UG/sapt_pscore_lex1.h
+++ b/moses/TranslationModel/UG/sapt_pscore_lex1.h
@@ -1,6 +1,6 @@
// -*- c++ -*-
// Phrase scorer that counts the number of unaligend words in the phrase
-// written by Ulrich Germann
+// written by Ulrich Germann
#include "moses/TranslationModel/UG/mm/ug_bitext.h"
#include "sapt_pscore_base.h"
@@ -13,24 +13,31 @@ namespace Moses {
class
PScoreLex1 : public PhraseScorer<Token>
{
- float m_alpha;
+ float m_alpha;
+ string m_lexfile;
public:
LexicalPhraseScorer2<Token> scorer;
-
- PScoreLex1(string const& alpaspec, string const& lexfile)
- {
+
+ PScoreLex1(string const& alphaspec, string const& lexfile)
+ {
this->m_index = -1;
- this->m_num_feats = 2;
+ this->m_num_feats = 2;
this->m_feature_names.reserve(2);
this->m_feature_names.push_back("lexfwd");
this->m_feature_names.push_back("lexbwd");
- m_alpha = atof(alpaspec.c_str());
- scorer.open(lexfile);
+ m_alpha = atof(alphaspec.c_str());
+ m_lexfile = lexfile;
+ }
+
+ void
+ load()
+ {
+ scorer.open(m_lexfile);
}
-
- void
- operator()(Bitext<Token> const& bt,
- PhrasePair<Token>& pp,
+
+ void
+ operator()(Bitext<Token> const& bt,
+ PhrasePair<Token>& pp,
vector<float> * dest = NULL) const
{
if (!dest) dest = &pp.fvals;
@@ -41,27 +48,27 @@ namespace Moses {
cout << len1 << " " << len2 << endl;
Token const* t1 = bt.T1->sntStart(sid1);
for (size_t i = off1; i < off1 + len1; ++i)
- cout << (*bt.V1)[t1[i].id()] << " ";
+ cout << (*bt.V1)[t1[i].id()] << " ";
cout << __FILE__ << ":" << __LINE__ << endl;
-
+
Token const* t2 = bt.T2->sntStart(sid2);
for (size_t i = off2; i < off2 + len2; ++i)
- cout << (*bt.V2)[t2[i].id()] << " ";
+ cout << (*bt.V2)[t2[i].id()] << " ";
cout << __FILE__ << ":" << __LINE__ << endl;
-
+
BOOST_FOREACH (int a, pp.aln)
cout << a << " " ;
cout << __FILE__ << ":" << __LINE__ << "\n" << endl;
-
+
scorer.score(bt.T1->sntStart(sid1)+off1,0,len1,
bt.T2->sntStart(sid2)+off2,0,len2,
pp.aln, m_alpha,
(*dest)[this->m_index],
(*dest)[this->m_index+1]);
#endif
- scorer.score(pp.start1,0, pp.len1,
- pp.start2,0, pp.len2, pp.aln, m_alpha,
- (*dest)[this->m_index],
+ scorer.score(pp.start1,0, pp.len1,
+ pp.start2,0, pp.len2, pp.aln, m_alpha,
+ (*dest)[this->m_index],
(*dest)[this->m_index+1]);
}
};
diff --git a/moses/TranslationModel/UG/sapt_pscore_logcnt.h b/moses/TranslationModel/UG/sapt_pscore_logcnt.h
index 2790323ed..9dc5ac7ba 100644
--- a/moses/TranslationModel/UG/sapt_pscore_logcnt.h
+++ b/moses/TranslationModel/UG/sapt_pscore_logcnt.h
@@ -2,7 +2,7 @@
// Phrase scorer that rewards the number of phrase pair occurrences in a bitext
// with the asymptotic function x/(j+x) where x > 0 is a function
// parameter that determines the steepness of the rewards curve
-// written by Ulrich Germann
+// written by Ulrich Germann
#include "sapt_pscore_base.h"
#include <boost/dynamic_bitset.hpp>
@@ -10,15 +10,15 @@
using namespace std;
namespace Moses {
namespace bitext {
-
+
template<typename Token>
class
PScoreLogCnt : public PhraseScorer<Token>
{
string m_specs;
public:
- PScoreLogCnt(string const specs)
- {
+ PScoreLogCnt(string const specs)
+ {
this->m_index = -1;
this->m_specs = specs;
if (specs.find("r1") != string::npos) // raw source phrase counts
@@ -35,11 +35,11 @@ namespace Moses {
}
bool
- isIntegerValued(int i) const { return true; }
+ isIntegerValued(int i) const { return true; }
- void
- operator()(Bitext<Token> const& bt,
- PhrasePair<Token>& pp,
+ void
+ operator()(Bitext<Token> const& bt,
+ PhrasePair<Token>& pp,
vector<float> * dest = NULL) const
{
if (!dest) dest = &pp.fvals;
@@ -49,15 +49,15 @@ namespace Moses {
assert(pp.joint);
assert(pp.raw2);
size_t i = this->m_index;
- if (m_specs.find("r1") != string::npos)
+ if (m_specs.find("r1") != string::npos)
(*dest)[i++] = log(pp.raw1);
- if (m_specs.find("s1") != string::npos)
+ if (m_specs.find("s1") != string::npos)
(*dest)[i++] = log(pp.sample1);
- if (m_specs.find("g1") != string::npos)
+ if (m_specs.find("g1") != string::npos)
(*dest)[i++] = log(pp.good1);
- if (m_specs.find("j") != string::npos)
+ if (m_specs.find("j") != string::npos)
(*dest)[i++] = log(pp.joint);
- if (m_specs.find("r2") != string::npos)
+ if (m_specs.find("r2") != string::npos)
(*dest)[++i] = log(pp.raw2);
}
};
diff --git a/moses/TranslationModel/UG/sapt_pscore_pbwd.h b/moses/TranslationModel/UG/sapt_pscore_pbwd.h
index f7b4686d7..9366777ef 100644
--- a/moses/TranslationModel/UG/sapt_pscore_pbwd.h
+++ b/moses/TranslationModel/UG/sapt_pscore_pbwd.h
@@ -1,5 +1,5 @@
//-*- c++ -*-
-// written by Ulrich Germann
+// written by Ulrich Germann
#pragma once
#include "moses/TranslationModel/UG/mm/ug_bitext.h"
#include "util/exception.hh"
@@ -15,12 +15,12 @@ namespace Moses {
{
float conf;
string denom;
-
+
public:
- PScorePbwd(float const c, string d)
- {
+ PScorePbwd(float const c, string d)
+ {
this->m_index = -1;
- conf = c;
+ conf = c;
denom = d;
size_t checksum = d.size();
BOOST_FOREACH(char const& x, denom)
@@ -36,13 +36,13 @@ namespace Moses {
<< d << "' for Pbwd phrase scorer at " << HERE);
}
- void
- operator()(Bitext<Token> const& bt,
- PhrasePair<Token>& pp,
+ void
+ operator()(Bitext<Token> const& bt,
+ PhrasePair<Token>& pp,
vector<float> * dest = NULL) const
{
if (!dest) dest = &pp.fvals;
- // we use the denominator specification to scale the raw counts on the
+ // we use the denominator specification to scale the raw counts on the
// target side; the clean way would be to counter-sample
size_t i = this->m_index;
BOOST_FOREACH(char const& x, denom)
diff --git a/moses/TranslationModel/UG/sapt_pscore_pfwd.h b/moses/TranslationModel/UG/sapt_pscore_pfwd.h
index ed48a93d2..c5de210a1 100644
--- a/moses/TranslationModel/UG/sapt_pscore_pfwd.h
+++ b/moses/TranslationModel/UG/sapt_pscore_pfwd.h
@@ -1,5 +1,5 @@
// -*- c++ -*-
-// written by Ulrich Germann
+// written by Ulrich Germann
#pragma once
#include "moses/TranslationModel/UG/mm/ug_bitext.h"
#include "util/exception.hh"
@@ -18,10 +18,10 @@ namespace Moses {
public:
- PScorePfwd(float const c, string d)
- {
+ PScorePfwd(float const c, string d)
+ {
this->m_index = -1;
- conf = c;
+ conf = c;
denom = d;
size_t checksum = d.size();
BOOST_FOREACH(char const& x, denom)
@@ -32,17 +32,17 @@ namespace Moses {
this->m_feature_names.push_back(s);
}
this->m_num_feats = this->m_feature_names.size();
- UTIL_THROW_IF2(this->m_feature_names.size() != checksum,
- "Unknown parameter in specification '"
+ UTIL_THROW_IF2(this->m_feature_names.size() != checksum,
+ "Unknown parameter in specification '"
<< d << "' for Pfwd phrase scorer at " << HERE);
}
-
- void
- operator()(Bitext<Token> const& bt, PhrasePair<Token> & pp,
+
+ void
+ operator()(Bitext<Token> const& bt, PhrasePair<Token> & pp,
vector<float> * dest = NULL) const
{
if (!dest) dest = &pp.fvals;
- if (pp.joint > pp.good1)
+ if (pp.joint > pp.good1)
{
pp.joint = pp.good1;
// cerr<<bt.toString(pp.p1,0)<<" ::: "<<bt.toString(pp.p2,1)<<endl;
@@ -53,18 +53,18 @@ namespace Moses {
{
switch (c)
{
- case 'g':
- (*dest)[i++] = log(lbop(pp.good1, pp.joint, conf));
+ case 'g':
+ (*dest)[i++] = log(lbop(pp.good1, pp.joint, conf));
break;
- case 's':
- (*dest)[i++] = log(lbop(pp.sample1, pp.joint, conf));
+ case 's':
+ (*dest)[i++] = log(lbop(pp.sample1, pp.joint, conf));
break;
case 'r':
- (*dest)[i++] = log(lbop(pp.raw1, pp.joint, conf));
+ (*dest)[i++] = log(lbop(pp.raw1, pp.joint, conf));
}
}
}
};
}
}
-
+
diff --git a/moses/TranslationModel/UG/sapt_pscore_phrasecount.h b/moses/TranslationModel/UG/sapt_pscore_phrasecount.h
index e0a6eb48b..e0ce40117 100644
--- a/moses/TranslationModel/UG/sapt_pscore_phrasecount.h
+++ b/moses/TranslationModel/UG/sapt_pscore_phrasecount.h
@@ -1,5 +1,5 @@
// -*- c++ -*-
-// written by Ulrich Germann
+// written by Ulrich Germann
#pragma once
#include "moses/TranslationModel/UG/mm/ug_bitext.h"
#include "util/exception.hh"
@@ -20,15 +20,15 @@ namespace Moses {
this->m_num_feats = 1;
this->m_feature_names.push_back(string("phrasecount"));
}
-
- void
+
+ void
operator()(Bitext<Token> const& bt,
- PhrasePair<Token>& pp,
+ PhrasePair<Token>& pp,
vector<float> * dest = NULL) const
{
if (!dest) dest = &pp.fvals;
(*dest)[this->m_index] = 1;
- }
+ }
};
}
}
diff --git a/moses/TranslationModel/UG/sapt_pscore_provenance.h b/moses/TranslationModel/UG/sapt_pscore_provenance.h
index c33b98fe7..ee7b08bda 100644
--- a/moses/TranslationModel/UG/sapt_pscore_provenance.h
+++ b/moses/TranslationModel/UG/sapt_pscore_provenance.h
@@ -2,7 +2,7 @@
// Phrase scorer that rewards the number of phrase pair occurrences in a bitext
// with the asymptotic function j/(j+x) where x > 0 is a function
// parameter that determines the steepness of the rewards curve
-// written by Ulrich Germann
+// written by Ulrich Germann
#include "sapt_pscore_base.h"
#include <boost/dynamic_bitset.hpp>
@@ -10,7 +10,7 @@
using namespace std;
namespace Moses {
namespace bitext {
-
+
// asymptotic provenance feature n/(n+x)
template<typename Token>
class
@@ -18,18 +18,18 @@ namespace Moses {
{
public:
- PScoreProvenance(string const& spec)
+ PScoreProvenance(string const& spec)
{
this->m_tag = "prov";
this->init(spec);
}
-
+
bool
- isLogVal(int i) const { return false; }
+ isLogVal(int i) const { return false; }
- void
- operator()(Bitext<Token> const& bt,
- PhrasePair<Token>& pp,
+ void
+ operator()(Bitext<Token> const& bt,
+ PhrasePair<Token>& pp,
vector<float> * dest = NULL) const
{
if (!dest) dest = &pp.fvals;
@@ -39,7 +39,7 @@ namespace Moses {
}
bool
- allowPooling() const
+ allowPooling() const
{ return false; }
};
diff --git a/moses/TranslationModel/UG/sapt_pscore_rareness.h b/moses/TranslationModel/UG/sapt_pscore_rareness.h
index 58f204c88..34979243c 100644
--- a/moses/TranslationModel/UG/sapt_pscore_rareness.h
+++ b/moses/TranslationModel/UG/sapt_pscore_rareness.h
@@ -2,7 +2,7 @@
// Phrase scorer that rewards the number of phrase pair occurrences in a bitext
// with the asymptotic function x/(j+x) where x > 0 is a function
// parameter that determines the steepness of the rewards curve
-// written by Ulrich Germann
+// written by Ulrich Germann
#include "sapt_pscore_base.h"
#include <boost/dynamic_bitset.hpp>
@@ -10,25 +10,25 @@
using namespace std;
namespace Moses {
namespace bitext {
-
+
// rareness penalty: x/(n+x)
template<typename Token>
class
PScoreRareness : public SingleRealValuedParameterPhraseScorerFamily<Token>
{
public:
- PScoreRareness(string const spec)
+ PScoreRareness(string const spec)
{
this->m_tag = "rare";
this->init(spec);
}
bool
- isLogVal(int i) const { return false; }
+ isLogVal(int i) const { return false; }
- void
- operator()(Bitext<Token> const& bt,
- PhrasePair<Token>& pp,
+ void
+ operator()(Bitext<Token> const& bt,
+ PhrasePair<Token>& pp,
vector<float> * dest = NULL) const
{
if (!dest) dest = &pp.fvals;
diff --git a/moses/TranslationModel/UG/sapt_pscore_unaligned.h b/moses/TranslationModel/UG/sapt_pscore_unaligned.h
index dafc1e129..8dceb1ad0 100644
--- a/moses/TranslationModel/UG/sapt_pscore_unaligned.h
+++ b/moses/TranslationModel/UG/sapt_pscore_unaligned.h
@@ -1,6 +1,6 @@
// -*- c++ -*-
// Phrase scorer that counts the number of unaligend words in the phrase
-// written by Ulrich Germann
+// written by Ulrich Germann
#include "sapt_pscore_base.h"
#include <boost/dynamic_bitset.hpp>
@@ -14,7 +14,7 @@ namespace Moses {
{
typedef boost::dynamic_bitset<typename ::uint64_t> bitvector;
public:
- PScoreUnaligned(string const spec)
+ PScoreUnaligned(string const spec)
{
this->m_index = -1;
int f = this->m_num_feats = atoi(spec.c_str());
@@ -28,16 +28,16 @@ namespace Moses {
this->m_feature_names[1] = "unal-t";
}
}
-
+
bool
- isLogVal(int i) const { return false; }
-
+ isLogVal(int i) const { return false; }
+
bool
- isIntegerValued(int i) const { return true; }
+ isIntegerValued(int i) const { return true; }
- void
- operator()(Bitext<Token> const& bt,
- PhrasePair<Token>& pp,
+ void
+ operator()(Bitext<Token> const& bt,
+ PhrasePair<Token>& pp,
vector<float> * dest = NULL) const
{
if (!dest) dest = &pp.fvals;
@@ -46,9 +46,9 @@ namespace Moses {
// parse_pid(pp.p2, sid2, off2, len2);
bitvector check1(pp.len1),check2(pp.len2);
for (size_t i = 0; i < pp.aln.size(); )
- {
- check1.set(pp.aln[i++]);
- check2.set(pp.aln.at(i++));
+ {
+ check1.set(pp.aln[i++]);
+ check2.set(pp.aln.at(i++));
}
if (this->m_num_feats == 1)
diff --git a/moses/TranslationModel/UG/sapt_pscore_wordcount.h b/moses/TranslationModel/UG/sapt_pscore_wordcount.h
index 3227bb6ba..a5000be37 100644
--- a/moses/TranslationModel/UG/sapt_pscore_wordcount.h
+++ b/moses/TranslationModel/UG/sapt_pscore_wordcount.h
@@ -1,5 +1,5 @@
// -*- c++ -*-
-// written by Ulrich Germann
+// written by Ulrich Germann
#pragma once
#include "moses/TranslationModel/UG/mm/ug_bitext.h"
#include "util/exception.hh"
@@ -13,7 +13,7 @@ namespace Moses {
class
PScoreWC : public PhraseScorer<Token>
{
- public:
+ public:
PScoreWC(string const dummy)
{
this->m_index = -1;
@@ -21,14 +21,14 @@ namespace Moses {
this->m_feature_names.push_back(string("wordcount"));
}
- void
+ void
operator()(Bitext<Token> const& bt,
- PhrasePair<Token>& pp,
+ PhrasePair<Token>& pp,
vector<float> * dest = NULL) const
{
if (!dest) dest = &pp.fvals;
(*dest)[this->m_index] = pp.len2;
- }
+ }
};
}
}
diff --git a/moses/TranslationModel/UG/sim-pe.cc b/moses/TranslationModel/UG/sim-pe.cc
index 460d66c1f..00a705936 100644
--- a/moses/TranslationModel/UG/sim-pe.cc
+++ b/moses/TranslationModel/UG/sim-pe.cc
@@ -15,7 +15,7 @@ using namespace boost;
vector<FactorType> fo(1,FactorType(0));
-ostream&
+ostream&
operator<<(ostream& out, Hypothesis const* x)
{
vector<const Hypothesis*> H;
@@ -24,7 +24,7 @@ operator<<(ostream& out, Hypothesis const* x)
for (; H.size(); H.pop_back())
{
Phrase const& p = H.back()->GetCurrTargetPhrase();
- for (size_t pos = 0 ; pos < p.GetSize() ; pos++)
+ for (size_t pos = 0 ; pos < p.GetSize() ; pos++)
out << *p.GetFactor(pos, 0) << (H.size() ? " " : "");
}
return out;
@@ -33,19 +33,19 @@ operator<<(ostream& out, Hypothesis const* x)
vector<FactorType> ifo;
size_t lineNumber;
-string
+string
translate(string const& source)
{
StaticData const& global = StaticData::Instance();
- Sentence sentence;
- istringstream ibuf(source+"\n");
+ Sentence sentence;
+ istringstream ibuf(source+"\n");
sentence.Read(ibuf,ifo);
// Manager manager(lineNumber, sentence, global.GetSearchAlgorithm());
Manager manager(sentence, global.GetSearchAlgorithm());
manager.ProcessSentence();
-
+
ostringstream obuf;
const Hypothesis* h = manager.GetBestHypothesis();
obuf << h;
@@ -58,7 +58,7 @@ int main(int argc, char* argv[])
Parameter params;
if (!params.LoadParam(argc,argv) || !StaticData::LoadDataStatic(&params, argv[0]))
exit(1);
-
+
StaticData const& global = StaticData::Instance();
global.SetVerboseLevel(0);
ifo = global.GetInputFactorOrder();
@@ -79,6 +79,6 @@ int main(int argc, char* argv[])
}
exit(0);
}
-
-
+
+
diff --git a/moses/TranslationModel/UG/spe-check-coverage.cc b/moses/TranslationModel/UG/spe-check-coverage.cc
index 4ab2d749a..378dd800f 100644
--- a/moses/TranslationModel/UG/spe-check-coverage.cc
+++ b/moses/TranslationModel/UG/spe-check-coverage.cc
@@ -1,3 +1,5 @@
+#if 0
+// temporarily disabled; needs to be adapted to changes in the API
#include "mmsapt.h"
#include "moses/TranslationModel/PhraseDictionaryTreeAdaptor.h"
#include "moses/TranslationModel/UG/generic/program_options/ug_splice_arglist.h"
@@ -22,13 +24,13 @@ class SimplePhrase : public Moses::Phrase
vector<FactorType> const m_fo; // factor order
public:
SimplePhrase(): m_fo(1,FactorType(0)) {}
-
- void init(string const& s)
+
+ void init(string const& s)
{
istringstream buf(s); string w;
- while (buf >> w)
+ while (buf >> w)
{
- Word wrd;
+ Word wrd;
this->AddWord().CreateFromString(Input,m_fo,StringPiece(w),false,false);
}
}
@@ -43,7 +45,7 @@ public:
bool operator()(size_t a, size_t b) const
{
// return cmp(*my_tpc[a], *my_tpc[b]);
- return (my_tpc[a]->GetScoreBreakdown().GetWeightedScore() >
+ return (my_tpc[a]->GetScoreBreakdown().GetWeightedScore() >
my_tpc[b]->GetScoreBreakdown().GetWeightedScore());
}
};
@@ -57,7 +59,7 @@ int main(int argc, char* argv[])
argfilter[1] = std::make_pair(string("--spe-trg"),1);
argfilter[2] = std::make_pair(string("--spe-aln"),1);
argfilter[3] = std::make_pair(string("--spe-show"),1);
-
+
char** my_args; int my_acnt;
char** mo_args; int mo_acnt;
filter_arguments(argc, argv, mo_acnt, &mo_args, my_acnt, &my_args, argfilter);
@@ -75,9 +77,9 @@ int main(int argc, char* argv[])
else if (!strcmp(my_args[i],"--spe-show"))
vlevel = my_args[i+1];
}
-
+
Parameter params;
- if (!params.LoadParam(mo_acnt,mo_args) ||
+ if (!params.LoadParam(mo_acnt,mo_args) ||
!StaticData::LoadDataStatic(&params, mo_args[0]))
exit(1);
@@ -93,15 +95,15 @@ int main(int argc, char* argv[])
exit(1);
}
mmsapt->SetTableLimit(0);
-
+
string srcline,trgline,alnline;
cout.precision(2);
vector<string> fname = mmsapt->GetFeatureNames();
while (getline(spe_src,srcline))
{
- UTIL_THROW_IF2(!getline(spe_trg,trgline), HERE
+ UTIL_THROW_IF2(!getline(spe_trg,trgline), HERE
<< ": missing data for online updates.");
- UTIL_THROW_IF2(!getline(spe_aln,alnline), HERE
+ UTIL_THROW_IF2(!getline(spe_aln,alnline), HERE
<< ": missing data for online updates.");
cout << string(80,'-') << "\n" << srcline << "\n" << trgline << "\n" << endl;
@@ -125,29 +127,29 @@ int main(int argc, char* argv[])
if (!mmsapt->PrefixExists(p)) break;
TargetPhraseCollection const* trg = PT->GetTargetPhraseCollectionLEGACY(p);
if (!trg || !trg->GetSize()) continue;
-
+
bool header_done = false;
bool has_dynamic_match = vlevel == "all" || vlevel == "ALL";
- vector<size_t> order; order.reserve(trg->GetSize());
+ vector<size_t> order; order.reserve(trg->GetSize());
size_t stop = trg->GetSize();
vector<size_t> o2(trg->GetSize());
for (size_t i = 0; i < stop; ++i) o2[i] = i;
sort(o2.begin(),o2.end(),TargetPhraseIndexSorter(*trg));
-
+
for (size_t r = 0; r < stop; ++r) // r for rank
{
if (vlevel != "ALL")
{
Phrase const& phr = static_cast<Phrase const&>(*(*trg)[o2[r]]);
- ostringstream buf; buf << phr;
- string tphrase = buf.str();
+ ostringstream buf; buf << phr;
+ string tphrase = buf.str();
tphrase.erase(tphrase.size()-1);
size_t s = trgline.find(tphrase);
if (s == string::npos) continue;
size_t e = s + tphrase.size();
if ((s && trgline[s-1] != ' ') || (e < trgline.size() && trgline[e] != ' '))
- continue;
+ continue;
}
order.push_back(r);
if (!has_dynamic_match)
@@ -168,7 +170,7 @@ int main(int argc, char* argv[])
ScoreComponentCollection::IndexPair idx = scc.GetIndexes(PT);
FVector const& scores = scc.GetScoresVector();
float wscore = scc.GetWeightedScore();
- if (vlevel == "new" && scores[idx.first + dynprovidx] == 0)
+ if (vlevel == "new" && scores[idx.first + dynprovidx] == 0)
continue;
if (!header_done)
{
@@ -199,7 +201,7 @@ int main(int argc, char* argv[])
}
cout << " " << format(fmt) % (mmsapt->isInteger(j) ? round(f) : f);
}
- cout << " " << format("%10.3e") % exp(wscore)
+ cout << " " << format("%10.3e") % exp(wscore)
<< " " << format("%10.3e") % exp((*trg)[o2[r]]->GetFutureScore()) << endl;
}
mmsapt->Release(trg);
@@ -211,6 +213,6 @@ int main(int argc, char* argv[])
// }
exit(0);
}
-
-
+#endif
+
diff --git a/moses/TranslationModel/UG/spe-check-coverage2.cc b/moses/TranslationModel/UG/spe-check-coverage2.cc
index fa9ce1c85..3b4f559d2 100644
--- a/moses/TranslationModel/UG/spe-check-coverage2.cc
+++ b/moses/TranslationModel/UG/spe-check-coverage2.cc
@@ -20,7 +20,7 @@ typedef Bitext<Token>::iter iter;
mmbitext bg;
-void
+void
show(ostream& out, iter& f)
{
iter b(bg.I2.get(),f.getToken(0),f.size());
@@ -29,11 +29,11 @@ show(ostream& out, iter& f)
else
out << string(12,' ');
out << " " << setw(5) << int(round(f.approxOccurrenceCount())) << " ";
- out << f.str(bg.V1.get()) << endl;
+ out << f.str(bg.V1.get()) << endl;
}
-void
+void
dump(ostream& out, iter& f)
{
float cnt = f.size() ? f.approxOccurrenceCount() : 0;
@@ -44,12 +44,12 @@ dump(ostream& out, iter& f)
while (f.over());
f.up();
}
- if (f.size() && cnt < f.approxOccurrenceCount() && f.approxOccurrenceCount() > 1)
+ if (f.size() && cnt < f.approxOccurrenceCount() && f.approxOccurrenceCount() > 1)
show(out,f);
}
-void
+void
read_data(string fname, vector<string>& dest)
{
ifstream in(fname.c_str());
@@ -71,6 +71,6 @@ int main(int argc, char* argv[])
dump(cout,mfg);
exit(0);
}
-
-
+
+
diff --git a/moses/TranslationModel/UG/spe-check-coverage3.cc b/moses/TranslationModel/UG/spe-check-coverage3.cc
index ea8c85e99..a62daa7b8 100644
--- a/moses/TranslationModel/UG/spe-check-coverage3.cc
+++ b/moses/TranslationModel/UG/spe-check-coverage3.cc
@@ -22,7 +22,7 @@ typedef Bitext<Token>::iter iter;
mmbitext bg;
vector<string> src,trg,aln;
-void
+void
show(ostream& out, iter& f)
{
iter b(bg.I2.get(),f.getToken(0),f.size());
@@ -31,11 +31,11 @@ show(ostream& out, iter& f)
else
out << string(12,' ');
out << " " << setw(5) << int(round(f.approxOccurrenceCount())) << " ";
- out << f.str(bg.V1.get()) << endl;
+ out << f.str(bg.V1.get()) << endl;
}
-void
+void
dump(ostream& out, iter& f)
{
float cnt = f.size() ? f.approxOccurrenceCount() : 0;
@@ -46,12 +46,12 @@ dump(ostream& out, iter& f)
while (f.over());
f.up();
}
- if (f.size() && cnt < f.approxOccurrenceCount() && f.approxOccurrenceCount() > 1)
+ if (f.size() && cnt < f.approxOccurrenceCount() && f.approxOccurrenceCount() > 1)
show(out,f);
}
-void
+void
read_data(string fname, vector<string>& dest)
{
ifstream in(fname.c_str());
@@ -60,14 +60,14 @@ read_data(string fname, vector<string>& dest)
in.close();
}
-void
-show_snt(ostream& out, TokenIndex const& V, vector<Token> const& snt,
+void
+show_snt(ostream& out, TokenIndex const& V, vector<Token> const& snt,
vector<vector<int> > const& a)
{
for (size_t i = 0; i < snt.size(); ++i)
{
cout << format("%d:%s[") % i % V[snt[i].id()];
- for (size_t k = 0; k < a[i].size(); ++k)
+ for (size_t k = 0; k < a[i].size(); ++k)
cout << (k?",":"") << a[i][k];
cout << "] ";
}
@@ -77,7 +77,7 @@ show_snt(ostream& out, TokenIndex const& V, vector<Token> const& snt,
void show_pair(size_t const sid)
{
- vector<Token> s,t;
+ vector<Token> s,t;
fill_token_seq(*bg.V1,src[sid],s);
fill_token_seq(*bg.V2,trg[sid],t);
vector<vector<int> > a1(s.size()),a2(t.size());
@@ -97,11 +97,11 @@ void show_pair(size_t const sid)
int main(int argc, char* argv[])
{
- if (argc < 5)
+ if (argc < 5)
{
- cerr << "usage: " << argv[0]
- << " <bg base name> <L1> <L2> <fg base name>"
- << endl;
+ cerr << "usage: " << argv[0]
+ << " <bg base name> <L1> <L2> <fg base name>"
+ << endl;
exit(1);
}
bg.open(argv[1],argv[2],argv[3]);
@@ -122,10 +122,10 @@ int main(int argc, char* argv[])
bias[sid] = 0;
// cout << src[sid] << endl << trg[sid] << endl;
// show_pair(sid);
- vector<Token> snt;
+ vector<Token> snt;
fill_token_seq(*bg.V1,src[sid],snt);
vector<vector<sptr<vector<PhrasePair<Token> > > > > FG,BG;
- fg->lookup(snt,*fg->I1,FG,NULL,NULL,&bias,true);
+ fg->lookup(snt,*fg->I1,FG,NULL,NULL,&bias,true);
bg.lookup(snt,*bg.I1,BG,NULL,NULL,NULL,true);
set<sptr<vector<PhrasePair<Token> > > > seen;
for (size_t i = 0; i < snt.size(); ++i)
@@ -136,7 +136,7 @@ int main(int argc, char* argv[])
{
if (!m0.extend(snt[i+k].id())) break;
if (k && m0.approxOccurrenceCount() < 2) break;
- if (m1.size() == k && (!m1.extend(snt[i+k].id()) ||
+ if (m1.size() == k && (!m1.extend(snt[i+k].id()) ||
m1.approxOccurrenceCount() < 25))
{
cout << toString((*fg->V1), m0.getToken(0), m0.size()) << " "
@@ -156,8 +156,8 @@ int main(int argc, char* argv[])
sptr<pstats> bgstats;
jstats const* bgjstats = NULL;
Bitext<Token>::iter m2(bg.I2.get(), pp.start2, pp.len2);
- if (m1.approxOccurrenceCount() > 5000 ||
- m2.approxOccurrenceCount() > 5000)
+ if (m1.approxOccurrenceCount() > 5000 ||
+ m2.approxOccurrenceCount() > 5000)
continue;
if (m1.size() == pp.len1 && m2.size() == pp.len2)
{
@@ -173,9 +173,9 @@ int main(int argc, char* argv[])
cout << toString(*fg->V1, pp.start1, pp.len1) << " ::: "
<< toString(*fg->V2, pp.start2, pp.len2) << " "
<< format("[%u/%u/%u]") % pp.good1 % pp.joint % pp.good2;
- if (bgjstats)
- cout << " " << (format("[%u/%u/%u]")
- % bgstats->good % bgjstats->rcnt()
+ if (bgjstats)
+ cout << " " << (format("[%u/%u/%u]")
+ % bgstats->good % bgjstats->rcnt()
% (bgjstats->cnt2() * bgstats->good
/ bgstats->raw_cnt));
else if (m1.size() == pp.len1)
@@ -189,6 +189,6 @@ int main(int argc, char* argv[])
}
exit(0);
}
-
-
+
+
diff --git a/moses/TranslationModel/UG/try-align.cc b/moses/TranslationModel/UG/try-align.cc
index daafec545..60eabb9e7 100644
--- a/moses/TranslationModel/UG/try-align.cc
+++ b/moses/TranslationModel/UG/try-align.cc
@@ -17,7 +17,7 @@ float lbop_level = .05;
namespace stats
{
using namespace Moses::bitext;
- float
+ float
pmi(size_t j,size_t m1, size_t m2, size_t N)
{
#if smooth
@@ -29,8 +29,8 @@ namespace stats
return log(j) + log(N) - log(m1) - log(m2);
#endif
}
-
- float
+
+ float
npmi(size_t j,size_t m1, size_t m2, size_t N)
{
#if smooth
@@ -39,11 +39,11 @@ namespace stats
float p12 = lbop(N,j,lbop_level);
return (log(p12) - log(p1) - log(p2)) / -log(p12);
#else
- return pmi(j,m1,m2,N) / (log(N) - log(j));
+ return pmi(j,m1,m2,N) / (log(N) - log(j));
#endif
}
- float
+ float
mi(size_t j,size_t m1, size_t m2, size_t N)
{
float ret = 0;
@@ -79,7 +79,7 @@ struct PhrasePair
float mi; // mutual information
float score;
- void
+ void
set(vector<ttrack::Position> const& o1,
vector<ttrack::Position> const& o2,
size_t const N)
@@ -90,7 +90,7 @@ struct PhrasePair
{
if (i1 && o1[i1].sid == o1[i1-1].sid) { ++i1; continue; }
if (i2 && o2[i2].sid == o2[i2-1].sid) { ++i2; continue; }
-
+
if (o1[i1].sid == o2[i2].sid) { ++j; ++i1; ++i2; ++m1; ++m2; }
else if (o1[i1].sid < o2[i2].sid) { ++i1; ++m1; }
else { ++i2; ++m2; }
@@ -114,19 +114,19 @@ struct PhrasePair
this->score = npmi; // npmi; // hmean; // /sqrt(z);
}
} stats;
-
+
PhrasePair(ushort s1_=0, ushort e1_=0, ushort s2_=0, ushort e2_=0)
: s1(s1_), e1(e1_), s2(s2_), e2(e2_), parent(-1) { }
- bool
+ bool
operator<(PhrasePair const& other) const
- {
- return (this->stats.score == other.stats.score
+ {
+ return (this->stats.score == other.stats.score
? (e1-s1 + e2-s2 > other.e1-other.s1 + other.e2-other.s2)
- : (this->stats.score > other.stats.score));
+ : (this->stats.score > other.stats.score));
}
-
+
size_t len1() const { return e1 - s1; }
size_t len2() const { return e2 - s2; }
bool includes(PhrasePair const& o) const
@@ -142,8 +142,8 @@ PhrasePair::stats_t::cache_t ppcache;
struct SortByPositionInCorpus
{
- bool
- operator()(ttrack::Position const& a,
+ bool
+ operator()(ttrack::Position const& a,
ttrack::Position const& b) const
{
return a.sid != b.sid ? a.sid < b.sid : a.offset < b.offset;
@@ -151,8 +151,8 @@ struct SortByPositionInCorpus
};
-void
-getoccs(tsa_t::tree_iterator const& m,
+void
+getoccs(tsa_t::tree_iterator const& m,
vector<ttrack::Position>& occs)
{
occs.clear();
@@ -166,9 +166,9 @@ getoccs(tsa_t::tree_iterator const& m,
sort(occs.begin(),occs.end(),SortByPositionInCorpus());
}
-void
-lookup_phrases(vector<id_type> const& snt,
- TokenIndex& V, ttrack_t const& T,
+void
+lookup_phrases(vector<id_type> const& snt,
+ TokenIndex& V, ttrack_t const& T,
tsa_t const& I, SinglePhrase::cache_t& cache,
vector<vector<sptr<SinglePhrase> > >& dest)
{
@@ -182,7 +182,7 @@ lookup_phrases(vector<id_type> const& snt,
if (m.approxOccurrenceCount() < 3) break;
// if (k - i > 0) break;
sptr<SinglePhrase>& o = cache[m.getPid()];
- if (!o)
+ if (!o)
{
o.reset(new SinglePhrase());
o->pid = m.getPid();
@@ -193,7 +193,7 @@ lookup_phrases(vector<id_type> const& snt,
}
}
-struct
+struct
RowIndexSorter
{
vector<vector<float> > const& M;
@@ -202,14 +202,14 @@ RowIndexSorter
: M(m), my_col(c) { }
template<typename T>
- bool
- operator()(T const& a, T const& b) const
- {
+ bool
+ operator()(T const& a, T const& b) const
+ {
return M.at(a).at(my_col) > M.at(b).at(my_col);
}
};
-struct
+struct
ColIndexSorter
{
vector<vector<float> > const& M;
@@ -218,9 +218,9 @@ ColIndexSorter
: M(m), my_row(r) { }
template<typename T>
- bool
- operator()(T const& a, T const& b) const
- {
+ bool
+ operator()(T const& a, T const& b) const
+ {
return M.at(my_row).at(a) > M[my_row].at(b);
}
@@ -234,7 +234,7 @@ int main(int argc, char* argv[])
T1.reset(new ttrack_t());
T2.reset(new ttrack_t());
-
+
V1.open(base + L1 + ".tdx");
T1->open(base + L1 + ".mct");
I1.open(base + L1 + ".sfa", T1);
@@ -259,7 +259,7 @@ int main(int argc, char* argv[])
vector<PhrasePair> pp_all,pp_good;
vector<int> a1(snt1.size(),-1);
vector<int> a2(snt2.size(),-1);
-
+
vector<vector<int> > z1(snt1.size(),vector<int>(snt1.size(),-1));
vector<vector<int> > z2(snt2.size(),vector<int>(snt2.size(),-1));
vector<vector<vector<PhrasePair> > >ppm1(M1.size()),ppm2(M2.size());
@@ -282,9 +282,9 @@ int main(int argc, char* argv[])
for (size_t k2 = 0; k2 < M2[i2].size(); ++k2)
{
pp.e2 = i2 + k2 + 1;
- sptr<PhrasePair::stats_t> & s
+ sptr<PhrasePair::stats_t> & s
= ppcache[make_pair(M1[i1][k1]->pid,M2[i2][k2]->pid)];
- if (!s)
+ if (!s)
{
s.reset(new PhrasePair::stats_t());
s->set(M1[i1][k1]->occs,M2[i2][k2]->occs,T1->size());
@@ -294,8 +294,8 @@ int main(int argc, char* argv[])
// ppm1[i1][k1].push_back(pp);
// ppm2[i2][k2].push_back(pp);
size_t J = pp.stats.j * 100;
- if (pp.stats.score > 0
- && J >= pp.stats.m1
+ if (pp.stats.score > 0
+ && J >= pp.stats.m1
&& J > pp.stats.m2)
{ pp_all.push_back(pp); }
}
@@ -310,7 +310,7 @@ int main(int argc, char* argv[])
for (size_t r = pp.s1; r < pp.e1; ++r)
for (size_t c = pp.s2; c < pp.e2; ++c)
{
- // M[r][c] += log(1-pp.stats.npmi);
+ // M[r][c] += log(1-pp.stats.npmi);
M[r][c] += log(1-pp.stats.mi);
}
}
@@ -342,11 +342,11 @@ int main(int argc, char* argv[])
}
cout << endl;
}
-#endif
+#endif
#if 0
for (size_t k = 1; k < pp_all.size(); ++k)
for (size_t i = k; i--;)
- if (pp_all[i].s1 >= pp_all[k].s1 &&
+ if (pp_all[i].s1 >= pp_all[k].s1 &&
pp_all[i].e1 <= pp_all[k].e1 &&
pp_all[i].s2 >= pp_all[k].s2 &&
pp_all[i].e2 <= pp_all[k].e2)
@@ -360,35 +360,35 @@ int main(int argc, char* argv[])
{
PhrasePair const& x = pp_all[p];
// if (x.stats.npmi < .7) break;
- // if (z1[x.s1][x.e1-1] >= 0 || z2[x.s2][x.e2-1] >=0)
+ // if (z1[x.s1][x.e1-1] >= 0 || z2[x.s2][x.e2-1] >=0)
// continue;
- for (size_t i = x.s1; i < x.e1; ++i)
+ for (size_t i = x.s1; i < x.e1; ++i)
{
- if (assoc1[i] < 0)
+ if (assoc1[i] < 0)
assoc1[i] = p;
else
{
// PhrasePair& y = pp_all[assoc1[i]];
- // if (y.includes(x))
+ // if (y.includes(x))
// assoc1[i] = p;
}
}
- for (size_t i = x.s2; i < x.e2; ++i)
+ for (size_t i = x.s2; i < x.e2; ++i)
{
- if (assoc2[i] < 0)
+ if (assoc2[i] < 0)
assoc2[i] = p;
else
{
// PhrasePair& y = pp_all[assoc2[i]];
- // if (y.includes(x))
+ // if (y.includes(x))
// assoc2[i] = p;
}
}
z1[x.s1][x.e1-1] = p;
z2[x.s2][x.e2-1] = p;
continue;
- cout << (boost::format("%.4f %.8f %.4f")
- % x.stats.score
+ cout << (boost::format("%.4f %.8f %.4f")
+ % x.stats.score
% x.stats.mi
% x.stats.npmi);
for (size_t z = x.s1; z < x.e1; ++z)
@@ -396,8 +396,8 @@ int main(int argc, char* argv[])
cout << " :::";
for (size_t z = x.s2; z < x.e2; ++z)
cout << " " << V2[snt2[z]];
- cout << " ["
- << x.stats.m1 << "/" << x.stats.j << "/" << x.stats.m2
+ cout << " ["
+ << x.stats.m1 << "/" << x.stats.j << "/" << x.stats.m2
<< "]" << endl;
}
vector<bool> done(pp_all.size(),false);
@@ -415,8 +415,8 @@ int main(int argc, char* argv[])
cout << " ::: ";
for (size_t j = p.s2; j < p.e2; ++j)
cout << j << ":" << V2[snt2[j]] << " ";
- cout << "["
- << p.stats.m1 << "/" << p.stats.j << "/" << p.stats.m2
+ cout << "["
+ << p.stats.m1 << "/" << p.stats.j << "/" << p.stats.m2
<< "] "<< p.stats.score << endl;
// break;
}
@@ -433,20 +433,20 @@ int main(int argc, char* argv[])
cout << " ::: ";
for (size_t j = p.s2; j < p.e2; ++j)
cout << j << ":" << V2[snt2[j]] << " ";
- cout << "["
- << p.stats.m1 << "/" << p.stats.j << "/" << p.stats.m2
+ cout << "["
+ << p.stats.m1 << "/" << p.stats.j << "/" << p.stats.m2
<< "] "<< p.stats.score << endl;
}
-#endif
+#endif
// sort(pp_all.begin(),pp_all.end());
// BOOST_FOREACH(PhrasePair const& pp, pp_all)
// {
- // while (ppm1[pp.s1].size() < pp.e1 - pp.s1)
+ // while (ppm1[pp.s1].size() < pp.e1 - pp.s1)
// ppm1[pp.s1].push_back(vector<PhrasePair>());
// vector<PhrasePair>& v1 = ppm1[pp.s1][pp.e1-pp.s1-1];
// if (v1.size() && v1[0].stats.score > pp.stats.score)
// continue;
- // while (ppm2[pp.s2].size() < pp.e2 - pp.s2)
+ // while (ppm2[pp.s2].size() < pp.e2 - pp.s2)
// ppm2[pp.s2].push_back(vector<PhrasePair>());
// vector<PhrasePair>& v2 = ppm2[pp.s2][pp.e2-pp.s2-1];
// if (v2.size() && v2[0].stats.score > pp.stats.score)
@@ -455,12 +455,12 @@ int main(int argc, char* argv[])
// v2.push_back(pp);
// }
-
+
// BOOST_FOREACH(vector<vector<PhrasePair> >& vv, ppm1)
- // {
- // BOOST_FOREACH(vector<PhrasePair>& v, vv)
- // {
- // sort(v.begin(),v.end());
+ // {
+ // BOOST_FOREACH(vector<PhrasePair>& v, vv)
+ // {
+ // sort(v.begin(),v.end());
// if (v.size() > 1 && v[0].stats.score == v[1].stats.score)
// v.clear();
// }
@@ -468,19 +468,19 @@ int main(int argc, char* argv[])
// for (size_t i2 = 0; i2 < ppm2.size(); ++i2)
// {
// for (size_t k2 = 0; k2 < ppm2[i2].size(); ++k2)
- // {
+ // {
// vector<PhrasePair>& v2 = ppm2[i2][k2];
// sort(v2.begin(),v2.end());
- // if (v2.size() > 1 && v2[0].stats.score == v2[1].stats.score)
+ // if (v2.size() > 1 && v2[0].stats.score == v2[1].stats.score)
// {
// v2.clear();
// continue;
// }
// ushort i1 = v2[0].s1;
// ushort k1 = v2[0].e1 - i1 -1;
-
- // if (ppm1[i1][k1].size() == 0 ||
- // ppm1[i1][k1][0].s2 != i2 ||
+
+ // if (ppm1[i1][k1].size() == 0 ||
+ // ppm1[i1][k1][0].s2 != i2 ||
// ppm1[i1][k1][0].e2 != i2 + k2 + 1)
// { v2.clear(); }
// else pp_good.push_back(ppm2[i2][k2][0]);
@@ -508,7 +508,7 @@ int main(int argc, char* argv[])
// // cout << V2[snt2[z]] << " ";
// // cout << pp.m1 << "/" << pp.j << "/" << pp.m2 << endl;
// // }
-
+
}
}
diff --git a/moses/TranslationModel/UG/try-align2.cc b/moses/TranslationModel/UG/try-align2.cc
index 57cf25035..a18ce8d92 100644
--- a/moses/TranslationModel/UG/try-align2.cc
+++ b/moses/TranslationModel/UG/try-align2.cc
@@ -29,7 +29,7 @@ float lbop_level = .05;
namespace stats
{
using namespace Moses::bitext;
- float
+ float
pmi(size_t j,size_t m1, size_t m2, size_t N)
{
#if smooth
@@ -41,8 +41,8 @@ namespace stats
return log(j) + log(N) - log(m1) - log(m2);
#endif
}
-
- float
+
+ float
npmi(size_t j,size_t m1, size_t m2, size_t N)
{
#if smooth
@@ -52,11 +52,11 @@ namespace stats
float p12 = lbop(N,j,lbop_level);
return (log(p12) - log(p1) - log(p2)) / -log(p12);
#else
- return pmi(j,m1,m2,N) / (log(N) - log(j));
+ return pmi(j,m1,m2,N) / (log(N) - log(j));
#endif
}
- float
+ float
mi(size_t j,size_t m1, size_t m2, size_t N)
{
float ret = 0;
@@ -92,7 +92,7 @@ struct PhrasePair2
float mi; // mutual information
float score;
- void
+ void
set(vector<ttrack::Position> const& o1,
vector<ttrack::Position> const& o2,
size_t const N)
@@ -103,7 +103,7 @@ struct PhrasePair2
{
if (i1 && o1[i1].sid == o1[i1-1].sid) { ++i1; continue; }
if (i2 && o2[i2].sid == o2[i2-1].sid) { ++i2; continue; }
-
+
if (o1[i1].sid == o2[i2].sid) { ++j; ++i1; ++i2; ++m1; ++m2; }
else if (o1[i1].sid < o2[i2].sid) { ++i1; ++m1; }
else { ++i2; ++m2; }
@@ -127,19 +127,19 @@ struct PhrasePair2
this->score = npmi; // npmi; // hmean; // /sqrt(z);
}
} stats;
-
+
PhrasePair2(ushort s1_=0, ushort e1_=0, ushort s2_=0, ushort e2_=0)
: s1(s1_), e1(e1_), s2(s2_), e2(e2_), parent(-1) { }
- bool
+ bool
operator<(PhrasePair2 const& other) const
- {
- return (this->stats.score == other.stats.score
+ {
+ return (this->stats.score == other.stats.score
? (e1-s1 + e2-s2 > other.e1-other.s1 + other.e2-other.s2)
- : (this->stats.score > other.stats.score));
+ : (this->stats.score > other.stats.score));
}
-
+
size_t len1() const { return e1 - s1; }
size_t len2() const { return e2 - s2; }
bool includes(PhrasePair2 const& o) const
@@ -155,8 +155,8 @@ PhrasePair2::stats_t::cache_t ppcache;
struct SortByPositionInCorpus
{
- bool
- operator()(ttrack::Position const& a,
+ bool
+ operator()(ttrack::Position const& a,
ttrack::Position const& b) const
{
return a.sid != b.sid ? a.sid < b.sid : a.offset < b.offset;
@@ -164,8 +164,8 @@ struct SortByPositionInCorpus
};
-void
-getoccs(tsa_t::tree_iterator const& m,
+void
+getoccs(tsa_t::tree_iterator const& m,
vector<ttrack::Position>& occs)
{
occs.clear();
@@ -179,9 +179,9 @@ getoccs(tsa_t::tree_iterator const& m,
sort(occs.begin(),occs.end(),SortByPositionInCorpus());
}
-void
-lookup_phrases(vector<id_type> const& snt,
- TokenIndex& V, ttrack_t const& T,
+void
+lookup_phrases(vector<id_type> const& snt,
+ TokenIndex& V, ttrack_t const& T,
tsa_t const& I, SinglePhrase::cache_t& cache,
vector<vector<sptr<SinglePhrase> > >& dest)
{
@@ -195,7 +195,7 @@ lookup_phrases(vector<id_type> const& snt,
if (m.approxOccurrenceCount() < 3) break;
// if (k - i > 0) break;
sptr<SinglePhrase>& o = cache[m.getPid()];
- if (!o)
+ if (!o)
{
o.reset(new SinglePhrase());
o->pid = m.getPid();
@@ -207,7 +207,7 @@ lookup_phrases(vector<id_type> const& snt,
}
-struct
+struct
RowIndexSorter
{
vector<vector<float> > const& M;
@@ -216,14 +216,14 @@ RowIndexSorter
: M(m), my_col(c) { }
template<typename T>
- bool
- operator()(T const& a, T const& b) const
- {
+ bool
+ operator()(T const& a, T const& b) const
+ {
return M.at(a).at(my_col) > M.at(b).at(my_col);
}
};
-struct
+struct
ColIndexSorter
{
vector<vector<float> > const& M;
@@ -232,9 +232,9 @@ ColIndexSorter
: M(m), my_row(r) { }
template<typename T>
- bool
- operator()(T const& a, T const& b) const
- {
+ bool
+ operator()(T const& a, T const& b) const
+ {
return M.at(my_row).at(a) > M[my_row].at(b);
}
@@ -249,7 +249,7 @@ public:
{
#if 0
cout << pp.raw1 << " " << pp.sample1 << " " << pp.good1 << " "
- << pp.raw2 << " " << pp.sample2 << " " << pp.good2 << " "
+ << pp.raw2 << " " << pp.sample2 << " " << pp.good2 << " "
<< pp.joint << " " << __FILE__ << ":" << __LINE__ << endl;
#endif
pp.good2 = ceil(pp.raw2 * float(pp.good1)/pp.raw1);
@@ -266,7 +266,7 @@ class Alnhyp
};
-size_t
+size_t
lcs(string const a, string const b)
{
using namespace stringdist;
@@ -279,10 +279,10 @@ lcs(string const a, string const b)
{
StringDiff::Segment const& s = diff[i];
if (s.match != StringDiff::same && s.match != StringDiff::cap)
- {
+ {
if (len > ret) ret = len;
- len = 0;
- continue;
+ len = 0;
+ continue;
}
len += s.end_a - s.start_a;
}
@@ -290,9 +290,9 @@ lcs(string const a, string const b)
return ret;
}
-size_t
-mapstring(string const& utf8,
- UnicodeString& U,
+size_t
+mapstring(string const& utf8,
+ UnicodeString& U,
vector<int>& c2w,
vector<int>* wlen=NULL)
{
@@ -338,10 +338,10 @@ align_letters(UnicodeString const& A, vector<int> const& a2p,
// }
}
-void
+void
map_back(vector<vector<int> > const& W,
vector<vector<int> > & X,
- vector<uchar> const & aln)
+ vector<uchar> const & aln)
{
for (size_t i = 0; i < aln.size(); i += 2)
{
@@ -354,7 +354,7 @@ map_back(vector<vector<int> > const& W,
}
-void trymatch3(vector<PhrasePair<Token> > const& tcands,
+void trymatch3(vector<PhrasePair<Token> > const& tcands,
UnicodeString const& T, size_t const tlen,
vector<int> const& t2p,
TokenIndex const& V2, vector<vector<int> >&X)
@@ -374,8 +374,8 @@ void trymatch3(vector<PhrasePair<Token> > const& tcands,
cout << slen << " " << tlen << endl;
cout << "W: " << W.size() << " rows; " << W[0].size() << " cols" << endl;
cout << "X: " << X.size() << " rows; " << X[0].size() << " cols" << endl;
- cout << "aln: ";
- for (size_t a = 0; a < pp.aln.size(); a +=2)
+ cout << "aln: ";
+ for (size_t a = 0; a < pp.aln.size(); a +=2)
cout << int(pp.aln[a]) << "-" << int(pp.aln[a+1]) << " ";
cout << endl;
#endif
@@ -383,7 +383,7 @@ void trymatch3(vector<PhrasePair<Token> > const& tcands,
}
}
-void minmatch_filter(vector<vector<int> > & X,
+void minmatch_filter(vector<vector<int> > & X,
vector<int> const& len1,
vector<int> const& len2)
{
@@ -437,20 +437,20 @@ trymatch2(TokenIndex& V1, // source language vocab
TokenIndex& V2, // target language vocab
string const& source, // source phrase
string const& target, // observed target candidate
- vector<PhrasePair<Token> > const* const tcands,
+ vector<PhrasePair<Token> > const* const tcands,
vector<vector<int> >& X) // destination alignment matrix
// tcands: translations for source
{
- UnicodeString S,T;
+ UnicodeString S,T;
vector<int> t2p, s2p; // maps from character position in string to word pos.
vector<int> wlen_t, wlen_s; // individual word lengths
size_t slen = mapstring(source, S, s2p, &wlen_s);
size_t tlen = mapstring(target, T, t2p, &wlen_t);
-
+
X.assign(slen,vector<int>(tlen,0));
- if (slen == 1 && tlen ==1 && S == T)
+ if (slen == 1 && tlen ==1 && S == T)
X[0][0] = S.length();
- else
+ else
{
align_letters(S,s2p,T,t2p,X);
if (tcands) trymatch3(*tcands, T, tlen, t2p, V2, X);
@@ -475,7 +475,7 @@ trymatch2(TokenIndex& V1, // source language vocab
// float
-// trymatch(string const a, string const b,
+// trymatch(string const a, string const b,
// vector<PhrasePair<Token> > const* atrans,
// vector<PhrasePair<Token> > const* btrans)
// {
@@ -501,11 +501,11 @@ trymatch2(TokenIndex& V1, // source language vocab
// // float bar = float(lcs(foo,b))/min(foo.size(),b.size());
// float bar = float(lcs(foo,b));
-// if (bar > .5)
+// if (bar > .5)
// {
// // score = max(pp.score * bar,score);
// score = max(bar,score);
-// // cout << "[" << bar << "] " << foo << " ::: " << b
+// // cout << "[" << bar << "] " << foo << " ::: " << b
// // << " (" << a << ") " << pp.score << endl;
// }
// }
@@ -525,10 +525,10 @@ trymatch2(TokenIndex& V1, // source language vocab
// string foo = toString(*BT.V1,pp.start2,pp.len2);
// // float bar = float(lcs(a,foo))/min(a.size(),foo.size());
// float bar = float(lcs(a,foo));
-// if (bar > .5)
+// if (bar > .5)
// {
// score = max(bar,score);
-// // cout << "[" << bar<< "] " << a << " ::: " << foo
+// // cout << "[" << bar<< "] " << a << " ::: " << foo
// // << " (" << b << ") " << pp.score << endl;
// }
// }
@@ -547,8 +547,8 @@ struct ahyp
struct AlnPoint
{
enum status { no = 0, yes = 1, maybe = -1, undef = -7 };
- float score;
- status state;
+ float score;
+ status state;
AlnPoint() : score(0), state(undef) {}
};
@@ -562,14 +562,14 @@ class AlnMatrix
vector<bitvector> A1,A2; // final alignment matrix
vector<bitvector> S1,S2; // shadow alignment matrix
public:
- vector<bitvector*> m1,m2; // margins
+ vector<bitvector*> m1,m2; // margins
AlnMatrix(size_t const rows, size_t const cols);
- bitvector const&
+ bitvector const&
operator[](size_t const r) const
{ return A1.at(r); }
bool
- incorporate(span_t const& rspan, span_t const& cspan,
+ incorporate(span_t const& rspan, span_t const& cspan,
vector<uchar> const& aln, bool const flip);
size_t size() const { return A1.size(); }
@@ -588,9 +588,9 @@ AlnMatrix(size_t const rows, size_t const cols)
bool
AlnMatrix::
-incorporate(span_t const& rspan,
- span_t const& cspan,
- vector<uchar> const& aln,
+incorporate(span_t const& rspan,
+ span_t const& cspan,
+ vector<uchar> const& aln,
bool const flip)
{
for (size_t r = rspan.first; r < rspan.second; ++r)
@@ -622,7 +622,7 @@ incorporate(span_t const& rspan,
if (m1[r] && (*m1[r]) != S1[r]) return false;
for (size_t c = cspan.first; c < cspan.second; ++c)
if (m2[c] && (*m2[c]) != S2[c]) return false;
-
+
// all good, add new points
for (size_t r = rspan.first; r < rspan.second; ++r)
if (!m1[r]) { A1[r] = S1[r]; m1[r] = &A1[r]; }
@@ -632,9 +632,9 @@ incorporate(span_t const& rspan,
return true;
}
-struct alink
-{
- size_t r,c,m;
+struct alink
+{
+ size_t r,c,m;
bool operator<(alink const& o) const { return m < o.m; }
bool operator>(alink const& o) const { return m > o.m; }
};
@@ -659,9 +659,9 @@ int main(int argc, char* argv[])
vector<vector<uint64_t> > pm1,pm2;
BT.lookup(snt1,*BT.I1,pt1,&pm1,&scorer);
BT.lookup(snt2,*BT.I2,pt2,&pm2,&scorer);
-
+
// build map from phrases to positions
- typedef boost::unordered_map<uint64_t, vector<span_t> >
+ typedef boost::unordered_map<uint64_t, vector<span_t> >
p2s_map_t;
typedef p2s_map_t::iterator p2s_iter;
p2s_map_t p2s1,p2s2;
@@ -684,7 +684,7 @@ int main(int argc, char* argv[])
BOOST_FOREACH(PhrasePair<Token> const& pp, *pt1[i][k])
{
if (pp.score < 0) break;
- if (p2s2.find(pp.p2) != p2s2.end())
+ if (p2s2.find(pp.p2) != p2s2.end())
pp_all.push_back(pp);
}
}
@@ -704,10 +704,10 @@ int main(int argc, char* argv[])
{
PhrasePair<Token> const& pp = pp_all[p];
#if 0
- cout << (boost::format("%30s ::: %-30s ")
+ cout << (boost::format("%30s ::: %-30s ")
% BT.toString(pp.p1,0).c_str()
% BT.toString(pp.p2,1).c_str());
- cout << (boost::format("%.4f [%d/%d/%d]")
+ cout << (boost::format("%.4f [%d/%d/%d]")
% pp.score % pp.good1 % pp.joint % pp.good2);
for (size_t a = 0; a < pp.aln.size(); a += 2)
cout << " " << int(pp.aln[a]) << "-" << int(pp.aln[a+1]);
@@ -720,7 +720,7 @@ int main(int argc, char* argv[])
for (size_t i = v1[0].first; i < v1[0].second; ++i)
if (a1[i] < 0) a1[i] = p;
if (v2.size() == 1)
- for (size_t i = v2[0].first; i < v2[0].second; ++i)
+ for (size_t i = v2[0].first; i < v2[0].second; ++i)
if (a2[i] < 0) a2[i] = p;
if (v1.size() == 1 && v2.size() == 1)
@@ -740,11 +740,11 @@ int main(int argc, char* argv[])
vector<PhrasePair<Token> > const* atrans, *btrans;
ahyp h;
vector<ahyp> hyps;
- vector<vector<int> > L(snt1.size(),vector<int>(snt2.size(),0));
+ vector<vector<int> > L(snt1.size(),vector<int>(snt2.size(),0));
// L: matches by letter overlap
for (h.s1 = 0; h.s1 < a1.size(); ++h.s1)
- {
+ {
if (a1[h.s1] >= 0) continue;
ostringstream buf1;
for (h.e1 = h.s1; h.e1 < a1.size() && a1[h.e1] < 0; ++h.e1)
@@ -762,23 +762,23 @@ int main(int argc, char* argv[])
if (a2[h.s2] >= 0) continue;
for (h.e2 = h.s2; h.e2 < a2.size() && a2[h.e2] < 0; ++h.e2)
{
- if (h.e2 > h.s2)
+ if (h.e2 > h.s2)
{
if (pt2[h.s2].size() + h.s2 <= h.e2) break;
buf2 << " ";
}
buf2 << (*BT.V2)[snt2[h.e2].id()];
- btrans = (pt2[h.s2].size()
- ? pt2[h.s2].at(h.e2-h.s2).get()
+ btrans = (pt2[h.s2].size()
+ ? pt2[h.s2].at(h.e2-h.s2).get()
: NULL);
vector<vector<int> > aln;
- trymatch2(*BT.V1, *BT.V2, buf1.str(),buf2.str(),
+ trymatch2(*BT.V1, *BT.V2, buf1.str(),buf2.str(),
atrans,aln);
for (size_t i = 0; i < aln.size(); ++i)
for (size_t k = 0; k < aln[i].size(); ++k)
L[h.s1+i][h.s2+k] = max(L[h.s1+i][h.s2+k],aln[i][k]);
- trymatch2(*BT.V2, *BT.V1, buf2.str(),buf1.str(),
+ trymatch2(*BT.V2, *BT.V1, buf2.str(),buf1.str(),
btrans,aln);
for (size_t i = 0; i < aln[0].size(); ++i)
for (size_t k = 0; k < aln.size(); ++k)
@@ -795,7 +795,7 @@ int main(int argc, char* argv[])
alink x;
for (x.r = 0; x.r < L.size(); ++x.r)
{
-
+
for (x.c = 0; x.c < L[x.r].size(); ++x.c)
{
x.m = L[x.r][x.c];
@@ -807,22 +807,22 @@ int main(int argc, char* argv[])
BOOST_FOREACH(alink& x, links)
{
- if (L[x.r][x.c])
+ if (L[x.r][x.c])
{
cout << (*BT.V1)[snt1[x.r].id()] << " ::: "
<< (*BT.V2)[snt2[x.c].id()] << " ::: "
<< L[x.r][x.c] << endl;
}
- }
+ }
// sort(hyps.begin(),hyps.end(),greater<ahyp>());
// BOOST_FOREACH(ahyp const& h, hyps)
// {
// if (h.score < .5) break;
- // for (size_t i = h.s1; i <= h.e1; ++i)
+ // for (size_t i = h.s1; i <= h.e1; ++i)
// cout << i << ":" << (*BT.V1)[snt1[i].id()] << " ";
// cout << " ::: ";
- // for (size_t i = h.s2; i <= h.e2; ++i)
+ // for (size_t i = h.s2; i <= h.e2; ++i)
// cout << i << ":" << (*BT.V2)[snt2[i].id()] << " ";
// cout << h.score << endl;
// }
@@ -854,15 +854,15 @@ int main(int argc, char* argv[])
// #if 0
// if (match)
// {
-// if (first)
+// if (first)
// {
// cout << BT.toString(pm1[i][k],0) << endl;
// first = false;
// }
-// cout << boost::format("%.4f") % pt.score << " "
+// cout << boost::format("%.4f") % pt.score << " "
// << setw(5) << d1 << " " << (match ? "* " : " ")
// << toString(*BT.V2, pt.start2, pt.len2) << " ["
-// << pt.good1 << "/" << pt.joint << "/"
+// << pt.good1 << "/" << pt.joint << "/"
// << pt.good2 << "]";
// for (size_t a = 0; a < pt.aln.size(); a += 2)
// cout << " " << int(pt.aln[a]) << "-" << int(pt.aln[a+1]);
@@ -879,7 +879,7 @@ int main(int argc, char* argv[])
// pp_all.push_back(pt);
// // pp_all.back().m1 -= d1;
// }
-
+
// }
// if (!first) cout << endl;
// }
diff --git a/moses/TranslationModel/UG/util/ibm1-align.cc b/moses/TranslationModel/UG/util/ibm1-align.cc
index 08ac1f89b..3c43743d0 100644
--- a/moses/TranslationModel/UG/util/ibm1-align.cc
+++ b/moses/TranslationModel/UG/util/ibm1-align.cc
@@ -1,7 +1,7 @@
// -*- c++ -*-
// Parallel text alignment via IBM1 / raw counts of word alignments
// aiming at high precision (to seed Yawat alignments)
-// This program is tailored for use with Yawat.
+// This program is tailored for use with Yawat.
// Written by Ulrich Germann.
#include <string>
@@ -29,20 +29,20 @@ public:
table_t COOC;
TokenIndex V1,V2;
- void
+ void
align(string const& s1, string const& s2, vector<int>& aln) const;
- void
- align(vector<id_type> const& x1,
- vector<id_type> const& x2,
+ void
+ align(vector<id_type> const& x1,
+ vector<id_type> const& x2,
vector<int>& aln) const;
-
- void
- fill_amatrix(vector<id_type> const& x1,
- vector<id_type> const& x2,
+
+ void
+ fill_amatrix(vector<id_type> const& x1,
+ vector<id_type> const& x2,
vector<vector<int> >& aln) const;
- void
+ void
open(string const base, string const L1, string const L2);
};
@@ -75,10 +75,10 @@ u(StringPiece str, size_t start, size_t stop)
return ret;
}
-void
+void
IBM1::
-fill_amatrix(vector<id_type> const& x1,
- vector<id_type> const& x2,
+fill_amatrix(vector<id_type> const& x1,
+ vector<id_type> const& x2,
vector<vector<int> >& aln) const
{
aln.assign(x1.size(),vector<int>(x2.size()));
@@ -108,8 +108,8 @@ fill_amatrix(vector<id_type> const& x1,
void
IBM1::
-align(vector<id_type> const& x1,
- vector<id_type> const& x2,
+align(vector<id_type> const& x1,
+ vector<id_type> const& x2,
vector<int>& aln) const
{
vector<vector<int> > M;
@@ -157,7 +157,7 @@ int main(int argc, char* argv[])
// cout << line1 << endl;
// cout << line2 << endl;
// for (size_t i = 0; i < a.size(); i += 2)
- // cout << ibm1.V1[s1[a[i]]] << " - "
+ // cout << ibm1.V1[s1[a[i]]] << " - "
// << ibm1.V2[s2[a[i+1]]] << endl;
}
// cout << endl;
diff --git a/moses/TranslationModel/UG/util/tokenindex.dump.cc b/moses/TranslationModel/UG/util/tokenindex.dump.cc
index 8ab68579d..0e885630f 100644
--- a/moses/TranslationModel/UG/util/tokenindex.dump.cc
+++ b/moses/TranslationModel/UG/util/tokenindex.dump.cc
@@ -13,7 +13,7 @@
using namespace std;
using namespace ugdiss;
-int
+int
main(int argc,char* argv[])
{
if (argc > 1 && !strcmp(argv[1], "-h")) {
diff --git a/moses/TranslationModel/fuzzy-match/FuzzyMatchWrapper.cpp b/moses/TranslationModel/fuzzy-match/FuzzyMatchWrapper.cpp
index 26dce03d0..8cc2e3f57 100644
--- a/moses/TranslationModel/fuzzy-match/FuzzyMatchWrapper.cpp
+++ b/moses/TranslationModel/fuzzy-match/FuzzyMatchWrapper.cpp
@@ -716,7 +716,7 @@ void FuzzyMatchWrapper::basic_fuzzy_match( vector< vector< WORD_ID > > source,
}
unsigned int best_cost = input_length * (100-min_match) / 100 + 2;
string best_path = "";
- int best_match = -1;
+ //int best_match = -1;
// go through all corpus sentences
for(unsigned int s=0; s<source.size(); s++) {
@@ -739,7 +739,7 @@ void FuzzyMatchWrapper::basic_fuzzy_match( vector< vector< WORD_ID > > source,
if (cost < best_cost) {
best_cost = cost;
best_path = path;
- best_match = s;
+ //best_match = s;
}
}
//cout << best_cost << " ||| " << best_match << " ||| " << best_path << endl;
diff --git a/moses/TranslationModel/fuzzy-match/Vocabulary.cpp b/moses/TranslationModel/fuzzy-match/Vocabulary.cpp
index ab1439a29..b70eb98ca 100644
--- a/moses/TranslationModel/fuzzy-match/Vocabulary.cpp
+++ b/moses/TranslationModel/fuzzy-match/Vocabulary.cpp
@@ -1,71 +1,71 @@
-// $Id: Vocabulary.cpp 1565 2008-02-22 14:42:01Z bojar $
-#include "Vocabulary.h"
-#ifdef WITH_THREADS
-#include <boost/thread/locks.hpp>
-#endif
-
-using namespace std;
-
-namespace tmmt
-{
-
-// as in beamdecoder/tables.cpp
-vector<WORD_ID> Vocabulary::Tokenize( const char input[] )
-{
- vector< WORD_ID > token;
- bool betweenWords = true;
- int start=0;
- int i=0;
- for(; input[i] != '\0'; i++) {
- bool isSpace = (input[i] == ' ' || input[i] == '\t');
-
- if (!isSpace && betweenWords) {
- start = i;
- betweenWords = false;
- } else if (isSpace && !betweenWords) {
- token.push_back( StoreIfNew ( string( input+start, i-start ) ) );
- betweenWords = true;
- }
- }
- if (!betweenWords)
- token.push_back( StoreIfNew ( string( input+start, i-start ) ) );
- return token;
-}
-
-WORD_ID Vocabulary::StoreIfNew( const WORD& word )
-{
-
- {
- // read=lock scope
-#ifdef WITH_THREADS
- boost::shared_lock<boost::shared_mutex> read_lock(m_accessLock);
-#endif
- map<WORD, WORD_ID>::iterator i = lookup.find( word );
-
- if( i != lookup.end() )
- return i->second;
- }
-
-#ifdef WITH_THREADS
- boost::unique_lock<boost::shared_mutex> lock(m_accessLock);
-#endif
- WORD_ID id = vocab.size();
- vocab.push_back( word );
- lookup[ word ] = id;
- return id;
-}
-
-WORD_ID Vocabulary::GetWordID( const WORD &word )
-{
-#ifdef WITH_THREADS
- boost::shared_lock<boost::shared_mutex> read_lock(m_accessLock);
-#endif
- map<WORD, WORD_ID>::iterator i = lookup.find( word );
- if( i == lookup.end() )
- return 0;
- WORD_ID w= (WORD_ID) i->second;
- return w;
-}
-
-}
-
+// $Id: Vocabulary.cpp 1565 2008-02-22 14:42:01Z bojar $
+#include "Vocabulary.h"
+#ifdef WITH_THREADS
+#include <boost/thread/locks.hpp>
+#endif
+
+using namespace std;
+
+namespace tmmt
+{
+
+// as in beamdecoder/tables.cpp
+vector<WORD_ID> Vocabulary::Tokenize( const char input[] )
+{
+ vector< WORD_ID > token;
+ bool betweenWords = true;
+ int start=0;
+ int i=0;
+ for(; input[i] != '\0'; i++) {
+ bool isSpace = (input[i] == ' ' || input[i] == '\t');
+
+ if (!isSpace && betweenWords) {
+ start = i;
+ betweenWords = false;
+ } else if (isSpace && !betweenWords) {
+ token.push_back( StoreIfNew ( string( input+start, i-start ) ) );
+ betweenWords = true;
+ }
+ }
+ if (!betweenWords)
+ token.push_back( StoreIfNew ( string( input+start, i-start ) ) );
+ return token;
+}
+
+WORD_ID Vocabulary::StoreIfNew( const WORD& word )
+{
+
+ {
+ // read=lock scope
+#ifdef WITH_THREADS
+ boost::shared_lock<boost::shared_mutex> read_lock(m_accessLock);
+#endif
+ map<WORD, WORD_ID>::iterator i = lookup.find( word );
+
+ if( i != lookup.end() )
+ return i->second;
+ }
+
+#ifdef WITH_THREADS
+ boost::unique_lock<boost::shared_mutex> lock(m_accessLock);
+#endif
+ WORD_ID id = vocab.size();
+ vocab.push_back( word );
+ lookup[ word ] = id;
+ return id;
+}
+
+WORD_ID Vocabulary::GetWordID( const WORD &word )
+{
+#ifdef WITH_THREADS
+ boost::shared_lock<boost::shared_mutex> read_lock(m_accessLock);
+#endif
+ map<WORD, WORD_ID>::iterator i = lookup.find( word );
+ if( i == lookup.end() )
+ return 0;
+ WORD_ID w= (WORD_ID) i->second;
+ return w;
+}
+
+}
+
diff --git a/moses/TranslationOption.cpp b/moses/TranslationOption.cpp
index 610df63f7..52bf49fb2 100644
--- a/moses/TranslationOption.cpp
+++ b/moses/TranslationOption.cpp
@@ -66,7 +66,7 @@ bool TranslationOption::Overlap(const Hypothesis &hypothesis) const
return bitmap.Overlap(GetSourceWordsRange());
}
-void
+void
TranslationOption::
CacheLexReorderingScores(const LexicalReordering &producer, const Scores &score)
{
@@ -112,7 +112,7 @@ ostream& operator<<(ostream& out, const TranslationOption& possibleTranslation)
/** returns cached scores */
const Scores*
TranslationOption::
-GetLexReorderingScores(LexicalReordering const* scoreProducer) const
+GetLexReorderingScores(LexicalReordering const* scoreProducer) const
{
return m_targetPhrase.GetExtraScores(scoreProducer);
// _ScoreCacheMap::const_iterator it;
diff --git a/moses/TranslationOption.h b/moses/TranslationOption.h
index e99ff5d13..4bf545f7d 100644
--- a/moses/TranslationOption.h
+++ b/moses/TranslationOption.h
@@ -71,8 +71,8 @@ protected:
float m_futureScore; /*< estimate of total cost when using this translation option, includes language model probabilities */
// typedef std::map<const LexicalReordering*, Scores> _ScoreCacheMap;
- // _ScoreCacheMap m_lexReorderingScores;
- // m_lexReorderingScores was moved to TargetPhrase.h so that phrase tables
+ // _ScoreCacheMap m_lexReorderingScores;
+ // m_lexReorderingScores was moved to TargetPhrase.h so that phrase tables
// can add information (such as lexical reordering scores) to target phrases
// during lookup.
@@ -156,14 +156,14 @@ public:
}
/** returns cached scores */
- // inline
+ // inline
const Scores*
GetLexReorderingScores(const LexicalReordering *scoreProducer) const;
// {
// return m_targetPhrase.GetExtraScores(scoreProducer);
// }
- void CacheLexReorderingScores(const LexicalReordering &scoreProducer,
+ void CacheLexReorderingScores(const LexicalReordering &scoreProducer,
const Scores &score);
TO_STRING();
diff --git a/moses/TranslationOptionCollection.cpp b/moses/TranslationOptionCollection.cpp
index d599a7ddb..1e3ef9045 100644
--- a/moses/TranslationOptionCollection.cpp
+++ b/moses/TranslationOptionCollection.cpp
@@ -56,10 +56,12 @@ namespace Moses
* corresponding data structure is initialized here This fn should be
* called by inherited classe */
TranslationOptionCollection::
-TranslationOptionCollection(InputType const& src,
+TranslationOptionCollection(ttasksptr const& ttask,
+ InputType const& src,
size_t maxNoTransOptPerCoverage,
float translationOptionThreshold)
- : m_source(src)
+ : m_ttask(ttask)
+ , m_source(src)
, m_futureScore(src.GetSize())
, m_maxNoTransOptPerCoverage(maxNoTransOptPerCoverage)
, m_translationOptionThreshold(translationOptionThreshold)
@@ -624,12 +626,12 @@ CacheLexReordering()
{
size_t const stop = m_source.GetSize();
typedef StatefulFeatureFunction sfFF;
- BOOST_FOREACH(sfFF const* ff, sfFF::GetStatefulFeatureFunctions())
+ BOOST_FOREACH(sfFF const* ff, sfFF::GetStatefulFeatureFunctions())
{
if (typeid(*ff) != typeid(LexicalReordering)) continue;
LexicalReordering const& lr = static_cast<const LexicalReordering&>(*ff);
- for (size_t s = 0 ; s < stop ; s++)
- BOOST_FOREACH(TranslationOptionList& tol, m_collection[s])
+ for (size_t s = 0 ; s < stop ; s++)
+ BOOST_FOREACH(TranslationOptionList& tol, m_collection[s])
lr.SetCache(tol);
}
}
@@ -667,7 +669,7 @@ GetTargetPhraseCollectionBatch()
const Tstep* tstep = dynamic_cast<const Tstep *>(*i);
if (tstep) {
const PhraseDictionary &pdict = *tstep->GetPhraseDictionaryFeature();
- pdict.GetTargetPhraseCollectionBatch(m_inputPathQueue);
+ pdict.GetTargetPhraseCollectionBatch(m_ttask.lock(), m_inputPathQueue);
}
}
}
diff --git a/moses/TranslationOptionCollection.h b/moses/TranslationOptionCollection.h
index 6ec18bf4f..4c0a6bdc6 100644
--- a/moses/TranslationOptionCollection.h
+++ b/moses/TranslationOptionCollection.h
@@ -65,6 +65,7 @@ class TranslationOptionCollection
friend std::ostream& operator<<(std::ostream& out, const TranslationOptionCollection& coll);
TranslationOptionCollection(const TranslationOptionCollection&); /*< no copy constructor */
protected:
+ ttaskwptr m_ttask; // that is and must be a weak pointer!
std::vector< std::vector< TranslationOptionList > > m_collection; /*< contains translation options */
InputType const &m_source; /*< reference to the input */
SquareMatrix m_futureScore; /*< matrix of future costs for contiguous parts (span) of the input */
@@ -73,7 +74,8 @@ protected:
std::vector<const Phrase*> m_unksrcs;
InputPathList m_inputPathQueue;
- TranslationOptionCollection(InputType const& src, size_t maxNoTransOptPerCoverage,
+ TranslationOptionCollection(ttasksptr const& ttask,
+ InputType const& src, size_t maxNoTransOptPerCoverage,
float translationOptionThreshold);
void CalcFutureScore();
@@ -175,7 +177,10 @@ public:
return m_inputPathQueue;
}
-
+ ttasksptr GetTranslationTask() const
+ {
+ return m_ttask.lock();
+ }
TO_STRING();
};
diff --git a/moses/TranslationOptionCollectionConfusionNet.cpp b/moses/TranslationOptionCollectionConfusionNet.cpp
index d7579f0e4..387821102 100644
--- a/moses/TranslationOptionCollectionConfusionNet.cpp
+++ b/moses/TranslationOptionCollectionConfusionNet.cpp
@@ -20,10 +20,11 @@ namespace Moses
/** constructor; just initialize the base class */
TranslationOptionCollectionConfusionNet::
-TranslationOptionCollectionConfusionNet(const ConfusionNet &input,
+TranslationOptionCollectionConfusionNet(ttasksptr const& ttask,
+ const ConfusionNet &input,
size_t maxNoTransOptPerCoverage,
float translationOptionThreshold)
- : TranslationOptionCollection(input, maxNoTransOptPerCoverage,
+ : TranslationOptionCollection(ttask,input, maxNoTransOptPerCoverage,
translationOptionThreshold)
{
// Prefix checkers are phrase dictionaries that provide a prefix check
@@ -105,7 +106,7 @@ TranslationOptionCollectionConfusionNet(const ConfusionNet &input,
bool OK = prefixCheckers.size() == 0;
for (size_t k = 0; !OK && k < prefixCheckers.size(); ++k)
- OK = prefixCheckers[k]->PrefixExists(subphrase);
+ OK = prefixCheckers[k]->PrefixExists(m_ttask.lock(), subphrase);
if (!OK) continue;
const ScorePair &scores = col[i].second;
@@ -161,7 +162,9 @@ void TranslationOptionCollectionConfusionNet::ProcessUnknownWord(size_t sourcePo
}
-void TranslationOptionCollectionConfusionNet::CreateTranslationOptions()
+void
+TranslationOptionCollectionConfusionNet
+::CreateTranslationOptions()
{
if (!StaticData::Instance().GetUseLegacyPT()) {
GetTargetPhraseCollectionBatch();
@@ -198,8 +201,9 @@ CreateTranslationOptionsForRange(const DecodeGraph &decodeGraph,
bool
TranslationOptionCollectionConfusionNet::
-CreateTranslationOptionsForRangeNew(const DecodeGraph &decodeGraph, size_t startPos,
- size_t endPos, bool adhereTableLimit, size_t graphInd)
+CreateTranslationOptionsForRangeNew
+( const DecodeGraph &decodeGraph, size_t startPos, size_t endPos,
+ bool adhereTableLimit, size_t graphInd)
{
InputPathList &inputPathList = GetInputPathList(startPos, endPos);
if (inputPathList.size() == 0) return false; // no input path matches!
diff --git a/moses/TranslationOptionCollectionConfusionNet.h b/moses/TranslationOptionCollectionConfusionNet.h
index 03f21767e..c13e94620 100644
--- a/moses/TranslationOptionCollectionConfusionNet.h
+++ b/moses/TranslationOptionCollectionConfusionNet.h
@@ -35,7 +35,7 @@ protected:
, size_t graphInd);
public:
- TranslationOptionCollectionConfusionNet(const ConfusionNet &source, size_t maxNoTransOptPerCoverage, float translationOptionThreshold);
+ TranslationOptionCollectionConfusionNet(ttasksptr const& ttask, const ConfusionNet &source, size_t maxNoTransOptPerCoverage, float translationOptionThreshold);
void ProcessUnknownWord(size_t sourcePos);
void CreateTranslationOptions();
diff --git a/moses/TranslationOptionCollectionLattice.cpp b/moses/TranslationOptionCollectionLattice.cpp
index 9af62dc81..e2d9e996a 100644
--- a/moses/TranslationOptionCollectionLattice.cpp
+++ b/moses/TranslationOptionCollectionLattice.cpp
@@ -18,10 +18,12 @@ namespace Moses
{
/** constructor; just initialize the base class */
-TranslationOptionCollectionLattice::TranslationOptionCollectionLattice(
- const WordLattice &input
- , size_t maxNoTransOptPerCoverage, float translationOptionThreshold)
- : TranslationOptionCollection(input, maxNoTransOptPerCoverage, translationOptionThreshold)
+TranslationOptionCollectionLattice
+::TranslationOptionCollectionLattice
+( ttasksptr const& ttask, const WordLattice &input,
+ size_t maxNoTransOptPerCoverage, float translationOptionThreshold)
+ : TranslationOptionCollection(ttask, input, maxNoTransOptPerCoverage,
+ translationOptionThreshold)
{
UTIL_THROW_IF2(StaticData::Instance().GetUseLegacyPT(),
"Not for models using the legqacy binary phrase table");
diff --git a/moses/TranslationOptionCollectionLattice.h b/moses/TranslationOptionCollectionLattice.h
index ba7e70bcc..0b03157ea 100644
--- a/moses/TranslationOptionCollectionLattice.h
+++ b/moses/TranslationOptionCollectionLattice.h
@@ -22,7 +22,7 @@ protected:
void ProcessUnknownWord(size_t sourcePos); // do not implement
public:
- TranslationOptionCollectionLattice(const WordLattice &source, size_t maxNoTransOptPerCoverage, float translationOptionThreshold);
+ TranslationOptionCollectionLattice(ttasksptr const& ttask, const WordLattice &source, size_t maxNoTransOptPerCoverage, float translationOptionThreshold);
void CreateTranslationOptions();
diff --git a/moses/TranslationOptionCollectionText.cpp b/moses/TranslationOptionCollectionText.cpp
index 51cefdfe8..354d09f47 100644
--- a/moses/TranslationOptionCollectionText.cpp
+++ b/moses/TranslationOptionCollectionText.cpp
@@ -32,8 +32,8 @@ using namespace std;
namespace Moses
{
/** constructor; just initialize the base class */
-TranslationOptionCollectionText::TranslationOptionCollectionText(Sentence const &input, size_t maxNoTransOptPerCoverage, float translationOptionThreshold)
- : TranslationOptionCollection(input, maxNoTransOptPerCoverage, translationOptionThreshold)
+TranslationOptionCollectionText::TranslationOptionCollectionText(ttasksptr const& ttask, Sentence const &input, size_t maxNoTransOptPerCoverage, float translationOptionThreshold)
+ : TranslationOptionCollection(ttask,input, maxNoTransOptPerCoverage, translationOptionThreshold)
{
size_t size = input.GetSize();
m_inputPathMatrix.resize(size);
diff --git a/moses/TranslationOptionCollectionText.h b/moses/TranslationOptionCollectionText.h
index cdb35963e..deffd952e 100644
--- a/moses/TranslationOptionCollectionText.h
+++ b/moses/TranslationOptionCollectionText.h
@@ -48,7 +48,7 @@ protected:
public:
void ProcessUnknownWord(size_t sourcePos);
- TranslationOptionCollectionText(Sentence const& input, size_t maxNoTransOptPerCoverage, float translationOptionThreshold);
+ TranslationOptionCollectionText(ttasksptr const& ttask, Sentence const& input, size_t maxNoTransOptPerCoverage, float translationOptionThreshold);
bool HasXmlOptionsOverlappingRange(size_t startPosition, size_t endPosition) const;
bool ViolatesXmlOptionsConstraint(size_t startPosition, size_t endPosition, TranslationOption *transOpt) const;
diff --git a/moses/TranslationTask.cpp b/moses/TranslationTask.cpp
index 2b0d47baa..764ca998a 100644
--- a/moses/TranslationTask.cpp
+++ b/moses/TranslationTask.cpp
@@ -23,18 +23,46 @@ using namespace std;
namespace Moses
{
-boost::shared_ptr<TranslationTask>
+std::string const&
TranslationTask
-::create(boost::shared_ptr<InputType> const& source,
+::GetContextString() const
+{
+ return m_context_string;
+}
+
+void
+TranslationTask
+::SetContextString(std::string const& context)
+{
+ m_context_string = context;
+}
+
+
+
+boost::shared_ptr<TranslationTask>
+TranslationTask
+::create(boost::shared_ptr<InputType> const& source)
+{
+ boost::shared_ptr<IOWrapper> nix;
+ boost::shared_ptr<TranslationTask> ret(new TranslationTask(source, nix));
+ ret->m_self = ret;
+ ret->m_scope.reset(new ContextScope);
+ return ret;
+}
+
+boost::shared_ptr<TranslationTask>
+TranslationTask
+::create(boost::shared_ptr<InputType> const& source,
boost::shared_ptr<IOWrapper> const& ioWrapper)
{
boost::shared_ptr<TranslationTask> ret(new TranslationTask(source, ioWrapper));
ret->m_self = ret;
+ ret->m_scope.reset(new ContextScope);
return ret;
}
TranslationTask
-::TranslationTask(boost::shared_ptr<InputType> const& source,
+::TranslationTask(boost::shared_ptr<InputType> const& source,
boost::shared_ptr<IOWrapper> const& ioWrapper)
: m_source(source) , m_ioWrapper(ioWrapper)
{ }
@@ -42,6 +70,59 @@ TranslationTask
TranslationTask::~TranslationTask()
{ }
+
+boost::shared_ptr<BaseManager>
+TranslationTask
+::SetupManager(SearchAlgorithm algo)
+{
+ boost::shared_ptr<BaseManager> manager;
+ StaticData const& staticData = StaticData::Instance();
+ if (algo == DefaultSearchAlgorithm) algo = staticData.GetSearchAlgorithm();
+
+ if (!staticData.IsSyntax(algo))
+ manager.reset(new Manager(this->self())); // phrase-based
+
+ else if (algo == SyntaxF2S || algo == SyntaxT2S)
+ { // STSG-based tree-to-string / forest-to-string decoding (ask Phil Williams)
+ typedef Syntax::F2S::RuleMatcherCallback Callback;
+ typedef Syntax::F2S::RuleMatcherHyperTree<Callback> RuleMatcher;
+ manager.reset(new Syntax::F2S::Manager<RuleMatcher>(this->self()));
+ }
+
+ else if (algo == SyntaxS2T)
+ { // new-style string-to-tree decoding (ask Phil Williams)
+ S2TParsingAlgorithm algorithm = staticData.GetS2TParsingAlgorithm();
+ if (algorithm == RecursiveCYKPlus)
+ {
+ typedef Syntax::S2T::EagerParserCallback Callback;
+ typedef Syntax::S2T::RecursiveCYKPlusParser<Callback> Parser;
+ manager.reset(new Syntax::S2T::Manager<Parser>(this->self()));
+ }
+ else if (algorithm == Scope3)
+ {
+ typedef Syntax::S2T::StandardParserCallback Callback;
+ typedef Syntax::S2T::Scope3Parser<Callback> Parser;
+ manager.reset(new Syntax::S2T::Manager<Parser>(this->self()));
+ }
+ else UTIL_THROW2("ERROR: unhandled S2T parsing algorithm");
+ }
+
+ else if (algo == SyntaxT2S_SCFG)
+ { // SCFG-based tree-to-string decoding (ask Phil Williams)
+ typedef Syntax::F2S::RuleMatcherCallback Callback;
+ typedef Syntax::T2S::RuleMatcherSCFG<Callback> RuleMatcher;
+ manager.reset(new Syntax::T2S::Manager<RuleMatcher>(this->self()));
+ }
+
+ else if (algo == ChartIncremental) // Ken's incremental decoding
+ manager.reset(new Incremental::Manager(this->self()));
+
+ else // original SCFG manager
+ manager.reset(new ChartManager(this->self()));
+
+ return manager;
+}
+
void TranslationTask::Run()
{
UTIL_THROW_IF2(!m_source || !m_ioWrapper,
@@ -50,7 +131,6 @@ void TranslationTask::Run()
// shorthand for "global data"
- const StaticData &staticData = StaticData::Instance();
const size_t translationId = m_source->GetTranslationId();
// report wall time spent on translation
@@ -69,52 +149,22 @@ void TranslationTask::Run()
Timer initTime;
initTime.start();
- // which manager
- boost::scoped_ptr<BaseManager> manager;
-
- if (!staticData.IsSyntax()) {
- // phrase-based
- manager.reset(new Manager(*m_source));
- } else if (staticData.GetSearchAlgorithm() == SyntaxF2S ||
- staticData.GetSearchAlgorithm() == SyntaxT2S) {
- // STSG-based tree-to-string / forest-to-string decoding (ask Phil Williams)
- typedef Syntax::F2S::RuleMatcherCallback Callback;
- typedef Syntax::F2S::RuleMatcherHyperTree<Callback> RuleMatcher;
- manager.reset(new Syntax::F2S::Manager<RuleMatcher>(*m_source));
- } else if (staticData.GetSearchAlgorithm() == SyntaxS2T) {
- // new-style string-to-tree decoding (ask Phil Williams)
- S2TParsingAlgorithm algorithm = staticData.GetS2TParsingAlgorithm();
- if (algorithm == RecursiveCYKPlus) {
- typedef Syntax::S2T::EagerParserCallback Callback;
- typedef Syntax::S2T::RecursiveCYKPlusParser<Callback> Parser;
- manager.reset(new Syntax::S2T::Manager<Parser>(*m_source));
- } else if (algorithm == Scope3) {
- typedef Syntax::S2T::StandardParserCallback Callback;
- typedef Syntax::S2T::Scope3Parser<Callback> Parser;
- manager.reset(new Syntax::S2T::Manager<Parser>(*m_source));
- } else {
- UTIL_THROW2("ERROR: unhandled S2T parsing algorithm");
- }
- } else if (staticData.GetSearchAlgorithm() == SyntaxT2S_SCFG) {
- // SCFG-based tree-to-string decoding (ask Phil Williams)
- typedef Syntax::F2S::RuleMatcherCallback Callback;
- typedef Syntax::T2S::RuleMatcherSCFG<Callback> RuleMatcher;
- manager.reset(new Syntax::T2S::Manager<RuleMatcher>(*m_source));
- } else if (staticData.GetSearchAlgorithm() == ChartIncremental) {
- // Ken's incremental decoding
- manager.reset(new Incremental::Manager(*m_source));
- } else {
- // original SCFG manager
- manager.reset(new ChartManager(*m_source));
- }
+ boost::shared_ptr<BaseManager> manager = SetupManager();
- VERBOSE(1, "Line " << translationId << ": Initialize search took "
+ VERBOSE(1, "Line " << translationId << ": Initialize search took "
<< initTime << " seconds total" << endl);
manager->Decode();
- OutputCollector* ocoll;
+ // new: stop here if m_ioWrapper is NULL. This means that the
+ // owner of the TranslationTask will take care of the output
+ // oh, and by the way, all the output should be handled by the
+ // output wrapper along the lines of *m_iwWrapper << *manager;
+ // Just sayin' ...
+ if (m_ioWrapper == NULL) return;
+
// we are done with search, let's look what we got
+ OutputCollector* ocoll;
Timer additionalReportingTime;
additionalReportingTime.start();
@@ -132,7 +182,7 @@ void TranslationTask::Run()
// Output search graph in hypergraph format for Kenneth Heafield's
// lazy hypergraph decoder; writes to stderr
- manager->OutputSearchGraphHypergraph();
+ manager->OutputSearchGraphHypergraph();
additionalReportingTime.stop();
@@ -158,9 +208,9 @@ void TranslationTask::Run()
// report additional statistics
manager->CalcDecoderStatistics();
- VERBOSE(1, "Line " << translationId << ": Additional reporting took "
+ VERBOSE(1, "Line " << translationId << ": Additional reporting took "
<< additionalReportingTime << " seconds total" << endl);
- VERBOSE(1, "Line " << translationId << ": Translation took "
+ VERBOSE(1, "Line " << translationId << ": Translation took "
<< translationTime << " seconds total" << endl);
IFVERBOSE(2) {
PrintUserTime("Sentence Decoding Time:");
diff --git a/moses/TranslationTask.h b/moses/TranslationTask.h
index acd446abe..df1cf9f48 100644
--- a/moses/TranslationTask.h
+++ b/moses/TranslationTask.h
@@ -8,6 +8,7 @@
#include "moses/IOWrapper.h"
#include "moses/Manager.h"
#include "moses/ChartManager.h"
+#include "moses/ContextScope.h"
#include "moses/Syntax/F2S/Manager.h"
#include "moses/Syntax/S2T/Manager.h"
@@ -17,6 +18,11 @@
#include <boost/weak_ptr.hpp>
#include <boost/make_shared.hpp>
+#ifdef WITH_THREADS
+#include <boost/thread/shared_mutex.hpp>
+#include <boost/thread/locks.hpp>
+#endif
+
namespace Moses
{
class InputType;
@@ -30,19 +36,20 @@ class OutputCollector;
**/
class TranslationTask : public Moses::Task
{
- // no copying, no assignment
+ // no copying, no assignment
TranslationTask(TranslationTask const& other) { }
- TranslationTask const&
+ TranslationTask const&
operator=(TranslationTask const& other) { return *this; }
protected:
boost::weak_ptr<TranslationTask> m_self; // weak ptr to myself
-
+ boost::shared_ptr<ContextScope> m_scope; // sores local info
+ // pointer to ContextScope, which stores context-specific information
TranslationTask() { } ;
- TranslationTask(boost::shared_ptr<Moses::InputType> const& source,
+ TranslationTask(boost::shared_ptr<Moses::InputType> const& source,
boost::shared_ptr<Moses::IOWrapper> const& ioWrapper);
- // Yes, the constructor is protected.
+ // Yes, the constructor is protected.
//
// TranslationTasks can only be created through the creator
// functions create(...). The creator functions set m_self to a
@@ -57,30 +64,52 @@ protected:
// task is still live or not, or maintain a shared_ptr to ensure the
// task stays alive till it's done with it.
+ std::string m_context_string;
public:
- virtual
- boost::shared_ptr<TranslationTask>
+ boost::shared_ptr<TranslationTask>
self() { return m_self.lock(); }
virtual
- boost::shared_ptr<TranslationTask const>
+ boost::shared_ptr<TranslationTask const>
self() const { return m_self.lock(); }
// creator functions
- static boost::shared_ptr<TranslationTask> create();
+ static boost::shared_ptr<TranslationTask> create();
- static boost::shared_ptr<TranslationTask>
- create(boost::shared_ptr<Moses::InputType> const& source,
+ static
+ boost::shared_ptr<TranslationTask>
+ create(boost::shared_ptr<Moses::InputType> const& source);
+
+ static
+ boost::shared_ptr<TranslationTask>
+ create(boost::shared_ptr<Moses::InputType> const& source,
boost::shared_ptr<Moses::IOWrapper> const& ioWrapper);
-
+
~TranslationTask();
/** Translate one sentence
* gets called by main function implemented at end of this source file */
virtual void Run();
-private:
- boost::shared_ptr<Moses::InputType> m_source;
+ boost::shared_ptr<Moses::InputType>
+ GetSource() const { return m_source; }
+
+ boost::shared_ptr<BaseManager>
+ SetupManager(SearchAlgorithm algo = DefaultSearchAlgorithm);
+
+
+ boost::shared_ptr<ContextScope> const&
+ GetScope() const
+ {
+ UTIL_THROW_IF2(m_scope == NULL, "No context scope!");
+ return m_scope;
+ }
+
+ std::string const& GetContextString() const;
+ void SetContextString(std::string const& context);
+
+protected:
+ boost::shared_ptr<Moses::InputType> m_source;
boost::shared_ptr<Moses::IOWrapper> m_ioWrapper;
};
diff --git a/moses/TrellisPath.cpp b/moses/TrellisPath.cpp
index e76adc2db..36397e006 100644
--- a/moses/TrellisPath.cpp
+++ b/moses/TrellisPath.cpp
@@ -31,7 +31,6 @@ namespace Moses
TrellisPath::TrellisPath(const Hypothesis *hypo)
: m_prevEdgeChanged(NOT_FOUND)
{
- m_scoreBreakdown = hypo->GetScoreBreakdown();
m_totalScore = hypo->GetTotalScore();
// enumerate path using prevHypo
@@ -41,10 +40,9 @@ TrellisPath::TrellisPath(const Hypothesis *hypo)
}
}
-void TrellisPath::InitScore()
+void TrellisPath::InitTotalScore()
{
m_totalScore = m_path[0]->GetWinningHypo()->GetTotalScore();
- m_scoreBreakdown= m_path[0]->GetWinningHypo()->GetScoreBreakdown();
//calc score
size_t sizePath = m_path.size();
@@ -53,12 +51,8 @@ void TrellisPath::InitScore()
const Hypothesis *winningHypo = hypo->GetWinningHypo();
if (hypo != winningHypo) {
m_totalScore = m_totalScore - winningHypo->GetTotalScore() + hypo->GetTotalScore();
- m_scoreBreakdown.MinusEquals(winningHypo->GetScoreBreakdown());
- m_scoreBreakdown.PlusEquals(hypo->GetScoreBreakdown());
}
}
-
-
}
TrellisPath::TrellisPath(const TrellisPath &copy, size_t edgeIndex, const Hypothesis *arc)
@@ -80,7 +74,7 @@ TrellisPath::TrellisPath(const TrellisPath &copy, size_t edgeIndex, const Hypoth
prevHypo = prevHypo->GetPrevHypo();
}
- InitScore();
+ InitTotalScore();
}
TrellisPath::TrellisPath(const vector<const Hypothesis*> edges)
@@ -88,9 +82,7 @@ TrellisPath::TrellisPath(const vector<const Hypothesis*> edges)
{
m_path.resize(edges.size());
copy(edges.rbegin(),edges.rend(),m_path.begin());
- InitScore();
-
-
+ InitTotalScore();
}
@@ -172,6 +164,32 @@ void TrellisPath::CreateDeviantPaths(TrellisPathList &pathColl) const
}
}
+const boost::shared_ptr<ScoreComponentCollection> TrellisPath::GetScoreBreakdown() const
+{
+ if (!m_scoreBreakdown) {
+ float totalScore = m_path[0]->GetWinningHypo()->GetTotalScore(); // calculated for sanity check only
+
+ m_scoreBreakdown = boost::shared_ptr<ScoreComponentCollection>(new ScoreComponentCollection());
+ m_scoreBreakdown->PlusEquals(ScoreComponentCollection(m_path[0]->GetWinningHypo()->GetScoreBreakdown()));
+
+ //calc score
+ size_t sizePath = m_path.size();
+ for (size_t pos = 0 ; pos < sizePath ; pos++) {
+ const Hypothesis *hypo = m_path[pos];
+ const Hypothesis *winningHypo = hypo->GetWinningHypo();
+ if (hypo != winningHypo) {
+ totalScore = totalScore - winningHypo->GetTotalScore() + hypo->GetTotalScore();
+ m_scoreBreakdown->MinusEquals(winningHypo->GetScoreBreakdown());
+ m_scoreBreakdown->PlusEquals(hypo->GetScoreBreakdown());
+ }
+ }
+
+ assert(totalScore == m_totalScore);
+ }
+
+ return m_scoreBreakdown;
+}
+
Phrase TrellisPath::GetTargetPhrase() const
{
Phrase targetPhrase(ARRAY_SIZE_INCR);
diff --git a/moses/TrellisPath.h b/moses/TrellisPath.h
index def86549b..89efb32e4 100644
--- a/moses/TrellisPath.h
+++ b/moses/TrellisPath.h
@@ -19,14 +19,14 @@ License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
***********************************************************************/
-#ifndef moses_TrellisPath_h
-#define moses_TrellisPath_h
+#pragma once
#include <iostream>
#include <vector>
#include <limits>
#include "Hypothesis.h"
#include "TypeDef.h"
+#include <boost/shared_ptr.hpp>
namespace Moses
{
@@ -50,13 +50,13 @@ protected:
, or NOT_FOUND if this path is the best trans so consist of only hypos
*/
- ScoreComponentCollection m_scoreBreakdown;
float m_totalScore;
+ mutable boost::shared_ptr<ScoreComponentCollection> m_scoreBreakdown;
//Used by Manager::LatticeSample()
explicit TrellisPath(const std::vector<const Hypothesis*> edges);
- void InitScore();
+ void InitTotalScore();
public:
TrellisPath(); // not implemented
@@ -91,9 +91,7 @@ public:
//! create a list of next best paths by wiggling 1 of the node at a time.
void CreateDeviantPaths(TrellisPathList &pathColl) const;
- inline const ScoreComponentCollection &GetScoreBreakdown() const {
- return m_scoreBreakdown;
- }
+ const boost::shared_ptr<ScoreComponentCollection> GetScoreBreakdown() const;
//! get target words range of the hypo within n-best trellis. not necessarily the same as hypo.GetCurrTargetWordsRange()
WordsRange GetTargetWordsRange(const Hypothesis &hypo) const;
@@ -123,4 +121,4 @@ inline std::ostream& operator<<(std::ostream& out, const TrellisPath& path)
}
}
-#endif
+
diff --git a/moses/TypeDef.h b/moses/TypeDef.h
index a47a6e2fe..66536909f 100644
--- a/moses/TypeDef.h
+++ b/moses/TypeDef.h
@@ -25,6 +25,9 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
#include <string>
#include <stdint.h>
+#include <boost/shared_ptr.hpp>
+#include <boost/weak_ptr.hpp>
+
//! all the typedefs and enums goes here
@@ -136,9 +139,9 @@ enum DictionaryFind {
// model is phrase-based or syntax-based. If you add a syntax-based search
// algorithm here then you should also update StaticData::IsSyntax().
enum SearchAlgorithm {
- Normal = 0
+ Normal = 0
,CubePruning = 1
- //,CubeGrowing = 2
+ //,CubeGrowing = 2
,CYKPlus = 3
,NormalBatch = 4
,ChartIncremental = 5
@@ -146,6 +149,7 @@ enum SearchAlgorithm {
,SyntaxT2S = 7
,SyntaxT2S_SCFG = 8
,SyntaxF2S = 9
+ ,DefaultSearchAlgorithm = 777 // means: use StaticData.m_searchAlgorithm
};
enum SourceLabelOverlap {
@@ -179,5 +183,8 @@ typedef std::vector<FactorType> FactorList;
typedef std::pair<std::vector<std::string const*>,WordAlignments > StringWordAlignmentCand;
+class TranslationTask;
+typedef boost::shared_ptr<TranslationTask> ttasksptr;
+typedef boost::weak_ptr<TranslationTask> ttaskwptr;
}
diff --git a/moses/Util.h b/moses/Util.h
index 68989721c..5c9b493f2 100644
--- a/moses/Util.h
+++ b/moses/Util.h
@@ -421,7 +421,7 @@ inline float CalcTranslationScore(const std::vector<float> &probVector,
out << *this; \
return out.str(); \
} \
-
+
//! delete and remove every element of a collection object such as set, list etc
template<class COLL>
void RemoveAllInColl(COLL &coll)
diff --git a/moses/WordLattice.cpp b/moses/WordLattice.cpp
index d377c1858..7804c9a58 100644
--- a/moses/WordLattice.cpp
+++ b/moses/WordLattice.cpp
@@ -219,7 +219,8 @@ bool WordLattice::CanIGetFromAToB(size_t start, size_t end) const
}
TranslationOptionCollection*
-WordLattice::CreateTranslationOptionCollection() const
+WordLattice
+::CreateTranslationOptionCollection(ttasksptr const& ttask) const
{
size_t maxNoTransOptPerCoverage = StaticData::Instance().GetMaxNoTransOptPerCoverage();
float translationOptionThreshold = StaticData::Instance().GetTranslationOptionThreshold();
@@ -228,9 +229,9 @@ WordLattice::CreateTranslationOptionCollection() const
//rv = new TranslationOptionCollectionConfusionNet(*this, maxNoTransOptPerCoverage, translationOptionThreshold);
if (StaticData::Instance().GetUseLegacyPT()) {
- rv = new TranslationOptionCollectionConfusionNet(*this, maxNoTransOptPerCoverage, translationOptionThreshold);
+ rv = new TranslationOptionCollectionConfusionNet(ttask, *this, maxNoTransOptPerCoverage, translationOptionThreshold);
} else {
- rv = new TranslationOptionCollectionLattice(*this, maxNoTransOptPerCoverage, translationOptionThreshold);
+ rv = new TranslationOptionCollectionLattice(ttask, *this, maxNoTransOptPerCoverage, translationOptionThreshold);
}
assert(rv);
diff --git a/moses/WordLattice.h b/moses/WordLattice.h
index d8f93a4b4..4dc937858 100644
--- a/moses/WordLattice.h
+++ b/moses/WordLattice.h
@@ -1,3 +1,4 @@
+// -*- c++ -*-
#ifndef moses_WordLattice_h
#define moses_WordLattice_h
@@ -9,7 +10,7 @@
namespace Moses
{
-class TranslationTask;
+class TranslationTask;
/** An input to the decoder that represent a word lattice.
* @todo why is this inherited from confusion net?
@@ -53,7 +54,8 @@ public:
return next_nodes[pos];
}
- TranslationOptionCollection *CreateTranslationOptionCollection() const;
+ TranslationOptionCollection*
+ CreateTranslationOptionCollection(ttasksptr const& ttask) const;
};
}
diff --git a/moses/mbr.cpp b/moses/mbr.cpp
index df2313b66..66dac47f7 100644
--- a/moses/mbr.cpp
+++ b/moses/mbr.cpp
@@ -105,13 +105,13 @@ const TrellisPath doMBR(const TrellisPathList& nBestList)
for (iter = nBestList.begin() ; iter != nBestList.end() ; ++iter) {
const TrellisPath &path = **iter;
float score = StaticData::Instance().GetMBRScale()
- * path.GetScoreBreakdown().GetWeightedScore();
+ * path.GetScoreBreakdown()->GetWeightedScore();
if (maxScore < score) maxScore = score;
}
for (iter = nBestList.begin() ; iter != nBestList.end() ; ++iter) {
const TrellisPath &path = **iter;
- joint_prob = UntransformScore(StaticData::Instance().GetMBRScale() * path.GetScoreBreakdown().GetWeightedScore() - maxScore);
+ joint_prob = UntransformScore(StaticData::Instance().GetMBRScale() * path.GetScoreBreakdown()->GetWeightedScore() - maxScore);
marginal += joint_prob;
joint_prob_vec.push_back(joint_prob);
diff --git a/moses/server/Optimizer.cpp b/moses/server/Optimizer.cpp
index d7d5f939c..d28d7f085 100644
--- a/moses/server/Optimizer.cpp
+++ b/moses/server/Optimizer.cpp
@@ -18,47 +18,47 @@ namespace MosesServer
void
Optimizer::
execute(xmlrpc_c::paramList const& paramList,
- xmlrpc_c::value * const retvalP)
+ xmlrpc_c::value * const retvalP)
{
#ifdef WITH_DLIB
const params_t params = paramList.getStruct(0);
params_t::const_iterator si;
- if ((si = params.find("model_name")) == params.end())
+ if ((si = params.find("model_name")) == params.end())
{
string msg = "Missing name of model to be optimized";
msg += " (e.g. PhraseDictionaryMultiModelCounts0)";
throw xmlrpc_c::fault(msg, xmlrpc_c::fault::CODE_PARSE);
}
const string model_name = xmlrpc_c::value_string(si->second);
-
- if ((si = params.find("phrase_pairs")) == params.end())
+
+ if ((si = params.find("phrase_pairs")) == params.end())
{
throw xmlrpc_c::fault("Missing list of phrase pairs",
xmlrpc_c::fault::CODE_PARSE);
}
-
+
vector<pair<string, string> > phrase_pairs;
-
+
xmlrpc_c::value_array pp_array = xmlrpc_c::value_array(si->second);
vector<xmlrpc_c::value> ppValVec(pp_array.vectorValueValue());
- for (size_t i = 0; i < ppValVec.size(); ++i)
+ for (size_t i = 0; i < ppValVec.size(); ++i)
{
- xmlrpc_c::value_array pp_array
+ xmlrpc_c::value_array pp_array
= xmlrpc_c::value_array(ppValVec[i]);
vector<xmlrpc_c::value> pp(pp_array.vectorValueValue());
string L1 = xmlrpc_c::value_string(pp[0]);
string L2 = xmlrpc_c::value_string(pp[1]);
phrase_pairs.push_back(make_pair(L1,L2));
}
-
- // PhraseDictionaryMultiModel* pdmm
+
+ // PhraseDictionaryMultiModel* pdmm
// = (PhraseDictionaryMultiModel*) FindPhraseDictionary(model_name);
PhraseDictionaryMultiModel* pdmm = FindPhraseDictionary(model_name);
vector<float> weight_vector = pdmm->MinimizePerplexity(phrase_pairs);
-
+
vector<xmlrpc_c::value> weight_vector_ret;
- for (size_t i=0;i < weight_vector.size();i++)
+ for (size_t i=0;i < weight_vector.size();i++)
weight_vector_ret.push_back(xmlrpc_c::value_double(weight_vector[i]));
*retvalP = xmlrpc_c::value_array(weight_vector_ret);
diff --git a/moses/server/Optimizer.h b/moses/server/Optimizer.h
index 5e2302d09..8911b089f 100644
--- a/moses/server/Optimizer.h
+++ b/moses/server/Optimizer.h
@@ -6,11 +6,11 @@
namespace MosesServer
{
- class
+ class
Optimizer : public xmlrpc_c::method
{
public:
- Optimizer();
+ Optimizer();
void execute(xmlrpc_c::paramList const& paramList,
xmlrpc_c::value * const retvalP);
};
diff --git a/moses/server/TranslationRequest.cpp b/moses/server/TranslationRequest.cpp
index 1953a711f..62e3031fa 100644
--- a/moses/server/TranslationRequest.cpp
+++ b/moses/server/TranslationRequest.cpp
@@ -4,9 +4,9 @@
namespace MosesServer
{
using namespace std;
- using Moses::Hypothesis;
- using Moses::StaticData;
- using Moses::WordsRange;
+ using Moses::Hypothesis;
+ using Moses::StaticData;
+ using Moses::WordsRange;
using Moses::ChartHypothesis;
using Moses::Phrase;
using Moses::Manager;
@@ -23,8 +23,8 @@ namespace MosesServer
boost::shared_ptr<TranslationRequest>
TranslationRequest::
- create(xmlrpc_c::paramList const& paramList,
- boost::condition_variable& cond,
+ create(xmlrpc_c::paramList const& paramList,
+ boost::condition_variable& cond,
boost::mutex& mut)
{
boost::shared_ptr<TranslationRequest> ret;
@@ -33,75 +33,75 @@ namespace MosesServer
return ret;
}
- void
+ void
TranslationRequest::
- Run()
+ Run()
{
parse_request(m_paramList.getStruct(0));
-
+
Moses::StaticData const& SD = Moses::StaticData::Instance();
-
+
//Make sure alternative paths are retained, if necessary
- if (m_withGraphInfo || m_nbestSize>0)
+ if (m_withGraphInfo || m_nbestSize>0)
// why on earth is this a global variable? Is this even thread-safe???? UG
(const_cast<Moses::StaticData&>(SD)).SetOutputSearchGraph(true);
-
+
std::stringstream out, graphInfo, transCollOpts;
-
- if (SD.IsSyntax())
+
+ if (SD.IsSyntax())
run_chart_decoder();
- else
+ else
run_phrase_decoder();
-
+
XVERBOSE(1,"Output: " << out.str() << endl);
{
boost::lock_guard<boost::mutex> lock(m_mutex);
m_done = true;
}
m_cond.notify_one();
-
+
}
-
+
/// add phrase alignment information from a Hypothesis
- void
+ void
TranslationRequest::
add_phrase_aln_info(Hypothesis const& h, vector<xmlrpc_c::value>& aInfo) const
{
if (!m_withAlignInfo) return;
WordsRange const& trg = h.GetCurrTargetWordsRange();
WordsRange const& src = h.GetCurrSourceWordsRange();
-
+
std::map<std::string, xmlrpc_c::value> pAlnInfo;
pAlnInfo["tgt-start"] = xmlrpc_c::value_int(trg.GetStartPos());
pAlnInfo["src-start"] = xmlrpc_c::value_int(src.GetStartPos());
pAlnInfo["src-end"] = xmlrpc_c::value_int(src.GetEndPos());
aInfo.push_back(xmlrpc_c::value_struct(pAlnInfo));
}
-
- void
+
+ void
TranslationRequest::
- outputChartHypo(ostream& out, const ChartHypothesis* hypo)
+ outputChartHypo(ostream& out, const ChartHypothesis* hypo)
{
Phrase outPhrase(20);
hypo->GetOutputPhrase(outPhrase);
-
+
// delete 1st & last
assert(outPhrase.GetSize() >= 2);
outPhrase.RemoveWord(0);
outPhrase.RemoveWord(outPhrase.GetSize() - 1);
- for (size_t pos = 0 ; pos < outPhrase.GetSize() ; pos++)
+ for (size_t pos = 0 ; pos < outPhrase.GetSize() ; pos++)
out << *outPhrase.GetFactor(pos, 0) << " ";
}
- bool
+ bool
TranslationRequest::
- compareSearchGraphNode(const Moses::SearchGraphNode& a,
- const Moses::SearchGraphNode& b)
+ compareSearchGraphNode(const Moses::SearchGraphNode& a,
+ const Moses::SearchGraphNode& b)
{ return a.hypo->GetId() < b.hypo->GetId(); }
- void
+ void
TranslationRequest::
- insertGraphInfo(Manager& manager, map<string, xmlrpc_c::value>& retData)
+ insertGraphInfo(Manager& manager, map<string, xmlrpc_c::value>& retData)
{
using xmlrpc_c::value_int;
using xmlrpc_c::value_double;
@@ -119,13 +119,13 @@ namespace MosesServer
const Hypothesis* hypo = n.hypo;
x["hyp"] = value_int(hypo->GetId());
x["stack"] = value_int(hypo->GetWordsBitmap().GetNumWordsCovered());
- if (hypo->GetId() != 0)
+ if (hypo->GetId() != 0)
{
const Hypothesis *prevHypo = hypo->GetPrevHypo();
x["back"] = value_int(prevHypo->GetId());
x["score"] = value_double(hypo->GetScore());
x["transition"] = value_double(hypo->GetScore() - prevHypo->GetScore());
- if (n.recombinationHypo)
+ if (n.recombinationHypo)
x["recombined"] = value_int(n.recombinationHypo->GetId());
x["cover-start"] = value_int(hypo->GetCurrSourceWordsRange().GetStartPos());
x["cover-end"] = value_int(hypo->GetCurrSourceWordsRange().GetEndPos());
@@ -136,26 +136,26 @@ namespace MosesServer
retData["sg"] = xmlrpc_c::value_array(searchGraphXml);
}
- void
+ void
TranslationRequest::
output_phrase(ostream& out, Phrase const& phrase) const
{
- if (!m_reportAllFactors)
+ if (!m_reportAllFactors)
{
- for (size_t i = 0 ; i < phrase.GetSize(); ++i)
+ for (size_t i = 0 ; i < phrase.GetSize(); ++i)
out << *phrase.GetFactor(i, 0) << " ";
}
else out << phrase;
}
-
- void
+
+ void
TranslationRequest::
outputNBest(const Manager& manager, map<string, xmlrpc_c::value>& retData)
{
TrellisPathList nBestList;
vector<xmlrpc_c::value> nBestXml;
manager.CalcNBest(m_nbestSize, nBestList, m_nbestDistinct);
-
+
BOOST_FOREACH(Moses::TrellisPath const* path, nBestList)
{
vector<const Hypothesis *> const& E = path->GetEdges();
@@ -166,30 +166,30 @@ namespace MosesServer
{
// should the score breakdown be reported in a more structured manner?
ostringstream buf;
- path->GetScoreBreakdown().OutputAllFeatureScores(buf);
+ path->GetScoreBreakdown()->OutputAllFeatureScores(buf);
nBestXmlItem["fvals"] = xmlrpc_c::value_string(buf.str());
}
-
+
// weighted score
nBestXmlItem["totalScore"] = xmlrpc_c::value_double(path->GetTotalScore());
nBestXml.push_back(xmlrpc_c::value_struct(nBestXmlItem));
}
retData["nbest"] = xmlrpc_c::value_array(nBestXml);
}
-
- void
+
+ void
TranslationRequest::
- insertTranslationOptions(Moses::Manager& manager,
- std::map<std::string, xmlrpc_c::value>& retData)
+ insertTranslationOptions(Moses::Manager& manager,
+ std::map<std::string, xmlrpc_c::value>& retData)
{
- const TranslationOptionCollection* toptsColl
+ const TranslationOptionCollection* toptsColl
= manager.getSntTranslationOptions();
vector<xmlrpc_c::value> toptsXml;
size_t const stop = toptsColl->GetSource().GetSize();
TranslationOptionList const* tol;
- for (size_t s = 0 ; s < stop ; ++s)
+ for (size_t s = 0 ; s < stop ; ++s)
{
- for (size_t e = s;
+ for (size_t e = s;
(tol = toptsColl->GetTranslationOptionList(s,e)) != NULL;
++e)
{
@@ -204,11 +204,11 @@ namespace MosesServer
toptXml["start"] = xmlrpc_c::value_int(s);
toptXml["end"] = xmlrpc_c::value_int(e);
vector<xmlrpc_c::value> scoresXml;
- const std::valarray<FValue> &scores
+ const std::valarray<FValue> &scores
= topt->GetScoreBreakdown().getCoreFeatures();
- for (size_t j = 0; j < scores.size(); ++j)
+ for (size_t j = 0; j < scores.size(); ++j)
scoresXml.push_back(xmlrpc_c::value_double(scores[j]));
-
+
toptXml["scores"] = xmlrpc_c::value_array(scoresXml);
toptsXml.push_back(xmlrpc_c::value_struct(toptXml));
}
@@ -217,13 +217,13 @@ namespace MosesServer
retData["topt"] = xmlrpc_c::value_array(toptsXml);
}
- bool
+ bool
check(std::map<std::string, xmlrpc_c::value> const& params, std::string const key)
{
std::map<std::string, xmlrpc_c::value>::const_iterator m;
return (params.find(key) != params.end());
}
-
+
TranslationRequest::
TranslationRequest(xmlrpc_c::paramList const& paramList,
boost::condition_variable& cond, boost::mutex& mut)
@@ -236,15 +236,15 @@ namespace MosesServer
{ // parse XMLRPC request
// params_t const params = m_paramList.getStruct(0);
m_paramList.verifyEnd(1); // ??? UG
-
+
// source text must be given, or we don't know what to translate
typedef std::map<std::string, xmlrpc_c::value> params_t;
params_t::const_iterator si = params.find("text");
- if (si == params.end())
+ if (si == params.end())
throw xmlrpc_c::fault("Missing source text", xmlrpc_c::fault::CODE_PARSE);
m_source_string = xmlrpc_c::value_string(si->second);
XVERBOSE(1,"Input: " << m_source_string << endl);
-
+
m_withAlignInfo = check(params, "align");
m_withWordAlignInfo = check(params, "word-align");
m_withGraphInfo = check(params, "sg");
@@ -252,31 +252,31 @@ namespace MosesServer
m_reportAllFactors = check(params, "report-all-factors");
m_nbestDistinct = check(params, "nbest-distinct");
m_withScoreBreakdown = check(params, "add-score-breakdown");
-
+ m_source.reset(new Sentence(0,m_source_string));
si = params.find("lambda");
- if (si != params.end())
+ if (si != params.end())
{
// muMo = multiModel
xmlrpc_c::value_array muMoArray = xmlrpc_c::value_array(si->second);
vector<xmlrpc_c::value> muMoValVec(muMoArray.vectorValueValue());
vector<float> w(muMoValVec.size());
- for (size_t i = 0; i < muMoValVec.size(); ++i)
+ for (size_t i = 0; i < muMoValVec.size(); ++i)
w[i] = xmlrpc_c::value_double(muMoValVec[i]);
if (w.size() && (si = params.find("model_name")) != params.end())
{
string const model_name = xmlrpc_c::value_string(si->second);
- PhraseDictionaryMultiModel* pdmm
+ PhraseDictionaryMultiModel* pdmm
= (PhraseDictionaryMultiModel*) FindPhraseDictionary(model_name);
- // Moses::PhraseDictionaryMultiModel* pdmm
+ // Moses::PhraseDictionaryMultiModel* pdmm
// = FindPhraseDictionary(model_name);
pdmm->SetTemporaryMultiModelWeightsVector(w);
}
}
-
+
// // biased sampling for suffix-array-based sampling phrase table?
// if ((si = params.find("bias")) != params.end())
- // {
- // std::vector<xmlrpc_c::value> tmp
+ // {
+ // std::vector<xmlrpc_c::value> tmp
// = xmlrpc_c::value_array(si->second).cvalue();
// for (size_t i = 1; i < tmp.size(); i += 2)
// m_bias[xmlrpc_c::value_int(tmp[i-1])] = xmlrpc_c::value_double(tmp[i]);
@@ -288,28 +288,28 @@ namespace MosesServer
TranslationRequest::
run_chart_decoder()
{
- Moses::TreeInput tinput;
+ Moses::TreeInput tinput;
istringstream buf(m_source_string + "\n");
tinput.Read(buf, StaticData::Instance().GetInputFactorOrder());
-
- Moses::ChartManager manager(tinput);
+
+ Moses::ChartManager manager(this->self());
manager.Decode();
-
+
const Moses::ChartHypothesis *hypo = manager.GetBestHypothesis();
ostringstream out;
outputChartHypo(out,hypo);
-
+
m_target_string = out.str();
m_retData["text"] = xmlrpc_c::value_string(m_target_string);
-
- if (m_withGraphInfo)
+
+ if (m_withGraphInfo)
{
std::ostringstream sgstream;
manager.OutputSearchGraphMoses(sgstream);
m_retData["sg"] = xmlrpc_c::value_string(sgstream.str());
}
} // end of TranslationRequest::run_chart_decoder()
-
+
void
TranslationRequest::
pack_hypothesis(vector<Hypothesis const* > const& edges, string const& key,
@@ -320,7 +320,7 @@ namespace MosesServer
BOOST_REVERSE_FOREACH(Hypothesis const* e, edges)
output_phrase(target, e->GetCurrTargetPhrase());
dest[key] = xmlrpc_c::value_string(target.str());
-
+
if (m_withAlignInfo)
{ // phrase alignment, if requested
@@ -356,19 +356,19 @@ namespace MosesServer
TranslationRequest::
run_phrase_decoder()
{
- Manager manager(Sentence(0, m_source_string));
+ Manager manager(this->self());
// if (m_bias.size()) manager.SetBias(&m_bias);
manager.Decode();
-
+
pack_hypothesis(manager.GetBestHypothesis(), "text", m_retData);
-
+
if (m_withGraphInfo) insertGraphInfo(manager,m_retData);
if (m_withTopts) insertTranslationOptions(manager,m_retData);
if (m_nbestSize) outputNBest(manager, m_retData);
-
+
(const_cast<StaticData&>(Moses::StaticData::Instance()))
- .SetOutputSearchGraph(false);
+ .SetOutputSearchGraph(false);
// WTF? one more reason not to have this as global variable! --- UG
-
+
}
}
diff --git a/moses/server/TranslationRequest.h b/moses/server/TranslationRequest.h
index 89b3c5097..6c7cd7275 100644
--- a/moses/server/TranslationRequest.h
+++ b/moses/server/TranslationRequest.h
@@ -23,7 +23,7 @@
#include <xmlrpc-c/base.hpp>
namespace MosesServer
{
- class
+ class
TranslationRequest : public virtual Moses::TranslationTask
{
boost::condition_variable& m_cond;
@@ -33,7 +33,7 @@ namespace MosesServer
xmlrpc_c::paramList const& m_paramList;
std::map<std::string, xmlrpc_c::value> m_retData;
std::map<uint32_t,float> m_bias; // for biased sampling
-
+
std::string m_source_string, m_target_string;
bool m_withAlignInfo;
bool m_withWordAlignInfo;
@@ -44,21 +44,21 @@ namespace MosesServer
bool m_withScoreBreakdown;
size_t m_nbestSize;
- void
+ void
parse_request();
void
parse_request(std::map<std::string, xmlrpc_c::value> const& req);
-
+
virtual void
run_chart_decoder();
virtual void
run_phrase_decoder();
-
- void
- pack_hypothesis(std::vector<Moses::Hypothesis const* > const& edges,
- std::string const& key,
+
+ void
+ pack_hypothesis(std::vector<Moses::Hypothesis const* > const& edges,
+ std::string const& key,
std::map<std::string, xmlrpc_c::value> & dest) const;
void
@@ -66,57 +66,57 @@ namespace MosesServer
std::map<std::string, xmlrpc_c::value> & dest) const;
- void
+ void
output_phrase(std::ostream& out, Moses::Phrase const& phrase) const;
- void
- add_phrase_aln_info(Moses::Hypothesis const& h,
+ void
+ add_phrase_aln_info(Moses::Hypothesis const& h,
std::vector<xmlrpc_c::value>& aInfo) const;
- void
+ void
outputChartHypo(std::ostream& out, const Moses::ChartHypothesis* hypo);
- bool
- compareSearchGraphNode(const Moses::SearchGraphNode& a,
+ bool
+ compareSearchGraphNode(const Moses::SearchGraphNode& a,
const Moses::SearchGraphNode& b);
- void
- insertGraphInfo(Moses::Manager& manager,
- std::map<std::string, xmlrpc_c::value>& retData);
- void
- outputNBest(Moses::Manager const& manager,
+ void
+ insertGraphInfo(Moses::Manager& manager,
+ std::map<std::string, xmlrpc_c::value>& retData);
+ void
+ outputNBest(Moses::Manager const& manager,
std::map<std::string, xmlrpc_c::value>& retData);
- void
- insertTranslationOptions(Moses::Manager& manager,
+ void
+ insertTranslationOptions(Moses::Manager& manager,
std::map<std::string, xmlrpc_c::value>& retData);
protected:
- TranslationRequest(xmlrpc_c::paramList const& paramList,
- boost::condition_variable& cond,
+ TranslationRequest(xmlrpc_c::paramList const& paramList,
+ boost::condition_variable& cond,
boost::mutex& mut);
public:
static
boost::shared_ptr<TranslationRequest>
- create(xmlrpc_c::paramList const& paramList,
- boost::condition_variable& cond,
+ create(xmlrpc_c::paramList const& paramList,
+ boost::condition_variable& cond,
boost::mutex& mut);
-
-
- virtual bool
+
+
+ virtual bool
DeleteAfterExecution() { return false; }
-
- bool
+
+ bool
IsDone() const { return m_done; }
-
- std::map<std::string, xmlrpc_c::value> const&
+
+ std::map<std::string, xmlrpc_c::value> const&
GetRetData() { return m_retData; }
-
- void
+
+ void
Run();
-
-
+
+
};
}
diff --git a/moses/server/Translator.cpp b/moses/server/Translator.cpp
index 51f863c4b..d4cff99df 100644
--- a/moses/server/Translator.cpp
+++ b/moses/server/Translator.cpp
@@ -8,8 +8,8 @@ namespace MosesServer
using namespace Moses;
Translator::
- Translator(size_t numThreads)
- : m_threadPool(numThreads)
+ Translator(size_t numThreads)
+ : m_threadPool(numThreads)
{
// signature and help strings are documentation -- the client
// can query this information with a system.methodSignature and
@@ -17,21 +17,21 @@ namespace MosesServer
this->_signature = "S:S";
this->_help = "Does translation";
}
-
- void
+
+ void
Translator::
execute(xmlrpc_c::paramList const& paramList,
- xmlrpc_c::value * const retvalP)
+ xmlrpc_c::value * const retvalP)
{
boost::condition_variable cond;
boost::mutex mut;
- boost::shared_ptr<TranslationRequest> task
+ boost::shared_ptr<TranslationRequest> task
= TranslationRequest::create(paramList,cond,mut);
m_threadPool.Submit(task);
boost::unique_lock<boost::mutex> lock(mut);
- while (!task->IsDone())
+ while (!task->IsDone())
cond.wait(lock);
*retvalP = xmlrpc_c::value_struct(task->GetRetData());
}
-
+
}
diff --git a/moses/server/Translator.h b/moses/server/Translator.h
index 062080545..e3117c290 100644
--- a/moses/server/Translator.h
+++ b/moses/server/Translator.h
@@ -10,17 +10,17 @@
#endif
namespace MosesServer
{
- class
+ class
// MosesServer::
Translator : public xmlrpc_c::method
{
public:
Translator(size_t numThreads = 10);
-
+
void execute(xmlrpc_c::paramList const& paramList,
xmlrpc_c::value * const retvalP);
private:
Moses::ThreadPool m_threadPool;
};
-
+
}
diff --git a/moses/server/Updater.cpp b/moses/server/Updater.cpp
index 95cafd71a..818f374a5 100644
--- a/moses/server/Updater.cpp
+++ b/moses/server/Updater.cpp
@@ -6,7 +6,7 @@ namespace MosesServer
using namespace std;
Updater::
- Updater()
+ Updater()
{
// signature and help strings are documentation -- the client
// can query this information with a system.methodSignature and
@@ -18,7 +18,7 @@ namespace MosesServer
void
Updater::
execute(xmlrpc_c::paramList const& paramList,
- xmlrpc_c::value * const retvalP)
+ xmlrpc_c::value * const retvalP)
{
#if PT_UG
const params_t params = paramList.getStruct(0);
@@ -29,20 +29,20 @@ namespace MosesServer
*retvalP = xmlrpc_c::value_string("Phrase table updated");
#endif
};
-
- void
+
+ void
Updater::
- breakOutParams(const params_t& params)
+ breakOutParams(const params_t& params)
{
params_t::const_iterator si = params.find("source");
if(si == params.end())
- throw xmlrpc_c::fault("Missing source sentence",
+ throw xmlrpc_c::fault("Missing source sentence",
xmlrpc_c::fault::CODE_PARSE);
m_src = xmlrpc_c::value_string(si->second);
XVERBOSE(1,"source = " << m_src << endl);
si = params.find("target");
if(si == params.end())
- throw xmlrpc_c::fault("Missing target sentence",
+ throw xmlrpc_c::fault("Missing target sentence",
xmlrpc_c::fault::CODE_PARSE);
m_trg = xmlrpc_c::value_string(si->second);
XVERBOSE(1,"target = " << m_trg << endl);
@@ -53,5 +53,5 @@ namespace MosesServer
m_bounded = ((si = params.find("bounded")) != params.end());
m_add2ORLM = ((si = params.find("updateORLM")) != params.end());
};
-
+
}
diff --git a/moses/server/Updater.h b/moses/server/Updater.h
index c3c72da50..9bb20b775 100644
--- a/moses/server/Updater.h
+++ b/moses/server/Updater.h
@@ -19,7 +19,7 @@
namespace MosesServer
{
- class
+ class
Updater: public xmlrpc_c::method
{
@@ -31,14 +31,14 @@ namespace MosesServer
public:
Updater();
-
+
void
execute(xmlrpc_c::paramList const& paramList,
xmlrpc_c::value * const retvalP);
- void
+ void
breakOutParams(const params_t& params);
-
+
};
}
diff --git a/moses/thread_safe_container.h b/moses/thread_safe_container.h
new file mode 100644
index 000000000..1983d7234
--- /dev/null
+++ b/moses/thread_safe_container.h
@@ -0,0 +1,125 @@
+// -*- c++ -*-
+#pragma once
+#include "moses/Util.h"
+#ifdef WITH_THREADS
+
+#include <time.h>
+#include <boost/thread.hpp>
+#include <boost/scoped_ptr.hpp>
+#include <boost/shared_ptr.hpp>
+#include <boost/weak_ptr.hpp>
+
+#include "moses/TargetPhrase.h"
+#include <boost/thread/shared_mutex.hpp>
+#include <boost/thread/locks.hpp>
+
+#include <map>
+
+namespace Moses
+{
+
+ // todo: replace this with thread lock-free containers, if a stable library can
+ // be found somewhere
+
+ template<typename KEY, typename VAL, class CONTAINER = std::map<KEY,VAL> >
+ class
+ ThreadSafeContainer
+ {
+ protected:
+ mutable boost::shared_mutex m_lock;
+ CONTAINER m_container;
+ typedef typename CONTAINER::iterator iter_t;
+ typedef typename CONTAINER::const_iterator const_iter_t;
+ typedef typename CONTAINER::value_type entry_t;
+ public:
+
+ class locking_iterator
+ {
+ boost::unique_lock<boost::shared_mutex> m_lock;
+ CONTAINER const* m_container;
+ const_iter_t m_iter;
+
+ locking_iterator(locking_iterator const& other); // no copies!
+ public:
+ locking_iterator() : m_container(NULL) { }
+
+ locking_iterator(boost::shared_mutex& lock,
+ CONTAINER const* container,
+ const_iter_t const& iter)
+ : m_lock(lock), m_container(container), m_iter(iter)
+ { }
+
+ entry_t const& operator->()
+ {
+ UTIL_THROW_IF2(m_container == NULL, "This locking iterator is invalid "
+ << "or has not been assigned.");
+ return m_iter.operator->();
+ }
+
+ // locking operators transfer the lock upon assignment and become invalid
+ locking_iterator const&
+ operator=(locking_iterator& other)
+ {
+ m_lock.swap(other.m_lock);
+ m_iter = other.m_iter;
+ other.m_iter = other.m_container.end();
+ }
+
+ bool
+ operator==(const_iter_t const& other)
+ {
+ return m_iter == other;
+ }
+
+ locking_iterator const&
+ operator++() { ++m_iter; return *this; }
+
+ // DO NOT DEFINE THE POST-INCREMENT OPERATOR!
+ // locking_operators are non-copyable,
+ // so we can't simply make a copy before incrementing and return
+ // the copy after incrementing
+ locking_iterator const&
+ operator++(int);
+ };
+
+ const_iter_t const& end() const
+ { return m_container.end(); }
+
+ locking_iterator begin() const
+ {
+ return locking_iterator(m_lock, this, m_container.begin());
+ }
+
+ VAL const& set(KEY const& key, VAL const& val)
+ {
+ boost::unique_lock< boost::shared_mutex > lock(m_lock);
+ entry_t entry(key,val);
+ iter_t foo = m_container.insert(entry).first;
+ foo->second = val;
+ return foo->second;
+ }
+
+ VAL const* get(KEY const& key, VAL const& default_val)
+ {
+ boost::shared_lock< boost::shared_mutex > lock(m_lock);
+ entry_t entry(key, default_val);
+ iter_t foo = m_container.insert(entry).first;
+ return &(foo->second);
+ }
+
+ VAL const* get(KEY const& key) const
+ {
+ boost::shared_lock< boost::shared_mutex > lock(m_lock);
+ const_iter_t m = m_container.find(key);
+ if (m == m_container.end()) return NULL;
+ return &m->second;
+ }
+
+ size_t erase(KEY const& key)
+ {
+ boost::unique_lock< boost::shared_mutex > lock(m_lock);
+ return m_container.erase(key);
+ }
+ };
+}
+#endif
diff --git a/phrase-extract/ExtractionPhrasePair.cpp b/phrase-extract/ExtractionPhrasePair.cpp
index cde712ac6..57821fe44 100644
--- a/phrase-extract/ExtractionPhrasePair.cpp
+++ b/phrase-extract/ExtractionPhrasePair.cpp
@@ -242,7 +242,7 @@ void ExtractionPhrasePair::AddProperties( const std::string &propertiesString, f
vector<std::string> keyValue = Moses::TokenizeFirstOnly(tok, " ");
if (keyValue.size() == 2) {
AddProperty(keyValue[0], keyValue[1], count);
- }
+ }
}
}
diff --git a/phrase-extract/XmlTree.h b/phrase-extract/XmlTree.h
index a8c6888d6..50b1c0acc 100644
--- a/phrase-extract/XmlTree.h
+++ b/phrase-extract/XmlTree.h
@@ -1,43 +1,43 @@
-// $Id: XmlOption.cpp 1960 2008-12-15 12:52:38Z phkoehn $
-// vim:tabstop=2
-
-/***********************************************************************
- Moses - factored phrase-based language decoder
- Copyright (C) 2006 University of Edinburgh
-
- This library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- This library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with this library; if not, write to the Free Software
- Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- ***********************************************************************/
-
-#pragma once
-#include <string>
-#include <vector>
-#include <set>
-#include <map>
-#include "SyntaxTree.h"
-
-namespace MosesTraining
-{
-
-std::string ParseXmlTagAttribute(const std::string& tag,const std::string& attributeName);
-std::string Trim(const std::string& str, const std::string dropChars = " \t\n\r");
-std::string TrimXml(const std::string& str);
-bool isXmlTag(const std::string& tag);
-std::vector<std::string> TokenizeXml(const std::string& str);
-bool ProcessAndStripXMLTags(std::string &line, SyntaxTree &tree, std::set< std::string > &labelCollection, std::map< std::string, int > &topLabelCollection, bool unescape = true);
-std::string unescape(const std::string &str);
-
-
-} // namespace
-
+// $Id: XmlOption.cpp 1960 2008-12-15 12:52:38Z phkoehn $
+// vim:tabstop=2
+
+/***********************************************************************
+ Moses - factored phrase-based language decoder
+ Copyright (C) 2006 University of Edinburgh
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ ***********************************************************************/
+
+#pragma once
+#include <string>
+#include <vector>
+#include <set>
+#include <map>
+#include "SyntaxTree.h"
+
+namespace MosesTraining
+{
+
+std::string ParseXmlTagAttribute(const std::string& tag,const std::string& attributeName);
+std::string Trim(const std::string& str, const std::string dropChars = " \t\n\r");
+std::string TrimXml(const std::string& str);
+bool isXmlTag(const std::string& tag);
+std::vector<std::string> TokenizeXml(const std::string& str);
+bool ProcessAndStripXMLTags(std::string &line, SyntaxTree &tree, std::set< std::string > &labelCollection, std::map< std::string, int > &topLabelCollection, bool unescape = true);
+std::string unescape(const std::string &str);
+
+
+} // namespace
+
diff --git a/phrase-extract/consolidate-main.cpp b/phrase-extract/consolidate-main.cpp
index 7f17eb1c8..4ff0b5373 100644
--- a/phrase-extract/consolidate-main.cpp
+++ b/phrase-extract/consolidate-main.cpp
@@ -214,11 +214,11 @@ void loadCountOfCounts( const std::string& fileNameCountOfCounts )
}
-void processFiles( const std::string& fileNameDirect,
- const std::string& fileNameIndirect,
- const std::string& fileNameConsolidated,
- const std::string& fileNameCountOfCounts,
- const std::string& fileNameSourceLabelSet,
+void processFiles( const std::string& fileNameDirect,
+ const std::string& fileNameIndirect,
+ const std::string& fileNameConsolidated,
+ const std::string& fileNameCountOfCounts,
+ const std::string& fileNameSourceLabelSet,
const std::string& fileNamePartsOfSpeechVocabulary )
{
if (goodTuringFlag || kneserNeyFlag)
@@ -260,9 +260,9 @@ void processFiles( const std::string& fileNameDirect,
// indirect: source target probabilities
// consistency checks
- UTIL_THROW_IF2(itemDirect[0].compare( itemIndirect[0] ) != 0,
+ UTIL_THROW_IF2(itemDirect[0].compare( itemIndirect[0] ) != 0,
"target phrase does not match in line " << i << ": '" << itemDirect[0] << "' != '" << itemIndirect[0] << "'");
- UTIL_THROW_IF2(itemDirect[1].compare( itemIndirect[1] ) != 0,
+ UTIL_THROW_IF2(itemDirect[1].compare( itemIndirect[1] ) != 0,
"source phrase does not match in line " << i << ": '" << itemDirect[1] << "' != '" << itemIndirect[1] << "'");
// SCORES ...
diff --git a/phrase-extract/extract-ghkm/ExtractGHKM.cpp b/phrase-extract/extract-ghkm/ExtractGHKM.cpp
index a9a0bffc2..7e9a3ec0a 100644
--- a/phrase-extract/extract-ghkm/ExtractGHKM.cpp
+++ b/phrase-extract/extract-ghkm/ExtractGHKM.cpp
@@ -344,7 +344,7 @@ int ExtractGHKM::Main(int argc, char *argv[])
std::set<std::string> strippedTargetLabelSet;
std::map<std::string, int> strippedTargetTopLabelSet;
- if (options.stripBitParLabels &&
+ if (options.stripBitParLabels &&
(!options.glueGrammarFile.empty() || !options.unknownWordSoftMatchesFile.empty())) {
StripBitParLabels(targetLabelSet, targetTopLabelSet, strippedTargetLabelSet, strippedTargetTopLabelSet);
}
diff --git a/phrase-extract/extract-mixed-syntax/ConsistentPhrases.cpp b/phrase-extract/extract-mixed-syntax/ConsistentPhrases.cpp
index f03a61840..b1d64fc54 100644
--- a/phrase-extract/extract-mixed-syntax/ConsistentPhrases.cpp
+++ b/phrase-extract/extract-mixed-syntax/ConsistentPhrases.cpp
@@ -19,10 +19,10 @@ ConsistentPhrases::ConsistentPhrases()
ConsistentPhrases::~ConsistentPhrases()
{
- for (int start = 0; start < m_coll.size(); ++start) {
+ for (size_t start = 0; start < m_coll.size(); ++start) {
std::vector<Coll> &allSourceStart = m_coll[start];
- for (int size = 0; size < allSourceStart.size(); ++size) {
+ for (size_t size = 0; size < allSourceStart.size(); ++size) {
Coll &coll = allSourceStart[size];
Moses::RemoveAllInColl(coll);
}
@@ -48,8 +48,8 @@ void ConsistentPhrases::Add(int sourceStart, int sourceEnd,
targetStart, targetEnd,
params);
- pair<Coll::iterator, bool> inserted = coll.insert(cp);
- assert(inserted.second);
+ assert(coll.find(cp) == coll.end());
+ coll.insert(cp);
}
const ConsistentPhrases::Coll &ConsistentPhrases::GetColl(int sourceStart, int sourceEnd) const
@@ -69,10 +69,10 @@ ConsistentPhrases::Coll &ConsistentPhrases::GetColl(int sourceStart, int sourceE
std::string ConsistentPhrases::Debug() const
{
std::stringstream out;
- for (int start = 0; start < m_coll.size(); ++start) {
+ for (size_t start = 0; start < m_coll.size(); ++start) {
const std::vector<Coll> &allSourceStart = m_coll[start];
- for (int size = 0; size < allSourceStart.size(); ++size) {
+ for (size_t size = 0; size < allSourceStart.size(); ++size) {
const Coll &coll = allSourceStart[size];
Coll::const_iterator iter;
@@ -89,9 +89,9 @@ std::string ConsistentPhrases::Debug() const
void ConsistentPhrases::AddHieroNonTerms(const Parameter &params)
{
// add [X] labels everywhere
- for (int i = 0; i < m_coll.size(); ++i) {
+ for (size_t i = 0; i < m_coll.size(); ++i) {
vector<Coll> &inner = m_coll[i];
- for (int j = 0; j < inner.size(); ++j) {
+ for (size_t j = 0; j < inner.size(); ++j) {
ConsistentPhrases::Coll &coll = inner[j];
ConsistentPhrases::Coll::iterator iter;
for (iter = coll.begin(); iter != coll.end(); ++iter) {
diff --git a/phrase-extract/extract-mixed-syntax/pugiconfig.hpp b/phrase-extract/extract-mixed-syntax/pugiconfig.hpp
index c2196715c..5a63fd488 100644
--- a/phrase-extract/extract-mixed-syntax/pugiconfig.hpp
+++ b/phrase-extract/extract-mixed-syntax/pugiconfig.hpp
@@ -57,7 +57,7 @@
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
- *
+ *
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
diff --git a/phrase-extract/extract-mixed-syntax/pugixml.cpp b/phrase-extract/extract-mixed-syntax/pugixml.cpp
index fa62a5e96..5076e3cc0 100644
--- a/phrase-extract/extract-mixed-syntax/pugixml.cpp
+++ b/phrase-extract/extract-mixed-syntax/pugixml.cpp
@@ -50,7 +50,7 @@
#endif
#ifdef __INTEL_COMPILER
-# pragma warning(disable: 177) // function was declared but never referenced
+# pragma warning(disable: 177) // function was declared but never referenced
# pragma warning(disable: 279) // controlling expression is constant
# pragma warning(disable: 1478 1786) // function was declared "deprecated"
# pragma warning(disable: 1684) // conversion from pointer to same-sized integral type
diff --git a/phrase-extract/extract-mixed-syntax/pugixml.hpp b/phrase-extract/extract-mixed-syntax/pugixml.hpp
index 82348bd19..a22b59d59 100644
--- a/phrase-extract/extract-mixed-syntax/pugixml.hpp
+++ b/phrase-extract/extract-mixed-syntax/pugixml.hpp
@@ -124,13 +124,13 @@ namespace pugi
// This flag determines if EOL characters are normalized (converted to #xA) during parsing. This flag is on by default.
const unsigned int parse_eol = 0x0020;
-
+
// This flag determines if attribute values are normalized using CDATA normalization rules during parsing. This flag is on by default.
const unsigned int parse_wconv_attribute = 0x0040;
// This flag determines if attribute values are normalized using NMTOKENS normalization rules during parsing. This flag is off by default.
const unsigned int parse_wnorm_attribute = 0x0080;
-
+
// This flag determines if document declaration (node_declaration) is added to the DOM tree. This flag is off by default.
const unsigned int parse_declaration = 0x0100;
@@ -168,16 +168,16 @@ namespace pugi
};
// Formatting flags
-
+
// Indent the nodes that are written to output stream with as many indentation strings as deep the node is in DOM tree. This flag is on by default.
const unsigned int format_indent = 0x01;
-
+
// Write encoding-specific BOM to the output stream. This flag is off by default.
const unsigned int format_write_bom = 0x02;
// Use raw output mode (no indentation and no line breaks are written). This flag is off by default.
const unsigned int format_raw = 0x04;
-
+
// Omit default XML declaration even if there is no declaration in the document. This flag is off by default.
const unsigned int format_no_declaration = 0x08;
@@ -190,7 +190,7 @@ namespace pugi
// The default set of formatting flags.
// Nodes are indented depending on their depth in DOM tree, a default declaration is output if document has none.
const unsigned int format_default = format_indent;
-
+
// Forward declarations
struct xml_attribute_struct;
struct xml_node_struct;
@@ -204,7 +204,7 @@ namespace pugi
class xml_node;
class xml_text;
-
+
#ifndef PUGIXML_NO_XPATH
class xpath_node;
class xpath_node_set;
@@ -277,13 +277,13 @@ namespace pugi
private:
xml_attribute_struct* _attr;
-
+
typedef void (*unspecified_bool_type)(xml_attribute***);
public:
// Default constructor. Constructs an empty attribute.
xml_attribute();
-
+
// Constructs attribute from internal pointer
explicit xml_attribute(xml_attribute_struct* attr);
@@ -378,7 +378,7 @@ namespace pugi
// Borland C++ workaround
bool operator!() const;
-
+
// Comparison operators (compares wrapped node pointers)
bool operator==(const xml_node& r) const;
bool operator!=(const xml_node& r) const;
@@ -396,7 +396,7 @@ namespace pugi
// Get node name/value, or "" if node is empty or it has no name/value
const char_t* name() const;
const char_t* value() const;
-
+
// Get attribute list
xml_attribute first_attribute() const;
xml_attribute last_attribute() const;
@@ -408,7 +408,7 @@ namespace pugi
// Get next/previous sibling in the children list of the parent node
xml_node next_sibling() const;
xml_node previous_sibling() const;
-
+
// Get parent node
xml_node parent() const;
@@ -433,7 +433,7 @@ namespace pugi
// Set node name/value (returns false if node is empty, there is not enough memory, or node can not have name/value)
bool set_name(const char_t* rhs);
bool set_value(const char_t* rhs);
-
+
// Add attribute with specified name. Returns added attribute, or empty attribute on errors.
xml_attribute append_attribute(const char_t* name);
xml_attribute prepend_attribute(const char_t* name);
@@ -476,11 +476,11 @@ namespace pugi
template <typename Predicate> xml_attribute find_attribute(Predicate pred) const
{
if (!_root) return xml_attribute();
-
+
for (xml_attribute attrib = first_attribute(); attrib; attrib = attrib.next_attribute())
if (pred(attrib))
return attrib;
-
+
return xml_attribute();
}
@@ -488,11 +488,11 @@ namespace pugi
template <typename Predicate> xml_node find_child(Predicate pred) const
{
if (!_root) return xml_node();
-
+
for (xml_node node = first_child(); node; node = node.next_sibling())
if (pred(node))
return node;
-
+
return xml_node();
}
@@ -502,7 +502,7 @@ namespace pugi
if (!_root) return xml_node();
xml_node cur = first_child();
-
+
while (cur._root && cur._root != _root)
{
if (pred(cur)) return cur;
@@ -534,7 +534,7 @@ namespace pugi
// Recursively traverse subtree with xml_tree_walker
bool traverse(xml_tree_walker& walker);
-
+
#ifndef PUGIXML_NO_XPATH
// Select single node by evaluating XPath query. Returns first node from the resulting node set.
xpath_node select_single_node(const char_t* query, xpath_variable_set* variables = 0) const;
@@ -544,7 +544,7 @@ namespace pugi
xpath_node_set select_nodes(const char_t* query, xpath_variable_set* variables = 0) const;
xpath_node_set select_nodes(const xpath_query& query) const;
#endif
-
+
// Print subtree using a writer object
void print(xml_writer& writer, const char_t* indent = PUGIXML_TEXT("\t"), unsigned int flags = format_default, xml_encoding encoding = encoding_auto, unsigned int depth = 0) const;
@@ -781,11 +781,11 @@ namespace pugi
private:
int _depth;
-
+
protected:
// Get current traversal depth
int depth() const;
-
+
public:
xml_tree_walker();
virtual ~xml_tree_walker();
@@ -852,7 +852,7 @@ namespace pugi
char_t* _buffer;
char _memory[192];
-
+
// Non-copyable semantics
xml_document(const xml_document&);
const xml_document& operator=(const xml_document&);
@@ -960,7 +960,7 @@ namespace pugi
// Non-copyable semantics
xpath_variable(const xpath_variable&);
xpath_variable& operator=(const xpath_variable&);
-
+
public:
// Get variable name
const char_t* name() const;
@@ -1035,21 +1035,21 @@ namespace pugi
// Get query expression return type
xpath_value_type return_type() const;
-
+
// Evaluate expression as boolean value in the specified context; performs type conversion if necessary.
// If PUGIXML_NO_EXCEPTIONS is not defined, throws std::bad_alloc on out of memory errors.
bool evaluate_boolean(const xpath_node& n) const;
-
+
// Evaluate expression as double value in the specified context; performs type conversion if necessary.
// If PUGIXML_NO_EXCEPTIONS is not defined, throws std::bad_alloc on out of memory errors.
double evaluate_number(const xpath_node& n) const;
-
+
#ifndef PUGIXML_NO_STL
// Evaluate expression as string value in the specified context; performs type conversion if necessary.
// If PUGIXML_NO_EXCEPTIONS is not defined, throws std::bad_alloc on out of memory errors.
string_t evaluate_string(const xpath_node& n) const;
#endif
-
+
// Evaluate expression as string value in the specified context; performs type conversion if necessary.
// At most capacity characters are written to the destination buffer, full result size is returned (includes terminating zero).
// If PUGIXML_NO_EXCEPTIONS is not defined, throws std::bad_alloc on out of memory errors.
@@ -1070,7 +1070,7 @@ namespace pugi
// Borland C++ workaround
bool operator!() const;
};
-
+
#ifndef PUGIXML_NO_EXCEPTIONS
// XPath exception class
class PUGIXML_CLASS xpath_exception: public std::exception
@@ -1089,20 +1089,20 @@ namespace pugi
const xpath_parse_result& result() const;
};
#endif
-
+
// XPath node class (either xml_node or xml_attribute)
class PUGIXML_CLASS xpath_node
{
private:
xml_node _node;
xml_attribute _attribute;
-
+
typedef void (*unspecified_bool_type)(xpath_node***);
public:
// Default constructor; constructs empty XPath node
xpath_node();
-
+
// Construct XPath node from XML node/attribute
xpath_node(const xml_node& node);
xpath_node(const xml_attribute& attribute, const xml_node& parent);
@@ -1110,13 +1110,13 @@ namespace pugi
// Get node/attribute, if any
xml_node node() const;
xml_attribute attribute() const;
-
+
// Get parent of contained node/attribute
xml_node parent() const;
// Safe bool conversion operator
operator unspecified_bool_type() const;
-
+
// Borland C++ workaround
bool operator!() const;
@@ -1142,10 +1142,10 @@ namespace pugi
type_sorted, // Sorted by document order (ascending)
type_sorted_reverse // Sorted by document order (descending)
};
-
+
// Constant iterator type
typedef const xpath_node* const_iterator;
-
+
// Default constructor. Constructs empty set.
xpath_node_set();
@@ -1154,38 +1154,38 @@ namespace pugi
// Destructor
~xpath_node_set();
-
+
// Copy constructor/assignment operator
xpath_node_set(const xpath_node_set& ns);
xpath_node_set& operator=(const xpath_node_set& ns);
// Get collection type
type_t type() const;
-
+
// Get collection size
size_t size() const;
// Indexing operator
const xpath_node& operator[](size_t index) const;
-
+
// Collection iterators
const_iterator begin() const;
const_iterator end() const;
// Sort the collection in ascending/descending order by document order
void sort(bool reverse = false);
-
+
// Get first node in the collection by document order
xpath_node first() const;
-
+
// Check if collection is empty
bool empty() const;
-
+
private:
type_t _type;
-
+
xpath_node _storage;
-
+
xpath_node* _begin;
xpath_node* _end;
@@ -1197,7 +1197,7 @@ namespace pugi
// Convert wide string to UTF8
std::basic_string<char, std::char_traits<char>, std::allocator<char> > PUGIXML_FUNCTION as_utf8(const wchar_t* str);
std::basic_string<char, std::char_traits<char>, std::allocator<char> > PUGIXML_FUNCTION as_utf8(const std::basic_string<wchar_t, std::char_traits<wchar_t>, std::allocator<wchar_t> >& str);
-
+
// Convert UTF8 to wide string
std::basic_string<wchar_t, std::char_traits<wchar_t>, std::allocator<wchar_t> > PUGIXML_FUNCTION as_wide(const char* str);
std::basic_string<wchar_t, std::char_traits<wchar_t>, std::allocator<wchar_t> > PUGIXML_FUNCTION as_wide(const std::basic_string<char, std::char_traits<char>, std::allocator<char> >& str);
@@ -1205,13 +1205,13 @@ namespace pugi
// Memory allocation function interface; returns pointer to allocated memory or NULL on failure
typedef void* (*allocation_function)(size_t size);
-
+
// Memory deallocation function interface
typedef void (*deallocation_function)(void* ptr);
// Override default memory management functions. All subsequent allocations/deallocations will be performed via supplied functions.
void PUGIXML_FUNCTION set_memory_management_functions(allocation_function allocate, deallocation_function deallocate);
-
+
// Get current memory management functions
allocation_function PUGIXML_FUNCTION get_memory_allocation_function();
deallocation_function PUGIXML_FUNCTION get_memory_deallocation_function();
@@ -1253,7 +1253,7 @@ namespace std
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
- *
+ *
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
diff --git a/phrase-extract/pcfg-common/pcfg.cc b/phrase-extract/pcfg-common/pcfg.cc
index cae6d4763..988367c9b 100644
--- a/phrase-extract/pcfg-common/pcfg.cc
+++ b/phrase-extract/pcfg-common/pcfg.cc
@@ -1,17 +1,17 @@
/***********************************************************************
Moses - statistical machine translation system
Copyright (C) 2006-2012 University of Edinburgh
-
+
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
-
+
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
-
+
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
diff --git a/phrase-extract/pcfg-common/tool.cc b/phrase-extract/pcfg-common/tool.cc
index f54e07a12..c41eaf9bd 100644
--- a/phrase-extract/pcfg-common/tool.cc
+++ b/phrase-extract/pcfg-common/tool.cc
@@ -1,17 +1,17 @@
/***********************************************************************
Moses - statistical machine translation system
Copyright (C) 2006-2012 University of Edinburgh
-
+
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
-
+
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
-
+
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
diff --git a/phrase-extract/pcfg-common/xml_tree_parser.cc b/phrase-extract/pcfg-common/xml_tree_parser.cc
index 29e46a9f2..f15a04811 100644
--- a/phrase-extract/pcfg-common/xml_tree_parser.cc
+++ b/phrase-extract/pcfg-common/xml_tree_parser.cc
@@ -1,17 +1,17 @@
/***********************************************************************
Moses - statistical machine translation system
Copyright (C) 2006-2012 University of Edinburgh
-
+
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
-
+
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
-
+
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
diff --git a/phrase-extract/pcfg-extract/main.cc b/phrase-extract/pcfg-extract/main.cc
index 84051f2e2..24549990f 100644
--- a/phrase-extract/pcfg-extract/main.cc
+++ b/phrase-extract/pcfg-extract/main.cc
@@ -1,17 +1,17 @@
/***********************************************************************
Moses - statistical machine translation system
Copyright (C) 2006-2012 University of Edinburgh
-
+
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
-
+
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
-
+
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
diff --git a/phrase-extract/pcfg-extract/pcfg_extract.cc b/phrase-extract/pcfg-extract/pcfg_extract.cc
index a5e06aa82..becb8edee 100644
--- a/phrase-extract/pcfg-extract/pcfg_extract.cc
+++ b/phrase-extract/pcfg-extract/pcfg_extract.cc
@@ -1,17 +1,17 @@
/***********************************************************************
Moses - statistical machine translation system
Copyright (C) 2006-2012 University of Edinburgh
-
+
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
-
+
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
-
+
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
diff --git a/phrase-extract/pcfg-extract/rule_collection.cc b/phrase-extract/pcfg-extract/rule_collection.cc
index 21e84d2fa..488fca6ae 100644
--- a/phrase-extract/pcfg-extract/rule_collection.cc
+++ b/phrase-extract/pcfg-extract/rule_collection.cc
@@ -1,17 +1,17 @@
/***********************************************************************
Moses - statistical machine translation system
Copyright (C) 2006-2012 University of Edinburgh
-
+
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
-
+
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
-
+
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
diff --git a/phrase-extract/pcfg-extract/rule_extractor.cc b/phrase-extract/pcfg-extract/rule_extractor.cc
index bb4698fae..6a99f7848 100644
--- a/phrase-extract/pcfg-extract/rule_extractor.cc
+++ b/phrase-extract/pcfg-extract/rule_extractor.cc
@@ -1,17 +1,17 @@
/***********************************************************************
Moses - statistical machine translation system
Copyright (C) 2006-2012 University of Edinburgh
-
+
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
-
+
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
-
+
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
diff --git a/phrase-extract/pcfg-score/main.cc b/phrase-extract/pcfg-score/main.cc
index 5ce19f797..93db0837a 100644
--- a/phrase-extract/pcfg-score/main.cc
+++ b/phrase-extract/pcfg-score/main.cc
@@ -1,17 +1,17 @@
/***********************************************************************
Moses - statistical machine translation system
Copyright (C) 2006-2012 University of Edinburgh
-
+
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
-
+
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
-
+
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
diff --git a/phrase-extract/pcfg-score/pcfg_score.cc b/phrase-extract/pcfg-score/pcfg_score.cc
index a561c18ed..92f214c8f 100644
--- a/phrase-extract/pcfg-score/pcfg_score.cc
+++ b/phrase-extract/pcfg-score/pcfg_score.cc
@@ -1,17 +1,17 @@
/***********************************************************************
Moses - statistical machine translation system
Copyright (C) 2006-2012 University of Edinburgh
-
+
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
-
+
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
-
+
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
diff --git a/phrase-extract/pcfg-score/tree_scorer.cc b/phrase-extract/pcfg-score/tree_scorer.cc
index 53b6aaccf..9a40332d2 100644
--- a/phrase-extract/pcfg-score/tree_scorer.cc
+++ b/phrase-extract/pcfg-score/tree_scorer.cc
@@ -1,17 +1,17 @@
/***********************************************************************
Moses - statistical machine translation system
Copyright (C) 2006-2012 University of Edinburgh
-
+
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
-
+
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
-
+
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
diff --git a/scripts/ems/experiment.meta b/scripts/ems/experiment.meta
index ead9ebe03..57ef4f9d6 100644
--- a/scripts/ems/experiment.meta
+++ b/scripts/ems/experiment.meta
@@ -165,11 +165,18 @@ get-corpus
pass-unless: get-corpus-script
default-name: lm/txt
template: $get-corpus-script > OUT
+use-parallel-corpus
+ in: parallel-corpus-stem
+ out: tokenized-corpus
+ default-name: lm/tok
+ ignore-unless: parallel-corpus-stem
+ template: ln -s IN.$output-extension OUT
tokenize
in: raw-corpus
out: tokenized-corpus
default-name: lm/tok
pass-unless: output-tokenizer
+ ignore-if: parallel-corpus-stem
template: $output-tokenizer < IN > OUT
parallelizable: yes
mock-parse
diff --git a/scripts/other/buckwalter.perl b/scripts/other/buckwalter.perl
new file mode 100755
index 000000000..62544e212
--- /dev/null
+++ b/scripts/other/buckwalter.perl
@@ -0,0 +1,33 @@
+#!/usr/bin/env perl
+
+use strict;
+use warnings;
+use Encode::Arabic::Buckwalter;
+use Getopt::Long "GetOptions";
+
+my $direction;
+GetOptions('direction=i' => \$direction)
+ or exit(1);
+# direction: 1=arabic->bw, 2=bw->arabic
+
+die("ERROR: need to set direction") unless defined($direction);
+
+
+
+while (my $line = <STDIN>) {
+ chomp($line);
+
+ my $lineOut;
+ if ($direction == 1) {
+ $lineOut = encode 'buckwalter', decode 'utf8', $line;
+ }
+ elsif ($direction == 2) {
+ $lineOut = encode 'utf8', decode 'buckwalter', $line;
+ }
+ else {
+ die("Unknown direction: $direction");
+ }
+ print "$lineOut\n";
+
+}
+
diff --git a/scripts/tokenizer/pre_tokenize_cleaning.py b/scripts/tokenizer/pre_tokenize_cleaning.py
new file mode 100644
index 000000000..76736da5c
--- /dev/null
+++ b/scripts/tokenizer/pre_tokenize_cleaning.py
@@ -0,0 +1,78 @@
+#!/usr/bin/env python -*- coding: utf-8 -*-
+
+"""
+The Gacha filter cleans out sentence pairs that have global character mean
+lower than a certain threshold.
+
+Use this cleaner to produce low quantity of high quality sentence pairs.
+
+It is an aggressive cleaner that cleaned out ~64% of the HindEnCorp during
+WMT14 when threshold is set at 20% (Tan and Pal, 2014); achieving lowest TER.
+(see http://www.aclweb.org/anthology/W/W14/W14-3323.pdf)
+
+This is inspired by the global character mean that is used in the Gale-Church
+algorithm (Gale aand Church, 1993), the c variable in:
+
+ delta = (l2-l1*c)/math.sqrt(l1*s2)
+
+where:
+ - l1 = len(source_sentence)
+ - l2 = len(target_sentence)
+ - c = global mean, i.e. #char in source corpus / #char in target corpus
+ - s2 = global variance, i.e. d ((l1 - l2)^2) / d (l1)
+
+(For details on Gale-Church, see http://www.aclweb.org/anthology/J93-1004.pdf)
+"""
+
+import io, subprocess
+
+red = '\033[01;31m'
+native = '\033[m'
+
+def err_msg(txt):
+ return red+txt+native
+
+def num_char(filename):
+ return float(subprocess.Popen(["wc", "-m", filename],
+ stdout=subprocess.PIPE).stdout.read().split()[0])
+
+def gacha_mean(sourcefile, targetfile):
+ """
+ Counts the global character mean between source and target language as
+ in Gale-Church (1993)
+ """
+ sys.stderr.write(err_msg('Calculating Gacha mean, please wait ...\n'))
+ c = num_char(sourcefile) / num_char(targetfile)
+ sys.stderr.write(err_msg('Gacha mean = '+str(c)+'\n'))
+ sys.stderr.write(err_msg('Filtering starts ...\n'))
+ return c
+
+def main(sourcefile, targetfile, threshold=0.2):
+ # Calculates Gacha mean.
+ c = gacha_mean(sourcefile, targetfile)
+ # Calculates lower and upperbound for filtering
+ threshold = float(threshold)
+ lowerbound = (1-threshold) * c
+ upperbound = (1+threshold) * c
+
+ # Start filtering sentences.
+ with io.open(sourcefile, 'r', encoding='utf8') as srcfin, \
+ io.open(targetfile, 'r', encoding='utf8') as trgfin:
+ for s, t in zip(srcfin, trgfin):
+ if lowerbound < len(s) / float(len(t)) < upperbound:
+ print(u"{}\t{}\n".format(s.strip(),t.strip()))
+
+if __name__ == '__main__':
+ import sys
+ if len(sys.argv) not in range(3,5):
+ usage_msg = err_msg('Usage: python %s srcfile trgfile (threshold)\n'
+ % sys.argv[0])
+
+ example_msg = err_msg('Example: gacha_cleaning.py ~/Europarl.de-en.de '
+ '~/Europarl.de-en.en 0.4\n'
+ % sys.argv[0])
+ sys.stderr.write(usage_msg)
+ sys.stderr.write(example_msg)
+ sys.exit(1)
+
+ main(*sys.argv[1:])
diff --git a/scripts/training/filter-model-given-input.pl b/scripts/training/filter-model-given-input.pl
index 7dec0762c..1464fdb73 100755
--- a/scripts/training/filter-model-given-input.pl
+++ b/scripts/training/filter-model-given-input.pl
@@ -37,6 +37,7 @@ my $ZCAT = "gzip -cd";
# get optional parameters
my $opt_hierarchical = 0;
my $binarizer = undef;
+my $threads = 1; # Default is single-thread, i.e. $threads=1
my $syntax_filter_cmd = "$SCRIPTS_ROOTDIR/../bin/filter-rule-table hierarchical";
my $min_score = undef;
my $opt_min_non_initial_rule_count = undef;
@@ -54,6 +55,7 @@ GetOptions(
"SyntaxFilterCmd=s" => \$syntax_filter_cmd,
"tempdir=s" => \$tempdir,
"MinScore=s" => \$min_score,
+ "threads=i" => \$threads,
"MinNonInitialRuleCount=i" => \$opt_min_non_initial_rule_count, # DEPRECATED
) or exit(1);
@@ -63,7 +65,7 @@ my $config = shift;
my $input = shift;
if (!defined $dir || !defined $config || !defined $input) {
- print STDERR "usage: filter-model-given-input.pl targetdir moses.ini input.text [-Binarizer binarizer] [-Hierarchical] [-MinScore id:threshold[,id:threshold]*] [-SyntaxFilterCmd cmd]\n";
+ print STDERR "usage: filter-model-given-input.pl targetdir moses.ini input.text [-Binarizer binarizer] [-Hierarchical] [-MinScore id:threshold[,id:threshold]*] [-SyntaxFilterCmd cmd] [-threads num]\n";
exit 1;
}
$dir = ensure_full_path($dir);
@@ -405,7 +407,7 @@ for(my $i=0;$i<=$#TABLE;$i++) {
# ... phrase translation model
elsif ($binarizer =~ /processPhraseTableMin/) {
#compact phrase table
- my $cmd = "$catcmd $mid_file | LC_ALL=C sort -T $tempdir > $mid_file.sorted && $binarizer -in $mid_file.sorted -out $new_file -nscores $TABLE_WEIGHTS[$i] && rm $mid_file.sorted";
+ my $cmd = "$catcmd $mid_file | LC_ALL=C sort -T $tempdir > $mid_file.sorted && $binarizer -in $mid_file.sorted -out $new_file -nscores $TABLE_WEIGHTS[$i] -threads $threads && rm $mid_file.sorted";
safesystem($cmd) or die "Can't binarize";
} elsif ($binarizer =~ /CreateOnDiskPt/) {
my $cmd = "$binarizer $mid_file $new_file.bin";
@@ -426,7 +428,7 @@ for(my $i=0;$i<=$#TABLE;$i++) {
$lexbin =~ s/PhraseTable/LexicalTable/;
my $cmd;
if ($lexbin =~ /processLexicalTableMin/) {
- $cmd = "$catcmd $mid_file | LC_ALL=C sort -T $tempdir > $mid_file.sorted && $lexbin -in $mid_file.sorted -out $new_file && rm $mid_file.sorted";
+ $cmd = "$catcmd $mid_file | LC_ALL=C sort -T $tempdir > $mid_file.sorted && $lexbin -in $mid_file.sorted -out $new_file -threads $threads && rm $mid_file.sorted";
} else {
$lexbin =~ s/^\s*(\S+)\s.+/$1/; # no options
$cmd = "$lexbin -in $mid_file -out $new_file";
diff --git a/scripts/training/rdlm/README b/scripts/training/rdlm/README
index 209daf1c0..ca2a06015 100644
--- a/scripts/training/rdlm/README
+++ b/scripts/training/rdlm/README
@@ -31,8 +31,8 @@ RDLM is split into two neural network models, which can be trained with
mkdir working_dir_head
mkdir working_dir_label
- ./train_rdlm.py --nplm-home /path/to/nplm --working-dir working_dir_head --output-dir /path/to/output_directory --output-model rdlm_head --mode head --output-vocab-size 500000 --noise-samples 100
- ./train_rdlm.py --nplm-home /path/to/nplm --working-dir working_dir_label --output-dir /path/to/output_directory --output-model rdlm_label --mode label --output-vocab-size 75 --noise-samples 50
+ ./train_rdlm.py --nplm-home /path/to/nplm --corpus [your_training_corpus] --working-dir working_dir_head --output-dir /path/to/output_directory --output-model rdlm_head --mode head --output-vocab-size 500000 --noise 100
+ ./train_rdlm.py --nplm-home /path/to/nplm --corpus [your_training_corpus] --working-dir working_dir_label --output-dir /path/to/output_directory --output-model rdlm_label --mode label --output-vocab-size 75 --noise 50
for more options, run `train_rdlm.py --help`. Parameters you may want to adjust
include the vocabulary size of the label model (depending on the number of
diff --git a/scripts/training/rdlm/extract_syntactic_ngrams.py b/scripts/training/rdlm/extract_syntactic_ngrams.py
index eca1b3a49..f3ce41080 100755
--- a/scripts/training/rdlm/extract_syntactic_ngrams.py
+++ b/scripts/training/rdlm/extract_syntactic_ngrams.py
@@ -113,13 +113,14 @@ def get_syntactic_ngrams(xml, options, vocab, output_vocab, parent_heads=None, p
int_list.extend(parent_heads)
int_list.extend(parent_labels)
+ # write root of tree
if options.mode == 'label':
int_list.append(output_vocab.get(label, 0))
- sys.stdout.write(' '.join(map(str, int_list)) + '\n')
+ options.output.write(' '.join(map(str, int_list)) + '\n')
elif options.mode == 'head' and not head == '<dummy_head>':
int_list.append(vocab.get(label, 0))
int_list.append(output_vocab.get(head, output_vocab.get(preterminal, 0)))
- sys.stdout.write(' '.join(map(str, int_list)) + '\n')
+ options.output.write(' '.join(map(str, int_list)) + '\n')
parent_heads.append(vocab.get(head, 0))
parent_labels.append(vocab.get(label, 0))
diff --git a/scripts/training/rdlm/extract_vocab.py b/scripts/training/rdlm/extract_vocab.py
index 55ecbe554..6d017602e 100755
--- a/scripts/training/rdlm/extract_vocab.py
+++ b/scripts/training/rdlm/extract_vocab.py
@@ -59,10 +59,6 @@ def get_head(xml, args):
preterminal = child.get('label')
head = escape_text(child.text.strip())
- # hack for split compounds
- elif child[-1].get('label') == 'SEGMENT':
- return escape_text(child[-1].text.strip()), 'SEGMENT'
-
elif args.ptkvz and head and child.get('label') == 'avz':
for grandchild in child:
if grandchild.get('label') == 'PTKVZ':
diff --git a/scripts/training/rdlm/train_rdlm.py b/scripts/training/rdlm/train_rdlm.py
index 1e7ecac52..15e56c430 100755
--- a/scripts/training/rdlm/train_rdlm.py
+++ b/scripts/training/rdlm/train_rdlm.py
@@ -43,7 +43,7 @@ parser.add_argument("--learning-rate", dest="learning_rate", type=float, metavar
parser.add_argument("--input-words-file", dest="input_words_file", metavar="PATH", help="input vocabulary (default: %(default)s)")
parser.add_argument("--output-words-file", dest="output_words_file", metavar="PATH", help="output vocabulary (default: %(default)s)")
parser.add_argument("--input_vocab_size", dest="input_vocab_size", type=int, metavar="INT", help="input vocabulary size (default: %(default)s)")
-parser.add_argument("--output_vocab_size", dest="output_vocab_size", type=int, metavar="INT", help="output vocabulary size (default: %(default)s)")
+parser.add_argument("--output-vocab-size", dest="output_vocab_size", type=int, metavar="INT", help="output vocabulary size (default: %(default)s)")
parser.set_defaults(
@@ -95,7 +95,7 @@ def prepare_vocabulary(options):
filtered_vocab = open(orig).readlines()
orig = vocab_prefix + '.nonterminals'
filtered_vocab += open(orig).readlines()
- filtered_vocab = [word for word in filtered_vocab if not word.startswith(prefix) for prefix in blacklist]
+ filtered_vocab = [word for word in filtered_vocab if not any(word.startswith(prefix) for prefix in blacklist)]
if options.output_vocab_size:
filtered_vocab = filtered_vocab[:options.output_vocab_size]
else:
@@ -127,12 +127,13 @@ def main(options):
sys.stderr.write('extracting syntactic n-grams\n')
extract_syntactic_ngrams.main(extract_options)
- if validation_corpus:
- extract_options.input = options.validation_corpus
- options.validation_file = os.path.join(options.working_dir, os.path.basename(options.validation_corpus) + '.numberized')
- extract_options.output = options.validation_file
+ if options.validation_corpus:
+ extract_options.input = open(options.validation_corpus)
+ options.validation_file = os.path.join(options.working_dir, os.path.basename(options.validation_corpus))
+ extract_options.output = open(options.validation_file + '.numberized', 'w')
sys.stderr.write('extracting syntactic n-grams (validation file)\n')
extract_syntactic_ngrams.main(extract_options)
+ extract_options.output.close()
sys.stderr.write('training neural network\n')
train_nplm.main(options)
@@ -141,8 +142,8 @@ def main(options):
ret = subprocess.call([os.path.join(sys.path[0], 'average_null_embedding.py'),
options.nplm_home,
os.path.join(options.output_dir, options.output_model + '.model.nplm.' + str(options.epochs)),
- os.path.join(options.working_dir, options.corpus_stem + '.numberized'),
- os.path.join(options.output_dir, options.output_model + '.model.nplm.')
+ os.path.join(options.working_dir, os.path.basename(options.corpus_stem) + '.numberized'),
+ os.path.join(options.output_dir, options.output_model + '.model.nplm')
])
if ret:
raise Exception("averaging null words failed")
diff --git a/scripts/training/train-model.perl b/scripts/training/train-model.perl
index fb63d4bbd..4c355479c 100755
--- a/scripts/training/train-model.perl
+++ b/scripts/training/train-model.perl
@@ -1,4 +1,4 @@
-#!/usr/bin/env perl
+#!/usr/bin/env perl
use warnings;
use strict;
diff --git a/scripts/training/wrappers/madamira-wrapper.perl b/scripts/training/wrappers/madamira-wrapper.perl
index 6e7efe245..5c1d0404f 100755
--- a/scripts/training/wrappers/madamira-wrapper.perl
+++ b/scripts/training/wrappers/madamira-wrapper.perl
@@ -8,21 +8,38 @@ use File::Basename;
use FindBin qw($RealBin);
use Cwd 'abs_path';
+sub GetFactors;
+
+
my $TMPDIR = "tmp";
my $SCHEME = "D2";
my $KEEP_TMP = 0;
my $MADA_DIR;
+my $CONFIG;
+
+my $FACTORS_STR;
+my @FACTORS;
GetOptions(
"scheme=s" => \$SCHEME,
"tmpdir=s" => \$TMPDIR,
"keep-tmp" => \$KEEP_TMP,
- "mada-dir=s" => \$MADA_DIR
+ "mada-dir=s" => \$MADA_DIR,
+ "factors=s" => \$FACTORS_STR,
+ "config=s" => \$CONFIG
) or die("ERROR: unknown options");
+if (!defined($CONFIG)) {
+ $CONFIG = "$MADA_DIR/samples/sampleConfigFile.xml";
+}
+
$TMPDIR = abs_path($TMPDIR);
print STDERR "TMPDIR=$TMPDIR \n";
+if (defined($FACTORS_STR)) {
+ @FACTORS = split(",", $FACTORS_STR);
+}
+
#binmode(STDIN, ":utf8");
#binmode(STDOUT, ":utf8");
@@ -54,7 +71,7 @@ else {
$cmd = "$SPLIT_EXEC -l 10000 -a 7 -d $TMPDIR/input $TMPDIR/split/x";
`$cmd`;
-$cmd = "cd $MADA_DIR && parallel --jobs 4 java -Xmx2500m -Xms2500m -XX:NewRatio=3 -jar $MADA_DIR/MADAMIRA.jar -rawinput {} -rawoutdir $TMPDIR/out -rawconfig $MADA_DIR/samples/sampleConfigFile.xml ::: $TMPDIR/split/x*";
+$cmd = "cd $MADA_DIR && parallel --jobs 4 java -Xmx2500m -Xms2500m -XX:NewRatio=3 -jar $MADA_DIR/MADAMIRA.jar -rawinput {} -rawoutdir $TMPDIR/out -rawconfig $CONFIG ::: $TMPDIR/split/x*";
print STDERR "Executing: $cmd\n";
`$cmd`;
@@ -66,7 +83,7 @@ print STDERR "Executing: $cmd\n";
open(MADA_OUT,"<$infile.mada");
#binmode(MADA_OUT, ":utf8");
while(my $line = <MADA_OUT>) {
- chop($line);
+ chomp($line);
#print STDERR "line=$line \n";
if (index($line, "SENTENCE BREAK") == 0) {
@@ -75,13 +92,21 @@ while(my $line = <MADA_OUT>) {
print "\n";
}
elsif (index($line, ";;WORD") == 0) {
- # word
+ # word
my $word = substr($line, 7, length($line) - 8);
- #print STDERR "FOund $word\n";
+ #print STDERR "FOund $word\n";
+
+ for (my $i = 0; $i < 4; ++$i) {
+ $line = <MADA_OUT>;
+ }
+
+ my $factors = GetFactors($line, \@FACTORS);
+ $word .= $factors;
+
print "$word ";
}
else {
- #print STDERR "NADA\n";
+ #print STDERR "NADA\n";
}
}
close (MADA_OUT);
@@ -91,3 +116,33 @@ if ($KEEP_TMP == 0) {
# `rm -rf $TMPDIR`;
}
+
+###########################
+sub GetFactors
+{
+ my $line = shift;
+ my $factorsRef = shift;
+ my @factors = @{$factorsRef};
+
+ # all factors
+ my %allFactors;
+ my @toks = split(" ", $line);
+ for (my $i = 1; $i < scalar(@toks); ++$i) {
+ #print " tok=" .$toks[$i];
+
+ my ($key, $value) = split(":", $toks[$i]);
+ $allFactors{$key} = $value;
+ }
+
+ my $ret = "";
+ my $factorType;
+ foreach $factorType(@factors) {
+ #print "factorType=$factorType ";
+ my $value = $allFactors{$factorType};
+
+ $ret .= "|$value";
+ }
+
+ return $ret;
+}
+
diff --git a/search/applied.hh b/search/applied.hh
index 88961775f..9464080cc 100644
--- a/search/applied.hh
+++ b/search/applied.hh
@@ -9,12 +9,12 @@
namespace search {
-// A full hypothesis: a score, arity of the rule, a pointer to the decoder's rule (Note), and pointers to non-terminals that were substituted.
+// A full hypothesis: a score, arity of the rule, a pointer to the decoder's rule (Note), and pointers to non-terminals that were substituted.
template <class Below> class GenericApplied : public Header {
public:
GenericApplied() {}
- GenericApplied(void *location, PartialEdge partial)
+ GenericApplied(void *location, PartialEdge partial)
: Header(location) {
memcpy(Base(), partial.Base(), kHeaderSize);
Below *child_out = Children();
@@ -23,7 +23,7 @@ template <class Below> class GenericApplied : public Header {
for (; part != part_end_loop; ++part, ++child_out)
*child_out = Below(part->End());
}
-
+
GenericApplied(void *location, Score score, Arity arity, Note note, Moses::WordsRange range) : Header(location, arity) {
SetScore(score);
SetNote(note);
@@ -46,7 +46,7 @@ template <class Below> class GenericApplied : public Header {
}
};
-// Applied rule that references itself.
+// Applied rule that references itself.
class Applied : public GenericApplied<Applied> {
private:
typedef GenericApplied<Applied> P;
@@ -57,7 +57,7 @@ class Applied : public GenericApplied<Applied> {
Applied(History from) : P(from) {}
};
-// How to build single-best hypotheses.
+// How to build single-best hypotheses.
class SingleBest {
public:
typedef PartialEdge Combine;
@@ -68,7 +68,7 @@ class SingleBest {
}
NBestComplete Complete(PartialEdge partial) {
- if (!partial.Valid())
+ if (!partial.Valid())
return NBestComplete(NULL, lm::ngram::ChartState(), -INFINITY);
void *place_final = pool_.Allocate(Applied::Size(partial.GetArity()));
Applied(place_final, partial);
diff --git a/search/config.hh b/search/config.hh
index ba18c09e9..dd52303cb 100644
--- a/search/config.hh
+++ b/search/config.hh
@@ -10,7 +10,7 @@ struct NBestConfig {
keep = in_size;
size = in_size;
}
-
+
unsigned int keep, size;
};
diff --git a/search/edge.hh b/search/edge.hh
index 187904bf9..cee96b474 100644
--- a/search/edge.hh
+++ b/search/edge.hh
@@ -16,13 +16,13 @@ namespace search {
// Copyable, but the copy will be shallow.
class PartialEdge : public Header {
public:
- // Allow default construction for STL.
+ // Allow default construction for STL.
PartialEdge() {}
- PartialEdge(util::Pool &pool, Arity arity)
+ PartialEdge(util::Pool &pool, Arity arity)
: Header(pool.Allocate(Size(arity, arity + 1)), arity) {}
-
- PartialEdge(util::Pool &pool, Arity arity, Arity chart_states)
+
+ PartialEdge(util::Pool &pool, Arity arity, Arity chart_states)
: Header(pool.Allocate(Size(arity, chart_states)), arity) {}
// Non-terminals
diff --git a/search/edge_generator.cc b/search/edge_generator.cc
index 1f933453d..5a6431807 100644
--- a/search/edge_generator.cc
+++ b/search/edge_generator.cc
@@ -34,7 +34,7 @@ template <class Model> void FastScore(const Context<Model> &context, Arity victi
adjustment += lm::ngram::Subsume(context.LanguageModel(), before->left, before->right, after->left, after->right, update_reveal.left.length);
}
before->right = after->right;
- // Shift the others shifted one down, covering after.
+ // Shift the others shifted one down, covering after.
for (lm::ngram::ChartState *cover = after; cover < between + incomplete; ++cover) {
*cover = *(cover + 1);
}
@@ -55,7 +55,7 @@ template <class Model> PartialEdge EdgeGenerator::Pop(Context<Model> &context) {
Arity victim_completed;
Arity incomplete;
unsigned char lowest_niceness = 255;
- // Select victim or return if complete.
+ // Select victim or return if complete.
{
Arity completed = 0;
for (Arity i = 0; i != arity; ++i) {
@@ -89,20 +89,20 @@ template <class Model> PartialEdge EdgeGenerator::Pop(Context<Model> &context) {
memcpy(alternate.Between(), top.Between(), sizeof(lm::ngram::ChartState) * (incomplete + 1));
- // TODO: dedupe?
+ // TODO: dedupe?
generate_.push(alternate);
}
-#ifndef NDEBUG
+#ifndef NDEBUG
Score before = top.GetScore();
#endif
// top is now the continuation.
FastScore(context, victim, victim - victim_completed, incomplete, old_value, top);
- // TODO: dedupe?
+ // TODO: dedupe?
generate_.push(top);
assert(lowest_niceness != 254 || top.GetScore() == before);
- // Invalid indicates no new hypothesis generated.
+ // Invalid indicates no new hypothesis generated.
return PartialEdge();
}
diff --git a/search/header.hh b/search/header.hh
index d70524097..699d8ef15 100644
--- a/search/header.hh
+++ b/search/header.hh
@@ -10,7 +10,7 @@
namespace search {
-// Copying is shallow.
+// Copying is shallow.
class Header {
public:
bool Valid() const { return base_; }
diff --git a/search/nbest.cc b/search/nbest.cc
index 43ed702cb..7cf84dbb4 100644
--- a/search/nbest.cc
+++ b/search/nbest.cc
@@ -40,7 +40,7 @@ const std::vector<Applied> &NBestList::Extract(util::Pool &pool, std::size_t n)
Score NBestList::Visit(util::Pool &pool, std::size_t index) {
if (index + 1 < revealed_.size())
return revealed_[index + 1].GetScore() - revealed_[index].GetScore();
- if (queue_.empty())
+ if (queue_.empty())
return -INFINITY;
if (index + 1 == revealed_.size())
return queue_.top().GetScore() - revealed_[index].GetScore();
@@ -81,7 +81,7 @@ void NBestList::MoveTop(util::Pool &pool) {
if (child->index_) break;
}
- // Convert QueueEntry to Applied. This leaves some unused memory.
+ // Convert QueueEntry to Applied. This leaves some unused memory.
void *overwrite = entry.Children();
for (unsigned int i = 0; i < entry.GetArity(); ++i) {
RevealedRef from(*(static_cast<const RevealedRef*>(overwrite) + i));
diff --git a/search/nbest.hh b/search/nbest.hh
index 0aa1dfd57..17b08da38 100644
--- a/search/nbest.hh
+++ b/search/nbest.hh
@@ -19,8 +19,8 @@ class NBestList;
class NBestList {
private:
class RevealedRef {
- public:
- explicit RevealedRef(History history)
+ public:
+ explicit RevealedRef(History history)
: in_(static_cast<NBestList*>(history)), index_(0) {}
private:
@@ -29,7 +29,7 @@ class NBestList {
NBestList *in_;
std::size_t index_;
};
-
+
typedef GenericApplied<RevealedRef> QueueEntry;
public:
diff --git a/search/rule.hh b/search/rule.hh
index 43ca61625..dc45f6634 100644
--- a/search/rule.hh
+++ b/search/rule.hh
@@ -16,8 +16,8 @@ struct ScoreRuleRet {
unsigned int oov;
};
-// Pass <s> and </s> normally.
-// Indicate non-terminals with kNonTerminal.
+// Pass <s> and </s> normally.
+// Indicate non-terminals with kNonTerminal.
template <class Model> ScoreRuleRet ScoreRule(const Model &model, const std::vector<lm::WordIndex> &words, lm::ngram::ChartState *state_out);
} // namespace search
diff --git a/search/types.hh b/search/types.hh
index 832ef159f..e1239dfd0 100644
--- a/search/types.hh
+++ b/search/types.hh
@@ -18,7 +18,7 @@ union Note {
typedef void *History;
struct NBestComplete {
- NBestComplete(History in_history, const lm::ngram::ChartState &in_state, Score in_score)
+ NBestComplete(History in_history, const lm::ngram::ChartState &in_state, Score in_score)
: history(in_history), state(&in_state), score(in_score) {}
History history;
diff --git a/search/vertex.cc b/search/vertex.cc
index 305ea0c99..cb1df7e51 100644
--- a/search/vertex.cc
+++ b/search/vertex.cc
@@ -20,7 +20,7 @@ class DivideLeft {
: index_(index) {}
uint64_t operator()(const lm::ngram::ChartState &state) const {
- return (index_ < state.left.length) ?
+ return (index_ < state.left.length) ?
state.left.pointers[index_] :
(kCompleteAdd - state.left.full);
}
@@ -71,7 +71,7 @@ uint64_t Identify(const lm::ngram::Left &left, unsigned char index) {
template <class Side> class DetermineSame {
public:
- DetermineSame(const Side &side, unsigned char guaranteed)
+ DetermineSame(const Side &side, unsigned char guaranteed)
: side_(side), guaranteed_(guaranteed), shared_(side.length), complete_(true) {}
void Consider(const Side &other) {
diff --git a/search/vertex.hh b/search/vertex.hh
index f560dc8c6..e04325dc3 100644
--- a/search/vertex.hh
+++ b/search/vertex.hh
@@ -54,7 +54,7 @@ class VertexNode {
void BuildExtend();
- // Should only happen to a root node when the entire vertex is empty.
+ // Should only happen to a root node when the entire vertex is empty.
bool Empty() const {
return hypos_.empty() && extend_.empty();
}
@@ -74,7 +74,7 @@ class VertexNode {
return bound_;
}
- // Will be invalid unless this is a leaf.
+ // Will be invalid unless this is a leaf.
const History End() const {
assert(hypos_.size() == 1);
return hypos_.front().history;
diff --git a/search/vertex_generator.hh b/search/vertex_generator.hh
index 328da7933..6013cdb7d 100644
--- a/search/vertex_generator.hh
+++ b/search/vertex_generator.hh
@@ -15,7 +15,7 @@ namespace search {
class ContextBase;
-// Output makes the single-best or n-best list.
+// Output makes the single-best or n-best list.
template <class Output> class VertexGenerator {
public:
VertexGenerator(ContextBase &context, Vertex &gen, Output &nbest) : context_(context), gen_(gen), nbest_(nbest) {}
@@ -49,7 +49,7 @@ template <class Output> class VertexGenerator {
// Special case for root vertex: everything should come together into the root
// node. In theory, this should happen naturally due to state collapsing with
// <s> and </s>. If that's the case, VertexGenerator is fine, though it will
-// make one connection.
+// make one connection.
template <class Output> class RootVertexGenerator {
public:
RootVertexGenerator(Vertex &gen, Output &out) : gen_(gen), out_(out) {}
@@ -66,7 +66,7 @@ template <class Output> class RootVertexGenerator {
private:
Vertex &gen_;
-
+
typename Output::Combine combine_;
Output &out_;
};
diff --git a/symal/symal.cpp b/symal/symal.cpp
index 249aa6caa..7f3e22866 100644
--- a/symal/symal.cpp
+++ b/symal/symal.cpp
@@ -15,9 +15,9 @@
using namespace std;
-#define MAX_WORD 10000 // maximum lengthsource/target strings
+#define MAX_WORD 10000 // maximum lengthsource/target strings
#define MAX_M 400 // maximum length of source strings
-#define MAX_N 400 // maximum length of target strings
+#define MAX_N 400 // maximum length of target strings
#define UNION 1
#define INTERSECT 2
@@ -512,6 +512,6 @@ int main(int argc, char** argv)
if (out != &std::cout) {
delete inp;
}
-
+
exit(0);
}
diff --git a/util/Jamfile b/util/Jamfile
index a82a5e23d..2d3cede01 100644
--- a/util/Jamfile
+++ b/util/Jamfile
@@ -21,7 +21,7 @@ obj file_piece_test.o : file_piece_test.cc /top//boost_unit_test_framework : $(c
fakelib parallel_read : parallel_read.cc : <threading>multi:<source>/top//boost_thread <threading>multi:<define>WITH_THREADS : : <include>.. ;
-fakelib kenutil : bit_packing.cc ersatz_progress.cc exception.cc file.cc file_piece.cc mmap.cc murmur_hash.cc parallel_read pool.cc read_compressed scoped.cc string_piece.cc usage.cc double-conversion//double-conversion : <include>.. <os>LINUX,<threading>single:<source>rt : : <include>.. ;
+fakelib kenutil : bit_packing.cc ersatz_progress.cc exception.cc file.cc file_piece.cc mmap.cc murmur_hash.cc parallel_read pool.cc random.cc read_compressed scoped.cc string_piece.cc usage.cc double-conversion//double-conversion : <include>.. <os>LINUX,<threading>single:<source>rt : : <include>.. ;
exe cat_compressed : cat_compressed_main.cc kenutil ;
diff --git a/util/bit_packing.cc b/util/bit_packing.cc
index d3f998d46..cffd9cf62 100644
--- a/util/bit_packing.cc
+++ b/util/bit_packing.cc
@@ -9,7 +9,7 @@ namespace {
template <bool> struct StaticCheck {};
template <> struct StaticCheck<true> { typedef bool StaticAssertionPassed; };
-// If your float isn't 4 bytes, we're hosed.
+// If your float isn't 4 bytes, we're hosed.
typedef StaticCheck<sizeof(float) == 4>::StaticAssertionPassed FloatSize;
} // namespace
@@ -34,7 +34,7 @@ void BitPackingSanity() {
if (test57 != ReadInt57(mem, b, 57, (1ULL << 57) - 1))
UTIL_THROW(Exception, "The bit packing routines are failing for your architecture. Please send a bug report with your architecture, operating system, and compiler.");
}
- // TODO: more checks.
+ // TODO: more checks.
}
} // namespace util
diff --git a/util/bit_packing.hh b/util/bit_packing.hh
index 3cf3cd4a1..b24fd9c1f 100644
--- a/util/bit_packing.hh
+++ b/util/bit_packing.hh
@@ -1,7 +1,7 @@
#ifndef UTIL_BIT_PACKING_H
#define UTIL_BIT_PACKING_H
-/* Bit-level packing routines
+/* Bit-level packing routines
*
* WARNING WARNING WARNING:
* The write functions assume that memory is zero initially. This makes them
@@ -9,10 +9,10 @@
* These routines assume that unaligned access to uint64_t is fast. This is
* the case on x86_64. I'm not sure how fast unaligned 64-bit access is on
* x86 but my target audience is large language models for which 64-bit is
- * necessary.
+ * necessary.
*
* Call the BitPackingSanity function to sanity check. Calling once suffices,
- * but it may be called multiple times when that's inconvenient.
+ * but it may be called multiple times when that's inconvenient.
*
* ARM and MinGW ports contributed by Hideo Okuma and Tomoyuki Yoshimura at
* NICT.
@@ -25,14 +25,14 @@
#include <endian.h>
#elif !defined(_WIN32) && !defined(_WIN64)
#include <arpa/nameser_compat.h>
-#endif
+#endif
#include <stdint.h>
#include <cstring>
namespace util {
-// Fun fact: __BYTE_ORDER is wrong on Solaris Sparc, but the version without __ is correct.
+// Fun fact: __BYTE_ORDER is wrong on Solaris Sparc, but the version without __ is correct.
#if BYTE_ORDER == LITTLE_ENDIAN
inline uint8_t BitPackShift(uint8_t bit, uint8_t /*length*/) {
return bit;
@@ -56,15 +56,15 @@ inline uint64_t ReadOff(const void *base, uint64_t bit_off) {
#endif
}
-/* Pack integers up to 57 bits using their least significant digits.
+/* Pack integers up to 57 bits using their least significant digits.
* The length is specified using mask:
- * Assumes mask == (1 << length) - 1 where length <= 57.
+ * Assumes mask == (1 << length) - 1 where length <= 57.
*/
inline uint64_t ReadInt57(const void *base, uint64_t bit_off, uint8_t length, uint64_t mask) {
return (ReadOff(base, bit_off) >> BitPackShift(bit_off & 7, length)) & mask;
}
/* Assumes value < (1 << length) and length <= 57.
- * Assumes the memory is zero initially.
+ * Assumes the memory is zero initially.
*/
inline void WriteInt57(void *base, uint64_t bit_off, uint8_t length, uint64_t value) {
#if defined(__arm) || defined(__arm__)
@@ -74,7 +74,7 @@ inline void WriteInt57(void *base, uint64_t bit_off, uint8_t length, uint64_t va
value64 |= (value << BitPackShift(bit_off & 7, length));
memcpy(base_off, &value64, sizeof(value64));
#else
- *reinterpret_cast<uint64_t*>(reinterpret_cast<uint8_t*>(base) + (bit_off >> 3)) |=
+ *reinterpret_cast<uint64_t*>(reinterpret_cast<uint8_t*>(base) + (bit_off >> 3)) |=
(value << BitPackShift(bit_off & 7, length));
#endif
}
@@ -99,7 +99,7 @@ inline void WriteInt25(void *base, uint64_t bit_off, uint8_t length, uint32_t va
value32 |= (value << BitPackShift(bit_off & 7, length));
memcpy(base_off, &value32, sizeof(value32));
#else
- *reinterpret_cast<uint32_t*>(reinterpret_cast<uint8_t*>(base) + (bit_off >> 3)) |=
+ *reinterpret_cast<uint32_t*>(reinterpret_cast<uint8_t*>(base) + (bit_off >> 3)) |=
(value << BitPackShift(bit_off & 7, length));
#endif
}
@@ -136,7 +136,7 @@ inline void UnsetSign(float &to) {
inline float ReadNonPositiveFloat31(const void *base, uint64_t bit_off) {
FloatEnc encoded;
encoded.i = ReadOff(base, bit_off) >> BitPackShift(bit_off & 7, 31);
- // Sign bit set means negative.
+ // Sign bit set means negative.
encoded.i |= kSignBit;
return encoded.f;
}
@@ -150,7 +150,7 @@ inline void WriteNonPositiveFloat31(void *base, uint64_t bit_off, float value) {
void BitPackingSanity();
// Return bits required to store integers upto max_value. Not the most
-// efficient implementation, but this is only called a few times to size tries.
+// efficient implementation, but this is only called a few times to size tries.
uint8_t RequiredBits(uint64_t max_value);
struct BitsMask {
diff --git a/util/cat_compressed_main.cc b/util/cat_compressed_main.cc
index 9ec8e81f7..0c7cda936 100644
--- a/util/cat_compressed_main.cc
+++ b/util/cat_compressed_main.cc
@@ -21,7 +21,7 @@ int main(int argc, char *argv[]) {
char *arg = argv[i];
if (!strcmp(arg, "--")) break;
if (!strcmp(arg, "-h") || !strcmp(arg, "--help")) {
- std::cerr <<
+ std::cerr <<
"A cat implementation that interprets compressed files.\n"
"Usage: " << argv[0] << " [file1] [file2] ...\n"
"If no file is provided, then stdin is read.\n";
diff --git a/util/ersatz_progress.cc b/util/ersatz_progress.cc
index 498ab5c58..55c82e7af 100644
--- a/util/ersatz_progress.cc
+++ b/util/ersatz_progress.cc
@@ -17,7 +17,7 @@ ErsatzProgress::~ErsatzProgress() {
if (out_) Finished();
}
-ErsatzProgress::ErsatzProgress(uint64_t complete, std::ostream *to, const std::string &message)
+ErsatzProgress::ErsatzProgress(uint64_t complete, std::ostream *to, const std::string &message)
: current_(0), next_(complete / kWidth), complete_(complete), stones_written_(0), out_(to) {
if (!out_) {
next_ = std::numeric_limits<uint64_t>::max();
diff --git a/util/ersatz_progress.hh b/util/ersatz_progress.hh
index b3bef7fa6..b47aded7d 100644
--- a/util/ersatz_progress.hh
+++ b/util/ersatz_progress.hh
@@ -6,7 +6,7 @@
#include <stdint.h>
// Ersatz version of boost::progress so core language model doesn't depend on
-// boost. Also adds option to print nothing.
+// boost. Also adds option to print nothing.
namespace util {
@@ -14,10 +14,10 @@ extern const char kProgressBanner[];
class ErsatzProgress {
public:
- // No output.
+ // No output.
ErsatzProgress();
- // Null means no output. The null value is useful for passing along the ostream pointer from another caller.
+ // Null means no output. The null value is useful for passing along the ostream pointer from another caller.
explicit ErsatzProgress(uint64_t complete, std::ostream *to = &std::cerr, const std::string &message = "");
~ErsatzProgress();
diff --git a/util/exception.cc b/util/exception.cc
index 32d48516a..588f5eae5 100644
--- a/util/exception.cc
+++ b/util/exception.cc
@@ -30,7 +30,7 @@ void Exception::SetLocation(const char *file, unsigned int line, const char *fun
/* The child class might have set some text, but we want this to come first.
* Another option would be passing this information to the constructor, but
* then child classes would have to accept constructor arguments and pass
- * them down.
+ * them down.
*/
text_ = stream_.str();
stream_.str("");
diff --git a/util/exception.hh b/util/exception.hh
index 2b503e50e..7a0e7c44a 100644
--- a/util/exception.hh
+++ b/util/exception.hh
@@ -19,10 +19,10 @@ class Exception : public std::exception {
Exception(const Exception &from);
Exception &operator=(const Exception &from);
- // Not threadsafe, but probably doesn't matter. FWIW, Boost's exception guidance implies that what() isn't threadsafe.
+ // Not threadsafe, but probably doesn't matter. FWIW, Boost's exception guidance implies that what() isn't threadsafe.
const char *what() const throw();
- // For use by the UTIL_THROW macros.
+ // For use by the UTIL_THROW macros.
void SetLocation(
const char *file,
unsigned int line,
@@ -33,7 +33,7 @@ class Exception : public std::exception {
private:
template <class Except, class Data> friend typename Except::template ExceptionTag<Except&>::Identity operator<<(Except &e, const Data &data);
- // This helps restrict operator<< defined below.
+ // This helps restrict operator<< defined below.
template <class T> struct ExceptionTag {
typedef T Identity;
};
@@ -42,9 +42,9 @@ class Exception : public std::exception {
mutable std::string text_;
};
-/* This implements the normal operator<< for Exception and all its children.
+/* This implements the normal operator<< for Exception and all its children.
* SFINAE means it only applies to Exception. Think of this as an ersatz
- * boost::enable_if.
+ * boost::enable_if.
*/
template <class Except, class Data> typename Except::template ExceptionTag<Except&>::Identity operator<<(Except &e, const Data &data) {
e.stream_ << data;
@@ -63,10 +63,10 @@ template <class Except, class Data> typename Except::template ExceptionTag<Excep
/* Create an instance of Exception, add the message Modify, and throw it.
* Modify is appended to the what() message and can contain << for ostream
- * operations.
+ * operations.
*
* do .. while kludge to swallow trailing ; character
- * http://gcc.gnu.org/onlinedocs/cpp/Swallowing-the-Semicolon.html .
+ * http://gcc.gnu.org/onlinedocs/cpp/Swallowing-the-Semicolon.html .
* Arg can be a constructor argument to the exception.
*/
#define UTIL_THROW_BACKEND(Condition, Exception, Arg, Modify) do { \
@@ -123,7 +123,7 @@ class FileOpenException : public Exception {
~FileOpenException() throw() {}
};
-// Utilities for overflow checking.
+// Utilities for overflow checking.
class OverflowException : public Exception {
public:
OverflowException() throw();
diff --git a/util/file.hh b/util/file.hh
index ca52dbfba..bd5873cbc 100644
--- a/util/file.hh
+++ b/util/file.hh
@@ -73,12 +73,12 @@ class EndOfFileException : public Exception {
~EndOfFileException() throw();
};
-// Open for read only.
+// Open for read only.
int OpenReadOrThrow(const char *name);
-// Create file if it doesn't exist, truncate if it does. Opened for write.
+// Create file if it doesn't exist, truncate if it does. Opened for write.
int CreateOrThrow(const char *name);
-// Return value for SizeFile when it can't size properly.
+// Return value for SizeFile when it can't size properly.
const uint64_t kBadSize = (uint64_t)-1;
uint64_t SizeFile(int fd);
uint64_t SizeOrThrow(int fd);
diff --git a/util/file_piece.cc b/util/file_piece.cc
index 37e492672..c808e7d90 100644
--- a/util/file_piece.cc
+++ b/util/file_piece.cc
@@ -26,10 +26,10 @@ ParseNumberException::ParseNumberException(StringPiece value) throw() {
*this << "Could not parse \"" << value << "\" into a number";
}
-// Sigh this is the only way I could come up with to do a _const_ bool. It has ' ', '\f', '\n', '\r', '\t', and '\v' (same as isspace on C locale).
+// Sigh this is the only way I could come up with to do a _const_ bool. It has ' ', '\f', '\n', '\r', '\t', and '\v' (same as isspace on C locale).
const bool kSpaces[256] = {0,0,0,0,0,0,0,0,0,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
-FilePiece::FilePiece(const char *name, std::ostream *show_progress, std::size_t min_buffer) :
+FilePiece::FilePiece(const char *name, std::ostream *show_progress, std::size_t min_buffer) :
file_(OpenReadOrThrow(name)), total_size_(SizeFile(file_.get())), page_(SizePage()),
progress_(total_size_, total_size_ == kBadSize ? NULL : show_progress, std::string("Reading ") + name) {
Initialize(name, show_progress, min_buffer);
@@ -42,7 +42,7 @@ std::string NamePossiblyFind(int fd, const char *name) {
}
} // namespace
-FilePiece::FilePiece(int fd, const char *name, std::ostream *show_progress, std::size_t min_buffer) :
+FilePiece::FilePiece(int fd, const char *name, std::ostream *show_progress, std::size_t min_buffer) :
file_(fd), total_size_(SizeFile(file_.get())), page_(SizePage()),
progress_(total_size_, total_size_ == kBadSize ? NULL : show_progress, std::string("Reading ") + NamePossiblyFind(fd, name)) {
Initialize(NamePossiblyFind(fd, name).c_str(), show_progress, min_buffer);
@@ -56,7 +56,7 @@ FilePiece::FilePiece(std::istream &stream, const char *name, std::size_t min_buf
data_.reset(MallocOrThrow(default_map_size_), default_map_size_, scoped_memory::MALLOC_ALLOCATED);
position_ = data_.begin();
position_end_ = position_;
-
+
fell_back_.Reset(stream);
}
@@ -118,9 +118,9 @@ void FilePiece::Initialize(const char *name, std::ostream *show_progress, std::s
InitializeNoRead(name, min_buffer);
if (total_size_ == kBadSize) {
- // So the assertion passes.
+ // So the assertion passes.
fallback_to_read_ = false;
- if (show_progress)
+ if (show_progress)
*show_progress << "File " << name << " isn't normal. Using slower read() instead of mmap(). No progress bar." << std::endl;
TransitionToRead();
} else {
@@ -214,7 +214,7 @@ void FilePiece::Shift() {
uint64_t desired_begin = position_ - data_.begin() + mapped_offset_;
if (!fallback_to_read_) MMapShift(desired_begin);
- // Notice an mmap failure might set the fallback.
+ // Notice an mmap failure might set the fallback.
if (fallback_to_read_) ReadShift();
for (last_space_ = position_end_ - 1; last_space_ >= position_; --last_space_) {
@@ -223,13 +223,13 @@ void FilePiece::Shift() {
}
void FilePiece::MMapShift(uint64_t desired_begin) {
- // Use mmap.
+ // Use mmap.
uint64_t ignore = desired_begin % page_;
- // Duplicate request for Shift means give more data.
+ // Duplicate request for Shift means give more data.
if (position_ == data_.begin() + ignore && position_) {
default_map_size_ *= 2;
}
- // Local version so that in case of failure it doesn't overwrite the class variable.
+ // Local version so that in case of failure it doesn't overwrite the class variable.
uint64_t mapped_offset = desired_begin - ignore;
uint64_t mapped_size;
@@ -240,7 +240,7 @@ void FilePiece::MMapShift(uint64_t desired_begin) {
mapped_size = default_map_size_;
}
- // Forcibly clear the existing mmap first.
+ // Forcibly clear the existing mmap first.
data_.reset();
try {
MapRead(POPULATE_OR_LAZY, *file_, mapped_offset, mapped_size, data_);
@@ -248,7 +248,7 @@ void FilePiece::MMapShift(uint64_t desired_begin) {
if (desired_begin) {
SeekOrThrow(*file_, desired_begin);
}
- // The mmap was scheduled to end the file, but now we're going to read it.
+ // The mmap was scheduled to end the file, but now we're going to read it.
at_end_ = false;
TransitionToRead();
return;
@@ -278,10 +278,10 @@ void FilePiece::TransitionToRead() {
void FilePiece::ReadShift() {
assert(fallback_to_read_);
- // Bytes [data_.begin(), position_) have been consumed.
- // Bytes [position_, position_end_) have been read into the buffer.
+ // Bytes [data_.begin(), position_) have been consumed.
+ // Bytes [position_, position_end_) have been read into the buffer.
- // Start at the beginning of the buffer if there's nothing useful in it.
+ // Start at the beginning of the buffer if there's nothing useful in it.
if (position_ == position_end_) {
mapped_offset_ += (position_end_ - data_.begin());
position_ = data_.begin();
@@ -292,7 +292,7 @@ void FilePiece::ReadShift() {
if (already_read == default_map_size_) {
if (position_ == data_.begin()) {
- // Buffer too small.
+ // Buffer too small.
std::size_t valid_length = position_end_ - position_;
default_map_size_ *= 2;
data_.call_realloc(default_map_size_);
diff --git a/util/fixed_array.hh b/util/fixed_array.hh
index e2aa8ae7d..610cbdf12 100644
--- a/util/fixed_array.hh
+++ b/util/fixed_array.hh
@@ -23,22 +23,22 @@ template <class T> class FixedArray {
Init(limit);
}
- /**
+ /**
* Constructs an instance, but does not initialize it.
*
* Any objects constructed in this manner must be subsequently @ref FixedArray::Init() "initialized" prior to use.
*
* @see FixedArray::Init()
*/
- FixedArray()
- : newed_end_(NULL)
+ FixedArray()
+ : newed_end_(NULL)
#ifndef NDEBUG
- , allocated_end_(NULL)
+ , allocated_end_(NULL)
#endif
{}
- /**
- * Initialize with a given size bound but do not construct the objects.
+ /**
+ * Initialize with a given size bound but do not construct the objects.
*
* This method is responsible for allocating memory.
* Objects stored in this array will be constructed in a location within this allocated memory.
@@ -73,37 +73,37 @@ template <class T> class FixedArray {
/** Gets a pointer to the first object currently stored in this data structure. */
T *begin() { return static_cast<T*>(block_.get()); }
-
+
/** Gets a const pointer to the last object currently stored in this data structure. */
const T *begin() const { return static_cast<const T*>(block_.get()); }
-
+
/** Gets a pointer to the last object currently stored in this data structure. */
T *end() { return newed_end_; }
-
+
/** Gets a const pointer to the last object currently stored in this data structure. */
const T *end() const { return newed_end_; }
/** Gets a reference to the last object currently stored in this data structure. */
T &back() { return *(end() - 1); }
-
+
/** Gets a const reference to the last object currently stored in this data structure. */
const T &back() const { return *(end() - 1); }
/** Gets the number of objects currently stored in this data structure. */
std::size_t size() const { return end() - begin(); }
-
+
/** Returns true if there are no objects currently stored in this data structure. */
bool empty() const { return begin() == end(); }
- /**
- * Gets a reference to the object with index i currently stored in this data structure.
+ /**
+ * Gets a reference to the object with index i currently stored in this data structure.
*
* @param i Index of the object to reference
*/
T &operator[](std::size_t i) { return begin()[i]; }
-
- /**
- * Gets a const reference to the object with index i currently stored in this data structure.
+
+ /**
+ * Gets a const reference to the object with index i currently stored in this data structure.
*
* @param i Index of the object to reference
*/
diff --git a/util/getopt.c b/util/getopt.c
index 992c96b0c..50eef42cc 100644
--- a/util/getopt.c
+++ b/util/getopt.c
@@ -3,7 +3,7 @@ POSIX getopt for Windows
AT&T Public License
-Code given out at the 1985 UNIFORUM conference in Dallas.
+Code given out at the 1985 UNIFORUM conference in Dallas.
*/
#ifndef __GNUC__
diff --git a/util/getopt.hh b/util/getopt.hh
index 50eab56f4..9b0792b04 100644
--- a/util/getopt.hh
+++ b/util/getopt.hh
@@ -3,7 +3,7 @@ POSIX getopt for Windows
AT&T Public License
-Code given out at the 1985 UNIFORUM conference in Dallas.
+Code given out at the 1985 UNIFORUM conference in Dallas.
*/
#ifdef __GNUC__
diff --git a/util/mmap.cc b/util/mmap.cc
index b91f181f4..7dcb57ba3 100644
--- a/util/mmap.cc
+++ b/util/mmap.cc
@@ -56,7 +56,7 @@ void UnmapOrThrow(void *start, size_t length) {
scoped_mmap::~scoped_mmap() {
if (data_ != (void*)-1) {
try {
- // Thanks Denis Filimonov for pointing out NFS likes msync first.
+ // Thanks Denis Filimonov for pointing out NFS likes msync first.
SyncOrThrow(data_, size_);
UnmapOrThrow(data_, size_);
} catch (const util::ErrnoException &e) {
diff --git a/util/mmap.hh b/util/mmap.hh
index 37feb5bee..9ac604975 100644
--- a/util/mmap.hh
+++ b/util/mmap.hh
@@ -1,6 +1,6 @@
#ifndef UTIL_MMAP_H
#define UTIL_MMAP_H
-// Utilities for mmaped files.
+// Utilities for mmaped files.
#include <cstddef>
#include <limits>
@@ -14,7 +14,7 @@ class scoped_fd;
long SizePage();
-// (void*)-1 is MAP_FAILED; this is done to avoid including the mmap header here.
+// (void*)-1 is MAP_FAILED; this is done to avoid including the mmap header here.
class scoped_mmap {
public:
scoped_mmap() : data_((void*)-1), size_(0) {}
@@ -47,13 +47,13 @@ class scoped_mmap {
/* For when the memory might come from mmap, new char[], or malloc. Uses NULL
* and 0 for blanks even though mmap signals errors with (void*)-1). The reset
- * function checks that blank for mmap.
+ * function checks that blank for mmap.
*/
class scoped_memory {
public:
typedef enum {MMAP_ALLOCATED, ARRAY_ALLOCATED, MALLOC_ALLOCATED, NONE_ALLOCATED} Alloc;
- scoped_memory(void *data, std::size_t size, Alloc source)
+ scoped_memory(void *data, std::size_t size, Alloc source)
: data_(data), size_(size), source_(source) {}
scoped_memory() : data_(NULL), size_(0), source_(NONE_ALLOCATED) {}
@@ -90,9 +90,9 @@ typedef enum {
LAZY,
// On linux, pass MAP_POPULATE to mmap.
POPULATE_OR_LAZY,
- // Populate on Linux. malloc and read on non-Linux.
+ // Populate on Linux. malloc and read on non-Linux.
POPULATE_OR_READ,
- // malloc and read.
+ // malloc and read.
READ,
// malloc and read in parallel (recommended for Lustre)
PARALLEL_READ,
@@ -110,18 +110,18 @@ void MapRead(LoadMethod method, int fd, uint64_t offset, std::size_t size, scope
void MapAnonymous(std::size_t size, scoped_memory &to);
-// Open file name with mmap of size bytes, all of which are initially zero.
+// Open file name with mmap of size bytes, all of which are initially zero.
void *MapZeroedWrite(int fd, std::size_t size);
void *MapZeroedWrite(const char *name, std::size_t size, scoped_fd &file);
-// msync wrapper
+// msync wrapper
void SyncOrThrow(void *start, size_t length);
// Forward rolling memory map with no overlap.
class Rolling {
public:
Rolling() {}
-
+
explicit Rolling(void *data) { Init(data); }
Rolling(const Rolling &copy_from, uint64_t increase = 0);
@@ -163,7 +163,7 @@ class Rolling {
}
return ptr_;
}
-
+
// Returns indexed pointer.
void *CheckedIndex(uint64_t index) {
return static_cast<uint8_t*>(CheckedBase(index)) + index;
@@ -178,7 +178,7 @@ class Rolling {
void *ptr_;
uint64_t current_begin_;
uint64_t current_end_;
-
+
scoped_memory mem_;
int fd_;
diff --git a/util/multi_intersection.hh b/util/multi_intersection.hh
index 2955acc72..73954608e 100644
--- a/util/multi_intersection.hh
+++ b/util/multi_intersection.hh
@@ -22,7 +22,7 @@ template <class Range> struct RangeLessBySize : public std::binary_function<cons
* order. sets is changed to truncate the beginning of each sequence to the
* location of the match or an empty set. Precondition: sets is not empty
* since the intersection over null is the universe and this function does not
- * know the universe.
+ * know the universe.
*/
template <class Iterator, class Less> boost::optional<typename std::iterator_traits<Iterator>::value_type> FirstIntersectionSorted(std::vector<boost::iterator_range<Iterator> > &sets, const Less &less = std::less<typename std::iterator_traits<Iterator>::value_type>()) {
typedef std::vector<boost::iterator_range<Iterator> > Sets;
@@ -31,7 +31,7 @@ template <class Iterator, class Less> boost::optional<typename std::iterator_tra
assert(!sets.empty());
if (sets.front().empty()) return boost::optional<Value>();
- // Possibly suboptimal to copy for general Value; makes unsigned int go slightly faster.
+ // Possibly suboptimal to copy for general Value; makes unsigned int go slightly faster.
Value highest(sets.front().front());
for (typename Sets::iterator i(sets.begin()); i != sets.end(); ) {
i->advance_begin(std::lower_bound(i->begin(), i->end(), highest, less) - i->begin());
diff --git a/util/multi_intersection_test.cc b/util/multi_intersection_test.cc
index 970afc171..ee5af7d61 100644
--- a/util/multi_intersection_test.cc
+++ b/util/multi_intersection_test.cc
@@ -8,7 +8,7 @@ namespace {
BOOST_AUTO_TEST_CASE(Empty) {
std::vector<boost::iterator_range<const unsigned int*> > sets;
-
+
sets.push_back(boost::iterator_range<const unsigned int*>(static_cast<const unsigned int*>(NULL), static_cast<const unsigned int*>(NULL)));
BOOST_CHECK(!FirstIntersection(sets));
}
diff --git a/util/murmur_hash.cc b/util/murmur_hash.cc
index d04e6ecd0..bf3249869 100644
--- a/util/murmur_hash.cc
+++ b/util/murmur_hash.cc
@@ -2,8 +2,8 @@
* code is released to the public domain. For business purposes, Murmurhash is
* under the MIT license."
* This is modified from the original:
- * ULL tag on 0xc6a4a7935bd1e995 so this will compile on 32-bit.
- * length changed to unsigned int.
+ * ULL tag on 0xc6a4a7935bd1e995 so this will compile on 32-bit.
+ * length changed to unsigned int.
* placed in namespace util
* add MurmurHashNative
* default option = 0 for seed
@@ -18,7 +18,7 @@ namespace util {
//-----------------------------------------------------------------------------
// MurmurHash2, 64-bit versions, by Austin Appleby
-// The same caveats as 32-bit MurmurHash2 apply here - beware of alignment
+// The same caveats as 32-bit MurmurHash2 apply here - beware of alignment
// and endian-ness issues if used across multiple platforms.
// 64-bit hash for 64-bit platforms
@@ -49,12 +49,12 @@ uint64_t MurmurHash64A ( const void * key, std::size_t len, uint64_t seed )
uint64_t k = *data++;
#endif
- k *= m;
- k ^= k >> r;
- k *= m;
-
+ k *= m;
+ k ^= k >> r;
+ k *= m;
+
h ^= k;
- h *= m;
+ h *= m;
}
const unsigned char * data2 = (const unsigned char*)data;
@@ -70,13 +70,13 @@ uint64_t MurmurHash64A ( const void * key, std::size_t len, uint64_t seed )
case 1: h ^= uint64_t(data2[0]);
h *= m;
};
-
+
h ^= h >> r;
h *= m;
h ^= h >> r;
return h;
-}
+}
// 64-bit hash for 32-bit platforms
@@ -151,7 +151,7 @@ uint64_t MurmurHash64B ( const void * key, std::size_t len, uint64_t seed )
return h;
}
-// Trick to test for 64-bit architecture at compile time.
+// Trick to test for 64-bit architecture at compile time.
namespace {
#ifdef __clang__
#pragma clang diagnostic push
diff --git a/util/pcqueue.hh b/util/pcqueue.hh
index f74ef18e8..05c868fba 100644
--- a/util/pcqueue.hh
+++ b/util/pcqueue.hh
@@ -74,10 +74,10 @@ inline void WaitSemaphore (Semaphore &on) {
/**
* Producer consumer queue safe for multiple producers and multiple consumers.
- * T must be default constructable and have operator=.
+ * T must be default constructable and have operator=.
* The value is copied twice for Consume(T &out) or three times for Consume(),
* so larger objects should be passed via pointer.
- * Strong exception guarantee if operator= throws. Undefined if semaphores throw.
+ * Strong exception guarantee if operator= throws. Undefined if semaphores throw.
*/
template <class T> class PCQueue : boost::noncopyable {
public:
@@ -130,7 +130,7 @@ template <class T> class PCQueue : boost::noncopyable {
Consume(ret);
return ret;
}
-
+
private:
// Number of empty spaces in storage_.
Semaphore empty_;
diff --git a/util/pool.hh b/util/pool.hh
index fd55572cd..511b6d995 100644
--- a/util/pool.hh
+++ b/util/pool.hh
@@ -1,5 +1,5 @@
// Very simple pool. It can only allocate memory. And all of the memory it
-// allocates must be freed at the same time.
+// allocates must be freed at the same time.
#ifndef UTIL_POOL_H
#define UTIL_POOL_H
@@ -37,7 +37,7 @@ class Pool {
// no copying
Pool(const Pool &);
Pool &operator=(const Pool &);
-};
+};
} // namespace util
diff --git a/util/probing_hash_table.hh b/util/probing_hash_table.hh
index 83fd0ec12..245340ddb 100644
--- a/util/probing_hash_table.hh
+++ b/util/probing_hash_table.hh
@@ -34,7 +34,7 @@ template <class EntryT, class HashT, class EqualT> class AutoProbing;
* Memory management and initialization is externalized to make it easier to
* serialize these to disk and load them quickly.
* Uses linear probing to find value.
- * Only insert and lookup operations.
+ * Only insert and lookup operations.
*/
template <class EntryT, class HashT, class EqualT = std::equal_to<typename EntryT::Key> > class ProbingHashTable {
public:
@@ -50,7 +50,7 @@ template <class EntryT, class HashT, class EqualT = std::equal_to<typename Entry
return buckets * sizeof(Entry);
}
- // Must be assigned to later.
+ // Must be assigned to later.
ProbingHashTable() : entries_(0)
#ifdef DEBUG
, initialized_(false)
@@ -98,12 +98,12 @@ template <class EntryT, class HashT, class EqualT = std::equal_to<typename Entry
return false;
}
if (++i == end_) i = begin_;
- }
+ }
}
void FinishedInserting() {}
- // Don't change anything related to GetKey,
+ // Don't change anything related to GetKey,
template <class Key> bool UnsafeMutableFind(const Key key, MutableIterator &out) {
#ifdef DEBUG
assert(initialized_);
@@ -136,7 +136,7 @@ template <class EntryT, class HashT, class EqualT = std::equal_to<typename Entry
if (equal_(got, key)) { out = i; return true; }
if (equal_(got, invalid_)) return false;
if (++i == end_) i = begin_;
- }
+ }
}
// Like Find but we're sure it must be there.
@@ -253,7 +253,7 @@ template <class EntryT, class HashT, class EqualT = std::equal_to<typename Entry
#endif
};
-// Resizable linear probing hash table. This owns the memory.
+// Resizable linear probing hash table. This owns the memory.
template <class EntryT, class HashT, class EqualT = std::equal_to<typename EntryT::Key> > class AutoProbing {
private:
typedef ProbingHashTable<EntryT, HashT, EqualT> Backend;
diff --git a/util/random.cc b/util/random.cc
new file mode 100644
index 000000000..b1eea2b50
--- /dev/null
+++ b/util/random.cc
@@ -0,0 +1,43 @@
+#include "util/random.hh"
+
+#include <cstdlib>
+
+#include <boost/thread/locks.hpp>
+// #include <boost/thread/lock_guard.hpp>
+#include <boost/thread/mutex.hpp>
+
+namespace util
+{
+namespace
+{
+/** Lock to protect randomizer.
+ *
+ * This module is implemented in terms of rand()/srand() from <cstdlib>.
+ * These functions are standard C, but they're not thread-safe. Scalability
+ * is not worth much complexity here, so just slap a mutex around it.
+ */
+boost::mutex rand_lock;
+} // namespace
+
+void rand_init(unsigned int seed)
+{
+ boost::lock_guard<boost::mutex> lock(rand_lock);
+ srand(seed);
+}
+
+
+void rand_init()
+{
+ rand_init(time(NULL));
+}
+
+namespace internal
+{
+// This is the one call to the actual randomizer. All else is built on this.
+int rand_int()
+{
+ boost::lock_guard<boost::mutex> lock(rand_lock);
+ return std::rand();
+}
+} // namespace internal
+} // namespace util
diff --git a/util/random.hh b/util/random.hh
new file mode 100644
index 000000000..6c2773520
--- /dev/null
+++ b/util/random.hh
@@ -0,0 +1,229 @@
+#ifndef UTIL_RANDOM_H
+#define UTIL_RANDOM_H
+
+#include <cstdlib>
+#include <limits>
+
+namespace util
+{
+/** Thread-safe, cross-platform random number generator.
+ *
+ * This is not for proper security-grade randomness, but should be "good
+ * enough" for producing arbitrary values of various numeric types.
+ *
+ * Before starting, call rand_init() to seed the randomizer. There is no need
+ * to do this more than once; in fact doing it more often is likely to make the
+ * randomizer less effective. Once that is done, call the rand(), rand_excl(),
+ * and rand_incl() functions as needed to generate pseudo-random numbers.
+ *
+ * Probability distribution is roughly uniform, but for integral types is
+ * skewed slightly towards lower numbers depending on how close "top" comes to
+ * RAND_MAX.
+ *
+ * For floating-point types, resolution is limited; there will actually be
+ * only RAND_MAX different possible values.
+ */
+
+/** Initialize randomizer with a fixed seed.
+ *
+ * After this, unless the randomizer gets seeded again, consecutive calls to
+ * the random functions will return a sequence of pseudo-random numbers
+ * determined by the seed. Every time the randomizer is seeded with this same
+ * seed, it will again start returning the same sequence of numbers.
+ */
+void rand_init(unsigned int);
+
+/** Initialize randomizer based on current time.
+ *
+ * Call this to make the randomizer return hard-to-predict numbers. It won't
+ * produce high-grade randomness, but enough to make the program act
+ * differently on different runs.
+ *
+ * The seed will be based on the current time in seconds. So calling it twice
+ * within the same second will just reset the randomizer to where it was before.
+ * Don't do that.
+ */
+void rand_init();
+
+
+/** Return a pseudorandom number between 0 and RAND_MAX inclusive.
+ *
+ * Initialize (seed) the randomizer before starting to call this.
+ */
+template<typename T> inline T rand();
+
+
+/** Return a pseudorandom number in the half-open interval [bottom, top).
+ *
+ * Generates a value between "bottom" (inclusive) and "top" (exclusive),
+ * assuming that (top - bottom) <= RAND_MAX.
+ */
+template<typename T> inline T rand_excl(T bottom, T top);
+
+
+/** Return a pseudorandom number in the half-open interval [0, top).
+ *
+ * Generates a value between 0 (inclusive) and "top" (exclusive), assuming that
+ * bottom <= RAND_MAX.
+ */
+template<typename T> inline T rand_excl(T top);
+
+
+/** Return a pseudorandom number in the open interval [bottom, top].
+ *
+ * Generates a value between "bottom" and "top" inclusive, assuming that
+ * (top - bottom) < RAND_MAX.
+ */
+template<typename T> inline T rand_incl(T bottom, T top);
+
+
+/** Return a pseudorandom number in the open interval [0, top].
+ *
+ * Generates a value between 0 and "top" inclusive, assuming that
+ * bottom < RAND_MAX.
+ */
+template<typename T> inline T rand_incl(T top);
+
+
+/** Return a pseudorandom number which may be larger than RAND_MAX.
+ *
+ * The requested type must be integral, and its size must be an even multiple
+ * of the size of an int. The return value will combine one or more random
+ * ints into a single value, which could get quite large.
+ *
+ * The result is nonnegative. Because the constituent ints are also
+ * nonnegative, the most significant bit in each of the ints will be zero,
+ * so for a wider type, there will be "gaps" in the range of possible outputs.
+ */
+template<typename T> inline T wide_rand();
+
+/** Return a pseudorandom number in [0, top), not limited to RAND_MAX.
+ *
+ * Works like wide_rand(), but if the requested type is wider than an int, it
+ * accommodates larger top values than an int can represent.
+ */
+template<typename T> inline T wide_rand_excl(T top);
+
+/** Return a pseudorandom number in [bottom, top), not limited to RAND_MAX.
+ *
+ * Works like wide_rand(), but if the requested type is wider than an int, it
+ * accommodates larger value ranges than an int can represent.
+ */
+template<typename T> inline T wide_rand_excl(T bottom, T top);
+
+/** Return a pseudorandom number in [0, top], not limited to RAND_MAX.
+ *
+ * Works like wide_rand(), but if the requested type is wider than an int, it
+ * accommodates larger top values than an int can represent.
+ */
+template<typename T> inline T wide_rand_incl(T top);
+
+/** Return a pseudorandom number in [bottom, top], not limited to RAND_MAX.
+ *
+ * Works like wide_rand(), but if the requested type is wider than an int, it
+ * accommodates larger top values than an int can represent.
+ */
+template<typename T> inline T wide_rand_incl(T bottom, T top);
+
+
+/// Implementation detail. For the random module's internal use only.
+namespace internal
+{
+/// The central call to the randomizer upon which this whole module is built.
+int rand_int();
+
+/// Helper template: customize random values to required ranges.
+template<typename T, bool is_integer_type> struct random_scaler;
+
+/// Specialized random_scaler for integral types.
+template<typename T> struct random_scaler<T, true>
+{
+ static T rnd_excl(T value, T range) { return value % range; }
+ static T rnd_incl(T value, T range) { return value % (range + 1); }
+};
+
+/// Specialized random_scaler for non-integral types.
+template<typename T> struct random_scaler<T, false>
+{
+ static T rnd_excl(T value, T range)
+ {
+ // Promote RAND_MAX to T before adding one to avoid overflow.
+ return range * value / (T(RAND_MAX) + 1);
+ }
+ static T rnd_incl(T value, T range) { return range * value / RAND_MAX; }
+};
+
+/// Helper for filling a wider variable with random ints.
+template<typename T, size_t remaining_ints> struct wide_random_collector
+{
+ static T generate()
+ {
+ T one_int = util::rand<T>() << (8 * sizeof(int));
+ return one_int | wide_random_collector<T, remaining_ints-1>::generate();
+ }
+};
+/// Specialized wide_random_collector for generating just a single int.
+template<typename T> struct wide_random_collector<T, 1>
+{
+ static T generate() { return util::rand<T>(); }
+};
+
+} // namespace internal
+
+
+template<typename T> inline T rand()
+{
+ return T(util::internal::rand_int());
+}
+
+template<typename T> inline T rand_excl(T top)
+{
+ typedef internal::random_scaler<T, std::numeric_limits<T>::is_integer> scaler;
+ return scaler::rnd_excl(util::rand<T>(), top);
+}
+
+template<typename T> inline T rand_excl(T bottom, T top)
+{
+ return bottom + rand_excl(top - bottom);
+}
+
+template<typename T> inline T rand_incl(T top)
+{
+ typedef internal::random_scaler<T, std::numeric_limits<T>::is_integer> scaler;
+ return scaler::rnd_incl(util::rand<T>(), top);
+}
+
+template<typename T> inline T rand_incl(T bottom, T top)
+{
+ return bottom + rand_incl(top - bottom);
+}
+
+template<typename T> inline T wide_rand()
+{
+ return internal::wide_random_collector<T, sizeof(T)/sizeof(int)>::generate();
+}
+
+template<typename T> inline T wide_rand_excl(T top)
+{
+ typedef internal::random_scaler<T, std::numeric_limits<T>::is_integer> scaler;
+ return scaler::rnd_excl(util::wide_rand<T>(), top);
+}
+
+template<typename T> inline T wide_rand_excl(T bottom, T top)
+{
+ return bottom + wide_rand_excl(top - bottom);
+}
+
+template<typename T> inline T wide_rand_incl(T top)
+{
+ typedef internal::random_scaler<T, std::numeric_limits<T>::is_integer> scaler;
+ return scaler::rnd_incl(util::wide_rand<T>(), top);
+}
+
+template<typename T> inline T wide_rand_incl(T bottom, T top)
+{
+ return bottom + wide_rand_incl(top - bottom);
+}
+} // namespace util
+
+#endif
diff --git a/util/random_test.cc b/util/random_test.cc
new file mode 100644
index 000000000..6d8981de8
--- /dev/null
+++ b/util/random_test.cc
@@ -0,0 +1,191 @@
+#include <cstdlib>
+
+#include "util/random.hh"
+
+#define BOOST_TEST_MODULE RandomTest
+#include <boost/test/unit_test.hpp>
+
+namespace util
+{
+namespace
+{
+
+BOOST_AUTO_TEST_CASE(rand_int_returns_positive_no_greater_than_RAND_MAX)
+{
+ rand_init();
+ for (int i=0; i<100; i++)
+ {
+ const int random_number = rand<int>();
+ BOOST_CHECK(random_number >= 0);
+ BOOST_CHECK(random_number <= RAND_MAX);
+ }
+}
+
+BOOST_AUTO_TEST_CASE(rand_int_returns_different_consecutive_numbers)
+{
+ rand_init(99);
+ const int first = rand<int>(), second = rand<int>(), third = rand<int>();
+ // Sometimes you'll get the same number twice in a row, but generally the
+ // randomizer returns different numbers.
+ BOOST_CHECK(second != first || third != first);
+}
+
+BOOST_AUTO_TEST_CASE(rand_int_returns_different_numbers_for_different_seeds)
+{
+ rand_init(1);
+ const int one1 = rand<int>(), one2 = rand<int>();
+ rand_init(2);
+ const int two1 = rand<int>(), two2 = rand<int>();
+ BOOST_CHECK(two1 != one1 || two2 != one2);
+}
+
+BOOST_AUTO_TEST_CASE(rand_int_returns_same_sequence_for_same_seed)
+{
+ rand_init(1);
+ const int first = rand<int>();
+ rand_init(1);
+ const int second = rand<int>();
+ BOOST_CHECK_EQUAL(first, second);
+}
+
+BOOST_AUTO_TEST_CASE(rand_excl_int_returns_number_in_range)
+{
+ const int bottom = 10, top = 50;
+ for (int i=0; i<100; i++)
+ {
+ const int random_number = rand_excl(bottom, top);
+ BOOST_CHECK(random_number >= bottom);
+ BOOST_CHECK(random_number < top);
+ }
+}
+
+BOOST_AUTO_TEST_CASE(rand_excl_int_covers_full_range)
+{
+ // The spread of random numbers really goes all the way from 0 (inclusive)
+ // to "top" (exclusive). It's not some smaller subset.
+ // This test will randomly fail sometimes, though very very rarely, when the
+ // random numbers don't actually have enough different values.
+ const int bottom = 1, top = 4;
+ int lowest = 99, highest = -1;
+ for (int i=0; i<100; i++)
+ {
+ const int random_number = rand_excl(bottom, top);
+ lowest = std::min(lowest, random_number);
+ highest = std::max(highest, random_number);
+ }
+
+ BOOST_CHECK_EQUAL(lowest, bottom);
+ BOOST_CHECK_EQUAL(highest, top - 1);
+}
+
+BOOST_AUTO_TEST_CASE(rand_incl_int_returns_number_in_range)
+{
+ const int bottom = 10, top = 50;
+ for (int i=0; i<100; i++)
+ {
+ const int random_number = rand_incl(bottom, top);
+ BOOST_CHECK(random_number >= 0);
+ BOOST_CHECK(random_number <= top);
+ }
+}
+
+BOOST_AUTO_TEST_CASE(rand_incl_int_covers_full_range)
+{
+ // The spread of random numbers really goes all the way from 0 to "top"
+ // inclusive. It's not some smaller subset.
+ // This test will randomly fail sometimes, though very very rarely, when the
+ // random numbers don't actually have enough different values.
+ const int bottom = 1, top = 4;
+ int lowest = 99, highest = -1;
+ for (int i=0; i<100; i++)
+ {
+ const int random_number = rand_incl(bottom, top);
+ lowest = std::min(lowest, random_number);
+ highest = std::max(highest, random_number);
+ }
+
+ BOOST_CHECK_EQUAL(lowest, bottom);
+ BOOST_CHECK_EQUAL(highest, top);
+}
+
+BOOST_AUTO_TEST_CASE(rand_excl_float_returns_float_in_range)
+{
+ const float bottom = 5, top = 10;
+ for (int i=0; i<100; i++)
+ {
+ const float random_number = rand_excl(bottom, top);
+ BOOST_CHECK(random_number >= bottom);
+ BOOST_CHECK(random_number < top);
+ }
+}
+
+BOOST_AUTO_TEST_CASE(rand_excl_float_returns_different_values)
+{
+ const float bottom = 5, top = 10;
+ float lowest = 99, highest = -1;
+ for (int i=0; i<10; i++)
+ {
+ const float random_number = rand_excl(bottom, top);
+ lowest = std::min(lowest, random_number);
+ highest = std::max(highest, random_number);
+ }
+ BOOST_CHECK(lowest < highest);
+}
+
+BOOST_AUTO_TEST_CASE(rand_float_incl_returns_float_in_range)
+{
+ const float bottom = 5, top = 10;
+ for (int i=0; i<1000; i++)
+ {
+ const float random_number = rand_excl(bottom, top);
+ BOOST_CHECK(random_number >= bottom);
+ BOOST_CHECK(random_number <= top);
+ }
+}
+
+BOOST_AUTO_TEST_CASE(rand_float_incl_returns_different_values)
+{
+ const float bottom = 0, top = 10;
+ float lowest = 99, highest = -1;
+ for (int i=0; i<10; i++)
+ {
+ const float random_number = rand_excl(bottom, top);
+ lowest = std::min(lowest, random_number);
+ highest = std::max(highest, random_number);
+ }
+ BOOST_CHECK(lowest < highest);
+}
+
+BOOST_AUTO_TEST_CASE(wide_rand_int_returns_different_numbers_in_range)
+{
+ for (int i=0; i<100; i++)
+ {
+ const int random_number = wide_rand<int>();
+ BOOST_CHECK(random_number >= 0);
+ BOOST_CHECK(random_number <= RAND_MAX);
+ }
+}
+
+BOOST_AUTO_TEST_CASE(wide_rand_long_long_returns_big_numbers)
+{
+ long long one = wide_rand<long long>(), two = wide_rand<long long>();
+ // This test will fail sometimes because of unlucky random numbers, but only
+ // very very rarely.
+ BOOST_CHECK(one > RAND_MAX || two > RAND_MAX);
+}
+
+BOOST_AUTO_TEST_CASE(wide_rand_excl_supports_larger_range)
+{
+ const long long top = 1000 * (long long)RAND_MAX;
+ long long
+ one = wide_rand_excl<long long>(top),
+ two = wide_rand_excl<long long>(top);
+ BOOST_CHECK(one < top);
+ BOOST_CHECK(two < top);
+ // This test will fail sometimes because of unlucky random numbers, but only
+ // very very rarely.
+ BOOST_CHECK(one > RAND_MAX || two > RAND_MAX);
+}
+
+} // namespace
+} // namespace util
diff --git a/util/read_compressed.cc b/util/read_compressed.cc
index 7f240dd7e..504c579c5 100644
--- a/util/read_compressed.cc
+++ b/util/read_compressed.cc
@@ -60,7 +60,7 @@ namespace {
ReadBase *ReadFactory(int fd, uint64_t &raw_amount, const void *already_data, std::size_t already_size, bool require_compressed);
-// Completed file that other classes can thunk to.
+// Completed file that other classes can thunk to.
class Complete : public ReadBase {
public:
std::size_t Read(void *, std::size_t, ReadCompressed &) {
@@ -121,7 +121,7 @@ template <class Compression> class StreamCompressed : public ReadBase {
: file_(fd),
in_buffer_(MallocOrThrow(kInputBuffer)),
back_(memcpy(in_buffer_.get(), already_data, already_size), already_size) {}
-
+
std::size_t Read(void *to, std::size_t amount, ReadCompressed &thunk) {
if (amount == 0) return 0;
back_.SetOutput(to, amount);
@@ -162,8 +162,8 @@ class GZip {
stream_.zfree = Z_NULL;
stream_.opaque = Z_NULL;
stream_.msg = NULL;
- // 32 for zlib and gzip decoding with automatic header detection.
- // 15 for maximum window size.
+ // 32 for zlib and gzip decoding with automatic header detection.
+ // 15 for maximum window size.
UTIL_THROW_IF(Z_OK != inflateInit2(&stream_, 32 + 15), GZException, "Failed to initialize zlib.");
}
diff --git a/util/read_compressed.hh b/util/read_compressed.hh
index 9b36f641c..935a49579 100644
--- a/util/read_compressed.hh
+++ b/util/read_compressed.hh
@@ -38,10 +38,10 @@ class ReadBase;
class ReadCompressed {
public:
static const std::size_t kMagicSize = 6;
- // Must have at least kMagicSize bytes.
+ // Must have at least kMagicSize bytes.
static bool DetectCompressedMagic(const void *from);
- // Takes ownership of fd.
+ // Takes ownership of fd.
explicit ReadCompressed(int fd);
// Try to avoid using this. Use the fd instead.
@@ -53,7 +53,7 @@ class ReadCompressed {
~ReadCompressed();
- // Takes ownership of fd.
+ // Takes ownership of fd.
void Reset(int fd);
// Same advice as the constructor.
@@ -74,7 +74,7 @@ class ReadCompressed {
uint64_t raw_amount_;
- // No copying.
+ // No copying.
ReadCompressed(const ReadCompressed &);
void operator=(const ReadCompressed &);
};
diff --git a/util/scoped.hh b/util/scoped.hh
index 60c36c36a..c347a43cc 100644
--- a/util/scoped.hh
+++ b/util/scoped.hh
@@ -86,7 +86,7 @@ class scoped_malloc : public scoped_c<void, std::free> {
struct scoped_delete_array_forward {
template <class T> static void Close(T *p) { delete [] p; }
};
-// Hat tip to boost.
+// Hat tip to boost.
template <class T> class scoped_array : public scoped<T, scoped_delete_array_forward> {
public:
explicit scoped_array(T *p = NULL) : scoped<T, scoped_delete_array_forward>(p) {}
diff --git a/util/sorted_uniform.hh b/util/sorted_uniform.hh
index 3673a8b5d..ddd2b3f2a 100644
--- a/util/sorted_uniform.hh
+++ b/util/sorted_uniform.hh
@@ -22,7 +22,7 @@ struct Pivot64 {
}
};
-// Use when off * width is <2^64. This is guaranteed when each of them is actually a 32-bit value.
+// Use when off * width is <2^64. This is guaranteed when each of them is actually a 32-bit value.
struct Pivot32 {
static inline std::size_t Calc(uint64_t off, uint64_t range, uint64_t width) {
return static_cast<std::size_t>((off * width) / (range + 1));
@@ -56,7 +56,7 @@ template <class Iterator, class Accessor> bool BinaryFind(
return false;
}
-// Search the range [before_it + 1, after_it - 1] for key.
+// Search the range [before_it + 1, after_it - 1] for key.
// Preconditions:
// before_v <= key <= after_v
// before_v <= all values in the range [before_it + 1, after_it - 1] <= after_v
@@ -90,7 +90,7 @@ template <class Iterator, class Accessor, class Pivot> bool SortedUniformFind(co
if (key == below) { out = begin; return true; }
return false;
}
- // Make the range [begin, end].
+ // Make the range [begin, end].
--end;
typename Accessor::Key above(accessor(end));
if (key >= above) {
diff --git a/util/sorted_uniform_test.cc b/util/sorted_uniform_test.cc
index d9f6fad1e..39f05e57e 100644
--- a/util/sorted_uniform_test.cc
+++ b/util/sorted_uniform_test.cc
@@ -87,7 +87,7 @@ template <class Key> void RandomTest(Key upper, size_t entries, size_t queries)
}
std::sort(backing.begin(), backing.end());
- // Random queries.
+ // Random queries.
for (size_t i = 0; i < queries; ++i) {
const Key key = gen_key();
Check<Key, unsigned char>(&*backing.begin(), &*backing.end(), reference, key);
diff --git a/util/stream/block.hh b/util/stream/block.hh
index aa7e28bb1..6a70dba3e 100644
--- a/util/stream/block.hh
+++ b/util/stream/block.hh
@@ -12,13 +12,13 @@ namespace stream {
*/
class Block {
public:
-
- /**
- * Constructs an empty block.
+
+ /**
+ * Constructs an empty block.
*/
Block() : mem_(NULL), valid_size_(0) {}
- /**
+ /**
* Constructs a block that encapsulates a segment of memory.
*
* @param[in] mem The segment of memory to encapsulate
@@ -33,9 +33,9 @@ class Block {
*/
void SetValidSize(std::size_t to) { valid_size_ = to; }
- /**
+ /**
* Gets the number of bytes in this block that should be interpreted as valid.
- * This is important because read might fill in less than Allocated at EOF.
+ * This is important because read might fill in less than Allocated at EOF.
*/
std::size_t ValidSize() const { return valid_size_; }
@@ -45,34 +45,34 @@ class Block {
/** Gets a const void pointer to the memory underlying this block. */
const void *Get() const { return mem_; }
-
+
/**
* Gets a const void pointer to the end of the valid section of memory
* encapsulated by this block.
*/
- const void *ValidEnd() const {
+ const void *ValidEnd() const {
return reinterpret_cast<const uint8_t*>(mem_) + valid_size_;
}
/**
* Returns true if this block encapsulates a valid (non-NULL) block of memory.
- *
+ *
* This method is a user-defined implicit conversion function to boolean;
- * among other things, this method enables bare instances of this class
+ * among other things, this method enables bare instances of this class
* to be used as the condition of an if statement.
*/
operator bool() const { return mem_ != NULL; }
-
+
/**
* Returns true if this block is empty.
- *
+ *
* In other words, if Get()==NULL, this method will return true.
*/
bool operator!() const { return mem_ == NULL; }
-
+
private:
friend class Link;
-
+
/**
* Points this block's memory at NULL.
*
diff --git a/util/stream/chain.cc b/util/stream/chain.cc
index 7b68400a8..39f2f3fbb 100644
--- a/util/stream/chain.cc
+++ b/util/stream/chain.cc
@@ -37,7 +37,7 @@ Chain::Chain(const ChainConfig &config) : config_(config), complete_called_(fals
UTIL_THROW_IF(!config.entry_size, ChainConfigException, "zero-size entries.");
UTIL_THROW_IF(!config.block_count, ChainConfigException, "block count zero");
UTIL_THROW_IF(config.total_memory < config.entry_size * config.block_count, ChainConfigException, config.total_memory << " total memory, too small for " << config.block_count << " blocks of containing entries of size " << config.entry_size);
- // Round down block size to a multiple of entry size.
+ // Round down block size to a multiple of entry size.
block_size_ = config.total_memory / (config.block_count * config.entry_size) * config.entry_size;
}
@@ -65,7 +65,7 @@ Chain &Chain::operator>>(const PWriteAndRecycle &writer) {
void Chain::Wait(bool release_memory) {
if (queues_.empty()) {
assert(threads_.empty());
- return; // Nothing to wait for.
+ return; // Nothing to wait for.
}
if (!complete_called_) CompleteLoop();
threads_.clear();
@@ -84,15 +84,15 @@ void Chain::Wait(bool release_memory) {
void Chain::Start() {
Wait(false);
if (!memory_.get()) {
- // Allocate memory.
+ // Allocate memory.
assert(threads_.empty());
assert(queues_.empty());
std::size_t malloc_size = block_size_ * config_.block_count;
memory_.reset(MallocOrThrow(malloc_size));
}
- // This queue can accomodate all blocks.
+ // This queue can accomodate all blocks.
queues_.push_back(new PCQueue<Block>(config_.block_count));
- // Populate the lead queue with blocks.
+ // Populate the lead queue with blocks.
uint8_t *base = static_cast<uint8_t*>(memory_.get());
for (std::size_t i = 0; i < config_.block_count; ++i) {
queues_.front().Produce(Block(base, block_size_));
@@ -124,7 +124,7 @@ Link::Link(const ChainPosition &position) : in_(NULL) {
Link::~Link() {
if (current_) {
- // Probably an exception unwinding.
+ // Probably an exception unwinding.
std::cerr << "Last input should have been poison." << std::endl;
// abort();
} else {
diff --git a/util/stream/chain.hh b/util/stream/chain.hh
index 28dc8060e..0cd8c2aae 100644
--- a/util/stream/chain.hh
+++ b/util/stream/chain.hh
@@ -23,10 +23,10 @@ class ChainConfigException : public Exception {
};
class Chain;
-
+
/**
* Encapsulates a @ref PCQueue "producer queue" and a @ref PCQueue "consumer queue" within a @ref Chain "chain".
- *
+ *
* Specifies position in chain for Link constructor.
*/
class ChainPosition {
@@ -35,7 +35,7 @@ class ChainPosition {
private:
friend class Chain;
friend class Link;
- ChainPosition(PCQueue<Block> &in, PCQueue<Block> &out, Chain *chain, MultiProgress &progress)
+ ChainPosition(PCQueue<Block> &in, PCQueue<Block> &out, Chain *chain, MultiProgress &progress)
: in_(&in), out_(&out), chain_(chain), progress_(progress.Add()) {}
PCQueue<Block> *in_, *out_;
@@ -45,7 +45,7 @@ class ChainPosition {
WorkerProgress progress_;
};
-
+
/**
* Encapsulates a worker thread processing data at a given position in the chain.
*
@@ -53,7 +53,7 @@ class ChainPosition {
*/
class Thread {
public:
-
+
/**
* Constructs a new Thread in which the provided Worker is Run().
*
@@ -102,7 +102,7 @@ class Recycler {
extern const Recycler kRecycle;
class WriteAndRecycle;
class PWriteAndRecycle;
-
+
/**
* Represents a sequence of workers, through which @ref Block "blocks" can pass.
*/
@@ -113,10 +113,10 @@ class Chain {
};
public:
-
- /**
+
+ /**
* Constructs a configured Chain.
- *
+ *
* @param config Specifies how to configure the Chain.
*/
explicit Chain(const ChainConfig &config);
@@ -146,7 +146,7 @@ class Chain {
std::size_t EntrySize() const {
return config_.entry_size;
}
-
+
/**
* Gets the inital @ref Block::ValidSize "valid size" for @ref Block "blocks" in this chain.
*
@@ -159,10 +159,10 @@ class Chain {
/** Two ways to add to the chain: Add() or operator>>. */
ChainPosition Add();
- /**
+ /**
* Adds a new worker to this chain,
* and runs that worker in a new Thread owned by this chain.
- *
+ *
* The worker must have a Run method that accepts a position argument.
*
* @see Thread::operator()()
@@ -173,10 +173,10 @@ class Chain {
return *this;
}
- /**
+ /**
* Adds a new worker to this chain (but avoids copying that worker),
* and runs that worker in a new Thread owned by this chain.
- *
+ *
* The worker must have a Run method that accepts a position argument.
*
* @see Thread::operator()()
@@ -187,14 +187,14 @@ class Chain {
return *this;
}
- // Note that Link and Stream also define operator>> outside this class.
+ // Note that Link and Stream also define operator>> outside this class.
- // To complete the loop, call CompleteLoop(), >> kRecycle, or the destructor.
+ // To complete the loop, call CompleteLoop(), >> kRecycle, or the destructor.
void CompleteLoop() {
threads_.push_back(new Thread(Complete(), kRecycle));
}
- /**
+ /**
* Adds a Recycler worker to this chain,
* and runs that worker in a new Thread owned by this chain.
*/
@@ -203,17 +203,17 @@ class Chain {
return *this;
}
- /**
+ /**
* Adds a WriteAndRecycle worker to this chain,
* and runs that worker in a new Thread owned by this chain.
*/
Chain &operator>>(const WriteAndRecycle &writer);
Chain &operator>>(const PWriteAndRecycle &writer);
- // Chains are reusable. Call Wait to wait for everything to finish and free memory.
+ // Chains are reusable. Call Wait to wait for everything to finish and free memory.
void Wait(bool release_memory = true);
- // Waits for the current chain to complete (if any) then starts again.
+ // Waits for the current chain to complete (if any) then starts again.
void Start();
bool Running() const { return !queues_.empty(); }
@@ -237,29 +237,29 @@ class Chain {
};
// Create the link in the worker thread using the position token.
-/**
+/**
* Represents a C++ style iterator over @ref Block "blocks".
*/
class Link {
public:
-
+
// Either default construct and Init or just construct all at once.
-
+
/**
* Constructs an @ref Init "initialized" link.
*
* @see Init
*/
explicit Link(const ChainPosition &position);
-
- /**
- * Constructs a link that must subsequently be @ref Init "initialized".
+
+ /**
+ * Constructs a link that must subsequently be @ref Init "initialized".
*
* @see Init
*/
Link();
-
- /**
+
+ /**
* Initializes the link with the input @ref PCQueue "consumer queue" and output @ref PCQueue "producer queue" at a given @ref ChainPosition "position" in the @ref Chain "chain".
*
* @see Link()
@@ -269,7 +269,7 @@ class Link {
/**
* Destructs the link object.
*
- * If necessary, this method will pass a poison block
+ * If necessary, this method will pass a poison block
* to this link's output @ref PCQueue "producer queue".
*
* @see Block::SetToPoison()
@@ -290,7 +290,7 @@ class Link {
* Gets a pointer to the @ref Block "block" at this link.
*/
Block *operator->() { return &current_; }
-
+
/**
* Gets a const pointer to the @ref Block "block" at this link.
*/
@@ -303,25 +303,25 @@ class Link {
/**
* Returns true if the @ref Block "block" at this link encapsulates a valid (non-NULL) block of memory.
- *
+ *
* This method is a user-defined implicit conversion function to boolean;
- * among other things, this method enables bare instances of this class
+ * among other things, this method enables bare instances of this class
* to be used as the condition of an if statement.
*/
operator bool() const { return current_; }
- /**
+ /**
* @ref Block::SetToPoison() "Poisons" the @ref Block "block" at this link,
* and passes this now-poisoned block to this link's output @ref PCQueue "producer queue".
*
* @see Block::SetToPoison()
*/
void Poison();
-
+
private:
Block current_;
PCQueue<Block> *in_, *out_;
-
+
bool poisoned_;
WorkerProgress progress_;
diff --git a/util/stream/config.hh b/util/stream/config.hh
index 6bad36bc5..e94cf3487 100644
--- a/util/stream/config.hh
+++ b/util/stream/config.hh
@@ -10,11 +10,11 @@ namespace util { namespace stream {
* Represents how a chain should be configured.
*/
struct ChainConfig {
-
+
/** Constructs an configuration with underspecified (or default) parameters. */
ChainConfig() {}
- /**
+ /**
* Constructs a chain configuration object.
*
* @param [in] in_entry_size Number of bytes in each record.
@@ -29,26 +29,26 @@ struct ChainConfig {
* Number of bytes in each record.
*/
std::size_t entry_size;
-
+
/**
* Number of blocks in the chain.
*/
std::size_t block_count;
-
- /**
+
+ /**
* Total number of bytes available to the chain.
* This value will be divided amongst the blocks in the chain.
- * Chain's constructor will make this a multiple of entry_size.
+ * Chain's constructor will make this a multiple of entry_size.
*/
std::size_t total_memory;
};
-
+
/**
* Represents how a sorter should be configured.
*/
struct SortConfig {
-
+
/** Filename prefix where temporary files should be placed. */
std::string temp_prefix;
diff --git a/util/stream/io.cc b/util/stream/io.cc
index fa8467abd..c272d779c 100644
--- a/util/stream/io.cc
+++ b/util/stream/io.cc
@@ -16,7 +16,7 @@ void Read::Run(const ChainPosition &position) {
const std::size_t entry_size = position.GetChain().EntrySize();
for (Link link(position); link; ++link) {
std::size_t got = util::ReadOrEOF(file_, link->Get(), block_size);
- UTIL_THROW_IF(got % entry_size, ReadSizeException, "File ended with " << got << " bytes, not a multiple of " << entry_size << ".");
+ UTIL_THROW_IF(got % entry_size, ReadSizeException, "File ended with " << got << " bytes, not a multiple of " << entry_size << ".");
if (got == 0) {
link.Poison();
return;
diff --git a/util/stream/io.hh b/util/stream/io.hh
index 8dae2cbff..c3b53bbfe 100644
--- a/util/stream/io.hh
+++ b/util/stream/io.hh
@@ -18,12 +18,12 @@ class ReadSizeException : public util::Exception {
class Read {
public:
explicit Read(int fd) : file_(fd) {}
- void Run(const ChainPosition &position);
+ void Run(const ChainPosition &position);
private:
int file_;
};
-// Like read but uses pread so that the file can be accessed from multiple threads.
+// Like read but uses pread so that the file can be accessed from multiple threads.
class PRead {
public:
explicit PRead(int fd, bool take_own = false) : file_(fd), own_(take_own) {}
@@ -60,7 +60,7 @@ class PWriteAndRecycle {
};
-// Reuse the same file over and over again to buffer output.
+// Reuse the same file over and over again to buffer output.
class FileBuffer {
public:
explicit FileBuffer(int fd) : file_(fd) {}
diff --git a/util/stream/line_input.cc b/util/stream/line_input.cc
index dafa50207..0ad8800f6 100644
--- a/util/stream/line_input.cc
+++ b/util/stream/line_input.cc
@@ -14,7 +14,7 @@ void LineInput::Run(const ChainPosition &position) {
ReadCompressed reader(fd_);
// Holding area for beginning of line to be placed in next block.
std::vector<char> carry;
-
+
for (Link block(position); ; ++block) {
char *to = static_cast<char*>(block->Get());
char *begin = to;
@@ -39,7 +39,7 @@ void LineInput::Run(const ChainPosition &position) {
UTIL_THROW_IF(newline < begin, Exception, "Did not find a newline in " << position.GetChain().BlockSize() << " bytes of input of " << NameFromFD(fd_) << ". Is this a text file?");
if (*newline == '\n') break;
}
-
+
// Copy everything after the last newline to the carry.
carry.clear();
carry.resize(to - (newline + 1));
diff --git a/util/stream/multi_progress.cc b/util/stream/multi_progress.cc
index 7d6a6a73a..59750f516 100644
--- a/util/stream/multi_progress.cc
+++ b/util/stream/multi_progress.cc
@@ -31,9 +31,9 @@ MultiProgress::~MultiProgress() {
}
void MultiProgress::Activate() {
- active_ =
+ active_ =
#if !defined(_WIN32) && !defined(_WIN64)
- // Is stderr a terminal?
+ // Is stderr a terminal?
(isatty(2) == 1)
#else
true
diff --git a/util/stream/multi_progress.hh b/util/stream/multi_progress.hh
index 41d40075a..f9e6423e3 100644
--- a/util/stream/multi_progress.hh
+++ b/util/stream/multi_progress.hh
@@ -38,7 +38,7 @@ class MultiProgress {
boost::mutex mutex_;
- // \0 at the end.
+ // \0 at the end.
char display_[kWidth + 1];
std::size_t character_handout_;
@@ -49,10 +49,10 @@ class MultiProgress {
class WorkerProgress {
public:
- // Default contrutor must be initialized with operator= later.
+ // Default contrutor must be initialized with operator= later.
WorkerProgress() : parent_(NULL) {}
- // Not threadsafe for the same worker by default.
+ // Not threadsafe for the same worker by default.
WorkerProgress &operator++() {
if (++current_ >= next_) {
parent_->Milestone(*this);
@@ -70,17 +70,17 @@ class WorkerProgress {
private:
friend class MultiProgress;
- WorkerProgress(uint64_t next, MultiProgress &parent, char character)
+ WorkerProgress(uint64_t next, MultiProgress &parent, char character)
: current_(0), next_(next), parent_(&parent), stone_(0), character_(character) {}
uint64_t current_, next_;
MultiProgress *parent_;
- // Previous milestone reached.
+ // Previous milestone reached.
unsigned char stone_;
- // Character to display in bar.
+ // Character to display in bar.
char character_;
};
diff --git a/util/stream/multi_stream.hh b/util/stream/multi_stream.hh
index 2772a7081..b1461f964 100644
--- a/util/stream/multi_stream.hh
+++ b/util/stream/multi_stream.hh
@@ -50,7 +50,7 @@ class Chains : public util::FixedArray<util::stream::Chain> {
}
Chains &operator>>(const util::stream::Recycler &recycler) {
- for (util::stream::Chain *i = begin(); i != end(); ++i)
+ for (util::stream::Chain *i = begin(); i != end(); ++i)
*i >> recycler;
return *this;
}
diff --git a/util/stream/sort.hh b/util/stream/sort.hh
index 9082cfdde..a1e0a8539 100644
--- a/util/stream/sort.hh
+++ b/util/stream/sort.hh
@@ -3,16 +3,16 @@
* Chain(config) >> Read(file) >> sorter.Unsorted();
* Stream stream;
* Chain chain(config) >> sorter.Sorted(internal_config, lazy_config) >> stream;
- *
- * Note that sorter must outlive any threads that use Unsorted or Sorted.
+ *
+ * Note that sorter must outlive any threads that use Unsorted or Sorted.
*
* Combiners take the form:
* bool operator()(void *into, const void *option, const Compare &compare) const
* which returns true iff a combination happened. The sorting algorithm
- * guarantees compare(into, option). But it does not guarantee
- * compare(option, into).
+ * guarantees compare(into, option). But it does not guarantee
+ * compare(option, into).
* Currently, combining is only done in merge steps, not during on-the-fly
- * sort. Use a hash table for that.
+ * sort. Use a hash table for that.
*/
#ifndef UTIL_STREAM_SORT_H
@@ -37,12 +37,12 @@ namespace util {
namespace stream {
struct NeverCombine {
- template <class Compare> bool operator()(const void *, const void *, const Compare &) const {
+ template <class Compare> bool operator()(const void *, const void *, const Compare &) const {
return false;
}
};
-// Manage the offsets of sorted blocks in a file.
+// Manage the offsets of sorted blocks in a file.
class Offsets {
public:
explicit Offsets(int fd) : log_(fd) {
@@ -150,7 +150,7 @@ template <class Compare> class MergeQueue {
}
private:
- // Priority queue contains these entries.
+ // Priority queue contains these entries.
class Entry {
public:
Entry() {}
@@ -195,7 +195,7 @@ template <class Compare> class MergeQueue {
uint64_t remaining_, offset_;
};
- // Wrapper comparison function for queue entries.
+ // Wrapper comparison function for queue entries.
class Greater : public std::binary_function<const Entry &, const Entry &, bool> {
public:
explicit Greater(const Compare &compare) : compare_(compare) {}
@@ -217,10 +217,10 @@ template <class Compare> class MergeQueue {
};
/* A worker object that merges. If the number of pieces to merge exceeds the
- * arity, it outputs multiple sorted blocks, recording to out_offsets.
+ * arity, it outputs multiple sorted blocks, recording to out_offsets.
* However, users will only every see a single sorted block out output because
* Sort::Sorted insures the arity is higher than the number of pieces before
- * returning this.
+ * returning this.
*/
template <class Compare, class Combine> class MergingReader {
public:
@@ -235,7 +235,7 @@ template <class Compare, class Combine> class MergingReader {
}
void Run(const ChainPosition &position, bool assert_one) {
- // Special case: nothing to read.
+ // Special case: nothing to read.
if (!in_offsets_->RemainingBlocks()) {
Link l(position);
l.Poison();
@@ -267,7 +267,7 @@ template <class Compare, class Combine> class MergingReader {
// Populate queue.
MergeQueue<Compare> queue(in_, per_buffer, entry_size, compare_);
- for (uint8_t *buf = static_cast<uint8_t*>(buffer.get());
+ for (uint8_t *buf = static_cast<uint8_t*>(buffer.get());
in_offsets_->RemainingBlocks() && (buf + std::min(per_buffer, in_offsets_->PeekSize()) <= buffer_end);) {
uint64_t offset = in_offsets_->TotalOffset();
uint64_t size = in_offsets_->NextSize();
@@ -285,7 +285,7 @@ template <class Compare, class Combine> class MergingReader {
}
uint64_t written = 0;
- // Merge including combiner support.
+ // Merge including combiner support.
memcpy(str.Get(), queue.Top(), entry_size);
for (queue.Pop(); !queue.Empty(); queue.Pop()) {
if (!combine_(str.Get(), queue.Top(), compare_)) {
@@ -300,9 +300,9 @@ template <class Compare, class Combine> class MergingReader {
str.Poison();
}
- private:
+ private:
void ReadSingle(uint64_t offset, const uint64_t size, const ChainPosition &position) {
- // Special case: only one to read.
+ // Special case: only one to read.
const uint64_t end = offset + size;
const uint64_t block_size = position.GetChain().BlockSize();
Link l(position);
@@ -315,7 +315,7 @@ template <class Compare, class Combine> class MergingReader {
(++l).Poison();
return;
}
-
+
Compare compare_;
Combine combine_;
@@ -326,17 +326,17 @@ template <class Compare, class Combine> class MergingReader {
private:
Offsets *out_offsets_;
-
+
std::size_t buffer_size_;
std::size_t total_memory_;
};
-// The lazy step owns the remaining files. This keeps track of them.
+// The lazy step owns the remaining files. This keeps track of them.
template <class Compare, class Combine> class OwningMergingReader : public MergingReader<Compare, Combine> {
private:
typedef MergingReader<Compare, Combine> P;
public:
- OwningMergingReader(int data, const Offsets &offsets, std::size_t buffer, std::size_t lazy, const Compare &compare, const Combine &combine)
+ OwningMergingReader(int data, const Offsets &offsets, std::size_t buffer, std::size_t lazy, const Compare &compare, const Combine &combine)
: P(data, NULL, NULL, buffer, lazy, compare, combine),
data_(data),
offsets_(offsets) {}
@@ -353,7 +353,7 @@ template <class Compare, class Combine> class OwningMergingReader : public Mergi
Offsets offsets_;
};
-// Don't use this directly. Worker that sorts blocks.
+// Don't use this directly. Worker that sorts blocks.
template <class Compare> class BlockSorter {
public:
BlockSorter(Offsets &offsets, const Compare &compare) :
@@ -362,7 +362,7 @@ template <class Compare> class BlockSorter {
void Run(const ChainPosition &position) {
const std::size_t entry_size = position.GetChain().EntrySize();
for (Link link(position); link; ++link) {
- // Record the size of each block in a separate file.
+ // Record the size of each block in a separate file.
offsets_->Append(link->ValidSize());
void *end = static_cast<uint8_t*>(link->Get()) + link->ValidSize();
#if defined(_WIN32) || defined(_WIN64)
@@ -399,7 +399,7 @@ template <class Compare, class Combine = NeverCombine> class Sort {
compare_(compare), combine_(combine),
entry_size_(in.EntrySize()) {
UTIL_THROW_IF(!entry_size_, BadSortConfig, "Sorting entries of size 0");
- // Make buffer_size a multiple of the entry_size.
+ // Make buffer_size a multiple of the entry_size.
config_.buffer_size -= config_.buffer_size % entry_size_;
UTIL_THROW_IF(!config_.buffer_size, BadSortConfig, "Sort buffer too small");
UTIL_THROW_IF(config_.total_memory < config_.buffer_size * 4, BadSortConfig, "Sorting memory " << config_.total_memory << " is too small for four buffers (two read and two write).");
@@ -429,7 +429,7 @@ template <class Compare, class Combine = NeverCombine> class Sort {
Offsets offsets2(offsets2_file.get());
Offsets *offsets_in = &offsets_, *offsets_out = &offsets2;
- // Double buffered writing.
+ // Double buffered writing.
ChainConfig chain_config;
chain_config.entry_size = entry_size_;
chain_config.block_count = 2;
@@ -472,7 +472,7 @@ template <class Compare, class Combine = NeverCombine> class Sort {
}
// Output to chain, using this amount of memory, maximum, for lazy merge
- // sort.
+ // sort.
void Output(Chain &out, std::size_t lazy_memory) {
Merge(lazy_memory);
out.SetProgressTarget(Size());
@@ -483,15 +483,15 @@ template <class Compare, class Combine = NeverCombine> class Sort {
/* If a pipeline step is reading sorted input and writing to a different
* sort order, then there's a trade-off between using RAM to read lazily
- * (avoiding copying the file) and using RAM to increase block size and,
+ * (avoiding copying the file) and using RAM to increase block size and,
* therefore, decrease the number of merge sort passes in the next
- * iteration.
- *
+ * iteration.
+ *
* Merge sort takes log_{arity}(pieces) passes. Thus, each time the chain
* block size is multiplied by arity, the number of output passes decreases
* by one. Up to a constant, then, log_{arity}(chain) is the number of
* passes saved. Chain simply divides the memory evenly over all blocks.
- *
+ *
* Lazy sort saves this many passes (up to a constant)
* log_{arity}((memory-lazy)/block_count) + 1
* Non-lazy sort saves this many passes (up to the same constant):
@@ -535,7 +535,7 @@ template <class Compare, class Combine = NeverCombine> class Sort {
const std::size_t entry_size_;
};
-// returns bytes to be read on demand.
+// returns bytes to be read on demand.
template <class Compare, class Combine> uint64_t BlockingSort(Chain &chain, const SortConfig &config, const Compare &compare = Compare(), const Combine &combine = NeverCombine()) {
Sort<Compare, Combine> sorter(chain, config, compare, combine);
chain.Wait(true);
diff --git a/util/stream/sort_test.cc b/util/stream/sort_test.cc
index fd7705cd9..fc97ffdbf 100644
--- a/util/stream/sort_test.cc
+++ b/util/stream/sort_test.cc
@@ -37,7 +37,7 @@ BOOST_AUTO_TEST_CASE(FromShuffled) {
shuffled.push_back(i);
}
std::random_shuffle(shuffled.begin(), shuffled.end());
-
+
ChainConfig config;
config.entry_size = 8;
config.total_memory = 800;
diff --git a/util/stream/stream.hh b/util/stream/stream.hh
index 0e37f5d51..ee1e9fa83 100644
--- a/util/stream/stream.hh
+++ b/util/stream/stream.hh
@@ -37,7 +37,7 @@ class Stream : boost::noncopyable {
++block_it_;
block_it_.Poison();
}
-
+
Stream &operator++() {
assert(*this);
assert(current_ < end_);
diff --git a/util/stream/timer.hh b/util/stream/timer.hh
index 06488a17e..9e9573d15 100644
--- a/util/stream/timer.hh
+++ b/util/stream/timer.hh
@@ -10,7 +10,7 @@
#define UTIL_TIMER(str) boost::timer::auto_cpu_timer timer(std::cerr, 1, (str))
#else
//#warning Using Boost older than 1.48. Timing information will not be available.*/
-#define UTIL_TIMER(str)
+#define UTIL_TIMER(str)
//#endif
#endif // UTIL_STREAM_TIMER_H
diff --git a/util/thread_pool.hh b/util/thread_pool.hh
index d385b1a74..dce987c40 100644
--- a/util/thread_pool.hh
+++ b/util/thread_pool.hh
@@ -48,7 +48,7 @@ template <class HandlerT> class Worker : boost::noncopyable {
PCQueue<Request> &in_;
boost::optional<Handler> handler_;
-
+
const Request poison_;
boost::thread thread_;
diff --git a/util/tokenize_piece.hh b/util/tokenize_piece.hh
index 8621705e8..9da5fa3c8 100644
--- a/util/tokenize_piece.hh
+++ b/util/tokenize_piece.hh
@@ -127,7 +127,7 @@ template <class Find, bool SkipEmpty = false> class TokenIter : public boost::it
} else {
after_ = StringPiece(found.data() + found.size(), after_.data() - found.data() + after_.size() - found.size());
}
- } while (SkipEmpty && current_.data() && current_.empty()); // Compiler should optimize this away if SkipEmpty is false.
+ } while (SkipEmpty && current_.data() && current_.empty()); // Compiler should optimize this away if SkipEmpty is false.
}
bool equal(const TokenIter<Find, SkipEmpty> &other) const {
diff --git a/util/usage.cc b/util/usage.cc
index bbb209306..f2b661014 100644
--- a/util/usage.cc
+++ b/util/usage.cc
@@ -12,7 +12,7 @@
#include <ctime>
#if defined(_WIN32) || defined(_WIN64)
// This code lifted from physmem.c in gnulib. See the copyright statement
-// below.
+// below.
# define WIN32_LEAN_AND_MEAN
# include <windows.h>
/* MEMORYSTATUSEX is missing from older windows headers, so define
@@ -256,14 +256,14 @@ template <class Num> uint64_t ParseNum(const std::string &arg) {
std::string throwaway;
UTIL_THROW_IF_ARG(stream >> throwaway, SizeParseError, (arg), "because there was more cruft " << throwaway << " after the number.");
- // Silly sort, using kilobytes as your default unit.
+ // Silly sort, using kilobytes as your default unit.
if (after.empty()) after = "K";
if (after == "%") {
uint64_t mem = GuessPhysicalMemory();
UTIL_THROW_IF_ARG(!mem, SizeParseError, (arg), "because % was specified but the physical memory size could not be determined.");
return static_cast<uint64_t>(static_cast<double>(value) * static_cast<double>(mem) / 100.0);
}
-
+
std::string units("bKMGTPEZY");
std::string::size_type index = units.find(after[0]);
UTIL_THROW_IF_ARG(index == std::string::npos, SizeParseError, (arg), "the allowed suffixes are " << units << "%.");