#include #include "BilingualLM.h" #include "moses/ScoreComponentCollection.h" using namespace std; namespace Moses { //////////////////////////////////////////////////////////////// BilingualLM::BilingualLM(const std::string &line) : StatefulFeatureFunction(1, line), word_factortype(0) { FactorCollection& factorFactory = FactorCollection::Instance(); //Factor Factory to use for BOS_ and EOS_ BOS_factor = factorFactory.AddFactor(BOS_); BOS_word.SetFactor(0, BOS_factor); EOS_factor = factorFactory.AddFactor(EOS_); EOS_word.SetFactor(0, EOS_factor); } void BilingualLM::Load(AllOptions::ptr const& opts) { m_options = opts; ReadParameters(); loadModel(); } //Populates words with amount words from the targetPhrase from the previous hypothesis where //words[0] is the last word of the previous hypothesis, words[1] is the second last etc... void BilingualLM::requestPrevTargetNgrams( const Hypothesis &cur_hypo, int amount, std::vector &words) const { const Hypothesis * prev_hyp = cur_hypo.GetPrevHypo(); int found = 0; while (prev_hyp && found != amount) { const TargetPhrase& currTargetPhrase = prev_hyp->GetCurrTargetPhrase(); for (int i = currTargetPhrase.GetSize() - 1; i> -1; i--) { if (found != amount) { const Word& word = currTargetPhrase.GetWord(i); words[found] = getNeuralLMId(word, false); found++; } else { return; //We have gotten everything needed } } prev_hyp = prev_hyp->GetPrevHypo(); } int neuralLM_wordID = getNeuralLMId(BOS_word, false); for (int i = found; i < amount; i++) { words[i] = neuralLM_wordID; } } //Populates the words vector with target_ngrams sized that also contains the current word we are looking at. //(in effect target_ngrams + 1) void BilingualLM::getTargetWords( const Hypothesis &cur_hypo, const TargetPhrase &targetPhrase, int current_word_index, std::vector &words) const { //Check if we need to look at previous target phrases int additional_needed = current_word_index - target_ngrams; if (additional_needed < 0) { additional_needed = -additional_needed; std::vector prev_words(additional_needed); requestPrevTargetNgrams(cur_hypo, additional_needed, prev_words); for (int i = additional_needed - 1; i >= 0; i--) { words.push_back(prev_words[i]); } } if (words.size() > 0) { //We have added some words from previous phrases //Just add until we reach current_word_index for (int i = 0; i <= current_word_index; i++) { const Word& word = targetPhrase.GetWord(i); words.push_back(getNeuralLMId(word, false)); } } else { //We haven't added any words, proceed as before for (int i = current_word_index - target_ngrams; i <= current_word_index; i++) { const Word& word = targetPhrase.GetWord(i); words.push_back(getNeuralLMId(word, false)); } } } //Returns source words in the way NeuralLM expects them. size_t BilingualLM::selectMiddleAlignment( const set& alignment_links) const { set::iterator it = alignment_links.begin(); for (size_t i = 0; i < (alignment_links.size() - 1) / 2; ++i) { ++it; } return *it; } void BilingualLM::getSourceWords( const TargetPhrase &targetPhrase, int targetWordIdx, const Sentence &source_sent, const Range &sourceWordRange, std::vector &words) const { //Get source context //Get alignment for the word we require const AlignmentInfo& alignments = targetPhrase.GetAlignTerm(); // We are getting word alignment for targetPhrase.GetWord(i + target_ngrams -1) according to the paper. // Find the closest target word with alignment links. std::set last_word_al; for (int j = 0; j < targetPhrase.GetSize(); j++) { // Find the nearest aligned word with preference for right. if ((targetWordIdx + j) < targetPhrase.GetSize()) { last_word_al = alignments.GetAlignmentsForTarget(targetWordIdx + j); if (!last_word_al.empty()) { break; } } // We couldn't find word on the right, try to the left. if ((targetWordIdx - j) >= 0) { last_word_al = alignments.GetAlignmentsForTarget(targetWordIdx - j); if (!last_word_al.empty()) { break; } } } //Assume we have gotten some alignment here. If we couldn't get an alignment from the above routine it means //that none of the words in the target phrase aligned to any word in the source phrase // Now we get the source words. First select middle alignment. //It should never be the case the the word_al size would be zero, but several times this has happened because //of a corrupt phrase table. It is best to have this check here, as it makes debugging the problem a lot easier. UTIL_THROW_IF2(last_word_al.size() == 0, "A target phrase with no alignments detected! " << targetPhrase << "Check if there is something wrong with your phrase table."); size_t source_center_index = selectMiddleAlignment(last_word_al); // We have found the alignment. Now determine how much to shift by to get the actual source word index. size_t phrase_start_pos = sourceWordRange.GetStartPos(); // Account for how far the current word is from the start of the phrase. size_t source_word_mid_idx = phrase_start_pos + source_center_index; appendSourceWordsToVector(source_sent, words, source_word_mid_idx); } size_t BilingualLM::getState(const Hypothesis& cur_hypo) const { const TargetPhrase &targetPhrase = cur_hypo.GetCurrTargetPhrase(); size_t hashCode = 0; // Check if we need to look at previous target phrases int additional_needed = targetPhrase.GetSize() - target_ngrams; if (additional_needed < 0) { additional_needed = -additional_needed; std::vector prev_words(additional_needed); requestPrevTargetNgrams(cur_hypo, additional_needed, prev_words); for (int i = additional_needed - 1; i >= 0; i--) { boost::hash_combine(hashCode, prev_words[i]); } // Get the rest of the phrases needed for (int i = 0; i < targetPhrase.GetSize(); i++) { const Word& word = targetPhrase.GetWord(i); int neuralLM_wordID = getNeuralLMId(word, false); boost::hash_combine(hashCode, neuralLM_wordID); } } else { // We just need the last target_ngrams from the current target phrase. for (int i = targetPhrase.GetSize() - target_ngrams; i < targetPhrase.GetSize(); i++) { const Word& word = targetPhrase.GetWord(i); int neuralLM_wordID = getNeuralLMId(word, false); boost::hash_combine(hashCode, neuralLM_wordID); } } return hashCode; } FFState* BilingualLM::EvaluateWhenApplied( const Hypothesis& cur_hypo, const FFState* prev_state, ScoreComponentCollection* accumulator) const { Manager& manager = cur_hypo.GetManager(); const Sentence& source_sent = static_cast(manager.GetSource()); // Init vectors. std::vector source_words; source_words.reserve(source_ngrams); std::vector target_words; target_words.reserve(target_ngrams); float value = 0; const TargetPhrase& currTargetPhrase = cur_hypo.GetCurrTargetPhrase(); const Range& sourceWordRange = cur_hypo.GetCurrSourceWordsRange(); //Source words range to calculate offsets // For each word in the current target phrase get its LM score. for (int i = 0; i < currTargetPhrase.GetSize(); i++) { getSourceWords( currTargetPhrase, i, source_sent, sourceWordRange, source_words); getTargetWords(cur_hypo, currTargetPhrase, i, target_words); value += Score(source_words, target_words); // Clear the vectors. source_words.clear(); target_words.clear(); } size_t new_state = getState(cur_hypo); accumulator->PlusEquals(this, value); return new BilingualLMState(new_state); } void BilingualLM::getAllTargetIdsChart(const ChartHypothesis& cur_hypo, size_t featureID, std::vector& wordIds) const { const TargetPhrase targetPhrase = cur_hypo.GetCurrTargetPhrase(); for (int i = 0; i < targetPhrase.GetSize(); i++) { if (targetPhrase.GetWord(i).IsNonTerminal()) { //Nonterminal get from prev state const ChartHypothesis * prev_hypo = cur_hypo.GetPrevHypo(targetPhrase.GetAlignNonTerm().GetNonTermIndexMap()[i]); const BilingualLMState * prev_state = static_cast(prev_hypo->GetFFState(featureID)); const std::vector prevWordIDs = prev_state->GetWordIdsVector(); for (std::vector::const_iterator it = prevWordIDs.begin(); it!= prevWordIDs.end(); it++) { wordIds.push_back(*it); } } else { wordIds.push_back(getNeuralLMId(targetPhrase.GetWord(i), false)); } } } void BilingualLM::getAllAlignments(const ChartHypothesis& cur_hypo, size_t featureID, std::vector& word_alignments) const { const TargetPhrase targetPhrase = cur_hypo.GetCurrTargetPhrase(); int source_word_mid_idx; //The word alignment //Get source sent const AlignmentInfo& alignments = targetPhrase.GetAlignTerm(); // get absolute position in source sentence for each source word in rule std::vector absolute_source_position (cur_hypo.GetCurrSourceRange().GetNumWordsCovered(), 0); //we actually only need number of source symbols in rule; can we get this number cheaply? absolute_source_position[0] = cur_hypo.GetCurrSourceRange().GetStartPos(); // get last absolute position of each source nonterminal symbol for (int i = 0; i < targetPhrase.GetSize(); i++) { if (targetPhrase.GetWord(i).IsNonTerminal()) { const ChartHypothesis * prev_hypo = cur_hypo.GetPrevHypo(targetPhrase.GetAlignNonTerm().GetNonTermIndexMap()[i]); absolute_source_position[targetPhrase.GetAlignNonTerm().GetNonTermIndexMap2()[i]] = prev_hypo->GetCurrSourceRange().GetEndPos(); } } // set absolute position of all source terminal symbols based on absolute position of previous symbol for (int i = 0; i != absolute_source_position.size(); i++) { if (i && absolute_source_position[i] == 0) { absolute_source_position[i] = absolute_source_position[i-1] + 1; } } for (int i = 0; i < targetPhrase.GetSize(); i++) { //Sometimes we have to traverse more than one target words because of //unaligned words. This is O(n^2) in worst case, but usually closer to O(n) if (targetPhrase.GetWord(i).IsNonTerminal()) { //If we have a non terminal we can get the alignments from the previous state const ChartHypothesis * prev_hypo = cur_hypo.GetPrevHypo(targetPhrase.GetAlignNonTerm().GetNonTermIndexMap()[i]); const BilingualLMState * prev_state = static_cast(prev_hypo->GetFFState(featureID)); const std::vector prevWordAls = prev_state->GetWordAlignmentVector(); for (std::vector::const_iterator it = prevWordAls.begin(); it!= prevWordAls.end(); it++) { word_alignments.push_back(*it); } } else { bool resolvedIndexis = false; //If we are aligning to an existing nonterm we don't need to calculate offsets std::set word_al = alignments.GetAlignmentsForTarget(i); if (word_al.empty()) { for (int j = 1; j < targetPhrase.GetSize(); j++) { //Try to get alignment from the current word and if it is unaligned, //try from the first word to the right and then to the left if ((i+j) < targetPhrase.GetSize()) { //TODO: this will always succeed, even if first word in previous hypo is unaligned. should it? if (targetPhrase.GetWord(i + j).IsNonTerminal()) { const ChartHypothesis * prev_hypo = cur_hypo.GetPrevHypo(targetPhrase.GetAlignNonTerm().GetNonTermIndexMap()[i+j]); const BilingualLMState * prev_state = static_cast(prev_hypo->GetFFState(featureID)); source_word_mid_idx = prev_state->GetWordAlignmentVector().front(); // The first word on the right of our word resolvedIndexis = true; break; } word_al = alignments.GetAlignmentsForTarget(i + j); if (!word_al.empty()) { break; } } if ((i - j) >= 0) { //TODO: this will always succeed, even if last word in previous hypo is unaligned. should it? if (targetPhrase.GetWord(i - j).IsNonTerminal()) { const ChartHypothesis * prev_hypo = cur_hypo.GetPrevHypo(targetPhrase.GetAlignNonTerm().GetNonTermIndexMap()[i-j]); const BilingualLMState * prev_state = static_cast(prev_hypo->GetFFState(featureID)); source_word_mid_idx = prev_state->GetWordAlignmentVector().back(); // The first word on the left of our word resolvedIndexis = true; break; } word_al = alignments.GetAlignmentsForTarget(i - j); if (!word_al.empty()) { break; } } } } if (!resolvedIndexis) { //It should never be the case the the word_al size would be zero, but several times this has happened because //of a corrupt phrase table. It is best to have this check here, as it makes debugging the problem a lot easier. UTIL_THROW_IF2(word_al.size() == 0, "A target phrase with no alignments detected! " << targetPhrase << "Check if there is something wrong with your phrase table."); size_t source_center_index = selectMiddleAlignment(word_al); // We have found the alignment. Now determine how much to shift by to get the actual source word index. source_word_mid_idx = absolute_source_position[source_center_index]; } word_alignments.push_back(source_word_mid_idx); } } } size_t BilingualLM::getStateChart(std::vector& neuralLMids) const { size_t hashCode = 0; for (int i = neuralLMids.size() - target_ngrams; i < neuralLMids.size(); i++) { int neuralLM_wordID; if (i < 0) { neuralLM_wordID = getNeuralLMId(BOS_word, false); } else { neuralLM_wordID = neuralLMids[i]; } boost::hash_combine(hashCode, neuralLM_wordID); } return hashCode; } void BilingualLM::getTargetWordsChart( std::vector& neuralLMids, int current_word_index, std::vector& words, bool sentence_begin) const { for (int i = current_word_index - target_ngrams; i <= current_word_index; i++) { if (i < 0) { if (sentence_begin) { words.push_back(getNeuralLMId(BOS_word, false)); } else { words.push_back(getNeuralLMId(getNullWord(), false)); } } else { words.push_back(neuralLMids[i]); } } } void BilingualLM::appendSourceWordsToVector(const Sentence &source_sent, std::vector &words, int source_word_mid_idx) const { //Define begin and end indexes of the lookup. Cases for even and odd ngrams //This can result in indexes which span larger than the length of the source phrase. //In this case we just int begin_idx; int end_idx; if (source_ngrams % 2 == 0) { begin_idx = source_word_mid_idx - source_ngrams / 2 + 1; end_idx = source_word_mid_idx + source_ngrams / 2; } else { begin_idx = source_word_mid_idx - (source_ngrams - 1) / 2; end_idx = source_word_mid_idx + (source_ngrams - 1) / 2; } //Add words to vector for (int j = begin_idx; j <= end_idx; j++) { int neuralLM_wordID; if (j < 0) { neuralLM_wordID = getNeuralLMId(BOS_word, true); } else if (j >= source_sent.GetSize()) { neuralLM_wordID = getNeuralLMId(EOS_word, true); } else { const Word& word = source_sent.GetWord(j); neuralLM_wordID = getNeuralLMId(word, true); } words.push_back(neuralLM_wordID); } } FFState* BilingualLM::EvaluateWhenApplied( const ChartHypothesis& cur_hypo, int featureID, /* - used to index the state in the previous hypotheses */ ScoreComponentCollection* accumulator) const { //Init vectors std::vector source_words; source_words.reserve(source_ngrams); std::vector target_words; target_words.reserve(target_ngrams+1); float value = 0; //NeuralLM score const TargetPhrase& currTargetPhrase = cur_hypo.GetCurrTargetPhrase(); std::vector neuralLMids; //Equivalent more or less to whole_phrase. Contains all word ids but not as expensive std::vector alignments; //Estimate size and reserve vectors to avoid reallocation int future_size = currTargetPhrase.GetNumTerminals(); for (int i =0; iGetCurrTargetPhrase().GetSize(); } neuralLMids.reserve(future_size); alignments.reserve(future_size); getAllTargetIdsChart(cur_hypo, featureID, neuralLMids); getAllAlignments(cur_hypo, featureID, alignments); bool sentence_begin = false; //Check if this hypothesis' target words are located in the beginning of the sentence if (neuralLMids[0] == getNeuralLMId(BOS_word, false)) { sentence_begin = true; } //Get source sentence const ChartManager& manager = cur_hypo.GetManager(); const Sentence& source_sent = static_cast(manager.GetSource()); for (int i = 0; i < neuralLMids.size(); i++) { //This loop should be bigger as non terminals expand //We already have resolved the nonterminals, we are left with a simple loop. appendSourceWordsToVector(source_sent, source_words, alignments[i]); getTargetWordsChart(neuralLMids, i, target_words, sentence_begin); value += Score(source_words, target_words); // Get the score //Clear the vectors before the next iteration source_words.clear(); target_words.clear(); } size_t new_state = getStateChart(neuralLMids); // we're rescoring the full hypothesis, so we need to detract scores from previous hypos for (std::vector::const_iterator iter = cur_hypo.GetPrevHypos().begin(); iter != cur_hypo.GetPrevHypos().end(); ++iter) { const ChartHypothesis &prevHypo = **iter; value -= (prevHypo.GetScoreBreakdown().GetScoreForProducer(this)); } accumulator->PlusEquals(this, value); return new BilingualLMState(new_state, alignments, neuralLMids); } void BilingualLM::SetParameter(const std::string& key, const std::string& value) { if (key == "path") { m_filePath = value; } else { StatefulFeatureFunction::SetParameter(key, value); } } } // namespace Moses