#include "HypPackEnumerator.h" #include #include #include using namespace std; namespace MosesTuning { StreamingHypPackEnumerator::StreamingHypPackEnumerator ( vector const& featureFiles, vector const& scoreFiles ) : m_featureFiles(featureFiles), m_scoreFiles(scoreFiles) { if (scoreFiles.size() == 0 || featureFiles.size() == 0) { cerr << "No data to process" << endl; exit(0); } if (featureFiles.size() != scoreFiles.size()) { cerr << "Error: Number of feature files (" << featureFiles.size() << ") does not match number of score files (" << scoreFiles.size() << ")" << endl; exit(1); } m_num_lists = scoreFiles.size(); m_primed = false; m_iNumDense = -1; } size_t StreamingHypPackEnumerator::num_dense() const { if(m_iNumDense<0) { cerr << "Error: Requested num_dense() for an unprimed StreamingHypPackEnumerator" << endl; exit(1); } return (size_t) m_iNumDense; } void StreamingHypPackEnumerator::prime() { m_current_indexes.clear(); m_current_featureVectors.clear(); boost::unordered_set seen; m_primed = true; for (size_t i = 0; i < m_num_lists; ++i) { if (m_featureDataIters[i] == FeatureDataIterator::end()) { cerr << "Error: Feature file " << i << " ended prematurely" << endl; exit(1); } if (m_scoreDataIters[i] == ScoreDataIterator::end()) { cerr << "Error: Score file " << i << " ended prematurely" << endl; exit(1); } if (m_featureDataIters[i]->size() != m_scoreDataIters[i]->size()) { cerr << "Error: For sentence " << m_sentenceId << " features and scores have different size" << endl; exit(1); } for (size_t j = 0; j < m_featureDataIters[i]->size(); ++j) { const FeatureDataItem& item = m_featureDataIters[i]->operator[](j); // Dedup if(seen.find(item)==seen.end()) { seen.insert(item); // Confirm dense features are always the same int iDense = item.dense.size(); if(m_iNumDense != iDense) { if(m_iNumDense==-1) m_iNumDense = iDense; else { cerr << "Error: expecting constant number of dense features: " << m_iNumDense << " != " << iDense << endl; exit(1); } } // Store item for retrieval m_current_indexes.push_back(pair(i,j)); m_current_featureVectors.push_back(MiraFeatureVector(item)); } } } } void StreamingHypPackEnumerator::reset() { m_featureDataIters.clear(); m_scoreDataIters.clear(); for (size_t i = 0; i < m_num_lists; ++i) { m_featureDataIters.push_back(FeatureDataIterator(m_featureFiles[i])); m_scoreDataIters.push_back(ScoreDataIterator(m_scoreFiles[i])); } m_sentenceId=0; prime(); } bool StreamingHypPackEnumerator::finished() { return m_featureDataIters[0]==FeatureDataIterator::end(); } void StreamingHypPackEnumerator::next() { if(!m_primed) { cerr << "Enumerating an unprimed HypPackEnumerator" << endl; exit(1); } for (size_t i = 0; i < m_num_lists; ++i) { ++m_featureDataIters[i]; ++m_scoreDataIters[i]; } m_sentenceId++; if(m_sentenceId % 100 == 0) cerr << "."; if(!finished()) prime(); } size_t StreamingHypPackEnumerator::cur_size() { if(!m_primed) { cerr << "Querying size from an unprimed HypPackEnumerator" << endl; exit(1); } return m_current_indexes.size(); } const MiraFeatureVector& StreamingHypPackEnumerator::featuresAt(size_t index) { if(!m_primed) { cerr << "Querying features from an unprimed HypPackEnumerator" << endl; exit(1); } return m_current_featureVectors[index]; } const ScoreDataItem& StreamingHypPackEnumerator::scoresAt(size_t index) { if(!m_primed) { cerr << "Querying scores from an unprimed HypPackEnumerator" << endl; exit(1); } const pair& pij = m_current_indexes[index]; return m_scoreDataIters[pij.first]->operator[](pij.second); } size_t StreamingHypPackEnumerator::cur_id() { return m_sentenceId; } /* --------- RandomAccessHypPackEnumerator ------------- */ RandomAccessHypPackEnumerator::RandomAccessHypPackEnumerator(vector const& featureFiles, vector const& scoreFiles, bool no_shuffle) { StreamingHypPackEnumerator train(featureFiles,scoreFiles); size_t index=0; for(train.reset(); !train.finished(); train.next()) { m_features.push_back(vector()); m_scores.push_back(vector()); for(size_t j=0; j= m_indexes.size(); } void RandomAccessHypPackEnumerator::next() { m_cur_index++; } size_t RandomAccessHypPackEnumerator::cur_size() { assert(m_features[m_indexes[m_cur_index]].size()==m_scores[m_indexes[m_cur_index]].size()); return m_features[m_indexes[m_cur_index]].size(); } const MiraFeatureVector& RandomAccessHypPackEnumerator::featuresAt(size_t i) { return m_features[m_indexes[m_cur_index]][i]; } const ScoreDataItem& RandomAccessHypPackEnumerator::scoresAt(size_t i) { return m_scores[m_indexes[m_cur_index]][i]; } size_t RandomAccessHypPackEnumerator::cur_id() { return m_indexes[m_cur_index]; } // --Emacs trickery-- // Local Variables: // mode:c++ // c-basic-offset:2 // End: }