diff options
author | Ulrich Germann <ugermann@inf.ed.ac.uk> | 2015-02-06 04:30:00 +0300 |
---|---|---|
committer | Ulrich Germann <ugermann@inf.ed.ac.uk> | 2015-02-06 04:30:00 +0300 |
commit | be5799dca34027849fc40a38a63459e164f27add (patch) | |
tree | 140e865a962c546c12e2322ab76a56699e7338c6 /mert | |
parent | 80a9f84422f3b7ce3ddf0bcfcbe2e8d06bba9e98 (diff) | |
parent | 8b61f396a7558bf628c2e94a9583023b9ae34a8c (diff) |
Merge branch 'master' of https://github.com/moses-smt/mosesdecoder
Conflicts:
moses/TranslationOptionCollection.cpp
moses/TranslationOptionCollectionLattice.cpp
moses/TranslationOptionCollectionLattice.h
moses/TranslationOptionList.h
Diffstat (limited to 'mert')
46 files changed, 2642 insertions, 2881 deletions
diff --git a/mert/Data.cpp b/mert/Data.cpp index 08e80409f..49c1239e5 100644 --- a/mert/Data.cpp +++ b/mert/Data.cpp @@ -263,13 +263,13 @@ void Data::createShards(size_t shard_count, float shard_size, const string& scor { UTIL_THROW_IF(shard_count == 0, util::Exception, "Must have at least 1 shard"); UTIL_THROW_IF(shard_size < 0 || shard_size > 1, - util::Exception, - "Shard size must be between 0 and 1, inclusive. Currently " << shard_size); + util::Exception, + "Shard size must be between 0 and 1, inclusive. Currently " << shard_size); size_t data_size = m_score_data->size(); UTIL_THROW_IF(data_size != m_feature_data->size(), - util::Exception, - "Error"); + util::Exception, + "Error"); shard_size *= data_size; const float coeff = static_cast<float>(data_size) / shard_count; diff --git a/mert/FeatureStats.cpp b/mert/FeatureStats.cpp index a0c6a6ebc..a3ed2cc9b 100644 --- a/mert/FeatureStats.cpp +++ b/mert/FeatureStats.cpp @@ -61,7 +61,8 @@ void SparseVector::set(const string& name, FeatureStatsType value) m_fvector[id] = value; } -void SparseVector::set(size_t id, FeatureStatsType value) { +void SparseVector::set(size_t id, FeatureStatsType value) +{ assert(m_id_to_name.size() > id); m_fvector[id] = value; } @@ -204,7 +205,7 @@ FeatureStats::FeatureStats(const size_t size) FeatureStats::~FeatureStats() { - delete [] m_array; + delete [] m_array; } void FeatureStats::Copy(const FeatureStats &stats) diff --git a/mert/ForestRescore.cpp b/mert/ForestRescore.cpp index 0172c6d92..009152e35 100644 --- a/mert/ForestRescore.cpp +++ b/mert/ForestRescore.cpp @@ -31,9 +31,11 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA using namespace std; -namespace MosesTuning { +namespace MosesTuning +{ -std::ostream& operator<<(std::ostream& out, const WordVec& wordVec) { +std::ostream& operator<<(std::ostream& out, const WordVec& wordVec) +{ out << "["; for (size_t i = 0; i < wordVec.size(); ++i) { out << wordVec[i]->first; @@ -44,7 +46,8 @@ std::ostream& operator<<(std::ostream& out, const WordVec& wordVec) { } -void ReferenceSet::Load(const vector<string>& files, Vocab& vocab) { +void ReferenceSet::Load(const vector<string>& files, Vocab& vocab) +{ for (size_t i = 0; i < files.size(); ++i) { util::FilePiece fh(files[i].c_str()); size_t sentenceId = 0; @@ -55,14 +58,15 @@ void ReferenceSet::Load(const vector<string>& files, Vocab& vocab) { } catch (util::EndOfFileException &e) { break; } - AddLine(sentenceId, line, vocab); - ++sentenceId; + AddLine(sentenceId, line, vocab); + ++sentenceId; } } } -void ReferenceSet::AddLine(size_t sentenceId, const StringPiece& line, Vocab& vocab) { +void ReferenceSet::AddLine(size_t sentenceId, const StringPiece& line, Vocab& vocab) +{ //cerr << line << endl; NgramCounter ngramCounts; list<WordVec> openNgrams; @@ -74,14 +78,14 @@ void ReferenceSet::AddLine(size_t sentenceId, const StringPiece& line, Vocab& vo openNgrams.push_front(WordVec()); for (list<WordVec>::iterator k = openNgrams.begin(); k != openNgrams.end(); ++k) { k->push_back(nextTok); - ++ngramCounts[*k]; + ++ngramCounts[*k]; } if (openNgrams.size() >= kBleuNgramOrder) openNgrams.pop_back(); } //merge into overall ngram map for (NgramCounter::const_iterator ni = ngramCounts.begin(); - ni != ngramCounts.end(); ++ni) { + ni != ngramCounts.end(); ++ni) { size_t count = ni->second; //cerr << *ni << " " << count << endl; if (ngramCounts_.size() <= sentenceId) ngramCounts_.resize(sentenceId+1); @@ -104,8 +108,9 @@ void ReferenceSet::AddLine(size_t sentenceId, const StringPiece& line, Vocab& vo //cerr << endl; } - -size_t ReferenceSet::NgramMatches(size_t sentenceId, const WordVec& ngram, bool clip) const { + +size_t ReferenceSet::NgramMatches(size_t sentenceId, const WordVec& ngram, bool clip) const +{ const NgramMap& ngramCounts = ngramCounts_.at(sentenceId); NgramMap::const_iterator ngi = ngramCounts.find(ngram); if (ngi == ngramCounts.end()) return 0; @@ -114,7 +119,8 @@ size_t ReferenceSet::NgramMatches(size_t sentenceId, const WordVec& ngram, bool VertexState::VertexState(): bleuStats(kBleuNgramOrder), targetLength(0) {} -void HgBleuScorer::UpdateMatches(const NgramCounter& counts, vector<FeatureStatsType>& bleuStats ) const { +void HgBleuScorer::UpdateMatches(const NgramCounter& counts, vector<FeatureStatsType>& bleuStats ) const +{ for (NgramCounter::const_iterator ngi = counts.begin(); ngi != counts.end(); ++ngi) { //cerr << "Checking: " << *ngi << " matches " << references_.NgramMatches(sentenceId_,*ngi,false) << endl; size_t order = ngi->first.size(); @@ -124,7 +130,8 @@ void HgBleuScorer::UpdateMatches(const NgramCounter& counts, vector<FeatureStats } } -size_t HgBleuScorer::GetTargetLength(const Edge& edge) const { +size_t HgBleuScorer::GetTargetLength(const Edge& edge) const +{ size_t targetLength = 0; for (size_t i = 0; i < edge.Words().size(); ++i) { const Vocab::Entry* word = edge.Words()[i]; @@ -137,7 +144,8 @@ size_t HgBleuScorer::GetTargetLength(const Edge& edge) const { return targetLength; } -FeatureStatsType HgBleuScorer::Score(const Edge& edge, const Vertex& head, vector<FeatureStatsType>& bleuStats) { +FeatureStatsType HgBleuScorer::Score(const Edge& edge, const Vertex& head, vector<FeatureStatsType>& bleuStats) +{ NgramCounter ngramCounts; size_t childId = 0; size_t wordId = 0; @@ -147,7 +155,7 @@ FeatureStatsType HgBleuScorer::Score(const Edge& edge, const Vertex& head, vecto bool inRightContext = false; list<WordVec> openNgrams; const Vocab::Entry* currentWord = NULL; - while (wordId < edge.Words().size()) { + while (wordId < edge.Words().size()) { currentWord = edge.Words()[wordId]; if (currentWord != NULL) { ++wordId; @@ -214,7 +222,7 @@ FeatureStatsType HgBleuScorer::Score(const Edge& edge, const Vertex& head, vecto } if (openNgrams.size() >= kBleuNgramOrder) openNgrams.pop_back(); } - + //Collect matches //This edge //cerr << "edge ngrams" << endl; @@ -227,26 +235,27 @@ FeatureStatsType HgBleuScorer::Score(const Edge& edge, const Vertex& head, vecto bleuStats[j] += vertexStates_[edge.Children()[i]].bleuStats[j]; } } - + FeatureStatsType sourceLength = head.SourceCovered(); size_t referenceLength = references_.Length(sentenceId_); - FeatureStatsType effectiveReferenceLength = + FeatureStatsType effectiveReferenceLength = sourceLength / totalSourceLength_ * referenceLength; bleuStats[bleuStats.size()-1] = effectiveReferenceLength; - //backgroundBleu_[backgroundBleu_.size()-1] = + //backgroundBleu_[backgroundBleu_.size()-1] = // backgroundRefLength_ * sourceLength / totalSourceLength_; FeatureStatsType bleu = sentenceLevelBackgroundBleu(bleuStats, backgroundBleu_); return bleu; } -void HgBleuScorer::UpdateState(const Edge& winnerEdge, size_t vertexId, const vector<FeatureStatsType>& bleuStats) { +void HgBleuScorer::UpdateState(const Edge& winnerEdge, size_t vertexId, const vector<FeatureStatsType>& bleuStats) +{ //TODO: Maybe more efficient to absorb into the Score() method VertexState& vertexState = vertexStates_[vertexId]; //cerr << "Updating state for " << vertexId << endl; - + //leftContext int wi = 0; const VertexState* childState = NULL; @@ -263,9 +272,9 @@ void HgBleuScorer::UpdateState(const Edge& winnerEdge, size_t vertexId, const ve //start of child state childState = &(vertexStates_[winnerEdge.Children()[childi++]]); contexti = 0; - } + } if ((size_t)contexti < childState->leftContext.size()) { - vertexState.leftContext.push_back(childState->leftContext[contexti++]); + vertexState.leftContext.push_back(childState->leftContext[contexti++]); } else { //end of child context childState = NULL; @@ -314,7 +323,8 @@ typedef pair<const Edge*,FeatureStatsType> BackPointer; * Recurse through back pointers **/ static void GetBestHypothesis(size_t vertexId, const Graph& graph, const vector<BackPointer>& bps, - HgHypothesis* bestHypo) { + HgHypothesis* bestHypo) +{ //cerr << "Expanding " << vertexId << " Score: " << bps[vertexId].second << endl; //UTIL_THROW_IF(bps[vertexId].second == kMinScore+1, HypergraphException, "Landed at vertex " << vertexId << " which is a dead end"); if (!bps[vertexId].first) return; @@ -334,14 +344,14 @@ static void GetBestHypothesis(size_t vertexId, const Graph& graph, const vector< } } -void Viterbi(const Graph& graph, const SparseVector& weights, float bleuWeight, const ReferenceSet& references , size_t sentenceId, const std::vector<FeatureStatsType>& backgroundBleu, HgHypothesis* bestHypo) +void Viterbi(const Graph& graph, const SparseVector& weights, float bleuWeight, const ReferenceSet& references , size_t sentenceId, const std::vector<FeatureStatsType>& backgroundBleu, HgHypothesis* bestHypo) { BackPointer init(NULL,kMinScore); vector<BackPointer> backPointers(graph.VertexSize(),init); HgBleuScorer bleuScorer(references, graph, sentenceId, backgroundBleu); vector<FeatureStatsType> winnerStats(kBleuNgramOrder*2+1); for (size_t vi = 0; vi < graph.VertexSize(); ++vi) { - //cerr << "vertex id " << vi << endl; +// cerr << "vertex id " << vi << endl; FeatureStatsType winnerScore = kMinScore; const Vertex& vertex = graph.GetVertex(vi); const vector<const Edge*>& incoming = vertex.GetIncoming(); @@ -349,7 +359,7 @@ void Viterbi(const Graph& graph, const SparseVector& weights, float bleuWeight, //UTIL_THROW(HypergraphException, "Vertex " << vi << " has no incoming edges"); //If no incoming edges, vertex is a dead end backPointers[vi].first = NULL; - backPointers[vi].second = kMinScore/2; + backPointers[vi].second = kMinScore; } else { //cerr << "\nVertex: " << vi << endl; for (size_t ei = 0; ei < incoming.size(); ++ei) { @@ -357,15 +367,15 @@ void Viterbi(const Graph& graph, const SparseVector& weights, float bleuWeight, FeatureStatsType incomingScore = incoming[ei]->GetScore(weights); for (size_t i = 0; i < incoming[ei]->Children().size(); ++i) { size_t childId = incoming[ei]->Children()[i]; - UTIL_THROW_IF(backPointers[childId].second == kMinScore, - HypergraphException, "Graph was not topologically sorted. curr=" << vi << " prev=" << childId); - incomingScore += backPointers[childId].second; + //UTIL_THROW_IF(backPointers[childId].second == kMinScore, + // HypergraphException, "Graph was not topologically sorted. curr=" << vi << " prev=" << childId); + incomingScore = max(incomingScore + backPointers[childId].second, kMinScore); } vector<FeatureStatsType> bleuStats(kBleuNgramOrder*2+1); - // cerr << "Score: " << incomingScore << " Bleu: "; - // if (incomingScore > nonbleuscore) {nonbleuscore = incomingScore; nonbleuid = ei;} + // cerr << "Score: " << incomingScore << " Bleu: "; + // if (incomingScore > nonbleuscore) {nonbleuscore = incomingScore; nonbleuid = ei;} FeatureStatsType totalScore = incomingScore; - if (bleuWeight) { + if (bleuWeight) { FeatureStatsType bleuScore = bleuScorer.Score(*(incoming[ei]), vertex, bleuStats); if (isnan(bleuScore)) { cerr << "WARN: bleu score undefined" << endl; @@ -379,7 +389,7 @@ void Viterbi(const Graph& graph, const SparseVector& weights, float bleuWeight, } //UTIL_THROW_IF(isnan(bleuScore), util::Exception, "Bleu score undefined, smoothing problem?"); totalScore += bleuWeight * bleuScore; - // cerr << bleuScore << " Total: " << incomingScore << endl << endl; + // cerr << bleuScore << " Total: " << incomingScore << endl << endl; //cerr << "is " << incomingScore << " bs " << bleuScore << endl; } if (totalScore >= winnerScore) { @@ -394,9 +404,12 @@ void Viterbi(const Graph& graph, const SparseVector& weights, float bleuWeight, //update with winner //if (bleuWeight) { //TODO: Not sure if we need this when computing max-model solution - bleuScorer.UpdateState(*(backPointers[vi].first), vi, winnerStats); + if (backPointers[vi].first) { + bleuScorer.UpdateState(*(backPointers[vi].first), vi, winnerStats); + } } +// cerr << "backpointer[" << vi << "] = (" << backPointers[vi].first << "," << backPointers[vi].second << ")" << endl; } //expand back pointers diff --git a/mert/ForestRescore.h b/mert/ForestRescore.h index 900275b74..2101a9248 100644 --- a/mert/ForestRescore.h +++ b/mert/ForestRescore.h @@ -27,7 +27,8 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA #include "BleuScorer.h" #include "Hypergraph.h" -namespace MosesTuning { +namespace MosesTuning +{ std::ostream& operator<<(std::ostream& out, const WordVec& wordVec); @@ -47,18 +48,21 @@ struct NgramEquals : public std::binary_function<const WordVec&, const WordVec&, typedef boost::unordered_map<WordVec, size_t, NgramHash, NgramEquals> NgramCounter; -class ReferenceSet { +class ReferenceSet +{ public: - + void AddLine(size_t sentenceId, const StringPiece& line, Vocab& vocab); void Load(const std::vector<std::string>& files, Vocab& vocab); size_t NgramMatches(size_t sentenceId, const WordVec&, bool clip) const; - size_t Length(size_t sentenceId) const {return lengths_[sentenceId];} + size_t Length(size_t sentenceId) const { + return lengths_[sentenceId]; + } private: //ngrams to (clipped,unclipped) counts @@ -80,31 +84,32 @@ struct VertexState { /** * Used to score an rule (ie edge) when we are applying it. **/ -class HgBleuScorer { - public: - HgBleuScorer(const ReferenceSet& references, const Graph& graph, size_t sentenceId, const std::vector<FeatureStatsType>& backgroundBleu): +class HgBleuScorer +{ +public: + HgBleuScorer(const ReferenceSet& references, const Graph& graph, size_t sentenceId, const std::vector<FeatureStatsType>& backgroundBleu): references_(references), sentenceId_(sentenceId), graph_(graph), backgroundBleu_(backgroundBleu), - backgroundRefLength_(backgroundBleu[kBleuNgramOrder*2]) { - vertexStates_.resize(graph.VertexSize()); - totalSourceLength_ = graph.GetVertex(graph.VertexSize()-1).SourceCovered(); - } - - FeatureStatsType Score(const Edge& edge, const Vertex& head, std::vector<FeatureStatsType>& bleuStats) ; + backgroundRefLength_(backgroundBleu[kBleuNgramOrder*2]) { + vertexStates_.resize(graph.VertexSize()); + totalSourceLength_ = graph.GetVertex(graph.VertexSize()-1).SourceCovered(); + } - void UpdateState(const Edge& winnerEdge, size_t vertexId, const std::vector<FeatureStatsType>& bleuStats); + FeatureStatsType Score(const Edge& edge, const Vertex& head, std::vector<FeatureStatsType>& bleuStats) ; + void UpdateState(const Edge& winnerEdge, size_t vertexId, const std::vector<FeatureStatsType>& bleuStats); - private: - const ReferenceSet& references_; - std::vector<VertexState> vertexStates_; - size_t sentenceId_; - size_t totalSourceLength_; - const Graph& graph_; - std::vector<FeatureStatsType> backgroundBleu_; - FeatureStatsType backgroundRefLength_; - void UpdateMatches(const NgramCounter& counter, std::vector<FeatureStatsType>& bleuStats) const; - size_t GetTargetLength(const Edge& edge) const; +private: + const ReferenceSet& references_; + std::vector<VertexState> vertexStates_; + size_t sentenceId_; + size_t totalSourceLength_; + const Graph& graph_; + std::vector<FeatureStatsType> backgroundBleu_; + FeatureStatsType backgroundRefLength_; + + void UpdateMatches(const NgramCounter& counter, std::vector<FeatureStatsType>& bleuStats) const; + size_t GetTargetLength(const Edge& edge) const; }; struct HgHypothesis { diff --git a/mert/ForestRescoreTest.cpp b/mert/ForestRescoreTest.cpp index 86975d3a5..4b62e8317 100644 --- a/mert/ForestRescoreTest.cpp +++ b/mert/ForestRescoreTest.cpp @@ -15,7 +15,7 @@ BOOST_AUTO_TEST_CASE(viterbi_simple_lattice) Vocab vocab; WordVec words; string wordStrings[] = - {"<s>", "</s>", "a", "b", "c", "d", "e", "f", "g"}; + {"<s>", "</s>", "a", "b", "c", "d", "e", "f", "g"}; for (size_t i = 0; i < 9; ++i) { words.push_back(&(vocab.FindOrAdd((wordStrings[i])))); } @@ -102,7 +102,7 @@ BOOST_AUTO_TEST_CASE(viterbi_3branch_lattice) Vocab vocab; WordVec words; string wordStrings[] = - {"<s>", "</s>", "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k"}; + {"<s>", "</s>", "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k"}; for (size_t i = 0; i < 13; ++i) { words.push_back(&(vocab.FindOrAdd((wordStrings[i])))); } diff --git a/mert/HopeFearDecoder.cpp b/mert/HopeFearDecoder.cpp index 993cef1d8..3e62d8171 100644 --- a/mert/HopeFearDecoder.cpp +++ b/mert/HopeFearDecoder.cpp @@ -34,11 +34,13 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA using namespace std; namespace fs = boost::filesystem; -namespace MosesTuning { +namespace MosesTuning +{ static const ValType BLEU_RATIO = 5; -ValType HopeFearDecoder::Evaluate(const AvgWeightVector& wv) { +ValType HopeFearDecoder::Evaluate(const AvgWeightVector& wv) +{ vector<ValType> stats(scorer_->NumberOfScores(),0); for(reset(); !finished(); next()) { vector<ValType> sent; @@ -51,13 +53,14 @@ ValType HopeFearDecoder::Evaluate(const AvgWeightVector& wv) { } NbestHopeFearDecoder::NbestHopeFearDecoder( - const vector<string>& featureFiles, - const vector<string>& scoreFiles, - bool streaming, - bool no_shuffle, - bool safe_hope, - Scorer* scorer - ) : safe_hope_(safe_hope) { + const vector<string>& featureFiles, + const vector<string>& scoreFiles, + bool streaming, + bool no_shuffle, + bool safe_hope, + Scorer* scorer +) : safe_hope_(safe_hope) +{ scorer_ = scorer; if (streaming) { train_.reset(new StreamingHypPackEnumerator(featureFiles, scoreFiles)); @@ -67,25 +70,29 @@ NbestHopeFearDecoder::NbestHopeFearDecoder( } -void NbestHopeFearDecoder::next() { +void NbestHopeFearDecoder::next() +{ train_->next(); } -bool NbestHopeFearDecoder::finished() { +bool NbestHopeFearDecoder::finished() +{ return train_->finished(); } -void NbestHopeFearDecoder::reset() { +void NbestHopeFearDecoder::reset() +{ train_->reset(); } void NbestHopeFearDecoder::HopeFear( - const std::vector<ValType>& backgroundBleu, - const MiraWeightVector& wv, - HopeFearData* hopeFear - ) { + const std::vector<ValType>& backgroundBleu, + const MiraWeightVector& wv, + HopeFearData* hopeFear +) +{ + - // Hope / fear decode ValType hope_scale = 1.0; size_t hope_index=0, fear_index=0, model_index=0; @@ -134,7 +141,8 @@ void NbestHopeFearDecoder::HopeFear( hopeFear->hopeFearEqual = (hope_index == fear_index); } -void NbestHopeFearDecoder::MaxModel(const AvgWeightVector& wv, std::vector<ValType>* stats) { +void NbestHopeFearDecoder::MaxModel(const AvgWeightVector& wv, std::vector<ValType>* stats) +{ // Find max model size_t max_index=0; ValType max_score=0; @@ -152,18 +160,19 @@ void NbestHopeFearDecoder::MaxModel(const AvgWeightVector& wv, std::vector<ValTy HypergraphHopeFearDecoder::HypergraphHopeFearDecoder - ( - const string& hypergraphDir, - const vector<string>& referenceFiles, - size_t num_dense, - bool streaming, - bool no_shuffle, - bool safe_hope, - size_t hg_pruning, - const MiraWeightVector& wv, - Scorer* scorer - ) : - num_dense_(num_dense) { +( + const string& hypergraphDir, + const vector<string>& referenceFiles, + size_t num_dense, + bool streaming, + bool no_shuffle, + bool safe_hope, + size_t hg_pruning, + const MiraWeightVector& wv, + Scorer* scorer +) : + num_dense_(num_dense) +{ UTIL_THROW_IF(streaming, util::Exception, "Streaming not currently supported for hypergraphs"); UTIL_THROW_IF(!fs::exists(hypergraphDir), HypergraphException, "Directory '" << hypergraphDir << "' does not exist"); @@ -177,16 +186,17 @@ HypergraphHopeFearDecoder::HypergraphHopeFearDecoder static const string kWeights = "weights"; fs::directory_iterator dend; size_t fileCount = 0; - + cerr << "Reading hypergraphs" << endl; for (fs::directory_iterator di(hypergraphDir); di != dend; ++di) { const fs::path& hgpath = di->path(); if (hgpath.filename() == kWeights) continue; + // cerr << "Reading " << hgpath.filename() << endl; Graph graph(vocab_); size_t id = boost::lexical_cast<size_t>(hgpath.stem().string()); util::scoped_fd fd(util::OpenReadOrThrow(hgpath.string().c_str())); //util::FilePiece file(di->path().string().c_str()); - util::FilePiece file(fd.release()); + util::FilePiece file(fd.release()); ReadGraph(file,graph); //cerr << "ref length " << references_.Length(id) << endl; @@ -195,7 +205,7 @@ HypergraphHopeFearDecoder::HypergraphHopeFearDecoder prunedGraph.reset(new Graph(vocab_)); graph.Prune(prunedGraph.get(), weights, edgeCount); graphs_[id] = prunedGraph; - //cerr << "Pruning to v=" << graphs_[id]->VertexSize() << " e=" << graphs_[id]->EdgeSize() << endl; + // cerr << "Pruning to v=" << graphs_[id]->VertexSize() << " e=" << graphs_[id]->EdgeSize() << endl; ++fileCount; if (fileCount % 10 == 0) cerr << "."; if (fileCount % 400 == 0) cerr << " [count=" << fileCount << "]\n"; @@ -210,23 +220,27 @@ HypergraphHopeFearDecoder::HypergraphHopeFearDecoder } -void HypergraphHopeFearDecoder::reset() { +void HypergraphHopeFearDecoder::reset() +{ sentenceIdIter_ = sentenceIds_.begin(); } -void HypergraphHopeFearDecoder::next() { +void HypergraphHopeFearDecoder::next() +{ sentenceIdIter_++; } -bool HypergraphHopeFearDecoder::finished() { +bool HypergraphHopeFearDecoder::finished() +{ return sentenceIdIter_ == sentenceIds_.end(); } void HypergraphHopeFearDecoder::HopeFear( - const vector<ValType>& backgroundBleu, - const MiraWeightVector& wv, - HopeFearData* hopeFear - ) { + const vector<ValType>& backgroundBleu, + const MiraWeightVector& wv, + HopeFearData* hopeFear +) +{ size_t sentenceId = *sentenceIdIter_; SparseVector weights; wv.ToSparse(&weights); @@ -246,12 +260,12 @@ void HypergraphHopeFearDecoder::HopeFear( Viterbi(graph, weights, 0, references_, sentenceId, backgroundBleu, &modelHypo); - // Outer loop rescales the contribution of model score to 'hope' in antagonistic cases + // Outer loop rescales the contribution of model score to 'hope' in antagonistic cases // where model score is having far more influence than BLEU - // hope_bleu *= BLEU_RATIO; // We only care about cases where model has MUCH more influence than BLEU - // if(safe_hope_ && safe_loop==0 && abs(hope_model)>1e-8 && abs(hope_bleu)/abs(hope_model)<hope_scale) - // hope_scale = abs(hope_bleu) / abs(hope_model); - // else break; + // hope_bleu *= BLEU_RATIO; // We only care about cases where model has MUCH more influence than BLEU + // if(safe_hope_ && safe_loop==0 && abs(hope_model)>1e-8 && abs(hope_bleu)/abs(hope_model)<hope_scale) + // hope_scale = abs(hope_bleu) / abs(hope_model); + // else break; //TODO: Don't currently get model and bleu so commented this out for now. break; } @@ -310,21 +324,23 @@ void HypergraphHopeFearDecoder::HopeFear( if (hopeFear->hopeFearEqual) { for (size_t i = 0; i < fearStats.size(); ++i) { if (fearStats[i] != hopeFear->hopeStats[i]) { - hopeFear->hopeFearEqual = false; - break; + hopeFear->hopeFearEqual = false; + break; } } } hopeFear->hopeFearEqual = hopeFear->hopeFearEqual && (hopeFear->fearFeatures == hopeFear->hopeFeatures); } -void HypergraphHopeFearDecoder::MaxModel(const AvgWeightVector& wv, vector<ValType>* stats) { +void HypergraphHopeFearDecoder::MaxModel(const AvgWeightVector& wv, vector<ValType>* stats) +{ assert(!finished()); HgHypothesis bestHypo; size_t sentenceId = *sentenceIdIter_; SparseVector weights; wv.ToSparse(&weights); vector<ValType> bg(scorer_->NumberOfScores()); + //cerr << "Calculating bleu on " << sentenceId << endl; Viterbi(*(graphs_[sentenceId]), weights, 0, references_, sentenceId, bg, &bestHypo); stats->resize(bestHypo.bleuStats.size()); /* diff --git a/mert/HopeFearDecoder.h b/mert/HopeFearDecoder.h index d1881eeb2..53c0e935d 100644 --- a/mert/HopeFearDecoder.h +++ b/mert/HopeFearDecoder.h @@ -35,7 +35,8 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA // the n-best list and lattice/hypergraph implementations // -namespace MosesTuning { +namespace MosesTuning +{ class Scorer; @@ -44,7 +45,7 @@ struct HopeFearData { MiraFeatureVector modelFeatures; MiraFeatureVector hopeFeatures; MiraFeatureVector fearFeatures; - + std::vector<float> modelStats; std::vector<float> hopeStats; @@ -55,7 +56,8 @@ struct HopeFearData { }; //Abstract base class -class HopeFearDecoder { +class HopeFearDecoder +{ public: //iterator methods virtual void reset() = 0; @@ -68,14 +70,14 @@ public: * Calculate hope, fear and model hypotheses **/ virtual void HopeFear( - const std::vector<ValType>& backgroundBleu, - const MiraWeightVector& wv, - HopeFearData* hopeFear - ) = 0; + const std::vector<ValType>& backgroundBleu, + const MiraWeightVector& wv, + HopeFearData* hopeFear + ) = 0; /** Max score decoding */ virtual void MaxModel(const AvgWeightVector& wv, std::vector<ValType>* stats) - = 0; + = 0; /** Calculate bleu on training set */ ValType Evaluate(const AvgWeightVector& wv); @@ -86,25 +88,26 @@ protected: /** Gets hope-fear from nbest lists */ -class NbestHopeFearDecoder : public virtual HopeFearDecoder { +class NbestHopeFearDecoder : public virtual HopeFearDecoder +{ public: NbestHopeFearDecoder(const std::vector<std::string>& featureFiles, - const std::vector<std::string>& scoreFiles, - bool streaming, - bool no_shuffle, - bool safe_hope, - Scorer* scorer - ); + const std::vector<std::string>& scoreFiles, + bool streaming, + bool no_shuffle, + bool safe_hope, + Scorer* scorer + ); virtual void reset(); virtual void next(); virtual bool finished(); virtual void HopeFear( - const std::vector<ValType>& backgroundBleu, - const MiraWeightVector& wv, - HopeFearData* hopeFear - ); + const std::vector<ValType>& backgroundBleu, + const MiraWeightVector& wv, + HopeFearData* hopeFear + ); virtual void MaxModel(const AvgWeightVector& wv, std::vector<ValType>* stats); @@ -117,29 +120,30 @@ private: /** Gets hope-fear from hypergraphs */ -class HypergraphHopeFearDecoder : public virtual HopeFearDecoder { +class HypergraphHopeFearDecoder : public virtual HopeFearDecoder +{ public: HypergraphHopeFearDecoder( - const std::string& hypergraphDir, - const std::vector<std::string>& referenceFiles, - size_t num_dense, - bool streaming, - bool no_shuffle, - bool safe_hope, - size_t hg_pruning, - const MiraWeightVector& wv, - Scorer* scorer_ - ); + const std::string& hypergraphDir, + const std::vector<std::string>& referenceFiles, + size_t num_dense, + bool streaming, + bool no_shuffle, + bool safe_hope, + size_t hg_pruning, + const MiraWeightVector& wv, + Scorer* scorer_ + ); virtual void reset(); virtual void next(); virtual bool finished(); virtual void HopeFear( - const std::vector<ValType>& backgroundBleu, - const MiraWeightVector& wv, - HopeFearData* hopeFear - ); + const std::vector<ValType>& backgroundBleu, + const MiraWeightVector& wv, + HopeFearData* hopeFear + ); virtual void MaxModel(const AvgWeightVector& wv, std::vector<ValType>* stats); diff --git a/mert/HwcmScorer.cpp b/mert/HwcmScorer.cpp index 6aff77def..bb3cd4382 100644 --- a/mert/HwcmScorer.cpp +++ b/mert/HwcmScorer.cpp @@ -55,7 +55,8 @@ void HwcmScorer::setReferenceFiles(const vector<string>& referenceFiles) } -void HwcmScorer::extractHeadWordChain(TreePointer tree, vector<string> & history, vector<map<string, int> > & hwc) { +void HwcmScorer::extractHeadWordChain(TreePointer tree, vector<string> & history, vector<map<string, int> > & hwc) +{ if (tree->GetLength() > 0) { string head = getHead(tree); @@ -64,8 +65,7 @@ void HwcmScorer::extractHeadWordChain(TreePointer tree, vector<string> & history for (std::vector<TreePointer>::const_iterator it = tree->GetChildren().begin(); it != tree->GetChildren().end(); ++it) { extractHeadWordChain(*it, history, hwc); } - } - else { + } else { vector<string> new_history(kHwcmOrder); new_history[0] = head; hwc[0][head]++; @@ -85,11 +85,11 @@ void HwcmScorer::extractHeadWordChain(TreePointer tree, vector<string> & history } } -string HwcmScorer::getHead(TreePointer tree) { +string HwcmScorer::getHead(TreePointer tree) +{ // assumption (only true for dependency parse: each constituent has a preterminal label, and corresponding terminal is head) // if constituent has multiple preterminals, first one is picked; if it has no preterminals, empty string is returned - for (std::vector<TreePointer>::const_iterator it = tree->GetChildren().begin(); it != tree->GetChildren().end(); ++it) - { + for (std::vector<TreePointer>::const_iterator it = tree->GetChildren().begin(); it != tree->GetChildren().end(); ++it) { TreePointer child = *it; if (child->GetLength() == 1 && child->GetChildren()[0]->IsTerminal()) { diff --git a/mert/Hypergraph.cpp b/mert/Hypergraph.cpp index b7725ead0..a8087acb5 100644 --- a/mert/Hypergraph.cpp +++ b/mert/Hypergraph.cpp @@ -31,18 +31,22 @@ using namespace std; static const string kBOS = "<s>"; static const string kEOS = "</s>"; -namespace MosesTuning { +namespace MosesTuning +{ -StringPiece NextLine(util::FilePiece& from) { +StringPiece NextLine(util::FilePiece& from) +{ StringPiece line; while ((line = from.ReadLine()).starts_with("#")); return line; } -Vocab::Vocab() : eos_( FindOrAdd(kEOS)), bos_(FindOrAdd(kBOS)){ +Vocab::Vocab() : eos_( FindOrAdd(kEOS)), bos_(FindOrAdd(kBOS)) +{ } -const Vocab::Entry &Vocab::FindOrAdd(const StringPiece &str) { +const Vocab::Entry &Vocab::FindOrAdd(const StringPiece &str) +{ #if BOOST_VERSION >= 104200 Map::const_iterator i= map_.find(str, Hash(), Equals()); #else @@ -62,7 +66,8 @@ double_conversion::StringToDoubleConverter converter(double_conversion::StringTo /** * Reads an incoming edge. Returns edge and source words covered. **/ -static pair<Edge*,size_t> ReadEdge(util::FilePiece &from, Graph &graph) { +static pair<Edge*,size_t> ReadEdge(util::FilePiece &from, Graph &graph) +{ Edge* edge = graph.NewEdge(); StringPiece line = from.ReadLine(); //Don't allow comments within edge lists util::TokenIter<util::MultiCharacter> pipes(line, util::MultiCharacter(" ||| ")); @@ -82,7 +87,7 @@ static pair<Edge*,size_t> ReadEdge(util::FilePiece &from, Graph &graph) { edge->AddWord(&found); } } - + //Features ++pipes; for (util::TokenIter<util::SingleCharacter, true> i(*pipes, util::SingleCharacter(' ')); i; ++i) { @@ -100,17 +105,18 @@ static pair<Edge*,size_t> ReadEdge(util::FilePiece &from, Graph &graph) { //Covered words ++pipes; size_t sourceCovered = boost::lexical_cast<size_t>(*pipes); - return pair<Edge*,size_t>(edge,sourceCovered); + return pair<Edge*,size_t>(edge,sourceCovered); } -void Graph::Prune(Graph* pNewGraph, const SparseVector& weights, size_t minEdgeCount) const { +void Graph::Prune(Graph* pNewGraph, const SparseVector& weights, size_t minEdgeCount) const +{ Graph& newGraph = *pNewGraph; //TODO: Optimise case where no pruning required //For debug - - + + /* map<const Edge*, string> edgeIds; for (size_t i = 0; i < edges_.Size(); ++i) { @@ -136,7 +142,7 @@ void Graph::Prune(Graph* pNewGraph, const SparseVector& weights, size_t minEdgeC //Compute backward scores for (size_t vi = 0; vi < vertices_.Size(); ++vi) { - // cerr << "Vertex " << vi << endl; + // cerr << "Vertex " << vi << endl; const Vertex& vertex = vertices_[vi]; const vector<const Edge*>& incoming = vertex.GetIncoming(); if (!incoming.size()) { @@ -150,7 +156,7 @@ void Graph::Prune(Graph* pNewGraph, const SparseVector& weights, size_t minEdgeC //cerr << "\tChild " << incoming[ei]->Children()[i] << endl; size_t childId = incoming[ei]->Children()[i]; UTIL_THROW_IF(vertexBackwardScores[childId] == kMinScore, - HypergraphException, "Graph was not topologically sorted. curr=" << vi << " prev=" << childId); + HypergraphException, "Graph was not topologically sorted. curr=" << vi << " prev=" << childId); outgoing[childId].push_back(incoming[ei]); incomingScore += vertexBackwardScores[childId]; } @@ -172,7 +178,7 @@ void Graph::Prune(Graph* pNewGraph, const SparseVector& weights, size_t minEdgeC } else { for (size_t ei = 0; ei < outgoing[vi].size(); ++ei) { //cerr << "Edge " << edgeIds[outgoing[vi][ei]] << endl; - FeatureStatsType outgoingScore = 0; + FeatureStatsType outgoingScore = 0; //add score of head outgoingScore += vertexForwardScores[edgeHeads[outgoing[vi][ei]]]; //cerr << "Forward score " << outgoingScore << endl; @@ -204,11 +210,11 @@ void Graph::Prune(Graph* pNewGraph, const SparseVector& weights, size_t minEdgeC } FeatureStatsType score = edgeForwardScores[edge] + edgeBackwardScores[edge]; edgeScores.insert(pair<FeatureStatsType, const Edge*>(score,edge)); - // cerr << edgeIds[edge] << " " << score << endl; + // cerr << edgeIds[edge] << " " << score << endl; } - + multimap<FeatureStatsType, const Edge*>::const_reverse_iterator ei = edgeScores.rbegin(); size_t edgeCount = 1; while(edgeCount < minEdgeCount && ei != edgeScores.rend()) { @@ -235,10 +241,10 @@ void Graph::Prune(Graph* pNewGraph, const SparseVector& weights, size_t minEdgeC map<size_t,size_t> oldIdToNew; size_t vi = 0; for (set<size_t>::const_iterator i = retainedVertices.begin(); i != retainedVertices.end(); ++i, ++vi) { - //cerr << *i << " New: " << vi << endl; +// cerr << *i << " New: " << vi << endl; oldIdToNew[*i] = vi; Vertex* vertex = newGraph.NewVertex(); - vertex->SetSourceCovered(vertices_[*i].SourceCovered()); + vertex->SetSourceCovered(vertices_[*i].SourceCovered()); } for (set<const Edge*>::const_iterator i = retainedEdges.begin(); i != retainedEdges.end(); ++i) { @@ -255,6 +261,7 @@ void Graph::Prune(Graph* pNewGraph, const SparseVector& weights, size_t minEdgeC newHead.AddEdge(newEdge); } + /* cerr << "New graph" << endl; for (size_t vi = 0; vi < newGraph.VertexSize(); ++vi) { @@ -274,21 +281,22 @@ void Graph::Prune(Graph* pNewGraph, const SparseVector& weights, size_t minEdgeC } cerr << endl; } - */ + */ } /** * Read from "Kenneth's hypergraph" aka cdec target_graph format (with comments) **/ -void ReadGraph(util::FilePiece &from, Graph &graph) { +void ReadGraph(util::FilePiece &from, Graph &graph) +{ //First line should contain field names StringPiece line = from.ReadLine(); UTIL_THROW_IF(line.compare("# target ||| features ||| source-covered") != 0, HypergraphException, "Incorrect format spec on first line: '" << line << "'"); line = NextLine(from); - + //Then expect numbers of vertices util::TokenIter<util::SingleCharacter, false> i(line, util::SingleCharacter(' ')); unsigned long int vertices = boost::lexical_cast<unsigned long int>(*i); @@ -303,9 +311,11 @@ void ReadGraph(util::FilePiece &from, Graph &graph) { for (unsigned long int e = 0; e < edge_count; ++e) { pair<Edge*,size_t> edge = ReadEdge(from, graph); vertex->AddEdge(edge.first); - //Note: the file format attaches this to the edge, but it's really a property + //Note: the file format attaches this to the edge, but it's really a property //of the vertex. - if (!e) {vertex->SetSourceCovered(edge.second);} + if (!e) { + vertex->SetSourceCovered(edge.second); + } } } } diff --git a/mert/Hypergraph.h b/mert/Hypergraph.h index b6ee6c3f8..14226fb48 100644 --- a/mert/Hypergraph.h +++ b/mert/Hypergraph.h @@ -37,81 +37,88 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA #include "FeatureStats.h" -namespace MosesTuning { +namespace MosesTuning +{ typedef unsigned int WordIndex; const WordIndex kMaxWordIndex = UINT_MAX; -const FeatureStatsType kMinScore = -std::numeric_limits<FeatureStatsType>::max(); - -template <class T> class FixedAllocator : boost::noncopyable { - public: - FixedAllocator() : current_(NULL), end_(NULL) {} - - void Init(std::size_t count) { - assert(!current_); - array_.reset(new T[count]); - current_ = array_.get(); - end_ = current_ + count; - } - - T &operator[](std::size_t idx) { - return array_.get()[idx]; - } - const T &operator[](std::size_t idx) const { - return array_.get()[idx]; - } - - T *New() { - T *ret = current_++; - UTIL_THROW_IF(ret >= end_, util::Exception, "Allocating past end"); - return ret; - } - - std::size_t Capacity() const { - return end_ - array_.get(); - } - - std::size_t Size() const { - return current_ - array_.get(); - } - - private: - boost::scoped_array<T> array_; - T *current_, *end_; +const FeatureStatsType kMinScore = -1e10; + +template <class T> class FixedAllocator : boost::noncopyable +{ +public: + FixedAllocator() : current_(NULL), end_(NULL) {} + + void Init(std::size_t count) { + assert(!current_); + array_.reset(new T[count]); + current_ = array_.get(); + end_ = current_ + count; + } + + T &operator[](std::size_t idx) { + return array_.get()[idx]; + } + const T &operator[](std::size_t idx) const { + return array_.get()[idx]; + } + + T *New() { + T *ret = current_++; + UTIL_THROW_IF(ret >= end_, util::Exception, "Allocating past end"); + return ret; + } + + std::size_t Capacity() const { + return end_ - array_.get(); + } + + std::size_t Size() const { + return current_ - array_.get(); + } + +private: + boost::scoped_array<T> array_; + T *current_, *end_; }; -class Vocab { - public: - Vocab(); +class Vocab +{ +public: + Vocab(); - typedef std::pair<const char *const, WordIndex> Entry; + typedef std::pair<const char *const, WordIndex> Entry; - const Entry &FindOrAdd(const StringPiece &str); + const Entry &FindOrAdd(const StringPiece &str); - const Entry& Bos() const {return bos_;} + const Entry& Bos() const { + return bos_; + } - const Entry& Eos() const {return eos_;} + const Entry& Eos() const { + return eos_; + } - private: - util::Pool piece_backing_; +private: + util::Pool piece_backing_; - struct Hash : public std::unary_function<const char *, std::size_t> { - std::size_t operator()(StringPiece str) const { - return util::MurmurHashNative(str.data(), str.size()); - } - }; + struct Hash : public std::unary_function<const char *, std::size_t> { + std::size_t operator()(StringPiece str) const { + return util::MurmurHashNative(str.data(), str.size()); + } + }; - struct Equals : public std::binary_function<const char *, const char *, bool> { - bool operator()(StringPiece first, StringPiece second) const { - return first == second; - } - }; + struct Equals : public std::binary_function<const char *, const char *, bool> { + bool operator()(StringPiece first, StringPiece second) const { + return first == second; + } + }; - typedef boost::unordered_map<const char *, WordIndex, Hash, Equals> Map; - Map map_; - Entry eos_; - Entry bos_; + typedef boost::unordered_map<const char *, WordIndex, Hash, Equals> Map; + Map map_; + Entry eos_; + Entry bos_; }; @@ -125,121 +132,141 @@ typedef boost::shared_ptr<SparseVector> FeaturePtr; /** * An edge has 1 head vertex, 0..n child (tail) vertices, a list of words and a feature vector. **/ -class Edge { - public: - Edge() {features_.reset(new SparseVector());} - - void AddWord(const Vocab::Entry *word) { - words_.push_back(word); - } - - void AddChild(size_t child) { - children_.push_back(child); - } - - void AddFeature(const StringPiece& name, FeatureStatsType value) { - //TODO StringPiece interface - features_->set(name.as_string(),value); - } - - - const WordVec &Words() const { - return words_; - } - - const FeaturePtr& Features() const { - return features_; - } - - void SetFeatures(const FeaturePtr& features) { - features_ = features; - } - - const std::vector<size_t>& Children() const { - return children_; - } - - FeatureStatsType GetScore(const SparseVector& weights) const { - return inner_product(*(features_.get()), weights); - } - - private: - // NULL for non-terminals. - std::vector<const Vocab::Entry*> words_; - std::vector<size_t> children_; - boost::shared_ptr<SparseVector> features_; +class Edge +{ +public: + Edge() { + features_.reset(new SparseVector()); + } + + void AddWord(const Vocab::Entry *word) { + words_.push_back(word); + } + + void AddChild(size_t child) { + children_.push_back(child); + } + + void AddFeature(const StringPiece& name, FeatureStatsType value) { + //TODO StringPiece interface + features_->set(name.as_string(),value); + } + + + const WordVec &Words() const { + return words_; + } + + const FeaturePtr& Features() const { + return features_; + } + + void SetFeatures(const FeaturePtr& features) { + features_ = features; + } + + const std::vector<size_t>& Children() const { + return children_; + } + + FeatureStatsType GetScore(const SparseVector& weights) const { + return inner_product(*(features_.get()), weights); + } + +private: + // NULL for non-terminals. + std::vector<const Vocab::Entry*> words_; + std::vector<size_t> children_; + boost::shared_ptr<SparseVector> features_; }; /* * A vertex has 0..n incoming edges **/ -class Vertex { - public: - Vertex() : sourceCovered_(0) {} - - void AddEdge(const Edge* edge) {incoming_.push_back(edge);} - - void SetSourceCovered(size_t sourceCovered) {sourceCovered_ = sourceCovered;} - - const std::vector<const Edge*>& GetIncoming() const {return incoming_;} - - size_t SourceCovered() const {return sourceCovered_;} - - private: - std::vector<const Edge*> incoming_; - size_t sourceCovered_; +class Vertex +{ +public: + Vertex() : sourceCovered_(0) {} + + void AddEdge(const Edge* edge) { + incoming_.push_back(edge); + } + + void SetSourceCovered(size_t sourceCovered) { + sourceCovered_ = sourceCovered; + } + + const std::vector<const Edge*>& GetIncoming() const { + return incoming_; + } + + size_t SourceCovered() const { + return sourceCovered_; + } + +private: + std::vector<const Edge*> incoming_; + size_t sourceCovered_; }; -class Graph : boost::noncopyable { - public: - Graph(Vocab& vocab) : vocab_(vocab) {} - - void SetCounts(std::size_t vertices, std::size_t edges) { - vertices_.Init(vertices); - edges_.Init(edges); - } - - Vocab &MutableVocab() { return vocab_; } - - Edge *NewEdge() { - return edges_.New(); - } - - Vertex *NewVertex() { - return vertices_.New(); - } - - const Vertex &GetVertex(std::size_t index) const { - return vertices_[index]; - } - - Edge &GetEdge(std::size_t index) { - return edges_[index]; - } - - /* Created a pruned copy of this graph with minEdgeCount edges. Uses - the scores in the max-product semiring to rank edges, as suggested by - Colin Cherry */ - void Prune(Graph* newGraph, const SparseVector& weights, size_t minEdgeCount) const; - - std::size_t VertexSize() const { return vertices_.Size(); } - std::size_t EdgeSize() const { return edges_.Size(); } - - bool IsBoundary(const Vocab::Entry* word) const { - return word->second == vocab_.Bos().second || word->second == vocab_.Eos().second; - } - - private: - FixedAllocator<Edge> edges_; - FixedAllocator<Vertex> vertices_; - Vocab& vocab_; +class Graph : boost::noncopyable +{ +public: + Graph(Vocab& vocab) : vocab_(vocab) {} + + void SetCounts(std::size_t vertices, std::size_t edges) { + vertices_.Init(vertices); + edges_.Init(edges); + } + + Vocab &MutableVocab() { + return vocab_; + } + + Edge *NewEdge() { + return edges_.New(); + } + + Vertex *NewVertex() { + return vertices_.New(); + } + + const Vertex &GetVertex(std::size_t index) const { + return vertices_[index]; + } + + Edge &GetEdge(std::size_t index) { + return edges_[index]; + } + + /* Created a pruned copy of this graph with minEdgeCount edges. Uses + the scores in the max-product semiring to rank edges, as suggested by + Colin Cherry */ + void Prune(Graph* newGraph, const SparseVector& weights, size_t minEdgeCount) const; + + std::size_t VertexSize() const { + return vertices_.Size(); + } + std::size_t EdgeSize() const { + return edges_.Size(); + } + + bool IsBoundary(const Vocab::Entry* word) const { + return word->second == vocab_.Bos().second || word->second == vocab_.Eos().second; + } + +private: + FixedAllocator<Edge> edges_; + FixedAllocator<Vertex> vertices_; + Vocab& vocab_; }; -class HypergraphException : public util::Exception { - public: - HypergraphException() {} - ~HypergraphException() throw() {} +class HypergraphException : public util::Exception +{ +public: + HypergraphException() {} + ~HypergraphException() throw() {} }; diff --git a/mert/HypergraphTest.cpp b/mert/HypergraphTest.cpp index 345a445f0..0dc1c04c6 100644 --- a/mert/HypergraphTest.cpp +++ b/mert/HypergraphTest.cpp @@ -8,12 +8,12 @@ using namespace std; using namespace MosesTuning; -BOOST_AUTO_TEST_CASE(prune) +BOOST_AUTO_TEST_CASE(prune) { Vocab vocab; WordVec words; string wordStrings[] = - {"<s>", "</s>", "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k"}; + {"<s>", "</s>", "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k"}; for (size_t i = 0; i < 13; ++i) { words.push_back(&(vocab.FindOrAdd((wordStrings[i])))); } @@ -105,7 +105,7 @@ BOOST_AUTO_TEST_CASE(prune) BOOST_CHECK_EQUAL(5, pruned.EdgeSize()); BOOST_CHECK_EQUAL(4, pruned.VertexSize()); - + //edges retained should be best path (<s> ab jk </s>) and hi BOOST_CHECK_EQUAL(1, pruned.GetVertex(0).GetIncoming().size()); BOOST_CHECK_EQUAL(2, pruned.GetVertex(1).GetIncoming().size()); @@ -115,37 +115,37 @@ BOOST_AUTO_TEST_CASE(prune) const Edge* edge; edge = pruned.GetVertex(0).GetIncoming()[0]; - BOOST_CHECK_EQUAL(1, edge->Words().size()); - BOOST_CHECK_EQUAL(words[0], edge->Words()[0]); + BOOST_CHECK_EQUAL(1, edge->Words().size()); + BOOST_CHECK_EQUAL(words[0], edge->Words()[0]); edge = pruned.GetVertex(1).GetIncoming()[0]; - BOOST_CHECK_EQUAL(3, edge->Words().size()); - BOOST_CHECK_EQUAL((Vocab::Entry*)NULL, edge->Words()[0]); - BOOST_CHECK_EQUAL(words[2]->first, edge->Words()[1]->first); - BOOST_CHECK_EQUAL(words[3]->first, edge->Words()[2]->first); + BOOST_CHECK_EQUAL(3, edge->Words().size()); + BOOST_CHECK_EQUAL((Vocab::Entry*)NULL, edge->Words()[0]); + BOOST_CHECK_EQUAL(words[2]->first, edge->Words()[1]->first); + BOOST_CHECK_EQUAL(words[3]->first, edge->Words()[2]->first); edge = pruned.GetVertex(1).GetIncoming()[1]; BOOST_CHECK_EQUAL(3, edge->Words().size()); BOOST_CHECK_EQUAL((Vocab::Entry*)NULL, edge->Words()[0]); - BOOST_CHECK_EQUAL(words[9]->first, edge->Words()[1]->first); + BOOST_CHECK_EQUAL(words[9]->first, edge->Words()[1]->first); BOOST_CHECK_EQUAL(words[10]->first, edge->Words()[2]->first); edge = pruned.GetVertex(2).GetIncoming()[0]; BOOST_CHECK_EQUAL(3, edge->Words().size()); BOOST_CHECK_EQUAL((Vocab::Entry*)NULL, edge->Words()[0]); - BOOST_CHECK_EQUAL(words[11]->first, edge->Words()[1]->first); + BOOST_CHECK_EQUAL(words[11]->first, edge->Words()[1]->first); BOOST_CHECK_EQUAL(words[12]->first, edge->Words()[2]->first); edge = pruned.GetVertex(3).GetIncoming()[0]; BOOST_CHECK_EQUAL(2, edge->Words().size()); BOOST_CHECK_EQUAL((Vocab::Entry*)NULL, edge->Words()[0]); - BOOST_CHECK_EQUAL(words[1]->first, edge->Words()[1]->first); + BOOST_CHECK_EQUAL(words[1]->first, edge->Words()[1]->first); + + - +// BOOST_CHECK_EQUAL(words[0], pruned.GetVertex(0).GetIncoming()[0].Words()[0]); -// BOOST_CHECK_EQUAL(words[0], pruned.GetVertex(0).GetIncoming()[0].Words()[0]); - } diff --git a/mert/InterpolatedScorer.cpp b/mert/InterpolatedScorer.cpp index ea4240472..b8ec3a855 100644 --- a/mert/InterpolatedScorer.cpp +++ b/mert/InterpolatedScorer.cpp @@ -174,19 +174,19 @@ float InterpolatedScorer::calculateScore(const std::vector<ScoreStatsType>& tota float InterpolatedScorer::getReferenceLength(const std::vector<ScoreStatsType>& totals) const { - size_t scorerNum = 0; - size_t last = 0; - float refLen = 0; - for (ScopedVector<Scorer>::const_iterator itsc = m_scorers.begin(); - itsc != m_scorers.end(); ++itsc) { - int numScoresScorer = (*itsc)->NumberOfScores(); - std::vector<ScoreStatsType> totals_scorer(totals.begin()+last, totals.begin()+last+numScoresScorer); - refLen += (*itsc)->getReferenceLength(totals_scorer) * m_scorer_weights[scorerNum]; - last += numScoresScorer; - scorerNum++; - } - return refLen; + size_t scorerNum = 0; + size_t last = 0; + float refLen = 0; + for (ScopedVector<Scorer>::const_iterator itsc = m_scorers.begin(); + itsc != m_scorers.end(); ++itsc) { + int numScoresScorer = (*itsc)->NumberOfScores(); + std::vector<ScoreStatsType> totals_scorer(totals.begin()+last, totals.begin()+last+numScoresScorer); + refLen += (*itsc)->getReferenceLength(totals_scorer) * m_scorer_weights[scorerNum]; + last += numScoresScorer; + scorerNum++; } + return refLen; +} void InterpolatedScorer::setReferenceFiles(const vector<string>& referenceFiles) { diff --git a/mert/MeteorScorer.cpp b/mert/MeteorScorer.cpp index 914fd02d4..3a7eb6ab7 100644 --- a/mert/MeteorScorer.cpp +++ b/mert/MeteorScorer.cpp @@ -34,7 +34,8 @@ namespace MosesTuning #define CHILD_STDOUT_WRITE pipefds_output[1] MeteorScorer::MeteorScorer(const string& config) - : StatisticsBasedScorer("METEOR",config) { + : StatisticsBasedScorer("METEOR",config) +{ meteor_jar = getConfig("jar", ""); meteor_lang = getConfig("lang", "en"); meteor_task = getConfig("task", "tune"); @@ -88,7 +89,8 @@ MeteorScorer::MeteorScorer(const string& config) m_from_meteor = new ifdstream(CHILD_STDOUT_READ); } -MeteorScorer::~MeteorScorer() { +MeteorScorer::~MeteorScorer() +{ // Cleanup IO delete m_to_meteor; delete m_from_meteor; @@ -171,7 +173,8 @@ float MeteorScorer::calculateScore(const vector<ScoreStatsType>& comps) const // Meteor unsupported, throw error if used MeteorScorer::MeteorScorer(const string& config) - : StatisticsBasedScorer("METEOR",config) { + : StatisticsBasedScorer("METEOR",config) +{ throw runtime_error("Meteor unsupported, requires GLIBCXX"); } diff --git a/mert/MeteorScorer.h b/mert/MeteorScorer.h index 9c3657018..31b05ec72 100644 --- a/mert/MeteorScorer.h +++ b/mert/MeteorScorer.h @@ -20,7 +20,7 @@ class ifdstream; class ScoreStats; /** - * Meteor scoring + * Meteor scoring * * https://github.com/mjdenkowski/meteor * http://statmt.org/wmt11/pdf/WMT07.pdf diff --git a/mert/MiraFeatureVector.cpp b/mert/MiraFeatureVector.cpp index 347ad488e..ad3588339 100644 --- a/mert/MiraFeatureVector.cpp +++ b/mert/MiraFeatureVector.cpp @@ -9,7 +9,8 @@ namespace MosesTuning { -void MiraFeatureVector::InitSparse(const SparseVector& sparse, size_t ignoreLimit) { +void MiraFeatureVector::InitSparse(const SparseVector& sparse, size_t ignoreLimit) +{ vector<size_t> sparseFeats = sparse.feats(); bool bFirst = true; size_t lastFeat = 0; @@ -40,7 +41,8 @@ MiraFeatureVector::MiraFeatureVector(const FeatureDataItem& vec) InitSparse(vec.sparse); } -MiraFeatureVector::MiraFeatureVector(const SparseVector& sparse, size_t num_dense) { +MiraFeatureVector::MiraFeatureVector(const SparseVector& sparse, size_t num_dense) +{ m_dense.resize(num_dense); //Assume that features with id [0,num_dense) are the dense features for (size_t id = 0; id < num_dense; ++id) { @@ -162,7 +164,8 @@ MiraFeatureVector operator-(const MiraFeatureVector& a, const MiraFeatureVector& return MiraFeatureVector(dense,sparseFeats,sparseVals); } -bool operator==(const MiraFeatureVector& a,const MiraFeatureVector& b) { +bool operator==(const MiraFeatureVector& a,const MiraFeatureVector& b) +{ ValType eps = 1e-8; //dense features if (a.m_dense.size() != b.m_dense.size()) return false; diff --git a/mert/MiraWeightVector.cpp b/mert/MiraWeightVector.cpp index c8a1ca774..eba9617c8 100644 --- a/mert/MiraWeightVector.cpp +++ b/mert/MiraWeightVector.cpp @@ -93,7 +93,8 @@ void MiraWeightVector::update(size_t index, ValType delta) m_lastUpdated[index] = m_numUpdates; } -void MiraWeightVector::ToSparse(SparseVector* sparse) const { +void MiraWeightVector::ToSparse(SparseVector* sparse) const +{ for (size_t i = 0; i < m_weights.size(); ++i) { if(abs(m_weights[i])>1e-8) { sparse->set(i,m_weights[i]); @@ -171,7 +172,8 @@ size_t AvgWeightVector::size() const return m_wv.m_weights.size(); } -void AvgWeightVector::ToSparse(SparseVector* sparse) const { +void AvgWeightVector::ToSparse(SparseVector* sparse) const +{ for (size_t i = 0; i < size(); ++i) { ValType w = weight(i); if(abs(w)>1e-8) { diff --git a/mert/Optimizer.cpp b/mert/Optimizer.cpp index 3f5aa48a6..5da32363f 100644 --- a/mert/Optimizer.cpp +++ b/mert/Optimizer.cpp @@ -168,8 +168,8 @@ statscore_t Optimizer::LineOptimize(const Point& origin, const Point& direction, // The rightmost bestindex is the one with the highest slope. // They should be equal but there might be. - UTIL_THROW_IF(abs(leftmost->first-gradient.rbegin()->first) >= 0.0001, - util::Exception, "Error"); + UTIL_THROW_IF(abs(leftmost->first-gradient.rbegin()->first) >= 0.0001, + util::Exception, "Error"); // A small difference due to rounding error break; } @@ -191,8 +191,8 @@ statscore_t Optimizer::LineOptimize(const Point& origin, const Point& direction, if (tit == previnserted) { // The threshold is the same as before can happen if 2 candidates are the same for example. UTIL_THROW_IF(previnserted->second.back().first != newd.first, - util::Exception, - "Error"); + util::Exception, + "Error"); previnserted->second.back()=newd; // just replace the 1 best for sentence S // previnsert doesn't change } else { @@ -207,8 +207,8 @@ statscore_t Optimizer::LineOptimize(const Point& origin, const Point& direction, // We append the diffs in previnsert to tit before destroying previnsert. tit->second.insert(tit->second.end(),previnserted->second.begin(),previnserted->second.end()); UTIL_THROW_IF(tit->second.back().first != newd.first, - util::Exception, - "Error"); + util::Exception, + "Error"); tit->second.back()=newd; // change diff for sentence S thresholdmap.erase(previnserted); // erase old previnsert previnserted = tit; // point previnsert to the new threshold @@ -216,8 +216,8 @@ statscore_t Optimizer::LineOptimize(const Point& origin, const Point& direction, } UTIL_THROW_IF(previnserted == thresholdmap.end(), - util::Exception, - "Error"); + util::Exception, + "Error"); } else { //normal insertion process previnserted = AddThreshold(thresholdmap, leftmostx, newd); } @@ -254,8 +254,8 @@ statscore_t Optimizer::LineOptimize(const Point& origin, const Point& direction, // We skipped the first el of thresholdlist but GetIncStatScore return 1 more for first1best. UTIL_THROW_IF(scores.size() != thresholdmap.size(), - util::Exception, - "Error"); + util::Exception, + "Error"); for (unsigned int sc = 0; sc != scores.size(); sc++) { //cerr << "x=" << thrit->first << " => " << scores[sc] << endl; diff --git a/mert/Point.cpp b/mert/Point.cpp index 1db59ce66..55dc6a6b2 100644 --- a/mert/Point.cpp +++ b/mert/Point.cpp @@ -40,8 +40,8 @@ Point::Point(const vector<parameter_t>& init, m_max[i] = max[i]; } } else { - UTIL_THROW_IF(init.size() != m_pdim, util::Exception, "Error"); - UTIL_THROW_IF(m_opt_indices.size() != Point::m_dim, util::Exception, "Error"); + UTIL_THROW_IF(init.size() != m_pdim, util::Exception, "Error"); + UTIL_THROW_IF(m_opt_indices.size() != Point::m_dim, util::Exception, "Error"); for (unsigned int i = 0; i < Point::m_dim; i++) { operator[](i) = init[m_opt_indices[i]]; m_min[i] = min[m_opt_indices[i]]; diff --git a/mert/PreProcessFilter.cpp b/mert/PreProcessFilter.cpp index a36ed6155..7a3add789 100644 --- a/mert/PreProcessFilter.cpp +++ b/mert/PreProcessFilter.cpp @@ -35,7 +35,7 @@ PreProcessFilter::PreProcessFilter(const string& filterCommand) m_fromFilter(NULL) { #if defined __MINGW32__ - //TODO(jie): replace this function with boost implementation + //TODO(jie): replace this function with boost implementation #else // Child error signal install // sigaction is the replacement for the traditional signal() method diff --git a/mert/Scorer.cpp b/mert/Scorer.cpp index ed3ff2458..ffaf03be4 100644 --- a/mert/Scorer.cpp +++ b/mert/Scorer.cpp @@ -25,9 +25,9 @@ const int kUnknownToken = -1; Scorer::Scorer(const string& name, const string& config) : m_name(name), m_vocab(mert::VocabularyFactory::GetVocabulary()), - #if defined(__GLIBCXX__) || defined(__GLIBCPP__) +#if defined(__GLIBCXX__) || defined(__GLIBCPP__) m_filter(NULL), - #endif +#endif m_score_data(NULL), m_enable_preserve_case(true) { diff --git a/mert/StatisticsBasedScorer.h b/mert/StatisticsBasedScorer.h index f1c77e0ba..ba45634cc 100644 --- a/mert/StatisticsBasedScorer.h +++ b/mert/StatisticsBasedScorer.h @@ -23,7 +23,7 @@ namespace MosesTuning */ class StatisticsBasedScorer : public Scorer { -friend class HopeFearDecoder; + friend class HopeFearDecoder; public: StatisticsBasedScorer(const std::string& name, const std::string& config); diff --git a/mert/TER/alignmentStruct.cpp b/mert/TER/alignmentStruct.cpp index 544ee61ac..e42ec4a14 100644 --- a/mert/TER/alignmentStruct.cpp +++ b/mert/TER/alignmentStruct.cpp @@ -5,7 +5,7 @@ Copyright 2010-2013, Christophe Servan, LIUM, University of Le Mans, France Contact: christophe.servan@lium.univ-lemans.fr The tercpp tool and library are free software: you can redistribute it and/or modify it -under the terms of the GNU Lesser General Public License as published by +under the terms of the GNU Lesser General Public License as published by the Free Software Foundation, either version 3 of the licence, or (at your option) any later version. @@ -23,15 +23,15 @@ Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA using namespace std; namespace TERCpp { - string alignmentStruct::toString() - { - stringstream s; +string alignmentStruct::toString() +{ + stringstream s; // s << "nword : " << vectorToString(nwords)<<endl; // s << "alignment" << vectorToString(alignment)<<endl; // s << "afterShift" << vectorToString(alignment)<<endl; - s << "Nothing to be printed" <<endl; - return s.str(); - } + s << "Nothing to be printed" <<endl; + return s.str(); +} // alignmentStruct::alignmentStruct() // { @@ -99,7 +99,7 @@ namespace TERCpp // return s.str(); // } - /* The distance of the shift. */ +/* The distance of the shift. */ // int alignmentStruct::distance() // { // if (moveto < start) diff --git a/mert/TER/alignmentStruct.h b/mert/TER/alignmentStruct.h index adda2c345..c1459960b 100644 --- a/mert/TER/alignmentStruct.h +++ b/mert/TER/alignmentStruct.h @@ -5,7 +5,7 @@ Copyright 2010-2013, Christophe Servan, LIUM, University of Le Mans, France Contact: christophe.servan@lium.univ-lemans.fr The tercpp tool and library are free software: you can redistribute it and/or modify it -under the terms of the GNU Lesser General Public License as published by +under the terms of the GNU Lesser General Public License as published by the Free Software Foundation, either version 3 of the licence, or (at your option) any later version. @@ -34,10 +34,10 @@ using namespace Tools; namespace TERCpp { - class alignmentStruct - { - private: - public: +class alignmentStruct +{ +private: +public: // alignmentStruct(); // alignmentStruct (int _start, int _end, int _moveto, int _newloc); @@ -53,14 +53,14 @@ namespace TERCpp // int end; // int moveto; // int newloc; - vector<string> nwords; // The words we shifted - vector<char> alignment ; // for pra_more output - vector<vecInt> aftershift; // for pra_more output - // This is used to store the cost of a shift, so we don't have to - // calculate it multiple times. - double cost; - string toString(); - }; + vector<string> nwords; // The words we shifted + vector<char> alignment ; // for pra_more output + vector<vecInt> aftershift; // for pra_more output + // This is used to store the cost of a shift, so we don't have to + // calculate it multiple times. + double cost; + string toString(); +}; } #endif
\ No newline at end of file diff --git a/mert/TER/bestShiftStruct.h b/mert/TER/bestShiftStruct.h index 9457fd1d8..d68f2319f 100644 --- a/mert/TER/bestShiftStruct.h +++ b/mert/TER/bestShiftStruct.h @@ -5,7 +5,7 @@ Copyright 2010-2013, Christophe Servan, LIUM, University of Le Mans, France Contact: christophe.servan@lium.univ-lemans.fr The tercpp tool and library are free software: you can redistribute it and/or modify it -under the terms of the GNU Lesser General Public License as published by +under the terms of the GNU Lesser General Public License as published by the Free Software Foundation, either version 3 of the licence, or (at your option) any later version. @@ -36,10 +36,10 @@ using namespace Tools; namespace TERCpp { - class bestShiftStruct - { - private: - public: +class bestShiftStruct +{ +private: +public: // alignmentStruct(); // alignmentStruct (int _start, int _end, int _moveto, int _newloc); @@ -55,16 +55,16 @@ namespace TERCpp // int end; // int moveto; // int newloc; - terShift m_best_shift; - terAlignment m_best_align; - bool m_empty; + terShift m_best_shift; + terAlignment m_best_align; + bool m_empty; // vector<string> nwords; // The words we shifted // char* alignment ; // for pra_more output // vector<vecInt> aftershift; // for pra_more output - // This is used to store the cost of a shift, so we don't have to - // calculate it multiple times. + // This is used to store the cost of a shift, so we don't have to + // calculate it multiple times. // double cost; - }; +}; } #endif
\ No newline at end of file diff --git a/mert/TER/hashMap.cpp b/mert/TER/hashMap.cpp index de84ff796..253fda715 100644 --- a/mert/TER/hashMap.cpp +++ b/mert/TER/hashMap.cpp @@ -5,7 +5,7 @@ Copyright 2010-2013, Christophe Servan, LIUM, University of Le Mans, France Contact: christophe.servan@lium.univ-lemans.fr The tercpp tool and library are free software: you can redistribute it and/or modify it -under the terms of the GNU Lesser General Public License as published by +under the terms of the GNU Lesser General Public License as published by the Free Software Foundation, either version 3 of the licence, or (at your option) any later version. @@ -28,156 +28,142 @@ using namespace std; namespace HashMapSpace { // hashMap::hashMap(); - /* hashMap::~hashMap() - { - // vector<stringHasher>::const_iterator del = m_hasher.begin(); - for ( vector<stringHasher>::const_iterator del=m_hasher.begin(); del != m_hasher.end(); del++ ) - { - delete(*del); - } - }*/ - /** - * int hashMap::trouve ( long searchKey ) - * @param searchKey - * @return - */ - int hashMap::trouve ( long searchKey ) +/* hashMap::~hashMap() { - long foundKey; +// vector<stringHasher>::const_iterator del = m_hasher.begin(); + for ( vector<stringHasher>::const_iterator del=m_hasher.begin(); del != m_hasher.end(); del++ ) + { + delete(*del); + } + }*/ +/** + * int hashMap::trouve ( long searchKey ) + * @param searchKey + * @return + */ +int hashMap::trouve ( long searchKey ) +{ + long foundKey; // vector<stringHasher>::const_iterator l_hasher=m_hasher.begin(); - for ( vector<stringHasher>:: iterator l_hasher=m_hasher.begin() ; l_hasher!=m_hasher.end() ; l_hasher++ ) - { - foundKey= ( *l_hasher ).getHashKey(); - if ( searchKey == foundKey ) - { - return 1; - } - } - return 0; + for ( vector<stringHasher>:: iterator l_hasher=m_hasher.begin() ; l_hasher!=m_hasher.end() ; l_hasher++ ) { + foundKey= ( *l_hasher ).getHashKey(); + if ( searchKey == foundKey ) { + return 1; } - int hashMap::trouve ( string key ) - { - long searchKey=hashValue ( key ); - long foundKey;; + } + return 0; +} +int hashMap::trouve ( string key ) +{ + long searchKey=hashValue ( key ); + long foundKey;; // vector<stringHasher>::const_iterator l_hasher=m_hasher.begin(); - for ( vector<stringHasher>:: iterator l_hasher=m_hasher.begin() ; l_hasher!=m_hasher.end() ; l_hasher++ ) - { - foundKey= ( *l_hasher ).getHashKey(); - if ( searchKey == foundKey ) - { - return 1; - } - } - return 0; + for ( vector<stringHasher>:: iterator l_hasher=m_hasher.begin() ; l_hasher!=m_hasher.end() ; l_hasher++ ) { + foundKey= ( *l_hasher ).getHashKey(); + if ( searchKey == foundKey ) { + return 1; } - /** - * long hashMap::hashValue ( string key ) - * @param key - * @return - */ - long hashMap::hashValue ( string key ) - { - locale loc; // the "C" locale - const collate<char>& coll = use_facet<collate<char> >(loc); - return coll.hash(key.data(),key.data()+key.length()); + } + return 0; +} +/** + * long hashMap::hashValue ( string key ) + * @param key + * @return + */ +long hashMap::hashValue ( string key ) +{ + locale loc; // the "C" locale + const collate<char>& coll = use_facet<collate<char> >(loc); + return coll.hash(key.data(),key.data()+key.length()); // boost::hash<string> hasher; // return hasher ( key ); - } - /** - * void hashMap::addHasher ( string key, string value ) - * @param key - * @param value - */ - void hashMap::addHasher ( string key, string value ) - { - if ( trouve ( hashValue ( key ) ) ==0 ) - { +} +/** + * void hashMap::addHasher ( string key, string value ) + * @param key + * @param value + */ +void hashMap::addHasher ( string key, string value ) +{ + if ( trouve ( hashValue ( key ) ) ==0 ) { // cerr << "ICI1" <<endl; - stringHasher H ( hashValue ( key ),key,value ); + stringHasher H ( hashValue ( key ),key,value ); // cerr <<" "<< hashValue ( key )<<" "<< key<<" "<<value <<endl; // cerr << "ICI2" <<endl; - m_hasher.push_back ( H ); - } - } - stringHasher hashMap::getHasher ( string key ) - { - long searchKey=hashValue ( key ); - long foundKey; - stringHasher defaut(0,"",""); + m_hasher.push_back ( H ); + } +} +stringHasher hashMap::getHasher ( string key ) +{ + long searchKey=hashValue ( key ); + long foundKey; + stringHasher defaut(0,"",""); // vector<stringHasher>::const_iterator l_hasher=m_hasher.begin(); - for ( vector<stringHasher>:: iterator l_hasher=m_hasher.begin() ; l_hasher!=m_hasher.end() ; l_hasher++ ) - { - foundKey= ( *l_hasher ).getHashKey(); - if ( searchKey == foundKey ) - { - return ( *l_hasher ); - } - } - return defaut; + for ( vector<stringHasher>:: iterator l_hasher=m_hasher.begin() ; l_hasher!=m_hasher.end() ; l_hasher++ ) { + foundKey= ( *l_hasher ).getHashKey(); + if ( searchKey == foundKey ) { + return ( *l_hasher ); } - string hashMap::getValue ( string key ) - { - long searchKey=hashValue ( key ); - long foundKey; + } + return defaut; +} +string hashMap::getValue ( string key ) +{ + long searchKey=hashValue ( key ); + long foundKey; // vector<stringHasher>::const_iterator l_hasher=m_hasher.begin(); - for ( vector<stringHasher>:: iterator l_hasher=m_hasher.begin() ; l_hasher!=m_hasher.end() ; l_hasher++ ) - { - foundKey= ( *l_hasher ).getHashKey(); - if ( searchKey == foundKey ) - { + for ( vector<stringHasher>:: iterator l_hasher=m_hasher.begin() ; l_hasher!=m_hasher.end() ; l_hasher++ ) { + foundKey= ( *l_hasher ).getHashKey(); + if ( searchKey == foundKey ) { // cerr <<"value found : " << key<<"|"<< ( *l_hasher ).getValue()<<endl; - return ( *l_hasher ).getValue(); - } - } - return ""; + return ( *l_hasher ).getValue(); } - string hashMap::searchValue ( string value ) - { + } + return ""; +} +string hashMap::searchValue ( string value ) +{ // long searchKey=hashValue ( key ); // long foundKey; - string foundValue; + string foundValue; // vector<stringHasher>::const_iterator l_hasher=m_hasher.begin(); - for ( vector<stringHasher>:: iterator l_hasher=m_hasher.begin() ; l_hasher!=m_hasher.end() ; l_hasher++ ) - { - foundValue= ( *l_hasher ).getValue(); - if ( foundValue.compare ( value ) == 0 ) - { - return ( *l_hasher ).getKey(); - } - } - return ""; + for ( vector<stringHasher>:: iterator l_hasher=m_hasher.begin() ; l_hasher!=m_hasher.end() ; l_hasher++ ) { + foundValue= ( *l_hasher ).getValue(); + if ( foundValue.compare ( value ) == 0 ) { + return ( *l_hasher ).getKey(); } + } + return ""; +} - void hashMap::setValue ( string key , string value ) - { - long searchKey=hashValue ( key ); - long foundKey; +void hashMap::setValue ( string key , string value ) +{ + long searchKey=hashValue ( key ); + long foundKey; // vector<stringHasher>::const_iterator l_hasher=m_hasher.begin(); - for ( vector<stringHasher>:: iterator l_hasher=m_hasher.begin() ; l_hasher!=m_hasher.end() ; l_hasher++ ) - { - foundKey= ( *l_hasher ).getHashKey(); - if ( searchKey == foundKey ) - { - ( *l_hasher ).setValue ( value ); + for ( vector<stringHasher>:: iterator l_hasher=m_hasher.begin() ; l_hasher!=m_hasher.end() ; l_hasher++ ) { + foundKey= ( *l_hasher ).getHashKey(); + if ( searchKey == foundKey ) { + ( *l_hasher ).setValue ( value ); // return ( *l_hasher ).getValue(); - } - } } + } +} - /** - * - */ - void hashMap::printHash() - { - for ( vector<stringHasher>:: iterator l_hasher=m_hasher.begin() ; l_hasher!=m_hasher.end() ; l_hasher++ ) - { - cout << ( *l_hasher ).getHashKey() <<" | "<< ( *l_hasher ).getKey() << " | " << ( *l_hasher ).getValue() << endl; - } - } +/** + * + */ +void hashMap::printHash() +{ + for ( vector<stringHasher>:: iterator l_hasher=m_hasher.begin() ; l_hasher!=m_hasher.end() ; l_hasher++ ) { + cout << ( *l_hasher ).getHashKey() <<" | "<< ( *l_hasher ).getKey() << " | " << ( *l_hasher ).getValue() << endl; + } +} diff --git a/mert/TER/hashMap.h b/mert/TER/hashMap.h index 6cb721573..c2708b360 100644 --- a/mert/TER/hashMap.h +++ b/mert/TER/hashMap.h @@ -5,7 +5,7 @@ Copyright 2010-2013, Christophe Servan, LIUM, University of Le Mans, France Contact: christophe.servan@lium.univ-lemans.fr The tercpp tool and library are free software: you can redistribute it and/or modify it -under the terms of the GNU Lesser General Public License as published by +under the terms of the GNU Lesser General Public License as published by the Free Software Foundation, either version 3 of the licence, or (at your option) any later version. @@ -35,27 +35,27 @@ using namespace std; namespace HashMapSpace { - class hashMap - { - private: - vector<stringHasher> m_hasher; +class hashMap +{ +private: + vector<stringHasher> m_hasher; - public: +public: // ~hashMap(); - long hashValue ( string key ); - int trouve ( long searchKey ); - int trouve ( string key ); - void addHasher ( string key, string value ); - stringHasher getHasher ( string key ); - string getValue ( string key ); - string searchValue ( string key ); - void setValue ( string key , string value ); - void printHash(); - vector<stringHasher> getHashMap(); - string printStringHash(); - string printStringHash2(); - string printStringHashForLexicon(); - }; + long hashValue ( string key ); + int trouve ( long searchKey ); + int trouve ( string key ); + void addHasher ( string key, string value ); + stringHasher getHasher ( string key ); + string getValue ( string key ); + string searchValue ( string key ); + void setValue ( string key , string value ); + void printHash(); + vector<stringHasher> getHashMap(); + string printStringHash(); + string printStringHash2(); + string printStringHashForLexicon(); +}; } diff --git a/mert/TER/hashMapInfos.cpp b/mert/TER/hashMapInfos.cpp index 23f57d808..0ab6d21b2 100644 --- a/mert/TER/hashMapInfos.cpp +++ b/mert/TER/hashMapInfos.cpp @@ -5,7 +5,7 @@ Copyright 2010-2013, Christophe Servan, LIUM, University of Le Mans, France Contact: christophe.servan@lium.univ-lemans.fr The tercpp tool and library are free software: you can redistribute it and/or modify it -under the terms of the GNU Lesser General Public License as published by +under the terms of the GNU Lesser General Public License as published by the Free Software Foundation, either version 3 of the licence, or (at your option) any later version. @@ -28,117 +28,108 @@ using namespace std; namespace HashMapSpace { // hashMapInfos::hashMap(); - /* hashMapInfos::~hashMap() - { - // vector<infosHasher>::const_iterator del = m_hasher.begin(); - for ( vector<infosHasher>::const_iterator del=m_hasher.begin(); del != m_hasher.end(); del++ ) - { - delete(*del); - } - }*/ - /** - * int hashMapInfos::trouve ( long searchKey ) - * @param searchKey - * @return - */ - int hashMapInfos::trouve ( long searchKey ) +/* hashMapInfos::~hashMap() { - long foundKey; +// vector<infosHasher>::const_iterator del = m_hasher.begin(); + for ( vector<infosHasher>::const_iterator del=m_hasher.begin(); del != m_hasher.end(); del++ ) + { + delete(*del); + } + }*/ +/** + * int hashMapInfos::trouve ( long searchKey ) + * @param searchKey + * @return + */ +int hashMapInfos::trouve ( long searchKey ) +{ + long foundKey; // vector<infosHasher>::const_iterator l_hasher=m_hasher.begin(); - for ( vector<infosHasher>:: iterator l_hasher=m_hasher.begin() ; l_hasher!=m_hasher.end() ; l_hasher++ ) - { - foundKey= ( *l_hasher ).getHashKey(); - if ( searchKey == foundKey ) - { - return 1; - } - } - return 0; + for ( vector<infosHasher>:: iterator l_hasher=m_hasher.begin() ; l_hasher!=m_hasher.end() ; l_hasher++ ) { + foundKey= ( *l_hasher ).getHashKey(); + if ( searchKey == foundKey ) { + return 1; } - int hashMapInfos::trouve ( string key ) - { - long searchKey=hashValue ( key ); - long foundKey;; + } + return 0; +} +int hashMapInfos::trouve ( string key ) +{ + long searchKey=hashValue ( key ); + long foundKey;; // vector<infosHasher>::const_iterator l_hasher=m_hasher.begin(); - for ( vector<infosHasher>:: iterator l_hasher=m_hasher.begin() ; l_hasher!=m_hasher.end() ; l_hasher++ ) - { - foundKey= ( *l_hasher ).getHashKey(); - if ( searchKey == foundKey ) - { - return 1; - } - } - return 0; + for ( vector<infosHasher>:: iterator l_hasher=m_hasher.begin() ; l_hasher!=m_hasher.end() ; l_hasher++ ) { + foundKey= ( *l_hasher ).getHashKey(); + if ( searchKey == foundKey ) { + return 1; } + } + return 0; +} - /** - * long hashMapInfos::hashValue ( string key ) - * @param key - * @return - */ - long hashMapInfos::hashValue ( string key ) - { - locale loc; // the "C" locale - const collate<char>& coll = use_facet<collate<char> >(loc); - return coll.hash(key.data(),key.data()+key.length()); +/** + * long hashMapInfos::hashValue ( string key ) + * @param key + * @return + */ +long hashMapInfos::hashValue ( string key ) +{ + locale loc; // the "C" locale + const collate<char>& coll = use_facet<collate<char> >(loc); + return coll.hash(key.data(),key.data()+key.length()); // boost::hash<string> hasher; // return hasher ( key ); - } - /** - * void hashMapInfos::addHasher ( string key, string value ) - * @param key - * @param value - */ - void hashMapInfos::addHasher ( string key, vector<int> value ) - { - if ( trouve ( hashValue ( key ) ) ==0 ) - { +} +/** + * void hashMapInfos::addHasher ( string key, string value ) + * @param key + * @param value + */ +void hashMapInfos::addHasher ( string key, vector<int> value ) +{ + if ( trouve ( hashValue ( key ) ) ==0 ) { // cerr << "ICI1" <<endl; - infosHasher H ( hashValue ( key ),key,value ); + infosHasher H ( hashValue ( key ),key,value ); // cerr <<" "<< hashValue ( key )<<" "<< key<<" "<<value <<endl; // cerr << "ICI2" <<endl; - m_hasher.push_back ( H ); - } - } - void hashMapInfos::addValue ( string key, vector<int> value ) - { - addHasher ( key, value ); - } - infosHasher hashMapInfos::getHasher ( string key ) - { - long searchKey=hashValue ( key ); - long foundKey; + m_hasher.push_back ( H ); + } +} +void hashMapInfos::addValue ( string key, vector<int> value ) +{ + addHasher ( key, value ); +} +infosHasher hashMapInfos::getHasher ( string key ) +{ + long searchKey=hashValue ( key ); + long foundKey; // vector<infosHasher>::const_iterator l_hasher=m_hasher.begin(); - for ( vector<infosHasher>:: iterator l_hasher=m_hasher.begin() ; l_hasher!=m_hasher.end() ; l_hasher++ ) - { - foundKey= ( *l_hasher ).getHashKey(); - if ( searchKey == foundKey ) - { - return ( *l_hasher ); - } - } - vector<int> temp; - infosHasher defaut(0,"",temp); - return defaut; + for ( vector<infosHasher>:: iterator l_hasher=m_hasher.begin() ; l_hasher!=m_hasher.end() ; l_hasher++ ) { + foundKey= ( *l_hasher ).getHashKey(); + if ( searchKey == foundKey ) { + return ( *l_hasher ); } - vector<int> hashMapInfos::getValue ( string key ) - { - long searchKey=hashValue ( key ); - long foundKey; - vector<int> retour; + } + vector<int> temp; + infosHasher defaut(0,"",temp); + return defaut; +} +vector<int> hashMapInfos::getValue ( string key ) +{ + long searchKey=hashValue ( key ); + long foundKey; + vector<int> retour; // vector<infosHasher>::const_iterator l_hasher=m_hasher.begin(); - for ( vector<infosHasher>:: iterator l_hasher=m_hasher.begin() ; l_hasher!=m_hasher.end() ; l_hasher++ ) - { - foundKey= ( *l_hasher ).getHashKey(); - if ( searchKey == foundKey ) - { + for ( vector<infosHasher>:: iterator l_hasher=m_hasher.begin() ; l_hasher!=m_hasher.end() ; l_hasher++ ) { + foundKey= ( *l_hasher ).getHashKey(); + if ( searchKey == foundKey ) { // cerr <<"value found : " << key<<"|"<< ( *l_hasher ).getValue()<<endl; - return ( *l_hasher ).getValue(); - } - } - return retour; + return ( *l_hasher ).getValue(); } + } + return retour; +} // string hashMapInfos::searchValue ( string value ) // { // // long searchKey=hashValue ( key ); @@ -158,42 +149,38 @@ namespace HashMapSpace // } // - void hashMapInfos::setValue ( string key , vector<int> value ) - { - long searchKey=hashValue ( key ); - long foundKey; +void hashMapInfos::setValue ( string key , vector<int> value ) +{ + long searchKey=hashValue ( key ); + long foundKey; // vector<infosHasher>::const_iterator l_hasher=m_hasher.begin(); - for ( vector<infosHasher>:: iterator l_hasher=m_hasher.begin() ; l_hasher!=m_hasher.end() ; l_hasher++ ) - { - foundKey= ( *l_hasher ).getHashKey(); - if ( searchKey == foundKey ) - { - ( *l_hasher ).setValue ( value ); + for ( vector<infosHasher>:: iterator l_hasher=m_hasher.begin() ; l_hasher!=m_hasher.end() ; l_hasher++ ) { + foundKey= ( *l_hasher ).getHashKey(); + if ( searchKey == foundKey ) { + ( *l_hasher ).setValue ( value ); // return ( *l_hasher ).getValue(); - } - } - } - string hashMapInfos::toString () - { - stringstream to_return; - for ( vector<infosHasher>:: iterator l_hasher = m_hasher.begin() ; l_hasher != m_hasher.end() ; l_hasher++ ) - { - to_return << (*l_hasher).toString(); - // cout << ( *l_hasher ).getHashKey() <<" | "<< ( *l_hasher ).getKey() << " | " << ( *l_hasher ).getValue() << endl; - } - return to_return.str(); } + } +} +string hashMapInfos::toString () +{ + stringstream to_return; + for ( vector<infosHasher>:: iterator l_hasher = m_hasher.begin() ; l_hasher != m_hasher.end() ; l_hasher++ ) { + to_return << (*l_hasher).toString(); + // cout << ( *l_hasher ).getHashKey() <<" | "<< ( *l_hasher ).getKey() << " | " << ( *l_hasher ).getValue() << endl; + } + return to_return.str(); +} - /** - * - */ - void hashMapInfos::printHash() - { - for ( vector<infosHasher>:: iterator l_hasher=m_hasher.begin() ; l_hasher!=m_hasher.end() ; l_hasher++ ) - { +/** + * + */ +void hashMapInfos::printHash() +{ + for ( vector<infosHasher>:: iterator l_hasher=m_hasher.begin() ; l_hasher!=m_hasher.end() ; l_hasher++ ) { // cout << ( *l_hasher ).getHashKey() <<" | "<< ( *l_hasher ).getKey() << " | " << ( *l_hasher ).getValue() << endl; - } - } + } +} diff --git a/mert/TER/hashMapInfos.h b/mert/TER/hashMapInfos.h index 5e7dbb6e7..e975aa738 100644 --- a/mert/TER/hashMapInfos.h +++ b/mert/TER/hashMapInfos.h @@ -5,7 +5,7 @@ Copyright 2010-2013, Christophe Servan, LIUM, University of Le Mans, France Contact: christophe.servan@lium.univ-lemans.fr The tercpp tool and library are free software: you can redistribute it and/or modify it -under the terms of the GNU Lesser General Public License as published by +under the terms of the GNU Lesser General Public License as published by the Free Software Foundation, either version 3 of the licence, or (at your option) any later version. @@ -34,29 +34,29 @@ using namespace std; namespace HashMapSpace { - class hashMapInfos - { - private: - vector<infosHasher> m_hasher; +class hashMapInfos +{ +private: + vector<infosHasher> m_hasher; - public: +public: // ~hashMap(); - long hashValue ( string key ); - int trouve ( long searchKey ); - int trouve ( string key ); - void addHasher ( string key, vector<int> value ); - void addValue ( string key, vector<int> value ); - infosHasher getHasher ( string key ); - vector<int> getValue ( string key ); + long hashValue ( string key ); + int trouve ( long searchKey ); + int trouve ( string key ); + void addHasher ( string key, vector<int> value ); + void addValue ( string key, vector<int> value ); + infosHasher getHasher ( string key ); + vector<int> getValue ( string key ); // string searchValue ( string key ); - void setValue ( string key , vector<int> value ); - void printHash(); - string toString(); - vector<infosHasher> getHashMap(); - string printStringHash(); - string printStringHash2(); - string printStringHashForLexicon(); - }; + void setValue ( string key , vector<int> value ); + void printHash(); + string toString(); + vector<infosHasher> getHashMap(); + string printStringHash(); + string printStringHash2(); + string printStringHashForLexicon(); +}; } diff --git a/mert/TER/hashMapStringInfos.cpp b/mert/TER/hashMapStringInfos.cpp index 773c148d4..d984bdadc 100644 --- a/mert/TER/hashMapStringInfos.cpp +++ b/mert/TER/hashMapStringInfos.cpp @@ -5,7 +5,7 @@ Copyright 2010-2013, Christophe Servan, LIUM, University of Le Mans, France Contact: christophe.servan@lium.univ-lemans.fr The tercpp tool and library are free software: you can redistribute it and/or modify it -under the terms of the GNU Lesser General Public License as published by +under the terms of the GNU Lesser General Public License as published by the Free Software Foundation, either version 3 of the licence, or (at your option) any later version. @@ -27,179 +27,166 @@ using namespace std; namespace HashMapSpace { - // hashMapStringInfos::hashMap(); - /* hashMapStringInfos::~hashMap() - { - // vector<stringInfosHasher>::const_iterator del = m_hasher.begin(); - for ( vector<stringInfosHasher>::const_iterator del=m_hasher.begin(); del != m_hasher.end(); del++ ) - { - delete(*del); - } - }*/ - /** - * int hashMapStringInfos::trouve ( long searchKey ) - * @param searchKey - * @return - */ - int hashMapStringInfos::trouve ( long searchKey ) - { - long foundKey; - // vector<stringInfosHasher>::const_iterator l_hasher=m_hasher.begin(); - for ( vector<stringInfosHasher>:: iterator l_hasher = m_hasher.begin() ; l_hasher != m_hasher.end() ; l_hasher++ ) - { - foundKey = ( *l_hasher ).getHashKey(); - if ( searchKey == foundKey ) - { - return 1; - } - } - return 0; +// hashMapStringInfos::hashMap(); +/* hashMapStringInfos::~hashMap() +{ +// vector<stringInfosHasher>::const_iterator del = m_hasher.begin(); + for ( vector<stringInfosHasher>::const_iterator del=m_hasher.begin(); del != m_hasher.end(); del++ ) + { + delete(*del); + } +}*/ +/** +* int hashMapStringInfos::trouve ( long searchKey ) +* @param searchKey +* @return +*/ +int hashMapStringInfos::trouve ( long searchKey ) +{ + long foundKey; + // vector<stringInfosHasher>::const_iterator l_hasher=m_hasher.begin(); + for ( vector<stringInfosHasher>:: iterator l_hasher = m_hasher.begin() ; l_hasher != m_hasher.end() ; l_hasher++ ) { + foundKey = ( *l_hasher ).getHashKey(); + if ( searchKey == foundKey ) { + return 1; } + } + return 0; +} - int hashMapStringInfos::trouve ( string key ) - { - long searchKey = hashValue ( key ); - long foundKey;; - // vector<stringInfosHasher>::const_iterator l_hasher=m_hasher.begin(); - for ( vector<stringInfosHasher>:: iterator l_hasher = m_hasher.begin() ; l_hasher != m_hasher.end() ; l_hasher++ ) - { - foundKey = ( *l_hasher ).getHashKey(); - if ( searchKey == foundKey ) - { - return 1; - } - } - return 0; +int hashMapStringInfos::trouve ( string key ) +{ + long searchKey = hashValue ( key ); + long foundKey;; + // vector<stringInfosHasher>::const_iterator l_hasher=m_hasher.begin(); + for ( vector<stringInfosHasher>:: iterator l_hasher = m_hasher.begin() ; l_hasher != m_hasher.end() ; l_hasher++ ) { + foundKey = ( *l_hasher ).getHashKey(); + if ( searchKey == foundKey ) { + return 1; } + } + return 0; +} - /** - * long hashMapStringInfos::hashValue ( string key ) - * @param key - * @return - */ - long hashMapStringInfos::hashValue ( string key ) - { - locale loc; // the "C" locale - const collate<char>& coll = use_facet<collate<char> > ( loc ); - return coll.hash ( key.data(), key.data() + key.length() ); +/** +* long hashMapStringInfos::hashValue ( string key ) +* @param key +* @return +*/ +long hashMapStringInfos::hashValue ( string key ) +{ + locale loc; // the "C" locale + const collate<char>& coll = use_facet<collate<char> > ( loc ); + return coll.hash ( key.data(), key.data() + key.length() ); // boost::hash<string> hasher; // return hasher ( key ); +} +/** +* void hashMapStringInfos::addHasher ( string key, string value ) +* @param key +* @param value +*/ +void hashMapStringInfos::addHasher ( string key, vector<string> value ) +{ + if ( trouve ( hashValue ( key ) ) == 0 ) { + // cerr << "ICI1" <<endl; + stringInfosHasher H ( hashValue ( key ), key, value ); + // cerr <<" "<< hashValue ( key )<<" "<< key<<" "<<value <<endl; + // cerr << "ICI2" <<endl; + + m_hasher.push_back ( H ); + } +} +void hashMapStringInfos::addValue ( string key, vector<string> value ) +{ + addHasher ( key, value ); +} +stringInfosHasher hashMapStringInfos::getHasher ( string key ) +{ + long searchKey = hashValue ( key ); + long foundKey; + // vector<stringInfosHasher>::const_iterator l_hasher=m_hasher.begin(); + for ( vector<stringInfosHasher>:: iterator l_hasher = m_hasher.begin() ; l_hasher != m_hasher.end() ; l_hasher++ ) { + foundKey = ( *l_hasher ).getHashKey(); + if ( searchKey == foundKey ) { + return ( *l_hasher ); } - /** - * void hashMapStringInfos::addHasher ( string key, string value ) - * @param key - * @param value - */ - void hashMapStringInfos::addHasher ( string key, vector<string> value ) - { - if ( trouve ( hashValue ( key ) ) == 0 ) - { - // cerr << "ICI1" <<endl; - stringInfosHasher H ( hashValue ( key ), key, value ); - // cerr <<" "<< hashValue ( key )<<" "<< key<<" "<<value <<endl; - // cerr << "ICI2" <<endl; - - m_hasher.push_back ( H ); - } - } - void hashMapStringInfos::addValue ( string key, vector<string> value ) - { - addHasher ( key, value ); - } - stringInfosHasher hashMapStringInfos::getHasher ( string key ) - { - long searchKey = hashValue ( key ); - long foundKey; - // vector<stringInfosHasher>::const_iterator l_hasher=m_hasher.begin(); - for ( vector<stringInfosHasher>:: iterator l_hasher = m_hasher.begin() ; l_hasher != m_hasher.end() ; l_hasher++ ) - { - foundKey = ( *l_hasher ).getHashKey(); - if ( searchKey == foundKey ) - { - return ( *l_hasher ); - } - } - vector<string> tmp; - stringInfosHasher defaut ( 0, "", tmp ); - return defaut; - } - vector<string> hashMapStringInfos::getValue ( string key ) - { - long searchKey = hashValue ( key ); - long foundKey; - vector<string> retour; - // vector<stringInfosHasher>::const_iterator l_hasher=m_hasher.begin(); - for ( vector<stringInfosHasher>:: iterator l_hasher = m_hasher.begin() ; l_hasher != m_hasher.end() ; l_hasher++ ) - { - foundKey = ( *l_hasher ).getHashKey(); - if ( searchKey == foundKey ) - { - // cerr <<"value found : " << key<<"|"<< ( *l_hasher ).getValue()<<endl; - return ( *l_hasher ).getValue(); - } - } - return retour; + } + vector<string> tmp; + stringInfosHasher defaut ( 0, "", tmp ); + return defaut; +} +vector<string> hashMapStringInfos::getValue ( string key ) +{ + long searchKey = hashValue ( key ); + long foundKey; + vector<string> retour; + // vector<stringInfosHasher>::const_iterator l_hasher=m_hasher.begin(); + for ( vector<stringInfosHasher>:: iterator l_hasher = m_hasher.begin() ; l_hasher != m_hasher.end() ; l_hasher++ ) { + foundKey = ( *l_hasher ).getHashKey(); + if ( searchKey == foundKey ) { + // cerr <<"value found : " << key<<"|"<< ( *l_hasher ).getValue()<<endl; + return ( *l_hasher ).getValue(); } - // string hashMapStringInfos::searchValue ( string value ) - // { - // // long searchKey=hashValue ( key ); - // // long foundKey; - // vector<int> foundValue; - // - // // vector<stringInfosHasher>::const_iterator l_hasher=m_hasher.begin(); - // for ( vector<stringInfosHasher>:: iterator l_hasher=m_hasher.begin() ; l_hasher!=m_hasher.end() ; l_hasher++ ) - // { - // foundValue= ( *l_hasher ).getValue(); - // /* if ( foundValue.compare ( value ) == 0 ) - // { - // return ( *l_hasher ).getKey(); - // }*/ - // } - // return ""; - // } - // - - void hashMapStringInfos::setValue ( string key , vector<string> value ) - { - long searchKey = hashValue ( key ); - long foundKey; - // vector<stringInfosHasher>::const_iterator l_hasher=m_hasher.begin(); - for ( vector<stringInfosHasher>:: iterator l_hasher = m_hasher.begin() ; l_hasher != m_hasher.end() ; l_hasher++ ) - { - foundKey = ( *l_hasher ).getHashKey(); - if ( searchKey == foundKey ) - { - ( *l_hasher ).setValue ( value ); - // return ( *l_hasher ).getValue(); - } - } + } + return retour; +} +// string hashMapStringInfos::searchValue ( string value ) +// { +// // long searchKey=hashValue ( key ); +// // long foundKey; +// vector<int> foundValue; +// +// // vector<stringInfosHasher>::const_iterator l_hasher=m_hasher.begin(); +// for ( vector<stringInfosHasher>:: iterator l_hasher=m_hasher.begin() ; l_hasher!=m_hasher.end() ; l_hasher++ ) +// { +// foundValue= ( *l_hasher ).getValue(); +// /* if ( foundValue.compare ( value ) == 0 ) +// { +// return ( *l_hasher ).getKey(); +// }*/ +// } +// return ""; +// } +// + +void hashMapStringInfos::setValue ( string key , vector<string> value ) +{ + long searchKey = hashValue ( key ); + long foundKey; + // vector<stringInfosHasher>::const_iterator l_hasher=m_hasher.begin(); + for ( vector<stringInfosHasher>:: iterator l_hasher = m_hasher.begin() ; l_hasher != m_hasher.end() ; l_hasher++ ) { + foundKey = ( *l_hasher ).getHashKey(); + if ( searchKey == foundKey ) { + ( *l_hasher ).setValue ( value ); + // return ( *l_hasher ).getValue(); } + } +} - string hashMapStringInfos::toString () - { - stringstream to_return; - for ( vector<stringInfosHasher>:: iterator l_hasher = m_hasher.begin() ; l_hasher != m_hasher.end() ; l_hasher++ ) - { - to_return << (*l_hasher).toString(); - // cout << ( *l_hasher ).getHashKey() <<" | "<< ( *l_hasher ).getKey() << " | " << ( *l_hasher ).getValue() << endl; - } - return to_return.str(); - } +string hashMapStringInfos::toString () +{ + stringstream to_return; + for ( vector<stringInfosHasher>:: iterator l_hasher = m_hasher.begin() ; l_hasher != m_hasher.end() ; l_hasher++ ) { + to_return << (*l_hasher).toString(); + // cout << ( *l_hasher ).getHashKey() <<" | "<< ( *l_hasher ).getKey() << " | " << ( *l_hasher ).getValue() << endl; + } + return to_return.str(); +} - /** - * - */ - void hashMapStringInfos::printHash() - { - for ( vector<stringInfosHasher>:: iterator l_hasher = m_hasher.begin() ; l_hasher != m_hasher.end() ; l_hasher++ ) - { - // cout << ( *l_hasher ).getHashKey() <<" | "<< ( *l_hasher ).getKey() << " | " << ( *l_hasher ).getValue() << endl; - } - } - vector< stringInfosHasher > hashMapStringInfos::getHashMap() - { - return m_hasher; - } +/** +* +*/ +void hashMapStringInfos::printHash() +{ + for ( vector<stringInfosHasher>:: iterator l_hasher = m_hasher.begin() ; l_hasher != m_hasher.end() ; l_hasher++ ) { + // cout << ( *l_hasher ).getHashKey() <<" | "<< ( *l_hasher ).getKey() << " | " << ( *l_hasher ).getValue() << endl; + } +} +vector< stringInfosHasher > hashMapStringInfos::getHashMap() +{ + return m_hasher; +} diff --git a/mert/TER/hashMapStringInfos.h b/mert/TER/hashMapStringInfos.h index 5337d50f2..a0eae951d 100644 --- a/mert/TER/hashMapStringInfos.h +++ b/mert/TER/hashMapStringInfos.h @@ -5,7 +5,7 @@ Copyright 2010-2013, Christophe Servan, LIUM, University of Le Mans, France Contact: christophe.servan@lium.univ-lemans.fr The tercpp tool and library are free software: you can redistribute it and/or modify it -under the terms of the GNU Lesser General Public License as published by +under the terms of the GNU Lesser General Public License as published by the Free Software Foundation, either version 3 of the licence, or (at your option) any later version. @@ -34,29 +34,29 @@ using namespace std; namespace HashMapSpace { - class hashMapStringInfos - { - private: - vector<stringInfosHasher> m_hasher; +class hashMapStringInfos +{ +private: + vector<stringInfosHasher> m_hasher; - public: +public: // ~hashMap(); - long hashValue ( string key ); - int trouve ( long searchKey ); - int trouve ( string key ); - void addHasher ( string key, vector<string> value ); - void addValue ( string key, vector<string> value ); - stringInfosHasher getHasher ( string key ); - vector<string> getValue ( string key ); + long hashValue ( string key ); + int trouve ( long searchKey ); + int trouve ( string key ); + void addHasher ( string key, vector<string> value ); + void addValue ( string key, vector<string> value ); + stringInfosHasher getHasher ( string key ); + vector<string> getValue ( string key ); // string searchValue ( string key ); - void setValue ( string key , vector<string> value ); - void printHash(); - string toString(); - vector<stringInfosHasher> getHashMap(); - string printStringHash(); - string printStringHash2(); - string printStringHashForLexicon(); - }; + void setValue ( string key , vector<string> value ); + void printHash(); + string toString(); + vector<stringInfosHasher> getHashMap(); + string printStringHash(); + string printStringHash2(); + string printStringHashForLexicon(); +}; } diff --git a/mert/TER/infosHasher.cpp b/mert/TER/infosHasher.cpp index 8ce23ae44..450b70d94 100644 --- a/mert/TER/infosHasher.cpp +++ b/mert/TER/infosHasher.cpp @@ -5,7 +5,7 @@ Copyright 2010-2013, Christophe Servan, LIUM, University of Le Mans, France Contact: christophe.servan@lium.univ-lemans.fr The tercpp tool and library are free software: you can redistribute it and/or modify it -under the terms of the GNU Lesser General Public License as published by +under the terms of the GNU Lesser General Public License as published by the Free Software Foundation, either version 3 of the licence, or (at your option) any later version. @@ -27,35 +27,35 @@ using namespace Tools; namespace HashMapSpace { - infosHasher::infosHasher (long cle,string cleTxt, vector<int> valueVecInt ) - { - m_hashKey=cle; - m_key=cleTxt; - m_value=valueVecInt; - } +infosHasher::infosHasher (long cle,string cleTxt, vector<int> valueVecInt ) +{ + m_hashKey=cle; + m_key=cleTxt; + m_value=valueVecInt; +} // infosHasher::~infosHasher(){};*/ - long infosHasher::getHashKey() - { - return m_hashKey; - } - string infosHasher::getKey() - { - return m_key; - } - vector<int> infosHasher::getValue() - { - return m_value; - } - void infosHasher::setValue ( vector<int> value ) - { - m_value=value; - } - string infosHasher::toString() - { - stringstream to_return; - to_return << m_hashKey << "\t" << m_key << "\t" << vectorToString(m_value,"\t") << endl; - return to_return.str(); - } +long infosHasher::getHashKey() +{ + return m_hashKey; +} +string infosHasher::getKey() +{ + return m_key; +} +vector<int> infosHasher::getValue() +{ + return m_value; +} +void infosHasher::setValue ( vector<int> value ) +{ + m_value=value; +} +string infosHasher::toString() +{ + stringstream to_return; + to_return << m_hashKey << "\t" << m_key << "\t" << vectorToString(m_value,"\t") << endl; + return to_return.str(); +} // typedef stdext::hash_map<std::string,string, stringhasher> HASH_S_S; diff --git a/mert/TER/infosHasher.h b/mert/TER/infosHasher.h index d3d56317a..ab9c7b5ed 100644 --- a/mert/TER/infosHasher.h +++ b/mert/TER/infosHasher.h @@ -5,7 +5,7 @@ Copyright 2010-2013, Christophe Servan, LIUM, University of Le Mans, France Contact: christophe.servan@lium.univ-lemans.fr The tercpp tool and library are free software: you can redistribute it and/or modify it -under the terms of the GNU Lesser General Public License as published by +under the terms of the GNU Lesser General Public License as published by the Free Software Foundation, either version 3 of the licence, or (at your option) any later version. @@ -31,23 +31,23 @@ Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA using namespace std; namespace HashMapSpace { - class infosHasher - { - private: - long m_hashKey; - string m_key; - vector<int> m_value; - - public: - infosHasher ( long cle, string cleTxt, vector<int> valueVecInt ); - long getHashKey(); - string getKey(); - vector<int> getValue(); - void setValue ( vector<int> value ); - string toString(); - - - }; +class infosHasher +{ +private: + long m_hashKey; + string m_key; + vector<int> m_value; + +public: + infosHasher ( long cle, string cleTxt, vector<int> valueVecInt ); + long getHashKey(); + string getKey(); + vector<int> getValue(); + void setValue ( vector<int> value ); + string toString(); + + +}; } diff --git a/mert/TER/stringHasher.cpp b/mert/TER/stringHasher.cpp index f4d1526e8..729310352 100644 --- a/mert/TER/stringHasher.cpp +++ b/mert/TER/stringHasher.cpp @@ -5,7 +5,7 @@ Copyright 2010-2013, Christophe Servan, LIUM, University of Le Mans, France Contact: christophe.servan@lium.univ-lemans.fr The tercpp tool and library are free software: you can redistribute it and/or modify it -under the terms of the GNU Lesser General Public License as published by +under the terms of the GNU Lesser General Public License as published by the Free Software Foundation, either version 3 of the licence, or (at your option) any later version. @@ -26,29 +26,29 @@ using namespace std; namespace HashMapSpace { - stringHasher::stringHasher ( long cle, string cleTxt, string valueTxt ) - { - m_hashKey=cle; - m_key=cleTxt; - m_value=valueTxt; - } +stringHasher::stringHasher ( long cle, string cleTxt, string valueTxt ) +{ + m_hashKey=cle; + m_key=cleTxt; + m_value=valueTxt; +} // stringHasher::~stringHasher(){};*/ - long stringHasher::getHashKey() - { - return m_hashKey; - } - string stringHasher::getKey() - { - return m_key; - } - string stringHasher::getValue() - { - return m_value; - } - void stringHasher::setValue ( string value ) - { - m_value=value; - } +long stringHasher::getHashKey() +{ + return m_hashKey; +} +string stringHasher::getKey() +{ + return m_key; +} +string stringHasher::getValue() +{ + return m_value; +} +void stringHasher::setValue ( string value ) +{ + m_value=value; +} // typedef stdext::hash_map<string, string, stringhasher> HASH_S_S; diff --git a/mert/TER/stringHasher.h b/mert/TER/stringHasher.h index d831f642c..5b0ccfc94 100644 --- a/mert/TER/stringHasher.h +++ b/mert/TER/stringHasher.h @@ -5,7 +5,7 @@ Copyright 2010-2013, Christophe Servan, LIUM, University of Le Mans, France Contact: christophe.servan@lium.univ-lemans.fr The tercpp tool and library are free software: you can redistribute it and/or modify it -under the terms of the GNU Lesser General Public License as published by +under the terms of the GNU Lesser General Public License as published by the Free Software Foundation, either version 3 of the licence, or (at your option) any later version. @@ -28,22 +28,22 @@ using namespace std; namespace HashMapSpace { - class stringHasher - { - private: - long m_hashKey; - string m_key; - string m_value; +class stringHasher +{ +private: + long m_hashKey; + string m_key; + string m_value; - public: - stringHasher ( long cle, string cleTxt, string valueTxt ); - long getHashKey(); - string getKey(); - string getValue(); - void setValue ( string value ); +public: + stringHasher ( long cle, string cleTxt, string valueTxt ); + long getHashKey(); + string getKey(); + string getValue(); + void setValue ( string value ); - }; +}; } diff --git a/mert/TER/stringInfosHasher.cpp b/mert/TER/stringInfosHasher.cpp index 007fd720f..ecbc10fa5 100644 --- a/mert/TER/stringInfosHasher.cpp +++ b/mert/TER/stringInfosHasher.cpp @@ -5,7 +5,7 @@ Copyright 2010-2013, Christophe Servan, LIUM, University of Le Mans, France Contact: christophe.servan@lium.univ-lemans.fr The tercpp tool and library are free software: you can redistribute it and/or modify it -under the terms of the GNU Lesser General Public License as published by +under the terms of the GNU Lesser General Public License as published by the Free Software Foundation, either version 3 of the licence, or (at your option) any later version. @@ -27,35 +27,35 @@ using namespace Tools; namespace HashMapSpace { - stringInfosHasher::stringInfosHasher ( long cle, string cleTxt, vector<string> valueVecInt ) - { - m_hashKey=cle; - m_key=cleTxt; - m_value=valueVecInt; - } +stringInfosHasher::stringInfosHasher ( long cle, string cleTxt, vector<string> valueVecInt ) +{ + m_hashKey=cle; + m_key=cleTxt; + m_value=valueVecInt; +} // stringInfosHasher::~stringInfosHasher(){};*/ - long stringInfosHasher::getHashKey() - { - return m_hashKey; - } - string stringInfosHasher::getKey() - { - return m_key; - } - vector<string> stringInfosHasher::getValue() - { - return m_value; - } - void stringInfosHasher::setValue ( vector<string> value ) - { - m_value=value; - } - string stringInfosHasher::toString() - { - stringstream to_return; - to_return << m_hashKey << "\t" << m_key << "\t" << vectorToString(m_value,"\t") << endl; - return to_return.str(); - } +long stringInfosHasher::getHashKey() +{ + return m_hashKey; +} +string stringInfosHasher::getKey() +{ + return m_key; +} +vector<string> stringInfosHasher::getValue() +{ + return m_value; +} +void stringInfosHasher::setValue ( vector<string> value ) +{ + m_value=value; +} +string stringInfosHasher::toString() +{ + stringstream to_return; + to_return << m_hashKey << "\t" << m_key << "\t" << vectorToString(m_value,"\t") << endl; + return to_return.str(); +} // typedef stdext::hash_map<string, string, stringhasher> HASH_S_S; diff --git a/mert/TER/stringInfosHasher.h b/mert/TER/stringInfosHasher.h index 307b48da7..e4369f27a 100644 --- a/mert/TER/stringInfosHasher.h +++ b/mert/TER/stringInfosHasher.h @@ -5,7 +5,7 @@ Copyright 2010-2013, Christophe Servan, LIUM, University of Le Mans, France Contact: christophe.servan@lium.univ-lemans.fr The tercpp tool and library are free software: you can redistribute it and/or modify it -under the terms of the GNU Lesser General Public License as published by +under the terms of the GNU Lesser General Public License as published by the Free Software Foundation, either version 3 of the licence, or (at your option) any later version. @@ -29,23 +29,23 @@ Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA using namespace std; namespace HashMapSpace { - class stringInfosHasher - { - private: - long m_hashKey; - string m_key; - vector<string> m_value; - - public: - stringInfosHasher ( long cle, string cleTxt, vector<string> valueVecInt ); - long getHashKey(); - string getKey(); - vector<string> getValue(); - void setValue ( vector<string> value ); - string toString(); - - - }; +class stringInfosHasher +{ +private: + long m_hashKey; + string m_key; + vector<string> m_value; + +public: + stringInfosHasher ( long cle, string cleTxt, vector<string> valueVecInt ); + long getHashKey(); + string getKey(); + vector<string> getValue(); + void setValue ( vector<string> value ); + string toString(); + + +}; } diff --git a/mert/TER/terAlignment.cpp b/mert/TER/terAlignment.cpp index 6c5d35cc5..ec7bcafb7 100644 --- a/mert/TER/terAlignment.cpp +++ b/mert/TER/terAlignment.cpp @@ -5,7 +5,7 @@ Copyright 2010-2013, Christophe Servan, LIUM, University of Le Mans, France Contact: christophe.servan@lium.univ-lemans.fr The tercpp tool and library are free software: you can redistribute it and/or modify it -under the terms of the GNU Lesser General Public License as published by +under the terms of the GNU Lesser General Public License as published by the Free Software Foundation, either version 3 of the licence, or (at your option) any later version. @@ -24,191 +24,163 @@ using namespace std; namespace TERCpp { - terAlignment::terAlignment() - { +terAlignment::terAlignment() +{ // vector<string> ref; // vector<string> hyp; // vector<string> aftershift; - // TERshift[] allshifts = null; + // TERshift[] allshifts = null; - numEdits=0; - numWords=0; - bestRef=""; + numEdits=0; + numWords=0; + bestRef=""; - numIns=0; - numDel=0; - numSub=0; - numSft=0; - numWsf=0; - } - string terAlignment::toString() - { - stringstream s; - s.str ( "" ); - s << "Original Ref: \t" << join ( " ", ref ) << endl; - s << "Original Hyp: \t" << join ( " ", hyp ) <<endl; - s << "Hyp After Shift:\t" << join ( " ", aftershift ); + numIns=0; + numDel=0; + numSub=0; + numSft=0; + numWsf=0; +} +string terAlignment::toString() +{ + stringstream s; + s.str ( "" ); + s << "Original Ref: \t" << join ( " ", ref ) << endl; + s << "Original Hyp: \t" << join ( " ", hyp ) <<endl; + s << "Hyp After Shift:\t" << join ( " ", aftershift ); // s << "Hyp After Shift: " << join ( " ", aftershift ); - s << endl; + s << endl; // string s = "Original Ref: " + join(" ", ref) + "\nOriginal Hyp: " + join(" ", hyp) + "\nHyp After Shift: " + join(" ", aftershift); - if ( ( int ) sizeof ( alignment ) >0 ) - { - s << "Alignment: ("; + if ( ( int ) sizeof ( alignment ) >0 ) { + s << "Alignment: ("; // s += "\nAlignment: ("; - for ( int i = 0; i < ( int ) ( alignment.size() ); i++ ) - { - s << alignment[i]; + for ( int i = 0; i < ( int ) ( alignment.size() ); i++ ) { + s << alignment[i]; // s+=alignment[i]; - } + } // s += ")"; - s << ")"; - } - s << endl; - if ( ( int ) allshifts.size() == 0 ) - { + s << ")"; + } + s << endl; + if ( ( int ) allshifts.size() == 0 ) { // s += "\nNumShifts: 0"; - s << "NumShifts: 0"; - } - else - { + s << "NumShifts: 0"; + } else { // s += "\nNumShifts: " + (int)allshifts.size(); - s << "NumShifts: "<< ( int ) allshifts.size(); - for ( int i = 0; i < ( int ) allshifts.size(); i++ ) - { - s << endl << " " ; - s << ( ( terShift ) allshifts[i] ).toString(); + s << "NumShifts: "<< ( int ) allshifts.size(); + for ( int i = 0; i < ( int ) allshifts.size(); i++ ) { + s << endl << " " ; + s << ( ( terShift ) allshifts[i] ).toString(); // s += "\n " + allshifts[i]; - } - } - s << endl << "Score: " << scoreAv() << " (" << numEdits << "/" << averageWords << ")"; + } + } + s << endl << "Score: " << scoreAv() << " (" << numEdits << "/" << averageWords << ")"; // s += "\nScore: " + score() + " (" + numEdits + "/" + numWords + ")"; - return s.str(); + return s.str(); - } - string terAlignment::join ( string delim, vector<string> arr ) - { - if ( ( int ) arr.size() == 0 ) return ""; +} +string terAlignment::join ( string delim, vector<string> arr ) +{ + if ( ( int ) arr.size() == 0 ) return ""; // if ((int)delim.compare("") == 0) delim = new String(""); // String s = new String(""); - stringstream s; - s.str ( "" ); - for ( int i = 0; i < ( int ) arr.size(); i++ ) - { - if ( i == 0 ) - { - s << arr.at ( i ); - } - else - { - s << delim << arr.at ( i ); - } - } - return s.str(); + stringstream s; + s.str ( "" ); + for ( int i = 0; i < ( int ) arr.size(); i++ ) { + if ( i == 0 ) { + s << arr.at ( i ); + } else { + s << delim << arr.at ( i ); + } + } + return s.str(); // return ""; +} +double terAlignment::score() +{ + if ( ( numWords <= 0.0 ) && ( numEdits > 0.0 ) ) { + return 1.0; + } + if ( numWords <= 0.0 ) { + return 0.0; + } + return ( double ) numEdits / numWords; +} +double terAlignment::scoreAv() +{ + if ( ( averageWords <= 0.0 ) && ( numEdits > 0.0 ) ) { + return 1.0; + } + if ( averageWords <= 0.0 ) { + return 0.0; + } + return ( double ) numEdits / averageWords; +} + +void terAlignment::scoreDetails() +{ + numIns = numDel = numSub = numWsf = numSft = 0; + if((int)allshifts.size()>0) { + for(int i = 0; i < (int)allshifts.size(); ++i) { + numWsf += allshifts[i].size(); } - double terAlignment::score() - { - if ( ( numWords <= 0.0 ) && ( numEdits > 0.0 ) ) - { - return 1.0; - } - if ( numWords <= 0.0 ) - { - return 0.0; - } - return ( double ) numEdits / numWords; + numSft = allshifts.size(); + } + + if((int)alignment.size()>0 ) { + for(int i = 0; i < (int)alignment.size(); ++i) { + switch (alignment[i]) { + case 'S': + case 'T': + numSub++; + break; + case 'D': + numDel++; + break; + case 'I': + numIns++; + break; + } } - double terAlignment::scoreAv() - { - if ( ( averageWords <= 0.0 ) && ( numEdits > 0.0 ) ) - { - return 1.0; - } - if ( averageWords <= 0.0 ) - { - return 0.0; - } - return ( double ) numEdits / averageWords; + } + // if(numEdits != numSft + numDel + numIns + numSub) + // System.out.println("** Error, unmatch edit erros " + numEdits + + // " vs " + (numSft + numDel + numIns + numSub)); +} +string terAlignment::printAlignments() +{ + stringstream to_return; + for(int i = 0; i < (int)alignment.size(); ++i) { + char alignInfo=alignment.at(i); + if (alignInfo == 'A' ) { + alignInfo='A'; } - void terAlignment::scoreDetails() - { - numIns = numDel = numSub = numWsf = numSft = 0; - if((int)allshifts.size()>0) - { - for(int i = 0; i < (int)allshifts.size(); ++i) - { - numWsf += allshifts[i].size(); - } - numSft = allshifts.size(); - } - - if((int)alignment.size()>0 ) - { - for(int i = 0; i < (int)alignment.size(); ++i) - { - switch (alignment[i]) - { - case 'S': - case 'T': - numSub++; - break; - case 'D': - numDel++; - break; - case 'I': - numIns++; - break; - } - } - } - // if(numEdits != numSft + numDel + numIns + numSub) - // System.out.println("** Error, unmatch edit erros " + numEdits + - // " vs " + (numSft + numDel + numIns + numSub)); - } - string terAlignment::printAlignments() - { - stringstream to_return; - for(int i = 0; i < (int)alignment.size(); ++i) - { - char alignInfo=alignment.at(i); - if (alignInfo == 'A' ) - { - alignInfo='A'; - } - - if (i==0) - { - to_return << alignInfo; - } - else - { - to_return << " " << alignInfo; - } - } - return to_return.str(); + if (i==0) { + to_return << alignInfo; + } else { + to_return << " " << alignInfo; + } } + return to_return.str(); +} string terAlignment::printAllShifts() { - stringstream to_return; - if ( ( int ) allshifts.size() == 0 ) - { + stringstream to_return; + if ( ( int ) allshifts.size() == 0 ) { // s += "\nNumShifts: 0"; - to_return << "NbrShifts: 0"; - } - else - { + to_return << "NbrShifts: 0"; + } else { // s += "\nNumShifts: " + (int)allshifts.size(); - to_return << "NbrShifts: "<< ( int ) allshifts.size(); - for ( int i = 0; i < ( int ) allshifts.size(); i++ ) - { - to_return << "\t" ; - to_return << ( ( terShift ) allshifts[i] ).toString(); + to_return << "NbrShifts: "<< ( int ) allshifts.size(); + for ( int i = 0; i < ( int ) allshifts.size(); i++ ) { + to_return << "\t" ; + to_return << ( ( terShift ) allshifts[i] ).toString(); // s += "\n " + allshifts[i]; - } - } - return to_return.str(); + } + } + return to_return.str(); } }
\ No newline at end of file diff --git a/mert/TER/terAlignment.h b/mert/TER/terAlignment.h index 0af86f663..2af0b7490 100644 --- a/mert/TER/terAlignment.h +++ b/mert/TER/terAlignment.h @@ -5,7 +5,7 @@ Copyright 2010-2013, Christophe Servan, LIUM, University of Le Mans, France Contact: christophe.servan@lium.univ-lemans.fr The tercpp tool and library are free software: you can redistribute it and/or modify it -under the terms of the GNU Lesser General Public License as published by +under the terms of the GNU Lesser General Public License as published by the Free Software Foundation, either version 3 of the licence, or (at your option) any later version. @@ -34,41 +34,41 @@ using namespace std; namespace TERCpp { - class terAlignment - { - private: - public: - - terAlignment(); - string toString(); - void scoreDetails(); - - vector<string> ref; - vector<string> hyp; - vector<string> aftershift; - vector<terShift> allshifts; - vector<int> hyp_int; - vector<int> aftershift_int; - - double numEdits; - double numWords; - double averageWords; - vector<char> alignment; - string bestRef; - - int numIns; - int numDel; - int numSub; - int numSft; - int numWsf; - - - string join ( string delim, vector<string> arr ); - double score(); - double scoreAv(); - string printAlignments(); - string printAllShifts(); - }; +class terAlignment +{ +private: +public: + + terAlignment(); + string toString(); + void scoreDetails(); + + vector<string> ref; + vector<string> hyp; + vector<string> aftershift; + vector<terShift> allshifts; + vector<int> hyp_int; + vector<int> aftershift_int; + + double numEdits; + double numWords; + double averageWords; + vector<char> alignment; + string bestRef; + + int numIns; + int numDel; + int numSub; + int numSft; + int numWsf; + + + string join ( string delim, vector<string> arr ); + double score(); + double scoreAv(); + string printAlignments(); + string printAllShifts(); +}; } #endif
\ No newline at end of file diff --git a/mert/TER/terShift.cpp b/mert/TER/terShift.cpp index c1106db76..440b4d2ce 100644 --- a/mert/TER/terShift.cpp +++ b/mert/TER/terShift.cpp @@ -5,7 +5,7 @@ Copyright 2010-2013, Christophe Servan, LIUM, University of Le Mans, France Contact: christophe.servan@lium.univ-lemans.fr The tercpp tool and library are free software: you can redistribute it and/or modify it -under the terms of the GNU Lesser General Public License as published by +under the terms of the GNU Lesser General Public License as published by the Free Software Foundation, either version 3 of the licence, or (at your option) any later version. @@ -42,32 +42,32 @@ namespace TERCpp // numSft=0; // numWsf=0; // } - terShift::terShift () - { - start = 0; - end = 0; - moveto = 0; - newloc = 0; - cost=1.0; - } - terShift::terShift ( int _start, int _end, int _moveto, int _newloc ) - { - start = _start; - end = _end; - moveto = _moveto; - newloc = _newloc; - cost=1.0; - } +terShift::terShift () +{ + start = 0; + end = 0; + moveto = 0; + newloc = 0; + cost=1.0; +} +terShift::terShift ( int _start, int _end, int _moveto, int _newloc ) +{ + start = _start; + end = _end; + moveto = _moveto; + newloc = _newloc; + cost=1.0; +} - terShift::terShift ( int _start, int _end, int _moveto, int _newloc, vector<string> _shifted ) - { - start = _start; - end = _end; - moveto = _moveto; - newloc = _newloc; - shifted = _shifted; - cost=1.0; - } +terShift::terShift ( int _start, int _end, int _moveto, int _newloc, vector<string> _shifted ) +{ + start = _start; + end = _end; + moveto = _moveto; + newloc = _newloc; + shifted = _shifted; + cost=1.0; +} // string terShift::vectorToString(vector<string> vec) // { // string retour(""); @@ -78,44 +78,38 @@ namespace TERCpp // return retour; // } - string terShift::toString() - { - stringstream s; - s.str ( "" ); - s << "[" << start << ", " << end << ", " << moveto << "/" << newloc << "]"; - if ( ( int ) shifted.size() > 0 ) - { - s << " (" << vectorToString ( shifted ) << ")"; - } - return s.str(); - } +string terShift::toString() +{ + stringstream s; + s.str ( "" ); + s << "[" << start << ", " << end << ", " << moveto << "/" << newloc << "]"; + if ( ( int ) shifted.size() > 0 ) { + s << " (" << vectorToString ( shifted ) << ")"; + } + return s.str(); +} - /* The distance of the shift. */ - int terShift::distance() - { - if ( moveto < start ) - { - return start - moveto; - } - else if ( moveto > end ) - { - return moveto - end; - } - else - { - return moveto - start; - } - } +/* The distance of the shift. */ +int terShift::distance() +{ + if ( moveto < start ) { + return start - moveto; + } else if ( moveto > end ) { + return moveto - end; + } else { + return moveto - start; + } +} - bool terShift::leftShift() - { - return ( moveto < start ); - } +bool terShift::leftShift() +{ + return ( moveto < start ); +} - int terShift::size() - { - return ( end - start ) + 1; - } +int terShift::size() +{ + return ( end - start ) + 1; +} // terShift terShift::operator=(terShift t) // { // diff --git a/mert/TER/terShift.h b/mert/TER/terShift.h index ba84a5947..74545e0de 100644 --- a/mert/TER/terShift.h +++ b/mert/TER/terShift.h @@ -5,7 +5,7 @@ Copyright 2010-2013, Christophe Servan, LIUM, University of Le Mans, France Contact: christophe.servan@lium.univ-lemans.fr The tercpp tool and library are free software: you can redistribute it and/or modify it -under the terms of the GNU Lesser General Public License as published by +under the terms of the GNU Lesser General Public License as published by the Free Software Foundation, either version 3 of the licence, or (at your option) any later version. @@ -34,32 +34,32 @@ using namespace Tools; namespace TERCpp { - class terShift - { - private: - public: +class terShift +{ +private: +public: - terShift(); - terShift ( int _start, int _end, int _moveto, int _newloc ); - terShift ( int _start, int _end, int _moveto, int _newloc, vector<string> _shifted ); - string toString(); - int distance() ; - bool leftShift(); - int size(); + terShift(); + terShift ( int _start, int _end, int _moveto, int _newloc ); + terShift ( int _start, int _end, int _moveto, int _newloc, vector<string> _shifted ); + string toString(); + int distance() ; + bool leftShift(); + int size(); // terShift operator=(terShift t); // string vectorToString(vector<string> vec); - int start; - int end; - int moveto; - int newloc; - vector<string> shifted; // The words we shifted - vector<char> alignment ; // for pra_more output - vector<string> aftershift; // for pra_more output - // This is used to store the cost of a shift, so we don't have to - // calculate it multiple times. - double cost; - }; + int start; + int end; + int moveto; + int newloc; + vector<string> shifted; // The words we shifted + vector<char> alignment ; // for pra_more output + vector<string> aftershift; // for pra_more output + // This is used to store the cost of a shift, so we don't have to + // calculate it multiple times. + double cost; +}; } #endif
\ No newline at end of file diff --git a/mert/TER/tercalc.cpp b/mert/TER/tercalc.cpp index b7f63772c..c4629c639 100644 --- a/mert/TER/tercalc.cpp +++ b/mert/TER/tercalc.cpp @@ -5,7 +5,7 @@ Copyright 2010-2013, Christophe Servan, LIUM, University of Le Mans, France Contact: christophe.servan@lium.univ-lemans.fr The tercpp tool and library are free software: you can redistribute it and/or modify it -under the terms of the GNU Lesser General Public License as published by +under the terms of the GNU Lesser General Public License as published by the Free Software Foundation, either version 3 of the licence, or (at your option) any later version. @@ -35,902 +35,724 @@ using namespace Tools; namespace TERCpp { - terCalc::terCalc() - { - TAILLE_PERMUT_MAX = 50; - infinite = 999999.0; - shift_cost = 1.0; - insert_cost = 1.0; - delete_cost = 1.0; - substitute_cost = 1.0; - match_cost = 0.0; - NBR_SEGS_EVALUATED = 0; - NBR_PERMUTS_CONSID = 0; - NBR_BS_APPELS = 0; - TAILLE_BEAM = 20; - DIST_MAX_PERMUT = 50; - PRINT_DEBUG = false; - hypSpans.clear(); - refSpans.clear(); - } +terCalc::terCalc() +{ + TAILLE_PERMUT_MAX = 50; + infinite = 999999.0; + shift_cost = 1.0; + insert_cost = 1.0; + delete_cost = 1.0; + substitute_cost = 1.0; + match_cost = 0.0; + NBR_SEGS_EVALUATED = 0; + NBR_PERMUTS_CONSID = 0; + NBR_BS_APPELS = 0; + TAILLE_BEAM = 20; + DIST_MAX_PERMUT = 50; + PRINT_DEBUG = false; + hypSpans.clear(); + refSpans.clear(); +} - terAlignment terCalc::WERCalculation ( vector< string > hyp , vector< string > ref ) - { - - return minimizeDistanceEdition ( hyp, ref, hypSpans ); - - } +terAlignment terCalc::WERCalculation ( vector< string > hyp , vector< string > ref ) +{ - terAlignment terCalc::TER ( std::vector< int > hyp, std::vector< int > ref ) - { - stringstream s; - s.str ( "" ); - string stringRef ( "" ); - string stringHyp ( "" ); - for ( vector<int>::iterator l_it = ref.begin(); l_it != ref.end(); l_it++ ) - { - if ( l_it == ref.begin() ) - { - s << ( *l_it ); - } - else - { - s << " " << ( *l_it ); - } - } - stringRef = s.str(); - s.str ( "" ); - for ( vector<int>::iterator l_itHyp = hyp.begin(); l_itHyp != hyp.end(); l_itHyp++ ) - { - if ( l_itHyp == hyp.begin() ) - { - s << ( *l_itHyp ); - } - else - { - s << " " << ( *l_itHyp ); - } - } - stringHyp = s.str(); - s.str ( "" ); - return TER ( stringToVector ( stringRef , " " ), stringToVector ( stringHyp , " " ) ); - } + return minimizeDistanceEdition ( hyp, ref, hypSpans ); +} - hashMapInfos terCalc::createConcordMots ( vector<string> hyp, vector<string> ref ) - { - hashMap tempHash; - hashMapInfos retour; - for ( int i = 0; i < ( int ) hyp.size(); i++ ) - { - tempHash.addHasher ( hyp.at ( i ), "" ); - } - bool cor[ref.size() ]; - for ( int i = 0; i < ( int ) ref.size(); i++ ) - { - if ( tempHash.trouve ( ( string ) ref.at ( i ) ) ) - { - cor[i] = true; - } - else - { - cor[i] = false; - } - } - for ( int start = 0; start < ( int ) ref.size(); start++ ) - { - if ( cor[start] ) - { - for ( int end = start; ( ( end < ( int ) ref.size() ) && ( end - start <= TAILLE_PERMUT_MAX ) && ( cor[end] ) );end++ ) - { - vector<string> ajouter = subVector ( ref, start, end + 1 ); - string ajouterString = vectorToString ( ajouter ); - vector<int> values = retour.getValue ( ajouterString ); - values.push_back ( start ); - if ( values.size() > 1 ) - { - retour.setValue ( ajouterString, values ); - } - else - { - retour.addValue ( ajouterString, values ); - } - } - } - } - return retour; +terAlignment terCalc::TER ( std::vector< int > hyp, std::vector< int > ref ) +{ + stringstream s; + s.str ( "" ); + string stringRef ( "" ); + string stringHyp ( "" ); + for ( vector<int>::iterator l_it = ref.begin(); l_it != ref.end(); l_it++ ) { + if ( l_it == ref.begin() ) { + s << ( *l_it ); + } else { + s << " " << ( *l_it ); + } + } + stringRef = s.str(); + s.str ( "" ); + for ( vector<int>::iterator l_itHyp = hyp.begin(); l_itHyp != hyp.end(); l_itHyp++ ) { + if ( l_itHyp == hyp.begin() ) { + s << ( *l_itHyp ); + } else { + s << " " << ( *l_itHyp ); } + } + stringHyp = s.str(); + s.str ( "" ); + return TER ( stringToVector ( stringRef , " " ), stringToVector ( stringHyp , " " ) ); +} + - bool terCalc::trouverIntersection ( vecInt refSpan, vecInt hypSpan ) - { - if ( ( refSpan.at ( 1 ) >= hypSpan.at ( 0 ) ) && ( refSpan.at ( 0 ) <= hypSpan.at ( 1 ) ) ) - { - return true; +hashMapInfos terCalc::createConcordMots ( vector<string> hyp, vector<string> ref ) +{ + hashMap tempHash; + hashMapInfos retour; + for ( int i = 0; i < ( int ) hyp.size(); i++ ) { + tempHash.addHasher ( hyp.at ( i ), "" ); + } + bool cor[ref.size() ]; + for ( int i = 0; i < ( int ) ref.size(); i++ ) { + if ( tempHash.trouve ( ( string ) ref.at ( i ) ) ) { + cor[i] = true; + } else { + cor[i] = false; + } + } + for ( int start = 0; start < ( int ) ref.size(); start++ ) { + if ( cor[start] ) { + for ( int end = start; ( ( end < ( int ) ref.size() ) && ( end - start <= TAILLE_PERMUT_MAX ) && ( cor[end] ) ); end++ ) { + vector<string> ajouter = subVector ( ref, start, end + 1 ); + string ajouterString = vectorToString ( ajouter ); + vector<int> values = retour.getValue ( ajouterString ); + values.push_back ( start ); + if ( values.size() > 1 ) { + retour.setValue ( ajouterString, values ); + } else { + retour.addValue ( ajouterString, values ); } - return false; + } } + } + return retour; +} +bool terCalc::trouverIntersection ( vecInt refSpan, vecInt hypSpan ) +{ + if ( ( refSpan.at ( 1 ) >= hypSpan.at ( 0 ) ) && ( refSpan.at ( 0 ) <= hypSpan.at ( 1 ) ) ) { + return true; + } + return false; +} - terAlignment terCalc::minimizeDistanceEdition ( vector<string> hyp, vector<string> ref, vector<vecInt> curHypSpans ) - { - double current_best = infinite; - double last_best = infinite; - int first_good = 0; - int current_first_good = 0; - int last_good = -1; - int cur_last_good = 0; - int last_peak = 0; - int cur_last_peak = 0; - int i, j; - double cost, icost, dcost; - double score; - - - - NBR_BS_APPELS++; - - - for ( i = 0; i <= ( int ) ref.size(); i++ ) - { - for ( j = 0; j <= ( int ) hyp.size(); j++ ) - { - S[i][j] = -1.0; - P[i][j] = '0'; - } - } - S[0][0] = 0.0; - for ( j = 0; j <= ( int ) hyp.size(); j++ ) - { - last_best = current_best; - current_best = infinite; - first_good = current_first_good; - current_first_good = -1; - last_good = cur_last_good; - cur_last_good = -1; - last_peak = cur_last_peak; - cur_last_peak = 0; - for ( i = first_good; i <= ( int ) ref.size(); i++ ) - { - if ( i > last_good ) - { - break; - } - if ( S[i][j] < 0 ) - { - continue; - } - score = S[i][j]; - if ( ( j < ( int ) hyp.size() ) && ( score > last_best + TAILLE_BEAM ) ) - { - continue; - } - if ( current_first_good == -1 ) - { - current_first_good = i ; - } - if ( ( i < ( int ) ref.size() ) && ( j < ( int ) hyp.size() ) ) - { - if ( ( int ) refSpans.size() == 0 || ( int ) hypSpans.size() == 0 || trouverIntersection ( refSpans.at ( i ), curHypSpans.at ( j ) ) ) - { - if ( ( int ) ( ref.at ( i ).compare ( hyp.at ( j ) ) ) == 0 ) - { - cost = match_cost + score; - if ( ( S[i+1][j+1] == -1 ) || ( cost < S[i+1][j+1] ) ) - { - S[i+1][j+1] = cost; - P[i+1][j+1] = 'A'; - } - if ( cost < current_best ) - { - current_best = cost; - } - if ( current_best == cost ) - { - cur_last_peak = i + 1; - } - } - else - { - cost = substitute_cost + score; - if ( ( S[i+1][j+1] < 0 ) || ( cost < S[i+1][j+1] ) ) - { - S[i+1][j+1] = cost; - P[i+1][j+1] = 'S'; - if ( cost < current_best ) - { - current_best = cost; - } - if ( current_best == cost ) - { - cur_last_peak = i + 1 ; - } - } - } - } - } - cur_last_good = i + 1; - if ( j < ( int ) hyp.size() ) - { - icost = score + insert_cost; - if ( ( S[i][j+1] < 0 ) || ( S[i][j+1] > icost ) ) - { - S[i][j+1] = icost; - P[i][j+1] = 'I'; - if ( ( cur_last_peak < i ) && ( current_best == icost ) ) - { - cur_last_peak = i; - } - } - } - if ( i < ( int ) ref.size() ) - { - dcost = score + delete_cost; - if ( ( S[ i+1][ j] < 0.0 ) || ( S[i+1][j] > dcost ) ) - { - S[i+1][j] = dcost; - P[i+1][j] = 'D'; - if ( i >= last_good ) - { - last_good = i + 1 ; - } - } - } - } - } + +terAlignment terCalc::minimizeDistanceEdition ( vector<string> hyp, vector<string> ref, vector<vecInt> curHypSpans ) +{ + double current_best = infinite; + double last_best = infinite; + int first_good = 0; + int current_first_good = 0; + int last_good = -1; + int cur_last_good = 0; + int last_peak = 0; + int cur_last_peak = 0; + int i, j; + double cost, icost, dcost; + double score; - int tracelength = 0; - i = ref.size(); - j = hyp.size(); - while ( ( i > 0 ) || ( j > 0 ) ) - { - tracelength++; - if ( P[i][j] == 'A' ) - { - i--; - j--; + + NBR_BS_APPELS++; + + + for ( i = 0; i <= ( int ) ref.size(); i++ ) { + for ( j = 0; j <= ( int ) hyp.size(); j++ ) { + S[i][j] = -1.0; + P[i][j] = '0'; + } + } + S[0][0] = 0.0; + for ( j = 0; j <= ( int ) hyp.size(); j++ ) { + last_best = current_best; + current_best = infinite; + first_good = current_first_good; + current_first_good = -1; + last_good = cur_last_good; + cur_last_good = -1; + last_peak = cur_last_peak; + cur_last_peak = 0; + for ( i = first_good; i <= ( int ) ref.size(); i++ ) { + if ( i > last_good ) { + break; + } + if ( S[i][j] < 0 ) { + continue; + } + score = S[i][j]; + if ( ( j < ( int ) hyp.size() ) && ( score > last_best + TAILLE_BEAM ) ) { + continue; + } + if ( current_first_good == -1 ) { + current_first_good = i ; + } + if ( ( i < ( int ) ref.size() ) && ( j < ( int ) hyp.size() ) ) { + if ( ( int ) refSpans.size() == 0 || ( int ) hypSpans.size() == 0 || trouverIntersection ( refSpans.at ( i ), curHypSpans.at ( j ) ) ) { + if ( ( int ) ( ref.at ( i ).compare ( hyp.at ( j ) ) ) == 0 ) { + cost = match_cost + score; + if ( ( S[i+1][j+1] == -1 ) || ( cost < S[i+1][j+1] ) ) { + S[i+1][j+1] = cost; + P[i+1][j+1] = 'A'; } - else - if ( P[i][j] == 'S' ) - { - i--; - j--; - } - else - if ( P[i][j] == 'D' ) - { - i--; - } - else - if ( P[i][j] == 'I' ) - { - j--; - } - else - { - cerr << "ERROR : terCalc::minimizeDistanceEdition : Invalid path : " << P[i][j] << endl; - exit ( -1 ); - } - } - vector<char> path ( tracelength ); - i = ref.size(); - j = hyp.size(); - while ( ( i > 0 ) || ( j > 0 ) ) - { - path[--tracelength] = P[i][j]; - if ( P[i][j] == 'A' ) - { - i--; - j--; + if ( cost < current_best ) { + current_best = cost; + } + if ( current_best == cost ) { + cur_last_peak = i + 1; } - else - if ( P[i][j] == 'S' ) - { - i--; - j--; - } - else - if ( P[i][j] == 'D' ) - { - i--; - } - else - if ( P[i][j] == 'I' ) - { - j--; - } + } else { + cost = substitute_cost + score; + if ( ( S[i+1][j+1] < 0 ) || ( cost < S[i+1][j+1] ) ) { + S[i+1][j+1] = cost; + P[i+1][j+1] = 'S'; + if ( cost < current_best ) { + current_best = cost; + } + if ( current_best == cost ) { + cur_last_peak = i + 1 ; + } + } + } } - terAlignment to_return; - to_return.numWords = ref.size(); - to_return.alignment = path; - to_return.numEdits = S[ref.size() ][hyp.size() ]; - to_return.hyp = hyp; - to_return.ref = ref; - to_return.averageWords = (int)ref.size(); - if ( PRINT_DEBUG ) - { - cerr << "BEGIN DEBUG : terCalc::minimizeDistanceEdition : to_return :" << endl << to_return.toString() << endl << "END DEBUG" << endl; + } + cur_last_good = i + 1; + if ( j < ( int ) hyp.size() ) { + icost = score + insert_cost; + if ( ( S[i][j+1] < 0 ) || ( S[i][j+1] > icost ) ) { + S[i][j+1] = icost; + P[i][j+1] = 'I'; + if ( ( cur_last_peak < i ) && ( current_best == icost ) ) { + cur_last_peak = i; + } } - return to_return; - + } + if ( i < ( int ) ref.size() ) { + dcost = score + delete_cost; + if ( ( S[ i+1][ j] < 0.0 ) || ( S[i+1][j] > dcost ) ) { + S[i+1][j] = dcost; + P[i+1][j] = 'D'; + if ( i >= last_good ) { + last_good = i + 1 ; + } + } + } + } + } + + + int tracelength = 0; + i = ref.size(); + j = hyp.size(); + while ( ( i > 0 ) || ( j > 0 ) ) { + tracelength++; + if ( P[i][j] == 'A' ) { + i--; + j--; + } else if ( P[i][j] == 'S' ) { + i--; + j--; + } else if ( P[i][j] == 'D' ) { + i--; + } else if ( P[i][j] == 'I' ) { + j--; + } else { + cerr << "ERROR : terCalc::minimizeDistanceEdition : Invalid path : " << P[i][j] << endl; + exit ( -1 ); + } + } + vector<char> path ( tracelength ); + i = ref.size(); + j = hyp.size(); + while ( ( i > 0 ) || ( j > 0 ) ) { + path[--tracelength] = P[i][j]; + if ( P[i][j] == 'A' ) { + i--; + j--; + } else if ( P[i][j] == 'S' ) { + i--; + j--; + } else if ( P[i][j] == 'D' ) { + i--; + } else if ( P[i][j] == 'I' ) { + j--; } - terAlignment terCalc::TER ( vector<string> hyp, vector<string> ref ) - { - hashMapInfos rloc = createConcordMots ( hyp, ref ); - terAlignment cur_align = minimizeDistanceEdition ( hyp, ref, hypSpans ); - vector<string> cur = hyp; - cur_align.hyp = hyp; - cur_align.ref = ref; - cur_align.aftershift = hyp; - double edits = 0; + } + terAlignment to_return; + to_return.numWords = ref.size(); + to_return.alignment = path; + to_return.numEdits = S[ref.size() ][hyp.size() ]; + to_return.hyp = hyp; + to_return.ref = ref; + to_return.averageWords = (int)ref.size(); + if ( PRINT_DEBUG ) { + cerr << "BEGIN DEBUG : terCalc::minimizeDistanceEdition : to_return :" << endl << to_return.toString() << endl << "END DEBUG" << endl; + } + return to_return; + +} +terAlignment terCalc::TER ( vector<string> hyp, vector<string> ref ) +{ + hashMapInfos rloc = createConcordMots ( hyp, ref ); + terAlignment cur_align = minimizeDistanceEdition ( hyp, ref, hypSpans ); + vector<string> cur = hyp; + cur_align.hyp = hyp; + cur_align.ref = ref; + cur_align.aftershift = hyp; + double edits = 0; // int numshifts = 0; - vector<terShift> allshifts; + vector<terShift> allshifts; // cerr << "Initial Alignment:" << endl << cur_align.toString() <<endl; - if ( PRINT_DEBUG ) - { - cerr << "BEGIN DEBUG : terCalc::TER : cur_align :" << endl << cur_align.toString() << endl << "END DEBUG" << endl; - } - while ( true ) - { - bestShiftStruct returns; - returns = findBestShift ( cur, hyp, ref, rloc, cur_align ); - if ( returns.m_empty ) - { - break; - } - terShift bestShift = returns.m_best_shift; - cur_align = returns.m_best_align; - edits += bestShift.cost; - bestShift.alignment = cur_align.alignment; - bestShift.aftershift = cur_align.aftershift; - allshifts.push_back ( bestShift ); - cur = cur_align.aftershift; - } - terAlignment to_return; - to_return = cur_align; - to_return.allshifts = allshifts; - to_return.numEdits += edits; - NBR_SEGS_EVALUATED++; - return to_return; + if ( PRINT_DEBUG ) { + cerr << "BEGIN DEBUG : terCalc::TER : cur_align :" << endl << cur_align.toString() << endl << "END DEBUG" << endl; + } + while ( true ) { + bestShiftStruct returns; + returns = findBestShift ( cur, hyp, ref, rloc, cur_align ); + if ( returns.m_empty ) { + break; } - bestShiftStruct terCalc::findBestShift ( vector<string> cur, vector<string> hyp, vector<string> ref, hashMapInfos rloc, terAlignment med_align ) - { - bestShiftStruct to_return; - bool anygain = false; - bool herr[ ( int ) hyp.size() ]; - bool rerr[ ( int ) ref.size() ]; - int ralign[ ( int ) ref.size() ]; - calculateTerAlignment ( med_align, herr, rerr, ralign ); - vector<vecTerShift> poss_shifts; - - if ( PRINT_DEBUG ) - { - cerr << "BEGIN DEBUG : terCalc::findBestShift (after the calculateTerAlignment call) :" << endl; - cerr << "indices: "; - for (int l_i=0; l_i < ( int ) ref.size() ; l_i++) - { - cerr << l_i << "\t"; - } - cerr << endl; - cerr << "hyp : \t"<<vectorToString(hyp ,"\t") << endl; - cerr << "cur : \t"<<vectorToString(cur ,"\t") << endl; - cerr << "ref : \t"<<vectorToString(ref ,"\t") << endl; - cerr << "herr : "<<vectorToString(herr,"\t",( int ) hyp.size()) << " | " << ( int ) hyp.size() <<endl; - cerr << "rerr : "<<vectorToString(rerr,"\t",( int ) ref.size()) << " | " << ( int ) ref.size() <<endl; - cerr << "ralign : "<< vectorToString(ralign,"\t",( int ) ref.size()) << " | " << ( int ) ref.size() << endl; - cerr << "END DEBUG " << endl; - } - poss_shifts = calculerPermutations ( cur, ref, rloc, med_align, herr, rerr, ralign ); - double curerr = med_align.numEdits; - if ( PRINT_DEBUG ) - { - cerr << "BEGIN DEBUG : terCalc::findBestShift :" << endl; - cerr << "Possible Shifts:" << endl; - for ( int i = ( int ) poss_shifts.size() - 1; i >= 0; i-- ) - { - for ( int j = 0; j < ( int ) ( poss_shifts.at ( i ) ).size(); j++ ) - { - cerr << " [" << i << "] " << ( ( poss_shifts.at ( i ) ).at ( j ) ).toString() << endl; - } - } - cerr << endl; - cerr << "END DEBUG " << endl; - } + terShift bestShift = returns.m_best_shift; + cur_align = returns.m_best_align; + edits += bestShift.cost; + bestShift.alignment = cur_align.alignment; + bestShift.aftershift = cur_align.aftershift; + allshifts.push_back ( bestShift ); + cur = cur_align.aftershift; + } + terAlignment to_return; + to_return = cur_align; + to_return.allshifts = allshifts; + to_return.numEdits += edits; + NBR_SEGS_EVALUATED++; + return to_return; +} +bestShiftStruct terCalc::findBestShift ( vector<string> cur, vector<string> hyp, vector<string> ref, hashMapInfos rloc, terAlignment med_align ) +{ + bestShiftStruct to_return; + bool anygain = false; + bool herr[ ( int ) hyp.size() ]; + bool rerr[ ( int ) ref.size() ]; + int ralign[ ( int ) ref.size() ]; + calculateTerAlignment ( med_align, herr, rerr, ralign ); + vector<vecTerShift> poss_shifts; + + if ( PRINT_DEBUG ) { + cerr << "BEGIN DEBUG : terCalc::findBestShift (after the calculateTerAlignment call) :" << endl; + cerr << "indices: "; + for (int l_i=0; l_i < ( int ) ref.size() ; l_i++) { + cerr << l_i << "\t"; + } + cerr << endl; + cerr << "hyp : \t"<<vectorToString(hyp ,"\t") << endl; + cerr << "cur : \t"<<vectorToString(cur ,"\t") << endl; + cerr << "ref : \t"<<vectorToString(ref ,"\t") << endl; + cerr << "herr : "<<vectorToString(herr,"\t",( int ) hyp.size()) << " | " << ( int ) hyp.size() <<endl; + cerr << "rerr : "<<vectorToString(rerr,"\t",( int ) ref.size()) << " | " << ( int ) ref.size() <<endl; + cerr << "ralign : "<< vectorToString(ralign,"\t",( int ) ref.size()) << " | " << ( int ) ref.size() << endl; + cerr << "END DEBUG " << endl; + } + poss_shifts = calculerPermutations ( cur, ref, rloc, med_align, herr, rerr, ralign ); + double curerr = med_align.numEdits; + if ( PRINT_DEBUG ) { + cerr << "BEGIN DEBUG : terCalc::findBestShift :" << endl; + cerr << "Possible Shifts:" << endl; + for ( int i = ( int ) poss_shifts.size() - 1; i >= 0; i-- ) { + for ( int j = 0; j < ( int ) ( poss_shifts.at ( i ) ).size(); j++ ) { + cerr << " [" << i << "] " << ( ( poss_shifts.at ( i ) ).at ( j ) ).toString() << endl; + } + } + cerr << endl; + cerr << "END DEBUG " << endl; + } // exit(0); - double cur_best_shift_cost = 0.0; - terAlignment cur_best_align = med_align; - terShift cur_best_shift; + double cur_best_shift_cost = 0.0; + terAlignment cur_best_align = med_align; + terShift cur_best_shift; - for ( int i = ( int ) poss_shifts.size() - 1; i >= 0; i-- ) - { - if ( PRINT_DEBUG ) - { - cerr << "BEGIN DEBUG : terCalc::findBestShift :" << endl; - cerr << "Considering shift of length " << i << " (" << ( poss_shifts.at ( i ) ).size() << ")" << endl; - cerr << "END DEBUG " << endl; - } - /* Consider shifts of length i+1 */ - double curfix = curerr - ( cur_best_shift_cost + cur_best_align.numEdits ); - double maxfix = ( 2 * ( 1 + i ) ); - if ( ( curfix > maxfix ) || ( ( cur_best_shift_cost != 0 ) && ( curfix == maxfix ) ) ) - { - break; - } + for ( int i = ( int ) poss_shifts.size() - 1; i >= 0; i-- ) { + if ( PRINT_DEBUG ) { + cerr << "BEGIN DEBUG : terCalc::findBestShift :" << endl; + cerr << "Considering shift of length " << i << " (" << ( poss_shifts.at ( i ) ).size() << ")" << endl; + cerr << "END DEBUG " << endl; + } + /* Consider shifts of length i+1 */ + double curfix = curerr - ( cur_best_shift_cost + cur_best_align.numEdits ); + double maxfix = ( 2 * ( 1 + i ) ); + if ( ( curfix > maxfix ) || ( ( cur_best_shift_cost != 0 ) && ( curfix == maxfix ) ) ) { + break; + } - for ( int s = 0; s < ( int ) ( poss_shifts.at ( i ) ).size(); s++ ) - { - curfix = curerr - ( cur_best_shift_cost + cur_best_align.numEdits ); - if ( ( curfix > maxfix ) || ( ( cur_best_shift_cost != 0 ) && ( curfix == maxfix ) ) ) - { - break; - } - terShift curshift = ( poss_shifts.at ( i ) ).at ( s ); - if ( PRINT_DEBUG ) - { - cerr << "BEGIN DEBUG : terCalc::findBestShift :" << endl; - cerr << "cur : "<< join(" ",cur) << endl; - cerr << "curshift : "<< curshift.toString() << endl; - - } - alignmentStruct shiftReturns = permuter ( cur, curshift ); - vector<string> shiftarr = shiftReturns.nwords; - vector<vecInt> curHypSpans = shiftReturns.aftershift; - - if ( PRINT_DEBUG ) - { - cerr << "shiftarr : "<< join(" ",shiftarr) << endl; + for ( int s = 0; s < ( int ) ( poss_shifts.at ( i ) ).size(); s++ ) { + curfix = curerr - ( cur_best_shift_cost + cur_best_align.numEdits ); + if ( ( curfix > maxfix ) || ( ( cur_best_shift_cost != 0 ) && ( curfix == maxfix ) ) ) { + break; + } + terShift curshift = ( poss_shifts.at ( i ) ).at ( s ); + if ( PRINT_DEBUG ) { + cerr << "BEGIN DEBUG : terCalc::findBestShift :" << endl; + cerr << "cur : "<< join(" ",cur) << endl; + cerr << "curshift : "<< curshift.toString() << endl; + + } + alignmentStruct shiftReturns = permuter ( cur, curshift ); + vector<string> shiftarr = shiftReturns.nwords; + vector<vecInt> curHypSpans = shiftReturns.aftershift; + + if ( PRINT_DEBUG ) { + cerr << "shiftarr : "<< join(" ",shiftarr) << endl; // cerr << "curHypSpans : "<< curHypSpans.toString() << endl; - cerr << "END DEBUG " << endl; - } - terAlignment curalign = minimizeDistanceEdition ( shiftarr, ref, curHypSpans ); - - curalign.hyp = hyp; - curalign.ref = ref; - curalign.aftershift = shiftarr; - - - double gain = ( cur_best_align.numEdits + cur_best_shift_cost ) - ( curalign.numEdits + curshift.cost ); - - // if (DEBUG) { - // string testeuh=terAlignment join(" ", shiftarr); - if ( PRINT_DEBUG ) - { - cerr << "BEGIN DEBUG : terCalc::findBestShift :" << endl; - cerr << "Gain for " << curshift.toString() << " is " << gain << ". (result: [" << curalign.join ( " ", shiftarr ) << "]" << endl; - cerr << "Details of gains : gain = ( cur_best_align.numEdits + cur_best_shift_cost ) - ( curalign.numEdits + curshift.cost )"<<endl; - cerr << "Details of gains : gain = ("<<cur_best_align.numEdits << "+" << cur_best_shift_cost << ") - (" << curalign.numEdits << "+" << curshift.cost << ")"<<endl; - cerr << "" << curalign.toString() << "\n" << endl; - cerr << "END DEBUG " << endl; - } - // } - // - if ( ( gain > 0 ) || ( ( cur_best_shift_cost == 0 ) && ( gain == 0 ) ) ) - { - anygain = true; - cur_best_shift = curshift; - cur_best_shift_cost = curshift.cost; - cur_best_align = curalign; - // if (DEBUG) - if ( PRINT_DEBUG ) - { - cerr << "BEGIN DEBUG : terCalc::findBestShift :" << endl; - cerr << "Tmp Choosing shift: " << cur_best_shift.toString() << " gives:\n" << cur_best_align.toString() << "\n" << endl; - cerr << "END DEBUG " << endl; - } - } - } - } - if ( anygain ) - { - to_return.m_best_shift = cur_best_shift; - to_return.m_best_align = cur_best_align; - to_return.m_empty = false; + cerr << "END DEBUG " << endl; + } + terAlignment curalign = minimizeDistanceEdition ( shiftarr, ref, curHypSpans ); + + curalign.hyp = hyp; + curalign.ref = ref; + curalign.aftershift = shiftarr; + + + double gain = ( cur_best_align.numEdits + cur_best_shift_cost ) - ( curalign.numEdits + curshift.cost ); + + // if (DEBUG) { + // string testeuh=terAlignment join(" ", shiftarr); + if ( PRINT_DEBUG ) { + cerr << "BEGIN DEBUG : terCalc::findBestShift :" << endl; + cerr << "Gain for " << curshift.toString() << " is " << gain << ". (result: [" << curalign.join ( " ", shiftarr ) << "]" << endl; + cerr << "Details of gains : gain = ( cur_best_align.numEdits + cur_best_shift_cost ) - ( curalign.numEdits + curshift.cost )"<<endl; + cerr << "Details of gains : gain = ("<<cur_best_align.numEdits << "+" << cur_best_shift_cost << ") - (" << curalign.numEdits << "+" << curshift.cost << ")"<<endl; + cerr << "" << curalign.toString() << "\n" << endl; + cerr << "END DEBUG " << endl; + } + // } + // + if ( ( gain > 0 ) || ( ( cur_best_shift_cost == 0 ) && ( gain == 0 ) ) ) { + anygain = true; + cur_best_shift = curshift; + cur_best_shift_cost = curshift.cost; + cur_best_align = curalign; + // if (DEBUG) + if ( PRINT_DEBUG ) { + cerr << "BEGIN DEBUG : terCalc::findBestShift :" << endl; + cerr << "Tmp Choosing shift: " << cur_best_shift.toString() << " gives:\n" << cur_best_align.toString() << "\n" << endl; + cerr << "END DEBUG " << endl; } - else - { - to_return.m_empty = true; - } - return to_return; + } } + } + if ( anygain ) { + to_return.m_best_shift = cur_best_shift; + to_return.m_best_align = cur_best_align; + to_return.m_empty = false; + } else { + to_return.m_empty = true; + } + return to_return; +} - void terCalc::calculateTerAlignment ( terAlignment align, bool* herr, bool* rerr, int* ralign ) - { - int hpos = -1; - int rpos = -1; - if ( PRINT_DEBUG ) - { +void terCalc::calculateTerAlignment ( terAlignment align, bool* herr, bool* rerr, int* ralign ) +{ + int hpos = -1; + int rpos = -1; + if ( PRINT_DEBUG ) { + + cerr << "BEGIN DEBUG : terCalc::calculateTerAlignment : " << endl << align.toString() << endl; + cerr << "END DEBUG " << endl; + } + for ( int i = 0; i < ( int ) align.alignment.size(); i++ ) { + herr[i] = false; + rerr[i] = false; + ralign[i] = -1; + } + for ( int i = 0; i < ( int ) align.alignment.size(); i++ ) { + char sym = align.alignment[i]; + if ( sym == 'A' ) { + hpos++; + rpos++; + herr[hpos] = false; + rerr[rpos] = false; + ralign[rpos] = hpos; + } else if ( sym == 'S' ) { + hpos++; + rpos++; + herr[hpos] = true; + rerr[rpos] = true; + ralign[rpos] = hpos; + } else if ( sym == 'I' ) { + hpos++; + herr[hpos] = true; + } else if ( sym == 'D' ) { + rpos++; + rerr[rpos] = true; + ralign[rpos] = hpos+1; + } else { + cerr << "ERROR : terCalc::calculateTerAlignment : Invalid mini align sequence " << sym << " at pos " << i << endl; + exit ( -1 ); + } + } +} - cerr << "BEGIN DEBUG : terCalc::calculateTerAlignment : " << endl << align.toString() << endl; - cerr << "END DEBUG " << endl; - } - for ( int i = 0; i < ( int ) align.alignment.size(); i++ ) - { - herr[i] = false; - rerr[i] = false; - ralign[i] = -1; - } - for ( int i = 0; i < ( int ) align.alignment.size(); i++ ) - { - char sym = align.alignment[i]; - if ( sym == 'A' ) - { - hpos++; - rpos++; - herr[hpos] = false; - rerr[rpos] = false; - ralign[rpos] = hpos; - } - else - if ( sym == 'S' ) - { - hpos++; - rpos++; - herr[hpos] = true; - rerr[rpos] = true; - ralign[rpos] = hpos; - } - else - if ( sym == 'I' ) - { - hpos++; - herr[hpos] = true; - } - else - if ( sym == 'D' ) - { - rpos++; - rerr[rpos] = true; - ralign[rpos] = hpos+1; - } - else - { - cerr << "ERROR : terCalc::calculateTerAlignment : Invalid mini align sequence " << sym << " at pos " << i << endl; - exit ( -1 ); - } - } +vector<vecTerShift> terCalc::calculerPermutations ( vector<string> hyp, vector<string> ref, hashMapInfos rloc, terAlignment align, bool* herr, bool* rerr, int* ralign ) +{ + vector<vecTerShift> to_return; + if ( ( TAILLE_PERMUT_MAX <= 0 ) || ( DIST_MAX_PERMUT <= 0 ) ) { + return to_return; + } + + vector<vecTerShift> allshifts ( TAILLE_PERMUT_MAX + 1 ); + for ( int start = 0; start < ( int ) hyp.size(); start++ ) { + string subVectorHypString = vectorToString ( subVector ( hyp, start, start + 1 ) ); + if ( ! rloc.trouve ( subVectorHypString ) ) { + continue; } - vector<vecTerShift> terCalc::calculerPermutations ( vector<string> hyp, vector<string> ref, hashMapInfos rloc, terAlignment align, bool* herr, bool* rerr, int* ralign ) - { - vector<vecTerShift> to_return; - if ( ( TAILLE_PERMUT_MAX <= 0 ) || ( DIST_MAX_PERMUT <= 0 ) ) - { - return to_return; + bool ok = false; + vector<int> mtiVec = rloc.getValue ( subVectorHypString ); + vector<int>::iterator mti = mtiVec.begin(); + while ( mti != mtiVec.end() && ( ! ok ) ) { + int moveto = ( *mti ); + mti++; + if ( ( start != ralign[moveto] ) && ( ( ralign[moveto] - start ) <= DIST_MAX_PERMUT ) && ( ( start - ralign[moveto] - 1 ) <= DIST_MAX_PERMUT ) ) { + ok = true; + } + } + if ( ! ok ) { + continue; + } + ok = true; + for ( int end = start; ( ok && ( end < ( int ) hyp.size() ) && ( end < start + TAILLE_PERMUT_MAX ) ); end++ ) { + /* check if cand is good if so, add it */ + vector<string> cand = subVector ( hyp, start, end + 1 ); + ok = false; + if ( ! ( rloc.trouve ( vectorToString ( cand ) ) ) ) { + continue; + } + + bool any_herr = false; + + for ( int i = 0; ( ( i <= ( end - start ) ) && ( ! any_herr ) ); i++ ) { + if ( herr[start+i] ) { + any_herr = true; } + } + if ( any_herr == false ) { + ok = true; + continue; + } + + vector<int> movetoitVec; + movetoitVec = rloc.getValue ( ( string ) vectorToString ( cand ) ); +// cerr << "CANDIDATE " << ( string ) vectorToString ( cand ) <<" PLACED : " << ( string ) vectorToString ( movetoitVec," ") << endl; + vector<int>::iterator movetoit = movetoitVec.begin(); + while ( movetoit != movetoitVec.end() ) { + int moveto = ( *movetoit ); + movetoit++; + if ( ! ( ( ralign[moveto] != start ) && ( ( ralign[moveto] < start ) || ( ralign[moveto] > end ) ) && ( ( ralign[moveto] - start ) <= DIST_MAX_PERMUT ) && ( ( start - ralign[moveto] ) <= DIST_MAX_PERMUT ) ) ) { + continue; + } + ok = true; - vector<vecTerShift> allshifts ( TAILLE_PERMUT_MAX + 1 ); - for ( int start = 0; start < ( int ) hyp.size(); start++ ) - { - string subVectorHypString = vectorToString ( subVector ( hyp, start, start + 1 ) ); - if ( ! rloc.trouve ( subVectorHypString ) ) - { - continue; - } + /* check to see if there are any errors in either string + (only move if this is the case!) + */ + + bool any_rerr = false; + for ( int i = 0; ( i <= end - start ) && ( ! any_rerr ); i++ ) { + if ( rerr[moveto+i] ) { + any_rerr = true; + } + } + if ( ! any_rerr ) { + continue; + } + for ( int roff = -1; roff <= ( end - start ); roff++ ) { + terShift topush; + bool topushNull = true; + if ( ( roff == -1 ) && ( moveto == 0 ) ) { + if ( PRINT_DEBUG ) { - bool ok = false; - vector<int> mtiVec = rloc.getValue ( subVectorHypString ); - vector<int>::iterator mti = mtiVec.begin(); - while ( mti != mtiVec.end() && ( ! ok ) ) - { - int moveto = ( *mti ); - mti++; - if ( ( start != ralign[moveto] ) && ( ( ralign[moveto] - start ) <= DIST_MAX_PERMUT ) && ( ( start - ralign[moveto] - 1 ) <= DIST_MAX_PERMUT ) ) - { - ok = true; - } + cerr << "BEGIN DEBUG : terCalc::calculerPermutations 01 : " << endl << "Consider making " << start << "..." << end << " (" << vectorToString(cand," ")<< ") moveto: " << moveto << " roff: " << roff << " ralign[mt+roff]: -1" << endl << "END DEBUG" << endl; } - if ( ! ok ) - { - continue; + terShift t01 ( start, end, -1, -1 ); + topush = t01; + topushNull = false; + } else if ( ( start != ralign[moveto+roff] ) && ( ( roff == 0 ) || ( ralign[moveto+roff] != ralign[moveto] ) ) ) { + int newloc = ralign[moveto+roff]; + if ( PRINT_DEBUG ) { + + cerr << "BEGIN DEBUG : terCalc::calculerPermutations 02 : " << endl << "Consider making " << start << "..." << end << " (" << vectorToString(cand," ")<< ") moveto: " << moveto << " roff: " << roff << " ralign[mt+roff]: " << newloc << endl << "END DEBUG" << endl; } - ok = true; - for ( int end = start; ( ok && ( end < ( int ) hyp.size() ) && ( end < start + TAILLE_PERMUT_MAX ) ); end++ ) - { - /* check if cand is good if so, add it */ - vector<string> cand = subVector ( hyp, start, end + 1 ); - ok = false; - if ( ! ( rloc.trouve ( vectorToString ( cand ) ) ) ) - { - continue; - } - - bool any_herr = false; - - for ( int i = 0; ( ( i <= ( end - start ) ) && ( ! any_herr ) ); i++ ) - { - if ( herr[start+i] ) - { - any_herr = true; - } - } - if ( any_herr == false ) - { - ok = true; - continue; - } - - vector<int> movetoitVec; - movetoitVec = rloc.getValue ( ( string ) vectorToString ( cand ) ); -// cerr << "CANDIDATE " << ( string ) vectorToString ( cand ) <<" PLACED : " << ( string ) vectorToString ( movetoitVec," ") << endl; - vector<int>::iterator movetoit = movetoitVec.begin(); - while ( movetoit != movetoitVec.end() ) - { - int moveto = ( *movetoit ); - movetoit++; - if ( ! ( ( ralign[moveto] != start ) && ( ( ralign[moveto] < start ) || ( ralign[moveto] > end ) ) && ( ( ralign[moveto] - start ) <= DIST_MAX_PERMUT ) && ( ( start - ralign[moveto] ) <= DIST_MAX_PERMUT ) ) ) - { - continue; - } - ok = true; - - /* check to see if there are any errors in either string - (only move if this is the case!) - */ - - bool any_rerr = false; - for ( int i = 0; ( i <= end - start ) && ( ! any_rerr ); i++ ) - { - if ( rerr[moveto+i] ) - { - any_rerr = true; - } - } - if ( ! any_rerr ) - { - continue; - } - for ( int roff = -1; roff <= ( end - start ); roff++ ) - { - terShift topush; - bool topushNull = true; - if ( ( roff == -1 ) && ( moveto == 0 ) ) - { - if ( PRINT_DEBUG ) - { - - cerr << "BEGIN DEBUG : terCalc::calculerPermutations 01 : " << endl << "Consider making " << start << "..." << end << " (" << vectorToString(cand," ")<< ") moveto: " << moveto << " roff: " << roff << " ralign[mt+roff]: -1" << endl << "END DEBUG" << endl; - } - terShift t01 ( start, end, -1, -1 ); - topush = t01; - topushNull = false; - } - else - if ( ( start != ralign[moveto+roff] ) && ( ( roff == 0 ) || ( ralign[moveto+roff] != ralign[moveto] ) ) ) - { - int newloc = ralign[moveto+roff]; - if ( PRINT_DEBUG ) - { - - cerr << "BEGIN DEBUG : terCalc::calculerPermutations 02 : " << endl << "Consider making " << start << "..." << end << " (" << vectorToString(cand," ")<< ") moveto: " << moveto << " roff: " << roff << " ralign[mt+roff]: " << newloc << endl << "END DEBUG" << endl; - } - terShift t02 ( start, end, moveto + roff, newloc ); - topush = t02; - topushNull = false; - } - if ( !topushNull ) - { - topush.shifted = cand; - topush.cost = shift_cost; - if ( PRINT_DEBUG ) - { - - cerr << "BEGIN DEBUG : terCalc::calculerPermutations 02 : " << endl; - cerr << "start : " << start << endl; - cerr << "end : " << end << endl; - cerr << "end - start : " << end - start << endl; - cerr << "END DEBUG " << endl; - } - ( allshifts.at ( end - start ) ).push_back ( topush ); - } - } - } + terShift t02 ( start, end, moveto + roff, newloc ); + topush = t02; + topushNull = false; + } + if ( !topushNull ) { + topush.shifted = cand; + topush.cost = shift_cost; + if ( PRINT_DEBUG ) { + + cerr << "BEGIN DEBUG : terCalc::calculerPermutations 02 : " << endl; + cerr << "start : " << start << endl; + cerr << "end : " << end << endl; + cerr << "end - start : " << end - start << endl; + cerr << "END DEBUG " << endl; } + ( allshifts.at ( end - start ) ).push_back ( topush ); + } } - to_return.clear(); - for ( int i = 0; i < TAILLE_PERMUT_MAX + 1; i++ ) - { - to_return.push_back ( ( vecTerShift ) allshifts.at ( i ) ); - } - return to_return; + } } + } + to_return.clear(); + for ( int i = 0; i < TAILLE_PERMUT_MAX + 1; i++ ) { + to_return.push_back ( ( vecTerShift ) allshifts.at ( i ) ); + } + return to_return; +} - alignmentStruct terCalc::permuter ( vector<string> words, terShift s ) - { - return permuter ( words, s.start, s.end, s.newloc ); - } +alignmentStruct terCalc::permuter ( vector<string> words, terShift s ) +{ + return permuter ( words, s.start, s.end, s.newloc ); +} - alignmentStruct terCalc::permuter ( vector<string> words, int start, int end, int newloc ) - { - int c = 0; - vector<string> nwords ( words ); - vector<vecInt> spans ( ( int ) hypSpans.size() ); - alignmentStruct to_return; - if ( PRINT_DEBUG ) - { +alignmentStruct terCalc::permuter ( vector<string> words, int start, int end, int newloc ) +{ + int c = 0; + vector<string> nwords ( words ); + vector<vecInt> spans ( ( int ) hypSpans.size() ); + alignmentStruct to_return; + if ( PRINT_DEBUG ) { + + if ( ( int ) hypSpans.size() > 0 ) { + cerr << "BEGIN DEBUG : terCalc::permuter :" << endl << "word length: " << ( int ) words.size() << " span length: " << ( int ) hypSpans.size() << endl ; + } else { + cerr << "BEGIN DEBUG : terCalc::permuter :" << endl << "word length: " << ( int ) words.size() << " span length: null" << endl ; + } + cerr << "BEGIN DEBUG : terCalc::permuter :" << endl << join(" ",words) << " start: " << start << " end: " << end << " newloc "<< newloc << endl << "END DEBUG " << endl; + } + if (newloc >= ( int ) words.size()) { + if ( PRINT_DEBUG ) { + cerr << "WARNING: Relocation over the size of the hypothesis, replacing at the end of it."<<endl; + } + newloc = ( int ) words.size()-1; + } - if ( ( int ) hypSpans.size() > 0 ) - { - cerr << "BEGIN DEBUG : terCalc::permuter :" << endl << "word length: " << ( int ) words.size() << " span length: " << ( int ) hypSpans.size() << endl ; - } - else - { - cerr << "BEGIN DEBUG : terCalc::permuter :" << endl << "word length: " << ( int ) words.size() << " span length: null" << endl ; - } - cerr << "BEGIN DEBUG : terCalc::permuter :" << endl << join(" ",words) << " start: " << start << " end: " << end << " newloc "<< newloc << endl << "END DEBUG " << endl; - } - if (newloc >= ( int ) words.size()) - { - if ( PRINT_DEBUG ) - { - cerr << "WARNING: Relocation over the size of the hypothesis, replacing at the end of it."<<endl; - } - newloc = ( int ) words.size()-1; - } - // } - if ( newloc == -1 ) - { - for ( int i = start; i <= end;i++ ) - { - nwords.at ( c++ ) = words.at ( i ); - if ( ( int ) hypSpans.size() > 0 ) - { - spans.at ( c - 1 ) = hypSpans.at ( i ); - } - } - for ( int i = 0; i <= start - 1;i++ ) - { - nwords.at ( c++ ) = words.at ( i ); - if ( ( int ) hypSpans.size() > 0 ) - { - spans.at ( c - 1 ) = hypSpans.at ( i ); - } - } - for ( int i = end + 1; i < ( int ) words.size();i++ ) - { - nwords.at ( c++ ) = words.at ( i ); - if ( ( int ) hypSpans.size() > 0 ) - { - spans.at ( c - 1 ) = hypSpans.at ( i ); - } - } + if ( newloc == -1 ) { + for ( int i = start; i <= end; i++ ) { + nwords.at ( c++ ) = words.at ( i ); + if ( ( int ) hypSpans.size() > 0 ) { + spans.at ( c - 1 ) = hypSpans.at ( i ); + } + } + for ( int i = 0; i <= start - 1; i++ ) { + nwords.at ( c++ ) = words.at ( i ); + if ( ( int ) hypSpans.size() > 0 ) { + spans.at ( c - 1 ) = hypSpans.at ( i ); + } + } + for ( int i = end + 1; i < ( int ) words.size(); i++ ) { + nwords.at ( c++ ) = words.at ( i ); + if ( ( int ) hypSpans.size() > 0 ) { + spans.at ( c - 1 ) = hypSpans.at ( i ); + } + } + } else { + if ( newloc < start ) { + + for ( int i = 0; i < newloc; i++ ) { + nwords.at ( c++ ) = words.at ( i ); + if ( ( int ) hypSpans.size() > 0 ) { + spans.at ( c - 1 ) = hypSpans.at ( i ); } - else - { - if ( newloc < start ) - { - - for ( int i = 0; i < newloc; i++ ) - { - nwords.at ( c++ ) = words.at ( i ); - if ( ( int ) hypSpans.size() > 0 ) - { - spans.at ( c - 1 ) = hypSpans.at ( i ); - } - } - for ( int i = start; i <= end;i++ ) - { - nwords.at ( c++ ) = words.at ( i ); - if ( ( int ) hypSpans.size() > 0 ) - { - spans.at ( c - 1 ) = hypSpans.at ( i ); - } - } - for ( int i = newloc ; i < start ;i++ ) - { - nwords.at ( c++ ) = words.at ( i ); - if ( ( int ) hypSpans.size() > 0 ) - { - spans.at ( c - 1 ) = hypSpans.at ( i ); - } - } - for ( int i = end + 1; i < ( int ) words.size();i++ ) - { - nwords.at ( c++ ) = words.at ( i ); - if ( ( int ) hypSpans.size() > 0 ) - { - spans.at ( c - 1 ) = hypSpans.at ( i ); - } - } - } - else - { - if ( newloc > end ) - { - for ( int i = 0; i <= start - 1; i++ ) - { - nwords.at ( c++ ) = words.at ( i ); - if ( ( int ) hypSpans.size() > 0 ) - { - spans.at ( c - 1 ) = hypSpans.at ( i ); - } - } - for ( int i = end + 1; i <= newloc;i++ ) - { - nwords.at ( c++ ) = words.at ( i ); - if ( ( int ) hypSpans.size() > 0 ) - { - spans.at ( c - 1 ) = hypSpans.at ( i ); - } - } - for ( int i = start; i <= end;i++ ) - { - nwords.at ( c++ ) = words.at ( i ); - if ( ( int ) hypSpans.size() > 0 ) - { - spans.at ( c - 1 ) = hypSpans.at ( i ); - } - } - for ( int i = newloc + 1; i < ( int ) words.size();i++ ) - { - nwords.at ( c++ ) = words.at ( i ); - if ( ( int ) hypSpans.size() > 0 ) - { - spans.at ( c - 1 ) = hypSpans.at ( i ); - } - } - } - else - { - // we are moving inside of ourselves - for ( int i = 0; i <= start - 1; i++ ) - { - nwords.at ( c++ ) = words.at ( i ); - if ( ( int ) hypSpans.size() > 0 ) - { - spans.at ( c - 1 ) = hypSpans.at ( i ); - } - } - for ( int i = end + 1; ( i < ( int ) words.size() ) && ( i <= ( end + ( newloc - start ) ) ); i++ ) - { - nwords.at ( c++ ) = words.at ( i ); - if ( ( int ) hypSpans.size() > 0 ) - { - spans.at ( c - 1 ) = hypSpans.at ( i ); - } - } - for ( int i = start; i <= end;i++ ) - { - nwords.at ( c++ ) = words.at ( i ); - if ( ( int ) hypSpans.size() > 0 ) - { - spans.at ( c - 1 ) = hypSpans.at ( i ); - } - } - for ( int i = ( end + ( newloc - start ) + 1 ); i < ( int ) words.size();i++ ) - { - nwords.at ( c++ ) = words.at ( i ); - if ( ( int ) hypSpans.size() > 0 ) - { - spans.at ( c - 1 ) = hypSpans.at ( i ); - } - } - } - } + } + for ( int i = start; i <= end; i++ ) { + nwords.at ( c++ ) = words.at ( i ); + if ( ( int ) hypSpans.size() > 0 ) { + spans.at ( c - 1 ) = hypSpans.at ( i ); } - NBR_PERMUTS_CONSID++; - - if ( PRINT_DEBUG ) - { - cerr << "nwords" << join(" ",nwords) << endl; -// cerr << "spans" << spans. << endl; - } - - to_return.nwords = nwords; - to_return.aftershift = spans; - return to_return; - } - void terCalc::setDebugMode ( bool b ) - { - PRINT_DEBUG = b; + } + for ( int i = newloc ; i < start ; i++ ) { + nwords.at ( c++ ) = words.at ( i ); + if ( ( int ) hypSpans.size() > 0 ) { + spans.at ( c - 1 ) = hypSpans.at ( i ); + } + } + for ( int i = end + 1; i < ( int ) words.size(); i++ ) { + nwords.at ( c++ ) = words.at ( i ); + if ( ( int ) hypSpans.size() > 0 ) { + spans.at ( c - 1 ) = hypSpans.at ( i ); + } + } + } else { + if ( newloc > end ) { + for ( int i = 0; i <= start - 1; i++ ) { + nwords.at ( c++ ) = words.at ( i ); + if ( ( int ) hypSpans.size() > 0 ) { + spans.at ( c - 1 ) = hypSpans.at ( i ); + } + } + for ( int i = end + 1; i <= newloc; i++ ) { + nwords.at ( c++ ) = words.at ( i ); + if ( ( int ) hypSpans.size() > 0 ) { + spans.at ( c - 1 ) = hypSpans.at ( i ); + } + } + for ( int i = start; i <= end; i++ ) { + nwords.at ( c++ ) = words.at ( i ); + if ( ( int ) hypSpans.size() > 0 ) { + spans.at ( c - 1 ) = hypSpans.at ( i ); + } + } + for ( int i = newloc + 1; i < ( int ) words.size(); i++ ) { + nwords.at ( c++ ) = words.at ( i ); + if ( ( int ) hypSpans.size() > 0 ) { + spans.at ( c - 1 ) = hypSpans.at ( i ); + } + } + } else { + // we are moving inside of ourselves + for ( int i = 0; i <= start - 1; i++ ) { + nwords.at ( c++ ) = words.at ( i ); + if ( ( int ) hypSpans.size() > 0 ) { + spans.at ( c - 1 ) = hypSpans.at ( i ); + } + } + for ( int i = end + 1; ( i < ( int ) words.size() ) && ( i <= ( end + ( newloc - start ) ) ); i++ ) { + nwords.at ( c++ ) = words.at ( i ); + if ( ( int ) hypSpans.size() > 0 ) { + spans.at ( c - 1 ) = hypSpans.at ( i ); + } + } + for ( int i = start; i <= end; i++ ) { + nwords.at ( c++ ) = words.at ( i ); + if ( ( int ) hypSpans.size() > 0 ) { + spans.at ( c - 1 ) = hypSpans.at ( i ); + } + } + for ( int i = ( end + ( newloc - start ) + 1 ); i < ( int ) words.size(); i++ ) { + nwords.at ( c++ ) = words.at ( i ); + if ( ( int ) hypSpans.size() > 0 ) { + spans.at ( c - 1 ) = hypSpans.at ( i ); + } + } + } } + } + NBR_PERMUTS_CONSID++; + + if ( PRINT_DEBUG ) { + cerr << "nwords" << join(" ",nwords) << endl; +// cerr << "spans" << spans. << endl; + } + + to_return.nwords = nwords; + to_return.aftershift = spans; + return to_return; +} +void terCalc::setDebugMode ( bool b ) +{ + PRINT_DEBUG = b; +} } diff --git a/mert/TER/tercalc.h b/mert/TER/tercalc.h index 92d9caf2b..778d83395 100644 --- a/mert/TER/tercalc.h +++ b/mert/TER/tercalc.h @@ -5,7 +5,7 @@ Copyright 2010-2013, Christophe Servan, LIUM, University of Le Mans, France Contact: christophe.servan@lium.univ-lemans.fr The tercpp tool and library are free software: you can redistribute it and/or modify it -under the terms of the GNU Lesser General Public License as published by +under the terms of the GNU Lesser General Public License as published by the Free Software Foundation, either version 3 of the licence, or (at your option) any later version. @@ -41,62 +41,62 @@ namespace TERCpp { // typedef size_t WERelement[2]; // Vecteur d'alignement contenant le hash du mot et son evaluation (0=ok, 1=sub, 2=ins, 3=del) - typedef vector<terShift> vecTerShift; - /** - @author - */ - class terCalc - { - private : +typedef vector<terShift> vecTerShift; +/** + @author +*/ +class terCalc +{ +private : // Vecteur d'alignement contenant le hash du mot et son evaluation (0=ok, 1=sub, 2=ins, 3=del) - WERalignment l_WERalignment; + WERalignment l_WERalignment; // HashMap contenant les valeurs de hash de chaque mot - hashMap bagOfWords; - int TAILLE_PERMUT_MAX; - // Increments internes - int NBR_SEGS_EVALUATED; - int NBR_PERMUTS_CONSID; - int NBR_BS_APPELS; - int DIST_MAX_PERMUT; - bool PRINT_DEBUG; + hashMap bagOfWords; + int TAILLE_PERMUT_MAX; + // Increments internes + int NBR_SEGS_EVALUATED; + int NBR_PERMUTS_CONSID; + int NBR_BS_APPELS; + int DIST_MAX_PERMUT; + bool PRINT_DEBUG; - // Utilisés dans minDistEdit et ils ne sont pas réajustés - double S[1000][1000]; - char P[1000][1000]; - vector<vecInt> refSpans; - vector<vecInt> hypSpans; - int TAILLE_BEAM; + // Utilisés dans minDistEdit et ils ne sont pas réajustés + double S[1000][1000]; + char P[1000][1000]; + vector<vecInt> refSpans; + vector<vecInt> hypSpans; + int TAILLE_BEAM; - public: - int shift_cost; - int insert_cost; - int delete_cost; - int substitute_cost; - int match_cost; - double infinite; - terCalc(); +public: + int shift_cost; + int insert_cost; + int delete_cost; + int substitute_cost; + int match_cost; + double infinite; + terCalc(); // ~terCalc(); // size_t* hashVec ( vector<string> s ); - void setDebugMode ( bool b ); + void setDebugMode ( bool b ); // int WERCalculation ( size_t * ref, size_t * hyp ); // int WERCalculation ( vector<string> ref, vector<string> hyp ); // int WERCalculation ( vector<int> ref, vector<int> hyp ); - terAlignment WERCalculation ( vector<string> hyp, vector<string> ref ); + terAlignment WERCalculation ( vector<string> hyp, vector<string> ref ); // string vectorToString(vector<string> vec); // vector<string> subVector(vector<string> vec, int start, int end); - hashMapInfos createConcordMots ( vector<string> hyp, vector<string> ref ); - terAlignment minimizeDistanceEdition ( vector<string> hyp, vector<string> ref, vector<vecInt> curHypSpans ); - bool trouverIntersection ( vecInt refSpan, vecInt hypSpan ); - terAlignment TER ( vector<string> hyp, vector<string> ref , float avRefLength ); - terAlignment TER ( vector<string> hyp, vector<string> ref ); - terAlignment TER ( vector<int> hyp, vector<int> ref ); - bestShiftStruct findBestShift ( vector<string> cur, vector<string> hyp, vector<string> ref, hashMapInfos rloc, terAlignment cur_align ); - void calculateTerAlignment ( terAlignment align, bool* herr, bool* rerr, int* ralign ); - vector<vecTerShift> calculerPermutations ( vector<string> hyp, vector<string> ref, hashMapInfos rloc, terAlignment align, bool* herr, bool* rerr, int* ralign ); - alignmentStruct permuter ( vector<string> words, terShift s ); - alignmentStruct permuter ( vector<string> words, int start, int end, int newloc ); - }; + hashMapInfos createConcordMots ( vector<string> hyp, vector<string> ref ); + terAlignment minimizeDistanceEdition ( vector<string> hyp, vector<string> ref, vector<vecInt> curHypSpans ); + bool trouverIntersection ( vecInt refSpan, vecInt hypSpan ); + terAlignment TER ( vector<string> hyp, vector<string> ref , float avRefLength ); + terAlignment TER ( vector<string> hyp, vector<string> ref ); + terAlignment TER ( vector<int> hyp, vector<int> ref ); + bestShiftStruct findBestShift ( vector<string> cur, vector<string> hyp, vector<string> ref, hashMapInfos rloc, terAlignment cur_align ); + void calculateTerAlignment ( terAlignment align, bool* herr, bool* rerr, int* ralign ); + vector<vecTerShift> calculerPermutations ( vector<string> hyp, vector<string> ref, hashMapInfos rloc, terAlignment align, bool* herr, bool* rerr, int* ralign ); + alignmentStruct permuter ( vector<string> words, terShift s ); + alignmentStruct permuter ( vector<string> words, int start, int end, int newloc ); +}; } diff --git a/mert/TER/tools.cpp b/mert/TER/tools.cpp index 64e1483b6..8858a7119 100644 --- a/mert/TER/tools.cpp +++ b/mert/TER/tools.cpp @@ -5,7 +5,7 @@ Copyright 2010-2013, Christophe Servan, LIUM, University of Le Mans, France Contact: christophe.servan@lium.univ-lemans.fr The tercpp tool and library are free software: you can redistribute it and/or modify it -under the terms of the GNU Lesser General Public License as published by +under the terms of the GNU Lesser General Public License as published by the Free Software Foundation, either version 3 of the licence, or (at your option) any later version. @@ -25,748 +25,677 @@ using namespace boost::xpressive; namespace Tools { - string vectorToString ( vector<string> vec ) - { - string retour ( "" ); - for ( vector<string>::iterator vecIter = vec.begin();vecIter != vec.end(); vecIter++ ) - { - if ( vecIter == vec.begin() ) - { - retour += ( *vecIter ); - } - else - { - retour += "\t" + ( *vecIter ); - } - } - return retour; +string vectorToString ( vector<string> vec ) +{ + string retour ( "" ); + for ( vector<string>::iterator vecIter = vec.begin(); vecIter != vec.end(); vecIter++ ) { + if ( vecIter == vec.begin() ) { + retour += ( *vecIter ); + } else { + retour += "\t" + ( *vecIter ); } - string vectorToString ( vector<char> vec ) - { - stringstream retour; - retour.str(""); - for ( vector<char>::iterator vecIter = vec.begin();vecIter != vec.end(); vecIter++ ) - { - if ( vecIter == vec.begin() ) - { - retour << ( *vecIter ); - } - else - { - retour << "\t" << ( *vecIter ); - } - } - return retour.str(); + } + return retour; +} +string vectorToString ( vector<char> vec ) +{ + stringstream retour; + retour.str(""); + for ( vector<char>::iterator vecIter = vec.begin(); vecIter != vec.end(); vecIter++ ) { + if ( vecIter == vec.begin() ) { + retour << ( *vecIter ); + } else { + retour << "\t" << ( *vecIter ); } - string vectorToString ( vector<int> vec ) - { - stringstream retour; - retour.str(""); - for ( vector<int>::iterator vecIter = vec.begin();vecIter != vec.end(); vecIter++ ) - { - if ( vecIter == vec.begin() ) - { - retour << ( *vecIter ); - } - else - { - retour << "\t" << ( *vecIter ); - } - } - return retour.str(); + } + return retour.str(); +} +string vectorToString ( vector<int> vec ) +{ + stringstream retour; + retour.str(""); + for ( vector<int>::iterator vecIter = vec.begin(); vecIter != vec.end(); vecIter++ ) { + if ( vecIter == vec.begin() ) { + retour << ( *vecIter ); + } else { + retour << "\t" << ( *vecIter ); } + } + return retour.str(); +} - string vectorToString ( vector< string > vec, string s ) - { - string retour ( "" ); - for ( vector<string>::iterator vecIter = vec.begin();vecIter != vec.end(); vecIter++ ) - { - if ( vecIter == vec.begin() ) - { - retour += ( *vecIter ); - } - else - { - retour += s + ( *vecIter ); - } - } - return retour; - +string vectorToString ( vector< string > vec, string s ) +{ + string retour ( "" ); + for ( vector<string>::iterator vecIter = vec.begin(); vecIter != vec.end(); vecIter++ ) { + if ( vecIter == vec.begin() ) { + retour += ( *vecIter ); + } else { + retour += s + ( *vecIter ); } + } + return retour; - string vectorToString ( vector< char > vec, string s ) - { - stringstream retour; - retour.str(""); - for ( vector<char>::iterator vecIter = vec.begin();vecIter != vec.end(); vecIter++ ) - { - if ( vecIter == vec.begin() ) - { - retour << ( *vecIter ); - } - else - { - retour << s << ( *vecIter ); - } - } - return retour.str(); +} +string vectorToString ( vector< char > vec, string s ) +{ + stringstream retour; + retour.str(""); + for ( vector<char>::iterator vecIter = vec.begin(); vecIter != vec.end(); vecIter++ ) { + if ( vecIter == vec.begin() ) { + retour << ( *vecIter ); + } else { + retour << s << ( *vecIter ); } + } + return retour.str(); - string vectorToString ( vector< int > vec, string s ) - { - stringstream retour; - retour.str(""); - for ( vector<int>::iterator vecIter = vec.begin();vecIter != vec.end(); vecIter++ ) - { - if ( vecIter == vec.begin() ) - { - retour << ( *vecIter ); - } - else - { - retour << s << ( *vecIter ); - } - } - return retour.str(); +} +string vectorToString ( vector< int > vec, string s ) +{ + stringstream retour; + retour.str(""); + for ( vector<int>::iterator vecIter = vec.begin(); vecIter != vec.end(); vecIter++ ) { + if ( vecIter == vec.begin() ) { + retour << ( *vecIter ); + } else { + retour << s << ( *vecIter ); } + } + return retour.str(); - string vectorToString ( vector< bool > vec, string s ) - { - stringstream retour; - retour.str(""); - for ( vector<bool>::iterator vecIter = vec.begin();vecIter != vec.end(); vecIter++ ) - { - if ( vecIter == vec.begin() ) - { - retour << ( *vecIter ); - } - else - { - retour << s << ( *vecIter ); - } - } - return retour.str(); +} +string vectorToString ( vector< bool > vec, string s ) +{ + stringstream retour; + retour.str(""); + for ( vector<bool>::iterator vecIter = vec.begin(); vecIter != vec.end(); vecIter++ ) { + if ( vecIter == vec.begin() ) { + retour << ( *vecIter ); + } else { + retour << s << ( *vecIter ); } - string vectorToString ( char* vec, string s , int taille) - { - stringstream retour; - retour.str(""); - int l_i; - for ( l_i=0; l_i < taille ; l_i++) - { - if ( l_i == 0 ) - { - retour << vec[l_i]; - } - else - { - retour << s << vec[l_i]; - } - } - return retour.str(); + } + return retour.str(); +} +string vectorToString ( char* vec, string s , int taille) +{ + stringstream retour; + retour.str(""); + int l_i; + for ( l_i=0; l_i < taille ; l_i++) { + if ( l_i == 0 ) { + retour << vec[l_i]; + } else { + retour << s << vec[l_i]; } + } + return retour.str(); - string vectorToString ( int* vec, string s , int taille) - { - stringstream retour; - retour.str(""); - int l_i; - for ( l_i=0; l_i < taille ; l_i++) - { - if ( l_i == 0 ) - { - retour << vec[l_i]; - } - else - { - retour << s << vec[l_i]; - } - } - return retour.str(); +} +string vectorToString ( int* vec, string s , int taille) +{ + stringstream retour; + retour.str(""); + int l_i; + for ( l_i=0; l_i < taille ; l_i++) { + if ( l_i == 0 ) { + retour << vec[l_i]; + } else { + retour << s << vec[l_i]; } + } + return retour.str(); - string vectorToString ( bool* vec, string s , int taille) - { - stringstream retour; - retour.str(""); - int l_i; - for ( l_i=0; l_i < taille ; l_i++) - { - if ( l_i == 0 ) - { - retour << vec[l_i]; - } - else - { - retour << s << vec[l_i]; - } - } - return retour.str(); +} +string vectorToString ( bool* vec, string s , int taille) +{ + stringstream retour; + retour.str(""); + int l_i; + for ( l_i=0; l_i < taille ; l_i++) { + if ( l_i == 0 ) { + retour << vec[l_i]; + } else { + retour << s << vec[l_i]; } - - vector<string> subVector ( vector<string> vec, int start, int end ) - { - vector<string> retour; - if ( start > end ) - { - cerr << "ERREUR : TERcalc::subVector : end > start" << endl; - exit ( 0 ); - } - for ( int i = start; ( ( i < end ) && ( i < ( int ) vec.size() ) ); i++ ) - { - retour.push_back ( vec.at ( i ) ); - } - return retour; - } - - vector<int> subVector ( vector<int> vec, int start, int end ) - { - vector<int> retour; - if ( start > end ) - { - cerr << "ERREUR : TERcalc::subVector : end > start" << endl; - exit ( 0 ); - } - for ( int i = start; ( ( i < end ) && ( i < ( int ) vec.size() ) ); i++ ) - { - retour.push_back ( vec.at ( i ) ); - } - return retour; - } - - vector<float> subVector ( vector<float> vec, int start, int end ) - { - vector<float> retour; - if ( start > end ) - { - cerr << "ERREUR : TERcalc::subVector : end > start" << endl; - exit ( 0 ); - } - for ( int i = start; ( ( i < end ) && ( i < ( int ) vec.size() ) ); i++ ) - { - retour.push_back ( vec.at ( i ) ); - } - return retour; - } - - vector<string> copyVector ( vector<string> vec ) - { - vector<string> retour; - for ( int i = 0; i < ( int ) vec.size(); i++ ) - { - retour.push_back ( vec.at ( i ) ); - } - return retour; + } + return retour.str(); + +} + +vector<string> subVector ( vector<string> vec, int start, int end ) +{ + vector<string> retour; + if ( start > end ) { + cerr << "ERREUR : TERcalc::subVector : end > start" << endl; + exit ( 0 ); + } + for ( int i = start; ( ( i < end ) && ( i < ( int ) vec.size() ) ); i++ ) { + retour.push_back ( vec.at ( i ) ); + } + return retour; +} + +vector<int> subVector ( vector<int> vec, int start, int end ) +{ + vector<int> retour; + if ( start > end ) { + cerr << "ERREUR : TERcalc::subVector : end > start" << endl; + exit ( 0 ); + } + for ( int i = start; ( ( i < end ) && ( i < ( int ) vec.size() ) ); i++ ) { + retour.push_back ( vec.at ( i ) ); + } + return retour; +} + +vector<float> subVector ( vector<float> vec, int start, int end ) +{ + vector<float> retour; + if ( start > end ) { + cerr << "ERREUR : TERcalc::subVector : end > start" << endl; + exit ( 0 ); + } + for ( int i = start; ( ( i < end ) && ( i < ( int ) vec.size() ) ); i++ ) { + retour.push_back ( vec.at ( i ) ); + } + return retour; +} + +vector<string> copyVector ( vector<string> vec ) +{ + vector<string> retour; + for ( int i = 0; i < ( int ) vec.size(); i++ ) { + retour.push_back ( vec.at ( i ) ); + } + return retour; +} +vector<int> copyVector ( vector<int> vec ) +{ + vector<int> retour; + for ( int i = 0; i < ( int ) vec.size(); i++ ) { + retour.push_back ( vec.at ( i ) ); + } + return retour; +} +vector<float> copyVector ( vector<float> vec ) +{ + vector<float> retour; + for ( int i = 0; i < ( int ) vec.size(); i++ ) { + retour.push_back ( vec.at ( i ) ); + } + return retour; +} +vector<string> stringToVector ( string s, string tok ) +{ + vector<string> to_return; + string to_push ( "" ); + bool pushed = false; + string::iterator sIt; + for ( sIt = s.begin(); sIt < s.end(); sIt++ ) { + pushed = false; + for ( string::iterator sTok = tok.begin(); sTok < tok.end(); sTok++ ) { + if ( ( *sIt ) == ( *sTok ) ) { + to_return.push_back ( to_push ); + to_push = ""; + pushed = true; + } } - vector<int> copyVector ( vector<int> vec ) - { - vector<int> retour; - for ( int i = 0; i < ( int ) vec.size(); i++ ) - { - retour.push_back ( vec.at ( i ) ); - } - return retour; + if ( !pushed ) { + to_push.push_back ( ( *sIt ) ); } - vector<float> copyVector ( vector<float> vec ) - { - vector<float> retour; - for ( int i = 0; i < ( int ) vec.size(); i++ ) - { - retour.push_back ( vec.at ( i ) ); + } + to_return.push_back ( to_push ); + return to_return; +} +vector<int> stringToVectorInt ( string s, string tok ) +{ + vector<int> to_return; + string to_push ( "" ); + bool pushed = false; + string::iterator sIt; + for ( sIt = s.begin(); sIt < s.end(); sIt++ ) { + pushed = false; + for ( string::iterator sTok = tok.begin(); sTok < tok.end(); sTok++ ) { + if ( ( *sIt ) == ( *sTok ) ) { + if ( ( int ) to_push.length() > 0 ) { + to_return.push_back ( atoi ( to_push.c_str() ) ); } - return retour; + to_push = ""; + pushed = true; + } } - vector<string> stringToVector ( string s, string tok ) - { - vector<string> to_return; - string to_push ( "" ); - bool pushed = false; - string::iterator sIt; - for ( sIt = s.begin(); sIt < s.end(); sIt++ ) - { - pushed = false; - for ( string::iterator sTok = tok.begin(); sTok < tok.end(); sTok++ ) - { - if ( ( *sIt ) == ( *sTok ) ) - { - to_return.push_back ( to_push ); - to_push = ""; - pushed = true; - } - } - if ( !pushed ) - { - to_push.push_back ( ( *sIt ) ); - } - } - to_return.push_back ( to_push ); - return to_return; + if ( !pushed ) { + to_push.push_back ( ( *sIt ) ); } - vector<int> stringToVectorInt ( string s, string tok ) - { - vector<int> to_return; - string to_push ( "" ); - bool pushed = false; - string::iterator sIt; - for ( sIt = s.begin(); sIt < s.end(); sIt++ ) - { - pushed = false; - for ( string::iterator sTok = tok.begin(); sTok < tok.end(); sTok++ ) - { - if ( ( *sIt ) == ( *sTok ) ) - { - if ( ( int ) to_push.length() > 0 ) - { - to_return.push_back ( atoi ( to_push.c_str() ) ); - } - to_push = ""; - pushed = true; - } - } - if ( !pushed ) - { - to_push.push_back ( ( *sIt ) ); - } - } - if ( ( int ) to_push.length() > 0 ) - { - to_return.push_back ( atoi ( to_push.c_str() ) ); + } + if ( ( int ) to_push.length() > 0 ) { + to_return.push_back ( atoi ( to_push.c_str() ) ); + } + return to_return; +} +vector<float> stringToVectorFloat ( string s, string tok ) +{ + vector<float> to_return; + string to_push ( "" ); + bool pushed = false; + string::iterator sIt; + for ( sIt = s.begin(); sIt < s.end(); sIt++ ) { + pushed = false; + for ( string::iterator sTok = tok.begin(); sTok < tok.end(); sTok++ ) { + if ( ( *sIt ) == ( *sTok ) ) { + if ( ( int ) to_push.length() > 0 ) { + to_return.push_back ( atof ( to_push.c_str() ) ); } - return to_return; + to_push = ""; + pushed = true; + } } - vector<float> stringToVectorFloat ( string s, string tok ) - { - vector<float> to_return; - string to_push ( "" ); - bool pushed = false; - string::iterator sIt; - for ( sIt = s.begin(); sIt < s.end(); sIt++ ) - { - pushed = false; - for ( string::iterator sTok = tok.begin(); sTok < tok.end(); sTok++ ) - { - if ( ( *sIt ) == ( *sTok ) ) - { - if ( ( int ) to_push.length() > 0 ) - { - to_return.push_back ( atof ( to_push.c_str() ) ); - } - to_push = ""; - pushed = true; - } - } - if ( !pushed ) - { - to_push.push_back ( ( *sIt ) ); - } - } - if ( ( int ) to_push.length() > 0 ) - { - to_return.push_back ( atoi ( to_push.c_str() ) ); - } - return to_return; + if ( !pushed ) { + to_push.push_back ( ( *sIt ) ); } + } + if ( ( int ) to_push.length() > 0 ) { + to_return.push_back ( atoi ( to_push.c_str() ) ); + } + return to_return; +} - string lowerCase ( string str ) - { - for ( int i = 0;i < ( int ) str.size();i++ ) - { - if ( ( str[i] >= 0x41 ) && ( str[i] <= 0x5A ) ) - { - str[i] = str[i] + 0x20; - } - } - return str; +string lowerCase ( string str ) +{ + for ( int i = 0; i < ( int ) str.size(); i++ ) { + if ( ( str[i] >= 0x41 ) && ( str[i] <= 0x5A ) ) { + str[i] = str[i] + 0x20; } - string removePunctTercom ( string str ) - { - string str_mod = str; - sregex rex; - string replace; + } + return str; +} +string removePunctTercom ( string str ) +{ + string str_mod = str; + sregex rex; + string replace; - rex = sregex::compile ( "^[ ]+" ); - replace = ""; - str_mod = regex_replace ( str_mod, rex, replace ); + rex = sregex::compile ( "^[ ]+" ); + replace = ""; + str_mod = regex_replace ( str_mod, rex, replace ); - rex = sregex::compile ( "[\"]" ); - replace = ( " " ); - str_mod = regex_replace ( str_mod, rex, replace ); + rex = sregex::compile ( "[\"]" ); + replace = ( " " ); + str_mod = regex_replace ( str_mod, rex, replace ); - rex = sregex::compile ( "[,]" ); - replace = " "; - str_mod = regex_replace ( str_mod, rex, replace ); + rex = sregex::compile ( "[,]" ); + replace = " "; + str_mod = regex_replace ( str_mod, rex, replace ); - rex = sregex::compile ( "([^0-9])([\\.])([^0-9])" ); - replace = ( "$1 $3" ); - str_mod = regex_replace ( str_mod, rex, replace ); + rex = sregex::compile ( "([^0-9])([\\.])([^0-9])" ); + replace = ( "$1 $3" ); + str_mod = regex_replace ( str_mod, rex, replace ); - rex = sregex::compile ( "([^0-9])([\\.])([^0-9])" ); - replace = ( "$1 $3" ); - str_mod = regex_replace ( str_mod, rex, replace ); + rex = sregex::compile ( "([^0-9])([\\.])([^0-9])" ); + replace = ( "$1 $3" ); + str_mod = regex_replace ( str_mod, rex, replace ); - rex = sregex::compile ( "([^0-9])([\\.])([^0-9])" ); - replace = ( "$1 $3" ); - str_mod = regex_replace ( str_mod, rex, replace ); + rex = sregex::compile ( "([^0-9])([\\.])([^0-9])" ); + replace = ( "$1 $3" ); + str_mod = regex_replace ( str_mod, rex, replace ); - rex = sregex::compile ( "([\\.]$)" ); - replace = ( " " ); - str_mod = regex_replace ( str_mod, rex, replace ); + rex = sregex::compile ( "([\\.]$)" ); + replace = ( " " ); + str_mod = regex_replace ( str_mod, rex, replace ); - rex = sregex::compile ( "[\\?]" ); - replace = ( " " ); - str_mod = regex_replace ( str_mod, rex, replace ); + rex = sregex::compile ( "[\\?]" ); + replace = ( " " ); + str_mod = regex_replace ( str_mod, rex, replace ); - rex = sregex::compile ( "[\\;]" ); - replace = ( " " ); - str_mod = regex_replace ( str_mod, rex, replace ); + rex = sregex::compile ( "[\\;]" ); + replace = ( " " ); + str_mod = regex_replace ( str_mod, rex, replace ); - rex = sregex::compile ( "[\\:]" ); - replace = ( " " ); - str_mod = regex_replace ( str_mod, rex, replace ); + rex = sregex::compile ( "[\\:]" ); + replace = ( " " ); + str_mod = regex_replace ( str_mod, rex, replace ); - rex = sregex::compile ( "[\\!]" ); - replace = ( " " ); - str_mod = regex_replace ( str_mod, rex, replace ); + rex = sregex::compile ( "[\\!]" ); + replace = ( " " ); + str_mod = regex_replace ( str_mod, rex, replace ); - rex = sregex::compile ( "[\\(]" ); - replace = ( " " ); - str_mod = regex_replace ( str_mod, rex, replace ); + rex = sregex::compile ( "[\\(]" ); + replace = ( " " ); + str_mod = regex_replace ( str_mod, rex, replace ); - rex = sregex::compile ( "[\\)]" ); - replace = ( " " ); - str_mod = regex_replace ( str_mod, rex, replace ); + rex = sregex::compile ( "[\\)]" ); + replace = ( " " ); + str_mod = regex_replace ( str_mod, rex, replace ); - rex = sregex::compile ( "[ ]+" ); - replace = " "; - str_mod = regex_replace ( str_mod, rex, replace ); + rex = sregex::compile ( "[ ]+" ); + replace = " "; + str_mod = regex_replace ( str_mod, rex, replace ); - rex = sregex::compile ( "[ ]+$" ); - replace = ""; - str_mod = regex_replace ( str_mod, rex, replace ); + rex = sregex::compile ( "[ ]+$" ); + replace = ""; + str_mod = regex_replace ( str_mod, rex, replace ); - return str_mod; - } - string removePunct ( string str ) - { - string str_mod = str; - sregex rex; - string replace; + return str_mod; +} +string removePunct ( string str ) +{ + string str_mod = str; + sregex rex; + string replace; - rex = sregex::compile ( "^[ ]+" ); - replace = ""; - str_mod = regex_replace ( str_mod, rex, replace ); + rex = sregex::compile ( "^[ ]+" ); + replace = ""; + str_mod = regex_replace ( str_mod, rex, replace ); - rex = sregex::compile ( "[\"]" ); - replace = ( " " ); - str_mod = regex_replace ( str_mod, rex, replace ); + rex = sregex::compile ( "[\"]" ); + replace = ( " " ); + str_mod = regex_replace ( str_mod, rex, replace ); - rex = sregex::compile ( "[,]" ); - replace = " "; - str_mod = regex_replace ( str_mod, rex, replace ); + rex = sregex::compile ( "[,]" ); + replace = " "; + str_mod = regex_replace ( str_mod, rex, replace ); - rex = sregex::compile ( "([^0-9])([\\.])([^0-9])" ); - replace = ( "$1 $3" ); - str_mod = regex_replace ( str_mod, rex, replace ); + rex = sregex::compile ( "([^0-9])([\\.])([^0-9])" ); + replace = ( "$1 $3" ); + str_mod = regex_replace ( str_mod, rex, replace ); - rex = sregex::compile ( "([^0-9])([\\.])([^0-9])" ); - replace = ( "$1 $3" ); - str_mod = regex_replace ( str_mod, rex, replace ); + rex = sregex::compile ( "([^0-9])([\\.])([^0-9])" ); + replace = ( "$1 $3" ); + str_mod = regex_replace ( str_mod, rex, replace ); - rex = sregex::compile ( "([^0-9])([\\.])([^0-9])" ); - replace = ( "$1 $3" ); - str_mod = regex_replace ( str_mod, rex, replace ); + rex = sregex::compile ( "([^0-9])([\\.])([^0-9])" ); + replace = ( "$1 $3" ); + str_mod = regex_replace ( str_mod, rex, replace ); - rex = sregex::compile ( "([\\.]$)" ); - replace = ( " " ); - str_mod = regex_replace ( str_mod, rex, replace ); + rex = sregex::compile ( "([\\.]$)" ); + replace = ( " " ); + str_mod = regex_replace ( str_mod, rex, replace ); - rex = sregex::compile ( "[\\?]" ); - replace = ( " " ); - str_mod = regex_replace ( str_mod, rex, replace ); + rex = sregex::compile ( "[\\?]" ); + replace = ( " " ); + str_mod = regex_replace ( str_mod, rex, replace ); - rex = sregex::compile ( "[\\;]" ); - replace = ( " " ); - str_mod = regex_replace ( str_mod, rex, replace ); + rex = sregex::compile ( "[\\;]" ); + replace = ( " " ); + str_mod = regex_replace ( str_mod, rex, replace ); - rex = sregex::compile ( "[\\:]" ); - replace = ( " " ); - str_mod = regex_replace ( str_mod, rex, replace ); + rex = sregex::compile ( "[\\:]" ); + replace = ( " " ); + str_mod = regex_replace ( str_mod, rex, replace ); - rex = sregex::compile ( "[\\!]" ); - replace = ( " " ); - str_mod = regex_replace ( str_mod, rex, replace ); + rex = sregex::compile ( "[\\!]" ); + replace = ( " " ); + str_mod = regex_replace ( str_mod, rex, replace ); - rex = sregex::compile ( "[\\(]" ); - replace = ( " " ); - str_mod = regex_replace ( str_mod, rex, replace ); + rex = sregex::compile ( "[\\(]" ); + replace = ( " " ); + str_mod = regex_replace ( str_mod, rex, replace ); - rex = sregex::compile ( "[\\)]" ); - replace = ( " " ); - str_mod = regex_replace ( str_mod, rex, replace ); + rex = sregex::compile ( "[\\)]" ); + replace = ( " " ); + str_mod = regex_replace ( str_mod, rex, replace ); - rex = sregex::compile ( "[ ]+" ); - replace = " "; - str_mod = regex_replace ( str_mod, rex, replace ); + rex = sregex::compile ( "[ ]+" ); + replace = " "; + str_mod = regex_replace ( str_mod, rex, replace ); - rex = sregex::compile ( "[ ]+$" ); - replace = ""; - str_mod = regex_replace ( str_mod, rex, replace ); + rex = sregex::compile ( "[ ]+$" ); + replace = ""; + str_mod = regex_replace ( str_mod, rex, replace ); - rex = sregex::compile ( "^[ ]+" ); - replace = ""; - str_mod = regex_replace ( str_mod, rex, replace ); + rex = sregex::compile ( "^[ ]+" ); + replace = ""; + str_mod = regex_replace ( str_mod, rex, replace ); - return str_mod; - } - string tokenizePunct ( string str ) - { - string str_mod = str; - sregex rex = sregex::compile ( "(([^0-9])([\\,])([^0-9]))" ); - string replace ( "$2 $3 $4" ); - str_mod = regex_replace ( str_mod, rex, replace ); + return str_mod; +} +string tokenizePunct ( string str ) +{ + string str_mod = str; + sregex rex = sregex::compile ( "(([^0-9])([\\,])([^0-9]))" ); + string replace ( "$2 $3 $4" ); + str_mod = regex_replace ( str_mod, rex, replace ); - rex = sregex::compile ( "(([^0-9])([\\.])([^0-9]))" ); - replace = ( "$2 $3 $4" ); - str_mod = regex_replace ( str_mod, rex, replace ); + rex = sregex::compile ( "(([^0-9])([\\.])([^0-9]))" ); + replace = ( "$2 $3 $4" ); + str_mod = regex_replace ( str_mod, rex, replace ); - rex = sregex::compile ( "( ([A-Z]|[a-z]) ([\\.]) )" ); - replace = ( " $2. " ); - str_mod = regex_replace ( str_mod, rex, replace ); + rex = sregex::compile ( "( ([A-Z]|[a-z]) ([\\.]) )" ); + replace = ( " $2. " ); + str_mod = regex_replace ( str_mod, rex, replace ); - rex = sregex::compile ( "( ([A-Z]|[a-z]) ([\\.])$)" ); - replace = ( " $2. " ); - str_mod = regex_replace ( str_mod, rex, replace ); + rex = sregex::compile ( "( ([A-Z]|[a-z]) ([\\.])$)" ); + replace = ( " $2. " ); + str_mod = regex_replace ( str_mod, rex, replace ); - rex = sregex::compile ( "(^([A-Z]|[a-z]) ([\\.]) )" ); - replace = ( " $2. " ); - str_mod = regex_replace ( str_mod, rex, replace ); + rex = sregex::compile ( "(^([A-Z]|[a-z]) ([\\.]) )" ); + replace = ( " $2. " ); + str_mod = regex_replace ( str_mod, rex, replace ); - rex = sregex::compile ( "(([A-Z]|[a-z])([\\.]) ([A-Z]|[a-z])([\\.]) )" ); - replace = ( "$2.$4. " ); - str_mod = regex_replace ( str_mod, rex, replace ); + rex = sregex::compile ( "(([A-Z]|[a-z])([\\.]) ([A-Z]|[a-z])([\\.]) )" ); + replace = ( "$2.$4. " ); + str_mod = regex_replace ( str_mod, rex, replace ); - rex = sregex::compile ( "[\\?]" ); - replace = ( " ? " ); - str_mod = regex_replace ( str_mod, rex, replace ); + rex = sregex::compile ( "[\\?]" ); + replace = ( " ? " ); + str_mod = regex_replace ( str_mod, rex, replace ); - rex = sregex::compile ( "[\\;]" ); - replace = ( " ; " ); - str_mod = regex_replace ( str_mod, rex, replace ); + rex = sregex::compile ( "[\\;]" ); + replace = ( " ; " ); + str_mod = regex_replace ( str_mod, rex, replace ); - rex = sregex::compile ( "(([^0-9])([\\:])([^0-9]))" ); - replace = ( "$2 $3 $4" ); - str_mod = regex_replace ( str_mod, rex, replace ); + rex = sregex::compile ( "(([^0-9])([\\:])([^0-9]))" ); + replace = ( "$2 $3 $4" ); + str_mod = regex_replace ( str_mod, rex, replace ); - rex = sregex::compile ( "[\\!]" ); - replace = ( " ! " ); - str_mod = regex_replace ( str_mod, rex, replace ); + rex = sregex::compile ( "[\\!]" ); + replace = ( " ! " ); + str_mod = regex_replace ( str_mod, rex, replace ); - rex = sregex::compile ( "[\\(]" ); - replace = ( " ( " ); - str_mod = regex_replace ( str_mod, rex, replace ); + rex = sregex::compile ( "[\\(]" ); + replace = ( " ( " ); + str_mod = regex_replace ( str_mod, rex, replace ); - rex = sregex::compile ( "[\\)]" ); - replace = ( " ) " ); - str_mod = regex_replace ( str_mod, rex, replace ); + rex = sregex::compile ( "[\\)]" ); + replace = ( " ) " ); + str_mod = regex_replace ( str_mod, rex, replace ); - rex = sregex::compile ( "[\"]" ); - replace = ( " \" " ); - str_mod = regex_replace ( str_mod, rex, replace ); + rex = sregex::compile ( "[\"]" ); + replace = ( " \" " ); + str_mod = regex_replace ( str_mod, rex, replace ); - rex = sregex::compile ( "(num_ \\( ([^\\)]+) \\))" ); - replace = ( "num_($2)" ); - str_mod = regex_replace ( str_mod, rex, replace ); + rex = sregex::compile ( "(num_ \\( ([^\\)]+) \\))" ); + replace = ( "num_($2)" ); + str_mod = regex_replace ( str_mod, rex, replace ); - rex = sregex::compile ( "(ordinal_ \\( ([^\\)]*) \\))" ); - replace = ( "ordinal_($2)" ); - str_mod = regex_replace ( str_mod, rex, replace ); + rex = sregex::compile ( "(ordinal_ \\( ([^\\)]*) \\))" ); + replace = ( "ordinal_($2)" ); + str_mod = regex_replace ( str_mod, rex, replace ); - rex = sregex::compile ( "(^([Mm]) \\.)" ); - replace = ( "$2." ); - str_mod = regex_replace ( str_mod, rex, replace ); + rex = sregex::compile ( "(^([Mm]) \\.)" ); + replace = ( "$2." ); + str_mod = regex_replace ( str_mod, rex, replace ); - rex = sregex::compile ( "( ([Mm]) \\.)" ); - replace = ( " $2." ); - str_mod = regex_replace ( str_mod, rex, replace ); + rex = sregex::compile ( "( ([Mm]) \\.)" ); + replace = ( " $2." ); + str_mod = regex_replace ( str_mod, rex, replace ); - rex = sregex::compile ( "(^([Dd]r) \\.)" ); - replace = ( "$2." ); - str_mod = regex_replace ( str_mod, rex, replace ); + rex = sregex::compile ( "(^([Dd]r) \\.)" ); + replace = ( "$2." ); + str_mod = regex_replace ( str_mod, rex, replace ); - rex = sregex::compile ( "( ([Dd]r) \\.)" ); - replace = ( " $2." ); - str_mod = regex_replace ( str_mod, rex, replace ); + rex = sregex::compile ( "( ([Dd]r) \\.)" ); + replace = ( " $2." ); + str_mod = regex_replace ( str_mod, rex, replace ); - rex = sregex::compile ( "(^([Mm]r) \\.)" ); - replace = ( "$2." ); - str_mod = regex_replace ( str_mod, rex, replace ); + rex = sregex::compile ( "(^([Mm]r) \\.)" ); + replace = ( "$2." ); + str_mod = regex_replace ( str_mod, rex, replace ); - rex = sregex::compile ( "( ([Mm]r) \\.)" ); - replace = ( " $2." ); - str_mod = regex_replace ( str_mod, rex, replace ); + rex = sregex::compile ( "( ([Mm]r) \\.)" ); + replace = ( " $2." ); + str_mod = regex_replace ( str_mod, rex, replace ); - rex = sregex::compile ( "(^([Mm]rs) \\.)" ); - replace = ( "$2." ); - str_mod = regex_replace ( str_mod, rex, replace ); + rex = sregex::compile ( "(^([Mm]rs) \\.)" ); + replace = ( "$2." ); + str_mod = regex_replace ( str_mod, rex, replace ); - rex = sregex::compile ( "( ([Mm]rs) \\.)" ); - replace = ( " $2." ); - str_mod = regex_replace ( str_mod, rex, replace ); + rex = sregex::compile ( "( ([Mm]rs) \\.)" ); + replace = ( " $2." ); + str_mod = regex_replace ( str_mod, rex, replace ); - rex = sregex::compile ( "(^([Nn]o) \\.)" ); - replace = ( "$2." ); - str_mod = regex_replace ( str_mod, rex, replace ); + rex = sregex::compile ( "(^([Nn]o) \\.)" ); + replace = ( "$2." ); + str_mod = regex_replace ( str_mod, rex, replace ); - rex = sregex::compile ( "( ([Nn]o) \\.)" ); - replace = ( " $2." ); - str_mod = regex_replace ( str_mod, rex, replace ); + rex = sregex::compile ( "( ([Nn]o) \\.)" ); + replace = ( " $2." ); + str_mod = regex_replace ( str_mod, rex, replace ); // rex = sregex::compile ( "(^(([Jj]an)|([Ff]ev)|([Mm]ar)|([Aa]pr)|([Jj]un)|([Jj]ul)|([Aa]ug)|([Ss]ept)|([Oo]ct)|([Nn]ov)|([Dd]ec)) \\.)" ); // replace = ( "$2." ); // str_mod = regex_replace ( str_mod, rex, replace ); -// +// // rex = sregex::compile ( "( (([Jj]an)|([Ff]ev)|([Mm]ar)|([Aa]pr)|([Jj]un)|([Jj]ul)|([Aa]ug)|([Ss]ept)|([Oo]ct)|([Nn]ov)|([Dd]ec)) \\.)" ); // replace = ( " $2." ); // str_mod = regex_replace ( str_mod, rex, replace ); -// +// // rex = sregex::compile ( "(^(([Gg]en)|([Cc]ol)) \\.)" ); // replace = ( "$2." ); // str_mod = regex_replace ( str_mod, rex, replace ); -// +// // rex = sregex::compile ( "( (([Gg]en)|([Cc]ol)) \\.)" ); // replace = ( " $2." ); // str_mod = regex_replace ( str_mod, rex, replace ); - rex = sregex::compile ( "(^(([A-Z][a-z])) \\. )" ); - replace = ( "$2. " ); - str_mod = regex_replace ( str_mod, rex, replace ); + rex = sregex::compile ( "(^(([A-Z][a-z])) \\. )" ); + replace = ( "$2. " ); + str_mod = regex_replace ( str_mod, rex, replace ); - rex = sregex::compile ( "( (([A-Z][a-z])) \\. )" ); - replace = ( " $2. " ); - str_mod = regex_replace ( str_mod, rex, replace ); + rex = sregex::compile ( "( (([A-Z][a-z])) \\. )" ); + replace = ( " $2. " ); + str_mod = regex_replace ( str_mod, rex, replace ); - rex = sregex::compile ( "(^(([A-Z][a-z][a-z])) \\. )" ); - replace = ( "$2. " ); - str_mod = regex_replace ( str_mod, rex, replace ); + rex = sregex::compile ( "(^(([A-Z][a-z][a-z])) \\. )" ); + replace = ( "$2. " ); + str_mod = regex_replace ( str_mod, rex, replace ); - rex = sregex::compile ( "( (([A-Z][a-z][a-z])) \\. )" ); - replace = ( " $2. " ); - str_mod = regex_replace ( str_mod, rex, replace ); + rex = sregex::compile ( "( (([A-Z][a-z][a-z])) \\. )" ); + replace = ( " $2. " ); + str_mod = regex_replace ( str_mod, rex, replace ); - rex = sregex::compile ( "[ ]+" ); - replace = " "; - str_mod = regex_replace ( str_mod, rex, replace ); + rex = sregex::compile ( "[ ]+" ); + replace = " "; + str_mod = regex_replace ( str_mod, rex, replace ); - rex = sregex::compile ( "^[ ]+" ); - replace = ""; - str_mod = regex_replace ( str_mod, rex, replace ); + rex = sregex::compile ( "^[ ]+" ); + replace = ""; + str_mod = regex_replace ( str_mod, rex, replace ); - rex = sregex::compile ( "[ ]+$" ); - replace = ""; - str_mod = regex_replace ( str_mod, rex, replace ); + rex = sregex::compile ( "[ ]+$" ); + replace = ""; + str_mod = regex_replace ( str_mod, rex, replace ); - return str_mod; - } + return str_mod; +} - string normalizeStd ( string str ) - { - string str_mod = str; - sregex rex = sregex::compile ( "(<skipped>)" ); - string replace ( "" ); - str_mod = regex_replace ( str_mod, rex, replace ); +string normalizeStd ( string str ) +{ + string str_mod = str; + sregex rex = sregex::compile ( "(<skipped>)" ); + string replace ( "" ); + str_mod = regex_replace ( str_mod, rex, replace ); - rex = sregex::compile ( "-\n" ); - replace = ( "" ); - str_mod = regex_replace ( str_mod, rex, replace ); + rex = sregex::compile ( "-\n" ); + replace = ( "" ); + str_mod = regex_replace ( str_mod, rex, replace ); - rex = sregex::compile ( "\n" ); - replace = ( " " ); - str_mod = regex_replace ( str_mod, rex, replace ); + rex = sregex::compile ( "\n" ); + replace = ( " " ); + str_mod = regex_replace ( str_mod, rex, replace ); - rex = sregex::compile ( """ ); - replace = ( "\"" ); - str_mod = regex_replace ( str_mod, rex, replace ); + rex = sregex::compile ( """ ); + replace = ( "\"" ); + str_mod = regex_replace ( str_mod, rex, replace ); - rex = sregex::compile ( "&" ); - replace = ( "& " ); - str_mod = regex_replace ( str_mod, rex, replace ); + rex = sregex::compile ( "&" ); + replace = ( "& " ); + str_mod = regex_replace ( str_mod, rex, replace ); - rex = sregex::compile ( "<" ); - replace = ( "<" ); - str_mod = regex_replace ( str_mod, rex, replace ); + rex = sregex::compile ( "<" ); + replace = ( "<" ); + str_mod = regex_replace ( str_mod, rex, replace ); - rex = sregex::compile ( ">" ); - replace = ( ">" ); - str_mod = regex_replace ( str_mod, rex, replace ); + rex = sregex::compile ( ">" ); + replace = ( ">" ); + str_mod = regex_replace ( str_mod, rex, replace ); - return str_mod; - } + return str_mod; +} - param copyParam ( param p ) - { - param to_return; - to_return.caseOn = p.caseOn; - to_return.noPunct = p.noPunct; - to_return.debugMode = p.debugMode; - to_return.debugLevel = p.debugLevel; - to_return.hypothesisFile = p.hypothesisFile; - to_return.referenceFile = p.referenceFile; - to_return.normalize = p.normalize; - to_return.noTxtIds = p.noTxtIds; - to_return.outputFileExtension = p.outputFileExtension; - to_return.outputFileName = p.outputFileName; - to_return.sgmlInputs = p.sgmlInputs; - to_return.tercomLike = p.tercomLike; - to_return.printAlignments = p.printAlignments; - to_return.WER=p.WER; - return to_return; - } - string printParams ( param p ) - { - stringstream s; - s << "caseOn = " << p.caseOn << endl; - s << "noPunct = " << p.noPunct << endl; - s << "debugMode = " << p.debugMode << endl; - s << "debugLevel = " << p.debugLevel << endl; - s << "hypothesisFile = " << p.hypothesisFile << endl; - s << "referenceFile = " << p.referenceFile << endl; - s << "normalize = " << p.normalize << endl; - s << "noTxtIds = " << p.noTxtIds << endl; - s << "outputFileExtension = " << p.outputFileExtension << endl; - s << "outputFileName = " << p.outputFileName << endl; - s << "sgmlInputs = " << p.sgmlInputs << endl; - s << "tercomLike = " << p.tercomLike << endl; - return s.str(); +param copyParam ( param p ) +{ + param to_return; + to_return.caseOn = p.caseOn; + to_return.noPunct = p.noPunct; + to_return.debugMode = p.debugMode; + to_return.debugLevel = p.debugLevel; + to_return.hypothesisFile = p.hypothesisFile; + to_return.referenceFile = p.referenceFile; + to_return.normalize = p.normalize; + to_return.noTxtIds = p.noTxtIds; + to_return.outputFileExtension = p.outputFileExtension; + to_return.outputFileName = p.outputFileName; + to_return.sgmlInputs = p.sgmlInputs; + to_return.tercomLike = p.tercomLike; + to_return.printAlignments = p.printAlignments; + to_return.WER=p.WER; + return to_return; +} +string printParams ( param p ) +{ + stringstream s; + s << "caseOn = " << p.caseOn << endl; + s << "noPunct = " << p.noPunct << endl; + s << "debugMode = " << p.debugMode << endl; + s << "debugLevel = " << p.debugLevel << endl; + s << "hypothesisFile = " << p.hypothesisFile << endl; + s << "referenceFile = " << p.referenceFile << endl; + s << "normalize = " << p.normalize << endl; + s << "noTxtIds = " << p.noTxtIds << endl; + s << "outputFileExtension = " << p.outputFileExtension << endl; + s << "outputFileName = " << p.outputFileName << endl; + s << "sgmlInputs = " << p.sgmlInputs << endl; + s << "tercomLike = " << p.tercomLike << endl; + return s.str(); - } - string join ( string delim, vector<string> arr ) - { - if ( ( int ) arr.size() == 0 ) return ""; +} +string join ( string delim, vector<string> arr ) +{ + if ( ( int ) arr.size() == 0 ) return ""; // if ((int)delim.compare("") == 0) delim = new String(""); // String s = new String(""); - stringstream s; - s.str ( "" ); - for ( int i = 0; i < ( int ) arr.size(); i++ ) - { - if ( i == 0 ) - { - s << arr.at ( i ); - } - else - { - s << delim << arr.at ( i ); - } - } - return s.str(); -// return ""; + stringstream s; + s.str ( "" ); + for ( int i = 0; i < ( int ) arr.size(); i++ ) { + if ( i == 0 ) { + s << arr.at ( i ); + } else { + s << delim << arr.at ( i ); } + } + return s.str(); +// return ""; +} } diff --git a/mert/TER/tools.h b/mert/TER/tools.h index 0a85e7b4b..157b739a5 100644 --- a/mert/TER/tools.h +++ b/mert/TER/tools.h @@ -5,7 +5,7 @@ Copyright 2010-2013, Christophe Servan, LIUM, University of Le Mans, France Contact: christophe.servan@lium.univ-lemans.fr The tercpp tool and library are free software: you can redistribute it and/or modify it -under the terms of the GNU Lesser General Public License as published by +under the terms of the GNU Lesser General Public License as published by the Free Software Foundation, either version 3 of the licence, or (at your option) any later version. @@ -35,32 +35,31 @@ using namespace std; namespace Tools { - typedef vector<double> vecDouble; - typedef vector<char> vecChar; - typedef vector<int> vecInt; - typedef vector<float> vecFloat; - typedef vector<size_t> vecSize_t; - typedef vector<string> vecString; - typedef vector<string> alignmentElement; - typedef vector<alignmentElement> WERalignment; +typedef vector<double> vecDouble; +typedef vector<char> vecChar; +typedef vector<int> vecInt; +typedef vector<float> vecFloat; +typedef vector<size_t> vecSize_t; +typedef vector<string> vecString; +typedef vector<string> alignmentElement; +typedef vector<alignmentElement> WERalignment; -struct param -{ - bool debugMode; - string referenceFile; // path to the resources - string hypothesisFile; // path to the configuration files - string outputFileExtension; - string outputFileName; - bool noPunct; - bool caseOn; - bool normalize; - bool tercomLike; - bool sgmlInputs; - bool noTxtIds; - bool printAlignments; - bool WER; - int debugLevel; +struct param { + bool debugMode; + string referenceFile; // path to the resources + string hypothesisFile; // path to the configuration files + string outputFileExtension; + string outputFileName; + bool noPunct; + bool caseOn; + bool normalize; + bool tercomLike; + bool sgmlInputs; + bool noTxtIds; + bool printAlignments; + bool WER; + int debugLevel; }; // param = { false, "","","","" }; @@ -68,35 +67,35 @@ struct param // private: // public: - string vectorToString ( vector<string> vec ); - string vectorToString ( vector<char> vec ); - string vectorToString ( vector<int> vec ); - string vectorToString ( vector<string> vec, string s ); - string vectorToString ( vector<char> vec, string s ); - string vectorToString ( vector<int> vec, string s ); - string vectorToString ( vector<bool> vec, string s ); - string vectorToString ( char* vec, string s, int taille ); - string vectorToString ( int* vec, string s , int taille ); - string vectorToString ( bool* vec, string s , int taille ); - vector<string> subVector ( vector<string> vec, int start, int end ); - vector<int> subVector ( vector<int> vec, int start, int end ); - vector<float> subVector ( vector<float> vec, int start, int end ); - vector<string> copyVector ( vector<string> vec ); - vector<int> copyVector ( vector<int> vec ); - vector<float> copyVector ( vector<float> vec ); - vector<string> stringToVector ( string s, string tok ); - vector<string> stringToVector ( char s, string tok ); - vector<string> stringToVector ( int s, string tok ); - vector<int> stringToVectorInt ( string s, string tok ); - vector<float> stringToVectorFloat ( string s, string tok ); - string lowerCase(string str); - string removePunct(string str); - string tokenizePunct(string str); - string removePunctTercom(string str); - string normalizeStd(string str); - string printParams(param p); - string join ( string delim, vector<string> arr ); +string vectorToString ( vector<string> vec ); +string vectorToString ( vector<char> vec ); +string vectorToString ( vector<int> vec ); +string vectorToString ( vector<string> vec, string s ); +string vectorToString ( vector<char> vec, string s ); +string vectorToString ( vector<int> vec, string s ); +string vectorToString ( vector<bool> vec, string s ); +string vectorToString ( char* vec, string s, int taille ); +string vectorToString ( int* vec, string s , int taille ); +string vectorToString ( bool* vec, string s , int taille ); +vector<string> subVector ( vector<string> vec, int start, int end ); +vector<int> subVector ( vector<int> vec, int start, int end ); +vector<float> subVector ( vector<float> vec, int start, int end ); +vector<string> copyVector ( vector<string> vec ); +vector<int> copyVector ( vector<int> vec ); +vector<float> copyVector ( vector<float> vec ); +vector<string> stringToVector ( string s, string tok ); +vector<string> stringToVector ( char s, string tok ); +vector<string> stringToVector ( int s, string tok ); +vector<int> stringToVectorInt ( string s, string tok ); +vector<float> stringToVectorFloat ( string s, string tok ); +string lowerCase(string str); +string removePunct(string str); +string tokenizePunct(string str); +string removePunctTercom(string str); +string normalizeStd(string str); +string printParams(param p); +string join ( string delim, vector<string> arr ); // }; - param copyParam(param p); +param copyParam(param p); } #endif diff --git a/mert/evaluator.cpp b/mert/evaluator.cpp index caae07684..7ab03c7eb 100644 --- a/mert/evaluator.cpp +++ b/mert/evaluator.cpp @@ -43,7 +43,8 @@ private: }; // load hypothesis from candidate output -vector<ScoreStats> EvaluatorUtil::loadCand(const string& candFile) { +vector<ScoreStats> EvaluatorUtil::loadCand(const string& candFile) +{ ifstream cand(candFile.c_str()); if (!cand.good()) throw runtime_error("Error opening candidate file"); @@ -61,7 +62,8 @@ vector<ScoreStats> EvaluatorUtil::loadCand(const string& candFile) { } // load 1-best hypothesis from n-best file (useful if relying on alignment/tree information) -vector<ScoreStats> EvaluatorUtil::loadNBest(const string& nBestFile) { +vector<ScoreStats> EvaluatorUtil::loadNBest(const string& nBestFile) +{ vector<ScoreStats> entries; Data data(g_scorer); @@ -81,8 +83,7 @@ void EvaluatorUtil::evaluate(const string& candFile, int bootstrap, bool nbest_i if (nbest_input) { entries = loadNBest(candFile); - } - else { + } else { entries = loadCand(candFile); } diff --git a/mert/kbmira.cpp b/mert/kbmira.cpp index 0abce8af4..5a119e875 100644 --- a/mert/kbmira.cpp +++ b/mert/kbmira.cpp @@ -77,7 +77,7 @@ int main(int argc, char** argv) bool model_bg = false; // Use model for background corpus bool verbose = false; // Verbose updates bool safe_hope = false; // Model score cannot have more than BLEU_RATIO times more influence than BLEU - size_t hgPruning = 50; //prune hypergraphs to have this many edges per reference word + size_t hgPruning = 50; //prune hypergraphs to have this many edges per reference word // Command-line processing follows pro.cpp po::options_description desc("Allowed options"); @@ -157,7 +157,7 @@ int main(int argc, char** argv) do { size_t equals = buffer.find_last_of("="); UTIL_THROW_IF(equals == buffer.npos, util::Exception, "Incorrect format in dense feature file: '" - << buffer << "'"); + << buffer << "'"); string name = buffer.substr(0,equals); names.push_back(name); initParams.push_back(boost::lexical_cast<ValType>(buffer.substr(equals+2))); @@ -183,7 +183,7 @@ int main(int argc, char** argv) //Make sure that SparseVector encodes dense feature names as 0..n-1. for (size_t i = 0; i < names.size(); ++i) { size_t id = SparseVector::encode(names[i]); - assert(id == i); + assert(id == i); if (verbose) cerr << names[i] << " " << initParams[i] << endl; } @@ -246,12 +246,12 @@ int main(int argc, char** argv) int iNumUpdates = 0; ValType totalLoss = 0.0; size_t sentenceIndex = 0; - for(decoder->reset();!decoder->finished(); decoder->next()) { + for(decoder->reset(); !decoder->finished(); decoder->next()) { HopeFearData hfd; decoder->HopeFear(bg,wv,&hfd); - + // Update weights - if (!hfd.hopeFearEqual && hfd.hopeBleu > hfd.fearBleu) { + if (!hfd.hopeFearEqual && hfd.hopeBleu > hfd.fearBleu) { // Vector difference MiraFeatureVector diff = hfd.hopeFeatures - hfd.fearFeatures; // Bleu difference |