/*********************************************************************** Moses - factored phrase-based language decoder Copyright (C) 2010 University of Edinburgh This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ***********************************************************************/ #include "SentenceAlignment.h" #include #include #include #include "tables-core.h" using namespace std; namespace MosesTraining { SentenceAlignment::~SentenceAlignment() {} void addBoundaryWords(vector &phrase) { phrase.insert(phrase.begin(), "~~"); phrase.push_back("~~"); } bool SentenceAlignment::processTargetSentence(const char * targetString, int, bool boundaryRules) { target = tokenize(targetString); if (boundaryRules) addBoundaryWords(target); return true; } bool SentenceAlignment::processSourceSentence(const char * sourceString, int, bool boundaryRules) { source = tokenize(sourceString); if (boundaryRules) addBoundaryWords(source); return true; } bool SentenceAlignment::create( char targetString[], char sourceString[], char alignmentString[], char weightString[], int sentenceID, bool boundaryRules) { using namespace std; this->sentenceID = sentenceID; this->weightString = std::string(weightString); // process sentence strings and store in target and source members. if (!processTargetSentence(targetString, sentenceID, boundaryRules)) { return false; } if (!processSourceSentence(sourceString, sentenceID, boundaryRules)) { return false; } // check if sentences are empty if (target.size() == 0 || source.size() == 0) { cerr << "no target (" << target.size() << ") or source (" << source.size() << ") words << end insentence " << sentenceID << endl; cerr << "T: " << targetString << endl << "S: " << sourceString << endl; return false; } // prepare data structures for alignments for(size_t i=0; i dummy; alignedToT.push_back( dummy ); } // reading in alignments vector alignmentSequence = tokenize( alignmentString ); for(size_t i=0; i= target.size() || (size_t)s >= source.size()) { cerr << "WARNING: sentence " << sentenceID << " has alignment point (" << s << ", " << t << ") out of bounds (" << source.size() << ", " << target.size() << ")\n"; cerr << "T: " << targetString << endl << "S: " << sourceString << endl; return false; } alignedToT[t].push_back( s ); alignedCountS[s]++; } if (boundaryRules) { alignedToT[0].push_back(0); alignedCountS[0]++; alignedToT.back().push_back(alignedCountS.size() - 1); alignedCountS.back()++; } return true; } }