diff options
author | hieuhoang1972 <hieuhoang1972@1f5c12ca-751b-0410-a591-d2e778427230> | 2011-09-14 14:23:14 +0400 |
---|---|---|
committer | hieuhoang1972 <hieuhoang1972@1f5c12ca-751b-0410-a591-d2e778427230> | 2011-09-14 14:23:14 +0400 |
commit | 149208ecbaed011a4555c55fcd5b828b4a9a8cc0 (patch) | |
tree | 355d3446d370a38777d2866c1c1d0c7aea526da2 /scripts | |
parent | d68274d2178196a4d1f9ffa3347336eec0e50f93 (diff) |
print out span widths of non-terms. Extra argument --OutputNTLengths
git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@4224 1f5c12ca-751b-0410-a591-d2e778427230
Diffstat (limited to 'scripts')
-rw-r--r-- | scripts/training/phrase-extract/PhraseAlignment.cpp | 77 | ||||
-rw-r--r-- | scripts/training/phrase-extract/PhraseAlignment.h | 4 | ||||
-rw-r--r-- | scripts/training/phrase-extract/score.cpp | 19 |
3 files changed, 97 insertions, 3 deletions
diff --git a/scripts/training/phrase-extract/PhraseAlignment.cpp b/scripts/training/phrase-extract/PhraseAlignment.cpp index c6e6fb92c..6ff7cc97a 100644 --- a/scripts/training/phrase-extract/PhraseAlignment.cpp +++ b/scripts/training/phrase-extract/PhraseAlignment.cpp @@ -7,6 +7,7 @@ * */ +#include <sstream> #include "PhraseAlignment.h" #include "SafeGetline.h" #include "tables-core.h" @@ -19,6 +20,59 @@ extern Vocabulary vcbS; extern bool hierarchicalFlag; +//! convert string to variable of type T. Used to reading floats, int etc from files +template<typename T> +inline T Scan(const std::string &input) +{ + std::stringstream stream(input); + T ret; + stream >> ret; + return ret; +} + + +//! speeded up version of above +template<typename T> +inline void Scan(std::vector<T> &output, const std::vector< std::string > &input) +{ + output.resize(input.size()); + for (size_t i = 0 ; i < input.size() ; i++) + { + output[i] = Scan<T>( input[i] ); + } +} + + +inline void Tokenize(std::vector<std::string> &output + , const std::string& str + , const std::string& delimiters = " \t") +{ + // Skip delimiters at beginning. + std::string::size_type lastPos = str.find_first_not_of(delimiters, 0); + // Find first "non-delimiter". + std::string::size_type pos = str.find_first_of(delimiters, lastPos); + + while (std::string::npos != pos || std::string::npos != lastPos) { + // Found a token, add it to the vector. + output.push_back(str.substr(lastPos, pos - lastPos)); + // Skip delimiters. Note the "not_of" + lastPos = str.find_first_not_of(delimiters, pos); + // Find next "non-delimiter" + pos = str.find_first_of(delimiters, lastPos); + } +} + +// speeded up version of above +template<typename T> +inline void Tokenize( std::vector<T> &output + , const std::string &input + , const std::string& delimiters = " \t") +{ + std::vector<std::string> stringVector; + Tokenize(stringVector, input, delimiters); + return Scan<T>(output, stringVector ); +} + // read in a phrase pair and store it void PhraseAlignment::create( char line[], int lineID ) { @@ -37,7 +91,6 @@ void PhraseAlignment::create( char line[], int lineID ) else if (item == 2) { // target phrase phraseT.push_back( vcbT.storeIfNew( token[j] ) ); } - else if (item == 3) { // alignment int s,t; sscanf(token[j].c_str(), "%d-%d", &s, &t); @@ -56,6 +109,9 @@ void PhraseAlignment::create( char line[], int lineID ) } else if (item == 4) { // count sscanf(token[j].c_str(), "%f", &count); } + else if (item == 5) { // non-term lengths + addNTLength(token[j]); + } } createAlignVec(phraseS.size(), phraseT.size()); @@ -63,11 +119,28 @@ void PhraseAlignment::create( char line[], int lineID ) if (item == 3) { count = 1.0; } - if (item < 3 || item > 4) { + if (item < 3 || item > 5) { cerr << "ERROR: faulty line " << lineID << ": " << line << endl; } } +void PhraseAlignment::addNTLength(const std::string &tok) +{ + vector< string > tokens; + + Tokenize(tokens, tok, "="); + assert(tokens.size() == 2); + + size_t sourcePos = Scan<size_t>(tokens[0]); + assert(sourcePos < phraseS.size()); + + vector< size_t > ntLengths; + Tokenize<size_t>(ntLengths, tokens[1], ","); + assert(ntLengths.size() == 2); + + m_ntLengths[sourcePos] = std::pair<size_t, size_t>(ntLengths[0], ntLengths[1]); +} + void PhraseAlignment::createAlignVec(size_t sourceSize, size_t targetSize) { // in case of no align info. always need align info, even if blank diff --git a/scripts/training/phrase-extract/PhraseAlignment.h b/scripts/training/phrase-extract/PhraseAlignment.h index d3557e4a8..68b82563a 100644 --- a/scripts/training/phrase-extract/PhraseAlignment.h +++ b/scripts/training/phrase-extract/PhraseAlignment.h @@ -11,6 +11,7 @@ #include <vector> #include <set> +#include <map> // data structure for a single phrase pair class PhraseAlignment @@ -19,7 +20,10 @@ protected: PHRASE phraseS; PHRASE phraseT; + std::map<size_t, std::pair<size_t, size_t> > m_ntLengths; + void createAlignVec(size_t sourceSize, size_t targetSize); + void addNTLength(const std::string &tok); public: float count; std::vector< std::set<size_t> > alignedToT; diff --git a/scripts/training/phrase-extract/score.cpp b/scripts/training/phrase-extract/score.cpp index fbb27b944..5ae714aa4 100644 --- a/scripts/training/phrase-extract/score.cpp +++ b/scripts/training/phrase-extract/score.cpp @@ -65,6 +65,7 @@ double computeUnalignedPenalty( const PHRASE &, const PHRASE &, PhraseAlignment set<string> functionWordList; void loadFunctionWords( const char* fileNameFunctionWords ); double computeUnalignedFWPenalty( const PHRASE &, const PHRASE &, PhraseAlignment * ); +void calcNTLengthProb(const vector< PhraseAlignment* > &phrasePair, map<size_t, pair<float, float> > &ntLengthsProb); LexicalTable lexTable; bool inverseFlag = false; @@ -78,6 +79,7 @@ int negLogProb = 1; bool lexFlag = true; bool unalignedFlag = false; bool unalignedFWFlag = false; +bool outputNTLengths = false; int countOfCounts[COC_MAX+1]; int totalDistinct = 0; float minCountHierarchical = 0; @@ -148,6 +150,8 @@ int main(int argc, char* argv[]) minCountHierarchical = atof(argv[++i]); cerr << "dropping all phrase pairs occurring less than " << minCountHierarchical << " times\n"; minCountHierarchical -= 0.00001; // account for rounding + } else if (strcmp(argv[i],"--OutputNTLengths") == 0) { + outputNTLengths = true; } else { cerr << "ERROR: unknown option " << argv[i] << endl; exit(1); @@ -325,12 +329,17 @@ PhraseAlignment* findBestAlignment( vector< PhraseAlignment* > &phrasePair ) return bestAlignment; } +void calcNTLengthProb(const vector< PhraseAlignment* > &phrasePair, map<size_t, pair<float, float> > &ntLengthsProb) +{ + +} + void outputPhrasePair( vector< PhraseAlignment* > &phrasePair, float totalCount, int distinctCount, ostream &phraseTableFile ) { if (phrasePair.size() == 0) return; PhraseAlignment *bestAlignment = findBestAlignment( phrasePair ); - + // compute count float count = 0; for(size_t i=0; i<phrasePair.size(); i++) { @@ -433,6 +442,14 @@ void outputPhrasePair( vector< PhraseAlignment* > &phrasePair, float totalCount, phraseTableFile << " ||| " << totalCount << " " << count; if (kneserNeyFlag) phraseTableFile << " " << distinctCount; + + // nt lengths + map<size_t, pair<float, float> > ntLengthsProb; + if (outputNTLengths) + { + calcNTLengthProb(phrasePair, ntLengthsProb); + } + phraseTableFile << endl; } |