Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLexi Birch <lexi.birch@gmail.com>2015-06-08 19:13:00 +0300
committerLexi Birch <lexi.birch@gmail.com>2015-06-08 19:13:00 +0300
commitb76194a16b3e2c070522751ff40762c3f8870bce (patch)
tree3d31e58d922981289fe1b825a057ee16f002f0e6 /phrase-extract
parent501c51947b192e8559fa35d820ebd951566bebba (diff)
parentc306715e828f23ffceefebde8e227fc1bd7ff4d0 (diff)
Merge branch 'master' of https://github.com/moses-smt/mosesdecoder
Diffstat (limited to 'phrase-extract')
-rw-r--r--phrase-extract/DomainFeature.cpp5
-rw-r--r--phrase-extract/DomainFeature.h6
-rw-r--r--phrase-extract/ExtractionPhrasePair.cpp2
-rw-r--r--phrase-extract/ExtractionPhrasePair.h2
-rw-r--r--phrase-extract/InternalStructFeature.h8
-rw-r--r--phrase-extract/OutputFileStream.cpp45
-rw-r--r--phrase-extract/OutputFileStream.h35
-rw-r--r--phrase-extract/PropertiesConsolidator.cpp24
-rw-r--r--phrase-extract/SentenceAlignment.cpp7
-rw-r--r--phrase-extract/SentenceAlignmentWithSyntax.cpp8
-rw-r--r--phrase-extract/SentenceAlignmentWithSyntax.h6
-rw-r--r--phrase-extract/SyntaxNode.h43
-rw-r--r--phrase-extract/SyntaxNodeCollection.cpp131
-rw-r--r--phrase-extract/SyntaxNodeCollection.h79
-rw-r--r--phrase-extract/SyntaxTree.cpp188
-rw-r--r--phrase-extract/SyntaxTree.h126
-rw-r--r--phrase-extract/XmlTree.cpp56
-rw-r--r--phrase-extract/XmlTree.h84
-rw-r--r--phrase-extract/consolidate-direct-main.cpp5
-rw-r--r--phrase-extract/consolidate-main.cpp288
-rw-r--r--phrase-extract/consolidate-reverse-main.cpp10
-rw-r--r--phrase-extract/extract-ghkm/Alignment.cpp4
-rw-r--r--phrase-extract/extract-ghkm/Alignment.h4
-rw-r--r--phrase-extract/extract-ghkm/AlignmentGraph.cpp36
-rw-r--r--phrase-extract/extract-ghkm/AlignmentGraph.h17
-rw-r--r--phrase-extract/extract-ghkm/ComposedRule.cpp12
-rw-r--r--phrase-extract/extract-ghkm/ComposedRule.h8
-rw-r--r--phrase-extract/extract-ghkm/Exception.h4
-rw-r--r--phrase-extract/extract-ghkm/ExtractGHKM.cpp352
-rw-r--r--phrase-extract/extract-ghkm/ExtractGHKM.h51
-rw-r--r--phrase-extract/extract-ghkm/Jamfile2
-rw-r--r--phrase-extract/extract-ghkm/Main.cpp2
-rw-r--r--phrase-extract/extract-ghkm/Node.cpp4
-rw-r--r--phrase-extract/extract-ghkm/Node.h8
-rw-r--r--phrase-extract/extract-ghkm/Options.h11
-rw-r--r--phrase-extract/extract-ghkm/ParseTree.cpp56
-rw-r--r--phrase-extract/extract-ghkm/ParseTree.h97
-rw-r--r--phrase-extract/extract-ghkm/PhraseOrientation.cpp4
-rw-r--r--phrase-extract/extract-ghkm/PhraseOrientation.h18
-rw-r--r--phrase-extract/extract-ghkm/Rule.cpp4
-rw-r--r--phrase-extract/extract-ghkm/Rule.h4
-rw-r--r--phrase-extract/extract-ghkm/ScfgRule.cpp38
-rw-r--r--phrase-extract/extract-ghkm/ScfgRule.h27
-rw-r--r--phrase-extract/extract-ghkm/ScfgRuleWriter.cpp59
-rw-r--r--phrase-extract/extract-ghkm/ScfgRuleWriter.h11
-rw-r--r--phrase-extract/extract-ghkm/Span.cpp4
-rw-r--r--phrase-extract/extract-ghkm/Span.h4
-rw-r--r--phrase-extract/extract-ghkm/StsgRule.cpp9
-rw-r--r--phrase-extract/extract-ghkm/StsgRule.h8
-rw-r--r--phrase-extract/extract-ghkm/StsgRuleWriter.cpp12
-rw-r--r--phrase-extract/extract-ghkm/StsgRuleWriter.h8
-rw-r--r--phrase-extract/extract-ghkm/Subgraph.cpp32
-rw-r--r--phrase-extract/extract-ghkm/Subgraph.h13
-rw-r--r--phrase-extract/extract-ghkm/XmlTreeParser.cpp94
-rw-r--r--phrase-extract/extract-ghkm/XmlTreeParser.h68
-rw-r--r--phrase-extract/extract-main.cpp13
-rw-r--r--phrase-extract/extract-mixed-syntax/ConsistentPhrases.cpp16
-rw-r--r--phrase-extract/extract-mixed-syntax/Rule.cpp9
-rw-r--r--phrase-extract/extract-mixed-syntax/pugiconfig.hpp2
-rw-r--r--phrase-extract/extract-mixed-syntax/pugixml.cpp16
-rw-r--r--phrase-extract/extract-mixed-syntax/pugixml.hpp2167
-rw-r--r--phrase-extract/extract-rules-main.cpp37
-rw-r--r--phrase-extract/filter-rule-table/FilterRuleTable.cpp25
-rw-r--r--phrase-extract/filter-rule-table/FilterRuleTable.h24
-rw-r--r--phrase-extract/filter-rule-table/ForestTsgFilter.h1
-rw-r--r--phrase-extract/filter-rule-table/TreeCfgFilter.cpp2
-rw-r--r--phrase-extract/filter-rule-table/TreeCfgFilter.h10
-rw-r--r--phrase-extract/filter-rule-table/TreeTsgFilter.cpp18
-rw-r--r--phrase-extract/filter-rule-table/TreeTsgFilter.h9
-rw-r--r--phrase-extract/pcfg-common/Jamfile1
-rw-r--r--phrase-extract/pcfg-common/pcfg.h63
-rw-r--r--phrase-extract/pcfg-common/pcfg_tree.h79
-rw-r--r--phrase-extract/pcfg-common/syntax_tree.h93
-rw-r--r--phrase-extract/pcfg-common/tool.cc82
-rw-r--r--phrase-extract/pcfg-common/tool.h93
-rw-r--r--phrase-extract/pcfg-common/typedef.h39
-rw-r--r--phrase-extract/pcfg-common/xml_tree_parser.cc88
-rw-r--r--phrase-extract/pcfg-common/xml_tree_parser.h58
-rw-r--r--phrase-extract/pcfg-common/xml_tree_writer.h135
-rw-r--r--phrase-extract/pcfg-extract/Jamfile2
-rw-r--r--phrase-extract/pcfg-extract/main.cc9
-rw-r--r--phrase-extract/pcfg-extract/pcfg_extract.cc54
-rw-r--r--phrase-extract/pcfg-extract/pcfg_extract.h2
-rw-r--r--phrase-extract/pcfg-extract/rule_collection.cc23
-rw-r--r--phrase-extract/pcfg-extract/rule_collection.h2
-rw-r--r--phrase-extract/pcfg-extract/rule_extractor.cc35
-rw-r--r--phrase-extract/pcfg-extract/rule_extractor.h8
-rw-r--r--phrase-extract/pcfg-score/Jamfile2
-rw-r--r--phrase-extract/pcfg-score/main.cc9
-rw-r--r--phrase-extract/pcfg-score/pcfg_score.cc53
-rw-r--r--phrase-extract/pcfg-score/pcfg_score.h2
-rw-r--r--phrase-extract/pcfg-score/tree_scorer.cc87
-rw-r--r--phrase-extract/pcfg-score/tree_scorer.h14
-rw-r--r--phrase-extract/postprocess-egret-forests/Forest.h46
-rw-r--r--phrase-extract/postprocess-egret-forests/ForestParser.cpp151
-rw-r--r--phrase-extract/postprocess-egret-forests/ForestParser.h87
-rw-r--r--phrase-extract/postprocess-egret-forests/ForestWriter.cpp105
-rw-r--r--phrase-extract/postprocess-egret-forests/ForestWriter.h36
-rw-r--r--phrase-extract/postprocess-egret-forests/Jamfile1
-rw-r--r--phrase-extract/postprocess-egret-forests/Main.cpp9
-rw-r--r--phrase-extract/postprocess-egret-forests/Options.h22
-rw-r--r--phrase-extract/postprocess-egret-forests/PostprocessEgretForests.cpp151
-rw-r--r--phrase-extract/postprocess-egret-forests/PostprocessEgretForests.h38
-rw-r--r--phrase-extract/postprocess-egret-forests/SplitPoint.cpp111
-rw-r--r--phrase-extract/postprocess-egret-forests/SplitPoint.h27
-rw-r--r--phrase-extract/postprocess-egret-forests/SplitPointFileParser.cpp86
-rw-r--r--phrase-extract/postprocess-egret-forests/SplitPointFileParser.h51
-rw-r--r--phrase-extract/postprocess-egret-forests/Symbol.h48
-rw-r--r--phrase-extract/postprocess-egret-forests/TopologicalSorter.cpp56
-rw-r--r--phrase-extract/postprocess-egret-forests/TopologicalSorter.h34
-rw-r--r--phrase-extract/relax-parse-main.cpp103
-rw-r--r--phrase-extract/relax-parse.h14
-rw-r--r--phrase-extract/score-main.cpp170
-rw-r--r--phrase-extract/score-stsg/ScoreStsg.cpp26
-rw-r--r--phrase-extract/score-stsg/ScoreStsg.h15
-rw-r--r--phrase-extract/score.h2
-rw-r--r--phrase-extract/statistics-main.cpp13
-rw-r--r--phrase-extract/syntax-common/pcfg.cc (renamed from phrase-extract/pcfg-common/pcfg.cc)21
-rw-r--r--phrase-extract/syntax-common/pcfg.h38
-rw-r--r--phrase-extract/syntax-common/tool.cc57
-rw-r--r--phrase-extract/syntax-common/tool.h53
-rw-r--r--phrase-extract/syntax-common/tree-inl.h130
-rw-r--r--phrase-extract/syntax-common/tree.h28
-rw-r--r--phrase-extract/syntax-common/tree_test.cc80
-rw-r--r--phrase-extract/syntax-common/vocabulary.h13
-rw-r--r--phrase-extract/syntax-common/xml_tree_parser.cc72
-rw-r--r--phrase-extract/syntax-common/xml_tree_parser.h44
-rw-r--r--phrase-extract/syntax-common/xml_tree_writer.cc82
-rw-r--r--phrase-extract/syntax-common/xml_tree_writer.h27
-rw-r--r--phrase-extract/tables-core.cpp31
-rw-r--r--phrase-extract/tables-core.h6
131 files changed, 4173 insertions, 3536 deletions
diff --git a/phrase-extract/DomainFeature.cpp b/phrase-extract/DomainFeature.cpp
index 899eb9f1c..d5138ba9b 100644
--- a/phrase-extract/DomainFeature.cpp
+++ b/phrase-extract/DomainFeature.cpp
@@ -2,6 +2,7 @@
#include "ExtractionPhrasePair.h"
#include "tables-core.h"
#include "InputFileStream.h"
+#include "util/tokenize.hh"
using namespace std;
@@ -17,7 +18,7 @@ void Domain::load( const std::string &domainFileName )
string line;
while(getline(*fileP, line)) {
// read
- vector< string > domainSpecLine = tokenize( line.c_str() );
+ const vector< string > domainSpecLine = util::tokenize( line );
int lineNumber;
if (domainSpecLine.size() != 2 ||
! sscanf(domainSpecLine[0].c_str(), "%d", &lineNumber)) {
@@ -25,7 +26,7 @@ void Domain::load( const std::string &domainFileName )
exit(1);
}
// store
- string &name = domainSpecLine[1];
+ const string &name = domainSpecLine[1];
spec.push_back( make_pair( lineNumber, name ));
if (name2id.find( name ) == name2id.end()) {
name2id[ name ] = list.size();
diff --git a/phrase-extract/DomainFeature.h b/phrase-extract/DomainFeature.h
index bcb2e63a2..95babb6c2 100644
--- a/phrase-extract/DomainFeature.h
+++ b/phrase-extract/DomainFeature.h
@@ -5,8 +5,8 @@
#include <iostream>
#include <fstream>
-#include <assert.h>
-#include <stdlib.h>
+#include <cassert>
+#include <cstdlib>
#include <string>
#include <queue>
#include <map>
@@ -14,8 +14,6 @@
#include "ScoreFeature.h"
-extern std::vector<std::string> tokenize( const char*);
-
namespace MosesTraining
{
diff --git a/phrase-extract/ExtractionPhrasePair.cpp b/phrase-extract/ExtractionPhrasePair.cpp
index cde712ac6..57821fe44 100644
--- a/phrase-extract/ExtractionPhrasePair.cpp
+++ b/phrase-extract/ExtractionPhrasePair.cpp
@@ -242,7 +242,7 @@ void ExtractionPhrasePair::AddProperties( const std::string &propertiesString, f
vector<std::string> keyValue = Moses::TokenizeFirstOnly(tok, " ");
if (keyValue.size() == 2) {
AddProperty(keyValue[0], keyValue[1], count);
- }
+ }
}
}
diff --git a/phrase-extract/ExtractionPhrasePair.h b/phrase-extract/ExtractionPhrasePair.h
index 3fa380d4f..58935a727 100644
--- a/phrase-extract/ExtractionPhrasePair.h
+++ b/phrase-extract/ExtractionPhrasePair.h
@@ -146,7 +146,7 @@ public:
void AddProperty(const std::string &key, const std::string &value, float count) {
std::map<std::string,
- std::pair< PROPERTY_VALUES*, LAST_PROPERTY_VALUE* > >::iterator iter = m_properties.find(key);
+ std::pair< PROPERTY_VALUES*, LAST_PROPERTY_VALUE* > >::iterator iter = m_properties.find(key);
if ( iter == m_properties.end() ) {
// key not found: insert property key and value
PROPERTY_VALUES *propertyValues = new PROPERTY_VALUES();
diff --git a/phrase-extract/InternalStructFeature.h b/phrase-extract/InternalStructFeature.h
index bd44f61fb..66d61c6f9 100644
--- a/phrase-extract/InternalStructFeature.h
+++ b/phrase-extract/InternalStructFeature.h
@@ -1,7 +1,7 @@
#include <iostream>
#include <fstream>
-#include <assert.h>
-#include <stdlib.h>
+#include <cassert>
+#include <cstdlib>
#include <string>
#include <queue>
#include <map>
@@ -10,10 +10,6 @@
#include "ScoreFeature.h"
#include "extract-ghkm/Node.h"
-using namespace MosesTraining;
-using namespace Moses;
-using namespace GHKM;
-
namespace MosesTraining
{
diff --git a/phrase-extract/OutputFileStream.cpp b/phrase-extract/OutputFileStream.cpp
index 15c2bd73e..d7874b06f 100644
--- a/phrase-extract/OutputFileStream.cpp
+++ b/phrase-extract/OutputFileStream.cpp
@@ -19,6 +19,7 @@
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
***********************************************************************/
+#include <iostream>
#include <boost/algorithm/string/predicate.hpp>
#include <boost/iostreams/filter/gzip.hpp>
#include "OutputFileStream.h"
@@ -32,11 +33,13 @@ namespace Moses
OutputFileStream::OutputFileStream()
:boost::iostreams::filtering_ostream()
,m_outFile(NULL)
+ ,m_open(false)
{
}
OutputFileStream::OutputFileStream(const std::string &filePath)
- : m_outFile(NULL)
+ :m_outFile(NULL)
+ ,m_open(false)
{
Open(filePath);
}
@@ -48,32 +51,38 @@ OutputFileStream::~OutputFileStream()
bool OutputFileStream::Open(const std::string &filePath)
{
- m_outFile = new ofstream(filePath.c_str(), ios_base::out | ios_base::binary);
- if (m_outFile->fail()) {
- return false;
+ assert(!m_open);
+ if (filePath == std::string("-")) {
+ // Write to standard output. Leave m_outFile null.
+ this->push(std::cout);
+ } else {
+ m_outFile = new ofstream(filePath.c_str(), ios_base::out | ios_base::binary);
+ if (m_outFile->fail()) {
+ return false;
+ }
+
+ if (ends_with(filePath, ".gz")) {
+ this->push(boost::iostreams::gzip_compressor());
+ }
+ this->push(*m_outFile);
}
- if (ends_with(filePath, ".gz")) {
- this->push(boost::iostreams::gzip_compressor());
- }
- this->push(*m_outFile);
-
+ m_open = true;
return true;
}
void OutputFileStream::Close()
{
- if (m_outFile == NULL) {
- return;
- }
-
+ if (!m_open) return;
this->flush();
- this->pop(); // file
+ if (m_outFile) {
+ this->pop(); // file
- m_outFile->close();
- delete m_outFile;
- m_outFile = NULL;
- return;
+ m_outFile->close();
+ delete m_outFile;
+ m_outFile = NULL;
+ }
+ m_open = false;
}
diff --git a/phrase-extract/OutputFileStream.h b/phrase-extract/OutputFileStream.h
index f52e36d76..b77741a73 100644
--- a/phrase-extract/OutputFileStream.h
+++ b/phrase-extract/OutputFileStream.h
@@ -30,19 +30,50 @@
namespace Moses
{
-/** Used in place of std::istream, can read zipped files if it ends in .gz
+/** Version of std::ostream with transparent compression.
+ *
+ * Transparently compresses output when writing to a file whose name ends in
+ * ".gz". Or, writes to stdout instead of a file when given a filename
+ * consisting of just a dash ("-").
*/
class OutputFileStream : public boost::iostreams::filtering_ostream
{
-protected:
+private:
+ /** File that needs flushing & closing when we close this stream.
+ *
+ * Is NULL when no file is opened, e.g. when writing to standard output.
+ */
std::ofstream *m_outFile;
+
+ /// Is this stream open?
+ bool m_open;
+
public:
+ /** Create an unopened OutputFileStream.
+ *
+ * Until it's been opened, nothing can be done with this stream.
+ */
OutputFileStream();
+ /// Create an OutputFileStream, and open it by calling Open().
OutputFileStream(const std::string &filePath);
virtual ~OutputFileStream();
+ // TODO: Can we please just always throw an exception when this fails?
+ /** Open stream.
+ *
+ * If filePath is "-" (just a dash), this opens the stream for writing to
+ * standard output. Otherwise, it opens the given file. If the filename
+ * has the ".gz" suffix, output will be transparently compressed.
+ *
+ * Call Close() to close the file.
+ *
+ * Returns whether opening the file was successful. It may also throw an
+ * exception on failure.
+ */
bool Open(const std::string &filePath);
+
+ /// Flush and close stream. After this, the stream can be opened again.
void Close();
};
diff --git a/phrase-extract/PropertiesConsolidator.cpp b/phrase-extract/PropertiesConsolidator.cpp
index 59c56b54b..94b6ea13a 100644
--- a/phrase-extract/PropertiesConsolidator.cpp
+++ b/phrase-extract/PropertiesConsolidator.cpp
@@ -116,18 +116,18 @@ void PropertiesConsolidator::ProcessPropertiesString(const std::string &properti
} else if ( !keyValue[0].compare("POS") ) {
-/* DO NOTHING (property is not registered in the decoder at the moment)
- if ( m_partsOfSpeechFlag ) {
-
- // POS property: replace strings with vocabulary indices
- out << " {{" << keyValue[0];
- ProcessPOSPropertyValue(keyValue[1], out);
- out << "}}";
-
- } else { // don't process POS property
- out << " {{" << keyValue[0] << " " << keyValue[1] << "}}";
- }
-*/
+ /* DO NOTHING (property is not registered in the decoder at the moment)
+ if ( m_partsOfSpeechFlag ) {
+
+ // POS property: replace strings with vocabulary indices
+ out << " {{" << keyValue[0];
+ ProcessPOSPropertyValue(keyValue[1], out);
+ out << "}}";
+
+ } else { // don't process POS property
+ out << " {{" << keyValue[0] << " " << keyValue[1] << "}}";
+ }
+ */
} else {
diff --git a/phrase-extract/SentenceAlignment.cpp b/phrase-extract/SentenceAlignment.cpp
index ee7f27ed9..21c1a1dbd 100644
--- a/phrase-extract/SentenceAlignment.cpp
+++ b/phrase-extract/SentenceAlignment.cpp
@@ -24,6 +24,7 @@
#include <string>
#include "tables-core.h"
+#include "util/tokenize.hh"
using namespace std;
@@ -40,7 +41,7 @@ void addBoundaryWords(vector<string> &phrase)
bool SentenceAlignment::processTargetSentence(const char * targetString, int, bool boundaryRules)
{
- target = tokenize(targetString);
+ target = util::tokenize(targetString);
if (boundaryRules)
addBoundaryWords(target);
return true;
@@ -48,7 +49,7 @@ bool SentenceAlignment::processTargetSentence(const char * targetString, int, bo
bool SentenceAlignment::processSourceSentence(const char * sourceString, int, bool boundaryRules)
{
- source = tokenize(sourceString);
+ source = util::tokenize(sourceString);
if (boundaryRules)
addBoundaryWords(source);
return true;
@@ -89,7 +90,7 @@ bool SentenceAlignment::create(const char targetString[],
}
// reading in alignments
- vector<string> alignmentSequence = tokenize( alignmentString );
+ vector<string> alignmentSequence = util::tokenize( alignmentString );
for(size_t i=0; i<alignmentSequence.size(); i++) {
int s,t;
// cout << "scaning " << alignmentSequence[i].c_str() << endl;
diff --git a/phrase-extract/SentenceAlignmentWithSyntax.cpp b/phrase-extract/SentenceAlignmentWithSyntax.cpp
index 1b4ed7c88..4fd2355ae 100644
--- a/phrase-extract/SentenceAlignmentWithSyntax.cpp
+++ b/phrase-extract/SentenceAlignmentWithSyntax.cpp
@@ -26,6 +26,7 @@
#include "tables-core.h"
#include "XmlException.h"
#include "XmlTree.h"
+#include "util/tokenize.hh"
using namespace std;
@@ -49,7 +50,7 @@ bool SentenceAlignmentWithSyntax::processTargetSentence(const char * targetStrin
<< sentenceID << ": " << e.getMsg() << std::endl;
return false;
}
- target = tokenize(targetStringCPP.c_str());
+ target = util::tokenize(targetStringCPP);
return true;
}
@@ -70,11 +71,8 @@ bool SentenceAlignmentWithSyntax::processSourceSentence(const char * sourceStrin
<< sentenceID << ": " << e.getMsg() << std::endl;
return false;
}
- source = tokenize(sourceStringCPP.c_str());
+ source = util::tokenize(sourceStringCPP);
return true;
}
} // namespace
-
-
-
diff --git a/phrase-extract/SentenceAlignmentWithSyntax.h b/phrase-extract/SentenceAlignmentWithSyntax.h
index 8b9088770..604b6d0e2 100644
--- a/phrase-extract/SentenceAlignmentWithSyntax.h
+++ b/phrase-extract/SentenceAlignmentWithSyntax.h
@@ -28,7 +28,7 @@
#include "RuleExtractionOptions.h"
#include "SentenceAlignment.h"
-#include "SyntaxTree.h"
+#include "SyntaxNodeCollection.h"
namespace MosesTraining
{
@@ -36,8 +36,8 @@ namespace MosesTraining
class SentenceAlignmentWithSyntax : public SentenceAlignment
{
public:
- SyntaxTree targetTree;
- SyntaxTree sourceTree;
+ SyntaxNodeCollection targetTree;
+ SyntaxNodeCollection sourceTree;
std::set<std::string> & m_targetLabelCollection;
std::set<std::string> & m_sourceLabelCollection;
std::map<std::string, int> & m_targetTopLabelCollection;
diff --git a/phrase-extract/SyntaxNode.h b/phrase-extract/SyntaxNode.h
new file mode 100644
index 000000000..49e2eb695
--- /dev/null
+++ b/phrase-extract/SyntaxNode.h
@@ -0,0 +1,43 @@
+/***********************************************************************
+ Moses - factored phrase-based language decoder
+ Copyright (C) 2009 University of Edinburgh
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ ***********************************************************************/
+
+#pragma once
+
+#include <map>
+#include <string>
+
+namespace MosesTraining
+{
+
+struct SyntaxNode {
+ typedef std::map<std::string, std::string> AttributeMap;
+
+ SyntaxNode(const std::string &label_, int start_, int end_)
+ : label(label_)
+ , start(start_)
+ , end(end_) {
+ }
+
+ std::string label;
+ int start;
+ int end;
+ AttributeMap attributes;
+};
+
+} // namespace MosesTraining
diff --git a/phrase-extract/SyntaxNodeCollection.cpp b/phrase-extract/SyntaxNodeCollection.cpp
new file mode 100644
index 000000000..70f52317e
--- /dev/null
+++ b/phrase-extract/SyntaxNodeCollection.cpp
@@ -0,0 +1,131 @@
+/***********************************************************************
+ Moses - factored phrase-based language decoder
+ Copyright (C) 2009 University of Edinburgh
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ ***********************************************************************/
+
+
+#include "SyntaxNodeCollection.h"
+
+#include <cassert>
+#include <iostream>
+
+namespace MosesTraining
+{
+
+SyntaxNodeCollection::~SyntaxNodeCollection()
+{
+ Clear();
+}
+
+void SyntaxNodeCollection::Clear()
+{
+ // loop through all m_nodes, delete them
+ for(size_t i=0; i<m_nodes.size(); i++) {
+ delete m_nodes[i];
+ }
+ m_nodes.clear();
+ m_index.clear();
+}
+
+SyntaxNode *SyntaxNodeCollection::AddNode(int startPos, int endPos,
+ const std::string &label)
+{
+ SyntaxNode* newNode = new SyntaxNode(label, startPos, endPos);
+ m_nodes.push_back( newNode );
+ m_index[ startPos ][ endPos ].push_back( newNode );
+ m_numWords = std::max(endPos+1, m_numWords);
+ return newNode;
+}
+
+bool SyntaxNodeCollection::HasNode( int startPos, int endPos ) const
+{
+ return GetNodes( startPos, endPos).size() > 0;
+}
+
+const std::vector< SyntaxNode* >& SyntaxNodeCollection::GetNodes(
+ int startPos, int endPos ) const
+{
+ NodeIndex::const_iterator startIndex = m_index.find( startPos );
+ if (startIndex == m_index.end() )
+ return m_emptyNode;
+
+ InnerNodeIndex::const_iterator endIndex = startIndex->second.find( endPos );
+ if (endIndex == startIndex->second.end())
+ return m_emptyNode;
+
+ return endIndex->second;
+}
+
+std::auto_ptr<SyntaxTree> SyntaxNodeCollection::ExtractTree()
+{
+ std::map<SyntaxNode *, SyntaxTree *> nodeToTree;
+
+ // Create a SyntaxTree object for each SyntaxNode.
+ for (std::vector<SyntaxNode*>::const_iterator p = m_nodes.begin();
+ p != m_nodes.end(); ++p) {
+ nodeToTree[*p] = new SyntaxTree(**p);
+ }
+
+ // Connect the SyntaxTrees.
+ typedef NodeIndex::const_iterator OuterIterator;
+ typedef InnerNodeIndex::const_reverse_iterator InnerIterator;
+
+ SyntaxTree *root = 0;
+ SyntaxNode *prevNode = 0;
+ SyntaxTree *prevTree = 0;
+ // Iterate over all start indices from lowest to highest.
+ for (OuterIterator p = m_index.begin(); p != m_index.end(); ++p) {
+ const InnerNodeIndex &inner = p->second;
+ // Iterate over all end indices from highest to lowest.
+ for (InnerIterator q = inner.rbegin(); q != inner.rend(); ++q) {
+ const std::vector<SyntaxNode*> &nodes = q->second;
+ // Iterate over all nodes that cover the same span in order of tree
+ // depth, top-most first.
+ for (std::vector<SyntaxNode*>::const_reverse_iterator r = nodes.rbegin();
+ r != nodes.rend(); ++r) {
+ SyntaxNode *node = *r;
+ SyntaxTree *tree = nodeToTree[node];
+ if (!prevNode) {
+ // node is the root.
+ root = tree;
+ tree->parent() = 0;
+ } else if (prevNode->start == node->start) {
+ // prevNode is the parent of node.
+ assert(prevNode->end >= node->end);
+ tree->parent() = prevTree;
+ prevTree->children().push_back(tree);
+ } else {
+ // prevNode is a descendant of node's parent. The lowest common
+ // ancestor of prevNode and node will be node's parent.
+ SyntaxTree *ancestor = prevTree->parent();
+ while (ancestor->value().end < tree->value().end) {
+ ancestor = ancestor->parent();
+ }
+ assert(ancestor);
+ tree->parent() = ancestor;
+ ancestor->children().push_back(tree);
+ }
+ prevNode = node;
+ prevTree = tree;
+ }
+ }
+ }
+
+ return std::auto_ptr<SyntaxTree>(root);
+}
+
+} // namespace MosesTraining
diff --git a/phrase-extract/SyntaxNodeCollection.h b/phrase-extract/SyntaxNodeCollection.h
new file mode 100644
index 000000000..405a77c5f
--- /dev/null
+++ b/phrase-extract/SyntaxNodeCollection.h
@@ -0,0 +1,79 @@
+/***********************************************************************
+ Moses - factored phrase-based language decoder
+ Copyright (C) 2009 University of Edinburgh
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ ***********************************************************************/
+
+#pragma once
+
+#include <map>
+#include <memory>
+#include <sstream>
+#include <string>
+#include <vector>
+
+#include "SyntaxNode.h"
+#include "SyntaxTree.h"
+
+namespace MosesTraining
+{
+
+/** A collection of SyntaxNodes organized by start and end position.
+ *
+ */
+class SyntaxNodeCollection
+{
+public:
+ SyntaxNodeCollection() : m_numWords(0) {}
+
+ ~SyntaxNodeCollection();
+
+ //! Construct and insert a new SyntaxNode.
+ SyntaxNode *AddNode( int startPos, int endPos, const std::string &label );
+
+ //! Return true iff there are one or more SyntaxNodes with the given span.
+ bool HasNode( int startPos, int endPos ) const;
+
+ //! Lookup the SyntaxNodes for a given span.
+ const std::vector< SyntaxNode* >& GetNodes( int startPos, int endPos ) const;
+
+ //! Get a vector of pointers to all SyntaxNodes (unordered).
+ const std::vector< SyntaxNode* >& GetAllNodes() {
+ return m_nodes;
+ };
+
+ size_t GetNumWords() const {
+ return m_numWords;
+ }
+ void Clear();
+
+ std::auto_ptr<SyntaxTree> ExtractTree();
+
+private:
+ typedef std::map< int, std::vector< SyntaxNode* > > InnerNodeIndex;
+ typedef std::map< int, InnerNodeIndex > NodeIndex;
+
+ // Not copyable.
+ SyntaxNodeCollection(const SyntaxNodeCollection &);
+ SyntaxNodeCollection &operator=(const SyntaxNodeCollection &);
+
+ std::vector< SyntaxNode* > m_nodes;
+ NodeIndex m_index;
+ int m_numWords;
+ std::vector< SyntaxNode* > m_emptyNode;
+};
+
+} // namespace MosesTraining
diff --git a/phrase-extract/SyntaxTree.cpp b/phrase-extract/SyntaxTree.cpp
deleted file mode 100644
index c50693e0d..000000000
--- a/phrase-extract/SyntaxTree.cpp
+++ /dev/null
@@ -1,188 +0,0 @@
-// $Id: SyntaxTree.cpp 1960 2008-12-15 12:52:38Z phkoehn $
-// vim:tabstop=2
-
-/***********************************************************************
- Moses - factored phrase-based language decoder
- Copyright (C) 2009 University of Edinburgh
-
- This library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- This library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with this library; if not, write to the Free Software
- Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- ***********************************************************************/
-
-
-#include "SyntaxTree.h"
-
-#include <cassert>
-#include <iostream>
-
-namespace MosesTraining
-{
-
-SyntaxTree::~SyntaxTree()
-{
- Clear();
-}
-
-void SyntaxTree::Clear()
-{
- m_top = 0;
- // loop through all m_nodes, delete them
- for(size_t i=0; i<m_nodes.size(); i++) {
- delete m_nodes[i];
- }
- m_nodes.clear();
- m_index.clear();
-}
-
-SyntaxNode *SyntaxTree::AddNode( int startPos, int endPos, std::string label )
-{
- SyntaxNode* newNode = new SyntaxNode( startPos, endPos, label );
- m_nodes.push_back( newNode );
- m_index[ startPos ][ endPos ].push_back( newNode );
- m_size = std::max(endPos+1, m_size);
- return newNode;
-}
-
-ParentNodes SyntaxTree::Parse()
-{
- ParentNodes parents;
-
- // looping through all spans of size >= 2
- for( int length=2; length<=m_size; length++ ) {
- for( int startPos = 0; startPos <= m_size-length; startPos++ ) {
- if (HasNode( startPos, startPos+length-1 )) {
- // processing one (parent) span
-
- //std::cerr << "# " << startPos << "-" << (startPos+length-1) << ":";
- SplitPoints splitPoints;
- splitPoints.push_back( startPos );
- //std::cerr << " " << startPos;
-
- int first = 1;
- int covered = 0;
- int found_somehing = 1; // break loop if nothing found
- while( covered < length && found_somehing ) {
- // find largest covering subspan (child)
- // starting at last covered position
- found_somehing = 0;
- for( int midPos=length-first; midPos>covered; midPos-- ) {
- if( HasNode( startPos+covered, startPos+midPos-1 ) ) {
- covered = midPos;
- splitPoints.push_back( startPos+covered );
- // std::cerr << " " << ( startPos+covered );
- first = 0;
- found_somehing = 1;
- }
- }
- }
- // std::cerr << std::endl;
- parents.push_back( splitPoints );
- }
- }
- }
- return parents;
-}
-
-bool SyntaxTree::HasNode( int startPos, int endPos ) const
-{
- return GetNodes( startPos, endPos).size() > 0;
-}
-
-const std::vector< SyntaxNode* >& SyntaxTree::GetNodes( int startPos, int endPos ) const
-{
- SyntaxTreeIndexIterator startIndex = m_index.find( startPos );
- if (startIndex == m_index.end() )
- return m_emptyNode;
-
- SyntaxTreeIndexIterator2 endIndex = startIndex->second.find( endPos );
- if (endIndex == startIndex->second.end())
- return m_emptyNode;
-
- return endIndex->second;
-}
-
-// for printing out tree
-std::string SyntaxTree::ToString() const
-{
- std::stringstream out;
- out << *this;
- return out.str();
-}
-
-void SyntaxTree::ConnectNodes()
-{
- typedef SyntaxTreeIndex2::const_reverse_iterator InnerIterator;
-
- SyntaxNode *prev = 0;
- // Iterate over all start indices from lowest to highest.
- for (SyntaxTreeIndexIterator p = m_index.begin(); p != m_index.end(); ++p) {
- const SyntaxTreeIndex2 &inner = p->second;
- // Iterate over all end indices from highest to lowest.
- for (InnerIterator q = inner.rbegin(); q != inner.rend(); ++q) {
- const std::vector<SyntaxNode*> &nodes = q->second;
- // Iterate over all nodes that cover the same span in order of tree
- // depth, top-most first.
- for (std::vector<SyntaxNode*>::const_reverse_iterator r = nodes.rbegin();
- r != nodes.rend(); ++r) {
- SyntaxNode *node = *r;
- if (!prev) {
- // node is the root.
- m_top = node;
- node->SetParent(0);
- } else if (prev->GetStart() == node->GetStart()) {
- // prev is the parent of node.
- assert(prev->GetEnd() >= node->GetEnd());
- node->SetParent(prev);
- prev->AddChild(node);
- } else {
- // prev is a descendant of node's parent. The lowest common
- // ancestor of prev and node will be node's parent.
- SyntaxNode *ancestor = prev->GetParent();
- while (ancestor->GetEnd() < node->GetEnd()) {
- ancestor = ancestor->GetParent();
- }
- assert(ancestor);
- node->SetParent(ancestor);
- ancestor->AddChild(node);
- }
- prev = node;
- }
- }
- }
-}
-
-std::ostream& operator<<(std::ostream& os, const SyntaxTree& t)
-{
- size_t size = t.m_index.size();
- for(size_t length=1; length<=size; length++) {
- for(size_t space=0; space<length; space++) {
- os << " ";
- }
- for(size_t start=0; start<=size-length; start++) {
-
- if (t.HasNode( start, start+(length-1) )) {
- std::string label = t.GetNodes( start, start+(length-1) )[0]->GetLabel() + "#######";
-
- os << label.substr(0,7) << " ";
- } else {
- os << "------- ";
- }
- }
- os << std::endl;
- }
- return os;
-}
-
-}
-
diff --git a/phrase-extract/SyntaxTree.h b/phrase-extract/SyntaxTree.h
index 6ffb5da34..c2132fda3 100644
--- a/phrase-extract/SyntaxTree.h
+++ b/phrase-extract/SyntaxTree.h
@@ -1,128 +1,12 @@
-// $Id: SyntaxTree.h 1960 2008-12-15 12:52:38Z phkoehn $
-// vim:tabstop=2
-
-/***********************************************************************
- Moses - factored phrase-based language decoder
- Copyright (C) 2009 University of Edinburgh
-
- This library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- This library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with this library; if not, write to the Free Software
- Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- ***********************************************************************/
-
-
#pragma once
-#include <string>
-#include <vector>
-#include <map>
-#include <sstream>
-
-namespace MosesTraining
-{
-
-class SyntaxNode
-{
-protected:
- int m_start, m_end;
- std::string m_label;
- std::vector< SyntaxNode* > m_children;
- SyntaxNode* m_parent;
- float m_pcfgScore;
-public:
- SyntaxNode( int startPos, int endPos, std::string label )
- :m_start(startPos)
- ,m_end(endPos)
- ,m_label(label)
- ,m_parent(0)
- ,m_pcfgScore(0.0f) {
- }
- int GetStart() const {
- return m_start;
- }
- int GetEnd() const {
- return m_end;
- }
- std::string GetLabel() const {
- return m_label;
- }
- float GetPcfgScore() const {
- return m_pcfgScore;
- }
- void SetPcfgScore(float score) {
- m_pcfgScore = score;
- }
- SyntaxNode *GetParent() {
- return m_parent;
- }
- void SetParent(SyntaxNode *parent) {
- m_parent = parent;
- }
- void AddChild(SyntaxNode* child) {
- m_children.push_back(child);
- }
- const std::vector< SyntaxNode* > &GetChildren() const {
- return m_children;
- }
-};
+#include "syntax-common/tree.h"
-typedef std::vector< int > SplitPoints;
-typedef std::vector< SplitPoints > ParentNodes;
+#include "SyntaxNode.h"
-class SyntaxTree
+namespace MosesTraining
{
-protected:
- std::vector< SyntaxNode* > m_nodes;
- SyntaxNode* m_top;
-
- typedef std::map< int, std::vector< SyntaxNode* > > SyntaxTreeIndex2;
- typedef SyntaxTreeIndex2::const_iterator SyntaxTreeIndexIterator2;
- typedef std::map< int, SyntaxTreeIndex2 > SyntaxTreeIndex;
- typedef SyntaxTreeIndex::const_iterator SyntaxTreeIndexIterator;
- SyntaxTreeIndex m_index;
- int m_size;
- std::vector< SyntaxNode* > m_emptyNode;
-
- friend std::ostream& operator<<(std::ostream&, const SyntaxTree&);
-
-public:
- SyntaxTree()
- : m_top(0) // m_top doesn't get set unless ConnectNodes is called.
- , m_size(0) {}
-
- ~SyntaxTree();
-
- SyntaxNode *AddNode( int startPos, int endPos, std::string label );
-
- SyntaxNode *GetTop() {
- return m_top;
- }
-
- ParentNodes Parse();
- bool HasNode( int startPos, int endPos ) const;
- const std::vector< SyntaxNode* >& GetNodes( int startPos, int endPos ) const;
- const std::vector< SyntaxNode* >& GetAllNodes() {
- return m_nodes;
- };
- size_t GetNumWords() const {
- return m_size;
- }
- void ConnectNodes();
- void Clear();
- std::string ToString() const;
-};
-
-std::ostream& operator<<(std::ostream&, const SyntaxTree&);
-}
+typedef Syntax::Tree<SyntaxNode> SyntaxTree;
+} // namespace MosesTraining
diff --git a/phrase-extract/XmlTree.cpp b/phrase-extract/XmlTree.cpp
index 6efa1bf5c..d8b77b6e6 100644
--- a/phrase-extract/XmlTree.cpp
+++ b/phrase-extract/XmlTree.cpp
@@ -1,6 +1,3 @@
-// $Id: XmlOption.cpp 1960 2008-12-15 12:52:38Z phkoehn $
-// vim:tabstop=2
-
/***********************************************************************
Moses - factored phrase-based language decoder
Copyright (C) 2006 University of Edinburgh
@@ -27,7 +24,8 @@
#include <iostream>
#include <cstdlib>
#include <sstream>
-#include "SyntaxTree.h"
+
+#include "SyntaxNodeCollection.h"
#include "XmlException.h"
using namespace std;
@@ -82,6 +80,39 @@ string ParseXmlTagAttribute(const string& tag,const string& attributeName)
return tag.substr(contentsStart,contentsEnd-contentsStart);
}
+// TODO Special handling of "label" attribute
+// s should be a sequence of name=attribute pairs separated by whitespace.
+// e.g. "label=\"S\" pcfg=\"-1.452\" foo=\"blah\\\"blah\""
+void ParseXmlTagAttributes(const std::string &s,
+ std::map<std::string, std::string> &attributes)
+{
+ std::size_t begin = 0;
+ while (true) {
+ std::size_t pos = s.find('=', begin);
+ if (pos == std::string::npos) {
+ return;
+ }
+ std::string name = Trim(s.substr(begin, pos-begin));
+ begin = s.find('"', pos+1);
+ if (begin == std::string::npos) {
+ throw XmlException("invalid tag content");
+ }
+ pos = s.find('"', begin+1);
+ if (pos == std::string::npos) {
+ throw XmlException("invalid tag content");
+ }
+ while (s[pos-1] == '\\') {
+ pos = s.find('"', pos+1);
+ if (pos == std::string::npos) {
+ throw XmlException("invalid tag content");
+ }
+ }
+ // TODO unescape \"
+ attributes[name] = s.substr(begin+1, pos-begin-1);
+ begin = pos+1;
+ }
+}
+
/**
* Remove "<" and ">" from XML tag
*
@@ -228,7 +259,10 @@ vector<string> TokenizeXml(const string& str)
parse because we don't have the completed source parsed until after this function
removes all the markup from it (CreateFromString in Sentence::Read).
*/
-bool ProcessAndStripXMLTags(string &line, SyntaxTree &tree, set< string > &labelCollection, map< string, int > &topLabelCollection, bool unescapeSpecialChars )
+bool ProcessAndStripXMLTags(string &line, SyntaxNodeCollection &nodeCollection,
+ set< string > &labelCollection,
+ map< string, int > &topLabelCollection,
+ bool unescapeSpecialChars )
{
//parse XML markup in translation line
@@ -364,18 +398,14 @@ bool ProcessAndStripXMLTags(string &line, SyntaxTree &tree, set< string > &label
string label = ParseXmlTagAttribute(tagContent,"label");
labelCollection.insert( label );
- string pcfgString = ParseXmlTagAttribute(tagContent,"pcfg");
- float pcfgScore = pcfgString == "" ? 0.0f
- : std::atof(pcfgString.c_str());
-
// report what we have processed so far
if (0) {
cerr << "XML TAG NAME IS: '" << tagName << "'" << endl;
cerr << "XML TAG LABEL IS: '" << label << "'" << endl;
cerr << "XML SPAN IS: " << startPos << "-" << (endPos-1) << endl;
}
- SyntaxNode *node = tree.AddNode( startPos, endPos-1, label );
- node->SetPcfgScore(pcfgScore);
+ SyntaxNode *node = nodeCollection.AddNode( startPos, endPos-1, label );
+ ParseXmlTagAttributes(tagContent, node->attributes);
}
}
}
@@ -386,10 +416,10 @@ bool ProcessAndStripXMLTags(string &line, SyntaxTree &tree, set< string > &label
}
// collect top labels
- const vector< SyntaxNode* >& topNodes = tree.GetNodes( 0, wordPos-1 );
+ const vector< SyntaxNode* >& topNodes = nodeCollection.GetNodes( 0, wordPos-1 );
for( vector< SyntaxNode* >::const_iterator node = topNodes.begin(); node != topNodes.end(); node++ ) {
SyntaxNode *n = *node;
- const string &label = n->GetLabel();
+ const string &label = n->label;
if (topLabelCollection.find( label ) == topLabelCollection.end())
topLabelCollection[ label ] = 0;
topLabelCollection[ label ]++;
diff --git a/phrase-extract/XmlTree.h b/phrase-extract/XmlTree.h
index a8c6888d6..3b5afd4dd 100644
--- a/phrase-extract/XmlTree.h
+++ b/phrase-extract/XmlTree.h
@@ -1,43 +1,41 @@
-// $Id: XmlOption.cpp 1960 2008-12-15 12:52:38Z phkoehn $
-// vim:tabstop=2
-
-/***********************************************************************
- Moses - factored phrase-based language decoder
- Copyright (C) 2006 University of Edinburgh
-
- This library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- This library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with this library; if not, write to the Free Software
- Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- ***********************************************************************/
-
-#pragma once
-#include <string>
-#include <vector>
-#include <set>
-#include <map>
-#include "SyntaxTree.h"
-
-namespace MosesTraining
-{
-
-std::string ParseXmlTagAttribute(const std::string& tag,const std::string& attributeName);
-std::string Trim(const std::string& str, const std::string dropChars = " \t\n\r");
-std::string TrimXml(const std::string& str);
-bool isXmlTag(const std::string& tag);
-std::vector<std::string> TokenizeXml(const std::string& str);
-bool ProcessAndStripXMLTags(std::string &line, SyntaxTree &tree, std::set< std::string > &labelCollection, std::map< std::string, int > &topLabelCollection, bool unescape = true);
-std::string unescape(const std::string &str);
-
-
-} // namespace
-
+/***********************************************************************
+ Moses - factored phrase-based language decoder
+ Copyright (C) 2006 University of Edinburgh
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ ***********************************************************************/
+
+#pragma once
+
+#include <string>
+#include <vector>
+#include <set>
+#include <map>
+
+#include "SyntaxNodeCollection.h"
+
+namespace MosesTraining
+{
+
+std::string ParseXmlTagAttribute(const std::string& tag,const std::string& attributeName);
+std::string Trim(const std::string& str, const std::string dropChars = " \t\n\r");
+std::string TrimXml(const std::string& str);
+bool isXmlTag(const std::string& tag);
+std::vector<std::string> TokenizeXml(const std::string& str);
+bool ProcessAndStripXMLTags(std::string &line, SyntaxNodeCollection &tree, std::set< std::string > &labelCollection, std::map< std::string, int > &topLabelCollection, bool unescape = true);
+std::string unescape(const std::string &str);
+
+
+} // namespace MosesTraining
diff --git a/phrase-extract/consolidate-direct-main.cpp b/phrase-extract/consolidate-direct-main.cpp
index 423a3909b..d25197372 100644
--- a/phrase-extract/consolidate-direct-main.cpp
+++ b/phrase-extract/consolidate-direct-main.cpp
@@ -25,11 +25,10 @@
#include <cstdlib>
#include "InputFileStream.h"
#include "OutputFileStream.h"
+#include "util/tokenize.hh"
using namespace std;
-std::vector<std::string> tokenize( const char [] );
-
vector< string > splitLine(const char *line)
{
vector< string > item;
@@ -109,7 +108,7 @@ int main(int argc, char* argv[])
if (! getLine(fileDirectP, itemDirect ))
break;
- vector< string > count = tokenize( itemDirect[4].c_str() );
+ const vector< string > count = util::tokenize( itemDirect[4] );
float countEF = atof(count[0].c_str());
float countF = atof(count[1].c_str());
float prob = countF/countEF;
diff --git a/phrase-extract/consolidate-main.cpp b/phrase-extract/consolidate-main.cpp
index 7ef146f64..5964bf686 100644
--- a/phrase-extract/consolidate-main.cpp
+++ b/phrase-extract/consolidate-main.cpp
@@ -17,137 +17,143 @@
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
***********************************************************************/
-#include <cstdio>
-#include <iostream>
-#include <fstream>
#include <vector>
#include <string>
-#include <cstdlib>
-#include <cstring>
+#include "util/exception.hh"
#include "moses/Util.h"
#include "InputFileStream.h"
#include "OutputFileStream.h"
#include "PropertiesConsolidator.h"
-using namespace std;
+bool countsProperty = false;
+bool goodTuringFlag = false;
bool hierarchicalFlag = false;
+bool kneserNeyFlag = false;
+bool logProbFlag = false;
+bool lowCountFlag = false;
bool onlyDirectFlag = false;
+bool partsOfSpeechFlag = false;
bool phraseCountFlag = false;
-bool lowCountFlag = false;
-bool goodTuringFlag = false;
-bool kneserNeyFlag = false;
bool sourceLabelsFlag = false;
-bool partsOfSpeechFlag = false;
-bool logProbFlag = false;
+bool sparseCountBinFeatureFlag = false;
+
+std::vector< int > countBin;
float minScore0 = 0;
float minScore2 = 0;
+std::vector< float > countOfCounts;
+std::vector< float > goodTuringDiscount;
+float kneserNey_D1, kneserNey_D2, kneserNey_D3, totalCount = -1;
+
+
+void processFiles( const std::string&, const std::string&, const std::string&, const std::string&, const std::string&, const std::string& );
+void loadCountOfCounts( const std::string& );
+void breakdownCoreAndSparse( const std::string &combined, std::string &core, std::string &sparse );
+bool getLine( Moses::InputFileStream &file, std::vector< std::string > &item );
+
+
inline float maybeLogProb( float a )
{
return logProbFlag ? std::log(a) : a;
}
+
inline bool isNonTerminal( const std::string &word )
{
return (word.length()>=3 && word[0] == '[' && word[word.length()-1] == ']');
}
-void processFiles( char*, char*, char*, char*, char*, char* );
-void loadCountOfCounts( char* );
-void breakdownCoreAndSparse( string combined, string &core, string &sparse );
-bool getLine( istream &fileP, vector< string > &item );
-vector< string > splitLine(const char *line);
-vector< int > countBin;
-bool sparseCountBinFeatureFlag = false;
int main(int argc, char* argv[])
{
- cerr << "Consolidate v2.0 written by Philipp Koehn\n"
- << "consolidating direct and indirect rule tables\n";
+ std::cerr << "Consolidate v2.0 written by Philipp Koehn" << std::endl
+ << "consolidating direct and indirect rule tables" << std::endl;
if (argc < 4) {
- cerr << "syntax: consolidate phrase-table.direct phrase-table.indirect phrase-table.consolidated [--Hierarchical] [--OnlyDirect] [--PhraseCount] [--GoodTuring counts-of-counts-file] [--KneserNey counts-of-counts-file] [--LowCountFeature] [--SourceLabels source-labels-file] [--PartsOfSpeech parts-of-speech-file] [--MinScore id:threshold[,id:threshold]*]\n";
+ std::cerr <<
+ "syntax: "
+ "consolidate phrase-table.direct "
+ "phrase-table.indirect "
+ "phrase-table.consolidated "
+ "[--Hierarchical] [--OnlyDirect] [--PhraseCount] "
+ "[--GoodTuring counts-of-counts-file] "
+ "[--KneserNey counts-of-counts-file] [--LowCountFeature] "
+ "[--SourceLabels source-labels-file] "
+ "[--PartsOfSpeech parts-of-speech-file] "
+ "[--MinScore id:threshold[,id:threshold]*]"
+ << std::endl;
exit(1);
}
- char* &fileNameDirect = argv[1];
- char* &fileNameIndirect = argv[2];
- char* &fileNameConsolidated = argv[3];
- char* fileNameCountOfCounts = 0;
- char* fileNameSourceLabelSet = 0;
- char* fileNamePartsOfSpeechVocabulary = 0;
+ const std::string fileNameDirect = argv[1];
+ const std::string fileNameIndirect = argv[2];
+ const std::string fileNameConsolidated = argv[3];
+ std::string fileNameCountOfCounts;
+ std::string fileNameSourceLabelSet;
+ std::string fileNamePartsOfSpeechVocabulary;
for(int i=4; i<argc; i++) {
if (strcmp(argv[i],"--Hierarchical") == 0) {
hierarchicalFlag = true;
- cerr << "processing hierarchical rules\n";
+ std::cerr << "processing hierarchical rules" << std::endl;
} else if (strcmp(argv[i],"--OnlyDirect") == 0) {
onlyDirectFlag = true;
- cerr << "only including direct translation scores p(e|f)\n";
+ std::cerr << "only including direct translation scores p(e|f)" << std::endl;
} else if (strcmp(argv[i],"--PhraseCount") == 0) {
phraseCountFlag = true;
- cerr << "including the phrase count feature\n";
+ std::cerr << "including the phrase count feature" << std::endl;
} else if (strcmp(argv[i],"--GoodTuring") == 0) {
goodTuringFlag = true;
- if (i+1==argc) {
- cerr << "ERROR: specify count of count files for Good Turing discounting!\n";
- exit(1);
- }
+ UTIL_THROW_IF2(i+1==argc, "specify count of count files for Good Turing discounting!");
fileNameCountOfCounts = argv[++i];
- cerr << "adjusting phrase translation probabilities with Good Turing discounting\n";
+ std::cerr << "adjusting phrase translation probabilities with Good Turing discounting" << std::endl;
} else if (strcmp(argv[i],"--KneserNey") == 0) {
kneserNeyFlag = true;
- if (i+1==argc) {
- cerr << "ERROR: specify count of count files for Kneser Ney discounting!\n";
- exit(1);
- }
+ UTIL_THROW_IF2(i+1==argc, "specify count of count files for Kneser Ney discounting!");
fileNameCountOfCounts = argv[++i];
- cerr << "adjusting phrase translation probabilities with Kneser Ney discounting\n";
+ std::cerr << "adjusting phrase translation probabilities with Kneser Ney discounting" << std::endl;
} else if (strcmp(argv[i],"--LowCountFeature") == 0) {
lowCountFlag = true;
- cerr << "including the low count feature\n";
+ std::cerr << "including the low count feature" << std::endl;
} else if (strcmp(argv[i],"--CountBinFeature") == 0 ||
strcmp(argv[i],"--SparseCountBinFeature") == 0) {
if (strcmp(argv[i],"--SparseCountBinFeature") == 0)
sparseCountBinFeatureFlag = true;
- cerr << "include "<< (sparseCountBinFeatureFlag ? "sparse " : "") << "count bin feature:";
+ std::cerr << "include "<< (sparseCountBinFeatureFlag ? "sparse " : "") << "count bin feature:";
int prev = 0;
while(i+1<argc && argv[i+1][0]>='0' && argv[i+1][0]<='9') {
- int binCount = atoi(argv[++i]);
+ int binCount = Moses::Scan<int>(argv[++i]);
countBin.push_back( binCount );
if (prev+1 == binCount) {
- cerr << " " << binCount;
+ std::cerr << " " << binCount;
} else {
- cerr << " " << (prev+1) << "-" << binCount;
+ std::cerr << " " << (prev+1) << "-" << binCount;
}
prev = binCount;
}
- cerr << " " << (prev+1) << "+\n";
+ std::cerr << " " << (prev+1) << "+" << std::endl;
} else if (strcmp(argv[i],"--LogProb") == 0) {
logProbFlag = true;
- cerr << "using log-probabilities\n";
+ std::cerr << "using log-probabilities" << std::endl;
+ } else if (strcmp(argv[i],"--Counts") == 0) {
+ countsProperty = true;
+ std::cerr << "output counts as a property" << std::endl;;
} else if (strcmp(argv[i],"--SourceLabels") == 0) {
sourceLabelsFlag = true;
- if (i+1==argc) {
- cerr << "ERROR: specify source label set file!\n";
- exit(1);
- }
+ UTIL_THROW_IF2(i+1==argc, "specify source label set file!");
fileNameSourceLabelSet = argv[++i];
- cerr << "processing source labels property\n";
+ std::cerr << "processing source labels property" << std::endl;
} else if (strcmp(argv[i],"--PartsOfSpeech") == 0) {
partsOfSpeechFlag = true;
- if (i+1==argc) {
- cerr << "ERROR: specify parts-of-speech file!\n";
- exit(1);
- }
+ UTIL_THROW_IF2(i+1==argc, "specify parts-of-speech file!");
fileNamePartsOfSpeechVocabulary = argv[++i];
- cerr << "processing parts-of-speech property\n";
+ std::cerr << "processing parts-of-speech property" << std::endl;
} else if (strcmp(argv[i],"--MinScore") == 0) {
- string setting = argv[++i];
+ std::string setting = argv[++i];
bool done = false;
while (!done) {
- string single_setting;
+ std::string single_setting;
size_t pos;
if ((pos = setting.find(",")) != std::string::npos) {
single_setting = setting.substr(0, pos);
@@ -156,52 +162,42 @@ int main(int argc, char* argv[])
single_setting = setting;
done = true;
}
- if ((pos = single_setting.find(":")) == std::string::npos) {
- cerr << "ERROR: faulty MinScore setting '" << single_setting << "' in '" << argv[i] << "'" << endl;
- exit(1);
- }
- unsigned int field = atoi( single_setting.substr(0,pos).c_str() );
- float threshold = atof( single_setting.substr(pos+1).c_str() );
+ pos = single_setting.find(":");
+ UTIL_THROW_IF2(pos == std::string::npos, "faulty MinScore setting '" << single_setting << "' in '" << argv[i] << "'");
+ unsigned int field = Moses::Scan<unsigned int>( single_setting.substr(0,pos) );
+ float threshold = Moses::Scan<float>( single_setting.substr(pos+1) );
if (field == 0) {
minScore0 = threshold;
- cerr << "setting minScore0 to " << threshold << endl;
+ std::cerr << "setting minScore0 to " << threshold << std::endl;
} else if (field == 2) {
minScore2 = threshold;
- cerr << "setting minScore2 to " << threshold << endl;
+ std::cerr << "setting minScore2 to " << threshold << std::endl;
} else {
- cerr << "ERROR: MinScore currently only supported for indirect (0) and direct (2) phrase translation probabilities" << endl;
- exit(1);
+ UTIL_THROW2("MinScore currently only supported for indirect (0) and direct (2) phrase translation probabilities");
}
}
} else {
- cerr << "ERROR: unknown option " << argv[i] << endl;
- exit(1);
+ UTIL_THROW2("unknown option " << argv[i]);
}
}
processFiles( fileNameDirect, fileNameIndirect, fileNameConsolidated, fileNameCountOfCounts, fileNameSourceLabelSet, fileNamePartsOfSpeechVocabulary );
}
-vector< float > countOfCounts;
-vector< float > goodTuringDiscount;
-float kneserNey_D1, kneserNey_D2, kneserNey_D3, totalCount = -1;
-void loadCountOfCounts( char* fileNameCountOfCounts )
+
+void loadCountOfCounts( const std::string& fileNameCountOfCounts )
{
Moses::InputFileStream fileCountOfCounts(fileNameCountOfCounts);
- if (fileCountOfCounts.fail()) {
- cerr << "ERROR: could not open count of counts file " << fileNameCountOfCounts << endl;
- exit(1);
- }
- istream &fileP = fileCountOfCounts;
+ UTIL_THROW_IF2(fileCountOfCounts.fail(), "could not open count of counts file " << fileNameCountOfCounts);
countOfCounts.push_back(0.0);
- string line;
- while (getline(fileP, line)) {
+ std::string line;
+ while (getline(fileCountOfCounts, line)) {
if (totalCount < 0)
- totalCount = atof(line.c_str()); // total number of distinct phrase pairs
+ totalCount = Moses::Scan<float>(line); // total number of distinct phrase pairs
else
- countOfCounts.push_back( atof(line.c_str()) );
+ countOfCounts.push_back( Moses::Scan<float>(line) );
}
fileCountOfCounts.Close();
@@ -228,34 +224,27 @@ void loadCountOfCounts( char* fileNameCountOfCounts )
if (kneserNey_D3 > 2.9) kneserNey_D3 = 2.9;
}
-void processFiles( char* fileNameDirect, char* fileNameIndirect, char* fileNameConsolidated, char* fileNameCountOfCounts, char* fileNameSourceLabelSet, char* fileNamePartsOfSpeechVocabulary )
+
+void processFiles( const std::string& fileNameDirect,
+ const std::string& fileNameIndirect,
+ const std::string& fileNameConsolidated,
+ const std::string& fileNameCountOfCounts,
+ const std::string& fileNameSourceLabelSet,
+ const std::string& fileNamePartsOfSpeechVocabulary )
{
if (goodTuringFlag || kneserNeyFlag)
loadCountOfCounts( fileNameCountOfCounts );
// open input files
Moses::InputFileStream fileDirect(fileNameDirect);
+ UTIL_THROW_IF2(fileDirect.fail(), "could not open phrase table file " << fileNameDirect);
Moses::InputFileStream fileIndirect(fileNameIndirect);
-
- if (fileDirect.fail()) {
- cerr << "ERROR: could not open phrase table file " << fileNameDirect << endl;
- exit(1);
- }
- istream &fileDirectP = fileDirect;
-
- if (fileIndirect.fail()) {
- cerr << "ERROR: could not open phrase table file " << fileNameIndirect << endl;
- exit(1);
- }
- istream &fileIndirectP = fileIndirect;
+ UTIL_THROW_IF2(fileIndirect.fail(), "could not open phrase table file " << fileNameIndirect);
// open output file: consolidated phrase table
Moses::OutputFileStream fileConsolidated;
bool success = fileConsolidated.Open(fileNameConsolidated);
- if (!success) {
- cerr << "ERROR: could not open output file " << fileNameConsolidated << endl;
- exit(1);
- }
+ UTIL_THROW_IF2(!success, "could not open output file " << fileNameConsolidated);
// create properties consolidator
// (in case any additional phrase property requires further processing)
@@ -271,43 +260,38 @@ void processFiles( char* fileNameDirect, char* fileNameIndirect, char* fileNameC
int i=0;
while(true) {
i++;
- if (i%100000 == 0) cerr << "." << flush;
+ if (i%100000 == 0) std::cerr << "." << std::flush;
- vector< string > itemDirect, itemIndirect;
- if (! getLine(fileIndirectP,itemIndirect) ||
- ! getLine(fileDirectP, itemDirect ))
+ std::vector< std::string > itemDirect, itemIndirect;
+ if (! getLine(fileIndirect, itemIndirect) ||
+ ! getLine(fileDirect, itemDirect))
break;
// direct: target source alignment probabilities
// indirect: source target probabilities
// consistency checks
- if (itemDirect[0].compare( itemIndirect[0] ) != 0) {
- cerr << "ERROR: target phrase does not match in line " << i << ": '"
- << itemDirect[0] << "' != '" << itemIndirect[0] << "'" << endl;
- exit(1);
- }
-
- if (itemDirect[1].compare( itemIndirect[1] ) != 0) {
- cerr << "ERROR: source phrase does not match in line " << i << ": '"
- << itemDirect[1] << "' != '" << itemIndirect[1] << "'" << endl;
- exit(1);
- }
+ UTIL_THROW_IF2(itemDirect[0].compare( itemIndirect[0] ) != 0,
+ "target phrase does not match in line " << i << ": '" << itemDirect[0] << "' != '" << itemIndirect[0] << "'");
+ UTIL_THROW_IF2(itemDirect[1].compare( itemIndirect[1] ) != 0,
+ "source phrase does not match in line " << i << ": '" << itemDirect[1] << "' != '" << itemIndirect[1] << "'");
// SCORES ...
- string directScores, directSparseScores, indirectScores, indirectSparseScores;
+ std::string directScores, directSparseScores, indirectScores, indirectSparseScores;
breakdownCoreAndSparse( itemDirect[3], directScores, directSparseScores );
breakdownCoreAndSparse( itemIndirect[3], indirectScores, indirectSparseScores );
- vector<string> directCounts = Moses::Tokenize(itemDirect[4]);
- vector<string> indirectCounts = Moses::Tokenize(itemIndirect[4]);
- float countF = atof(directCounts[0].c_str());
- float countE = atof(indirectCounts[0].c_str());
- float countEF = atof(indirectCounts[1].c_str());
+ std::vector<std::string> directCounts;
+ Moses::Tokenize( directCounts, itemDirect[4] );
+ std::vector<std::string> indirectCounts;
+ Moses::Tokenize( indirectCounts, itemIndirect[4] );
+ float countF = Moses::Scan<float>(directCounts[0]);
+ float countE = Moses::Scan<float>(indirectCounts[0]);
+ float countEF = Moses::Scan<float>(indirectCounts[1]);
float n1_F, n1_E;
if (kneserNeyFlag) {
- n1_F = atof(directCounts[2].c_str());
- n1_E = atof(indirectCounts[2].c_str());
+ n1_F = Moses::Scan<float>(directCounts[2]);
+ n1_E = Moses::Scan<float>(indirectCounts[2]);
}
// Good Turing discounting
@@ -344,7 +328,8 @@ void processFiles( char* fileNameDirect, char* fileNameIndirect, char* fileNameC
if (partsOfSpeechFlag) {
// write POS factor from property
- std::vector<std::string> targetTokens = Moses::Tokenize(itemDirect[1]);
+ std::vector<std::string> targetTokens;
+ Moses::Tokenize( targetTokens, itemDirect[1] );
std::vector<std::string> propertyValuePOS;
propertiesConsolidator.GetPOSPropertyValueFromPropertiesString(itemDirect[5], propertyValuePOS);
size_t targetTerminalIndex = 0;
@@ -401,10 +386,13 @@ void processFiles( char* fileNameDirect, char* fileNameIndirect, char* fileNameC
}
// alignment
- fileConsolidated << " ||| " << itemDirect[2];
+ fileConsolidated << " |||";
+ if (!itemDirect[2].empty()) {
+ fileConsolidated << " " << itemDirect[2];;
+ }
// counts, for debugging
- fileConsolidated << "||| " << countE << " " << countF << " " << countEF;
+ fileConsolidated << " ||| " << countE << " " << countF << " " << countEF;
// sparse features
fileConsolidated << " |||";
@@ -412,6 +400,7 @@ void processFiles( char* fileNameDirect, char* fileNameIndirect, char* fileNameC
fileConsolidated << " " << directSparseScores;
if (indirectSparseScores.compare("") != 0)
fileConsolidated << " " << indirectSparseScores;
+
// count bin feature (as a sparse feature)
if (sparseCountBinFeatureFlag) {
bool foundBin = false;
@@ -437,18 +426,25 @@ void processFiles( char* fileNameDirect, char* fileNameIndirect, char* fileNameC
propertiesConsolidator.ProcessPropertiesString(itemDirect[5], fileConsolidated);
}
- fileConsolidated << endl;
+ if (countsProperty) {
+ fileConsolidated << " {{Counts " << countE << " " << countF << " " << countEF << "}}";
+ }
+
+ fileConsolidated << std::endl;
}
+
fileDirect.Close();
fileIndirect.Close();
fileConsolidated.Close();
}
-void breakdownCoreAndSparse( string combined, string &core, string &sparse )
+
+void breakdownCoreAndSparse( const std::string &combined, std::string &core, std::string &sparse )
{
core = "";
sparse = "";
- vector<string> score = Moses::Tokenize( combined );
+ std::vector<std::string> score;
+ Moses::Tokenize( score, combined );
for(size_t i=0; i<score.size(); i++) {
if ((score[i][0] >= '0' && score[i][0] <= '9') || i+1 == score.size())
core += " " + score[i];
@@ -461,38 +457,18 @@ void breakdownCoreAndSparse( string combined, string &core, string &sparse )
if (sparse.size() > 0 ) sparse = sparse.substr(1);
}
-bool getLine( istream &fileP, vector< string > &item )
+
+bool getLine( Moses::InputFileStream &file, std::vector< std::string > &item )
{
- if (fileP.eof())
+ if (file.eof())
return false;
- string line;
- if (!getline(fileP, line))
+ std::string line;
+ if (!getline(file, line))
return false;
- item = splitLine(line.c_str());
+ Moses::TokenizeMultiCharSeparator(item, line, " ||| ");
return true;
}
-vector< string > splitLine(const char *line)
-{
- vector< string > item;
- int start=0;
- int i=0;
- for(; line[i] != '\0'; i++) {
- if (line[i] == ' ' &&
- line[i+1] == '|' &&
- line[i+2] == '|' &&
- line[i+3] == '|' &&
- line[i+4] == ' ') {
- if (start > i) start = i; // empty item
- item.push_back( string( line+start, i-start ) );
- start = i+5;
- i += 3;
- }
- }
- item.push_back( string( line+start, i-start ) );
-
- return item;
-}
diff --git a/phrase-extract/consolidate-reverse-main.cpp b/phrase-extract/consolidate-reverse-main.cpp
index e2b0ad473..bce496a0c 100644
--- a/phrase-extract/consolidate-reverse-main.cpp
+++ b/phrase-extract/consolidate-reverse-main.cpp
@@ -28,6 +28,7 @@
#include "tables-core.h"
#include "InputFileStream.h"
+#include "util/tokenize.hh"
using namespace std;
@@ -165,8 +166,8 @@ void processFiles( char* fileNameDirect, char* fileNameIndirect, char* fileNameC
fileConsolidated << " ||| " << reverseAlignment(itemDirect[3]);
// counts, for debugging
- vector<string> directCounts = tokenize(itemDirect[4].c_str());
- vector<string> indirectCounts = tokenize(itemIndirect[4].c_str());
+ const vector<string> directCounts = util::tokenize(itemDirect[4]);
+ const vector<string> indirectCounts = util::tokenize(itemIndirect[4]);
fileConsolidated << "||| " << directCounts[0] << " " << indirectCounts[0];
// output rule count if present in either file
if (indirectCounts.size() > 1) {
@@ -199,7 +200,6 @@ bool getLine( istream &fileP, vector< string > &item )
vector< string > splitLine(const char *line)
{
vector< string > item;
- bool betweenWords = true;
int start=0;
int i=0;
for(; line[i] != '\0'; i++) {
@@ -223,10 +223,10 @@ string reverseAlignment(const string &alignments)
{
stringstream ret("");
- vector<string> alignToks = tokenize(alignments.c_str());
+ const vector<string> alignToks = util::tokenize(alignments);
for (size_t i = 0; i < alignToks.size(); ++i) {
- string &alignPair = alignToks[i];
+ const string &alignPair = alignToks[i];
vector<string> alignPoints;
Tokenize(alignPoints, alignPair, "-");
assert(alignPoints.size() == 2);
diff --git a/phrase-extract/extract-ghkm/Alignment.cpp b/phrase-extract/extract-ghkm/Alignment.cpp
index 7e084e495..6f946fe5a 100644
--- a/phrase-extract/extract-ghkm/Alignment.cpp
+++ b/phrase-extract/extract-ghkm/Alignment.cpp
@@ -25,7 +25,7 @@
#include <cassert>
#include <cstdlib>
-namespace Moses
+namespace MosesTraining
{
namespace GHKM
{
@@ -70,4 +70,4 @@ void FlipAlignment(Alignment &a)
}
} // namespace GHKM
-} // namespace Moses
+} // namespace MosesTraining
diff --git a/phrase-extract/extract-ghkm/Alignment.h b/phrase-extract/extract-ghkm/Alignment.h
index e8381a602..154e1fc4f 100644
--- a/phrase-extract/extract-ghkm/Alignment.h
+++ b/phrase-extract/extract-ghkm/Alignment.h
@@ -23,7 +23,7 @@
#include <utility>
#include <vector>
-namespace Moses
+namespace MosesTraining
{
namespace GHKM
{
@@ -35,5 +35,5 @@ void ReadAlignment(const std::string &, Alignment &);
void FlipAlignment(Alignment &);
} // namespace GHKM
-} // namespace Moses
+} // namespace MosesTraining
diff --git a/phrase-extract/extract-ghkm/AlignmentGraph.cpp b/phrase-extract/extract-ghkm/AlignmentGraph.cpp
index 974188dbd..9dba71331 100644
--- a/phrase-extract/extract-ghkm/AlignmentGraph.cpp
+++ b/phrase-extract/extract-ghkm/AlignmentGraph.cpp
@@ -19,23 +19,25 @@
#include "AlignmentGraph.h"
-#include "ComposedRule.h"
-#include "Node.h"
-#include "Options.h"
-#include "ParseTree.h"
-#include "Subgraph.h"
-
#include <algorithm>
#include <cassert>
+#include <cstdlib>
#include <memory>
#include <stack>
-namespace Moses
+#include "SyntaxTree.h"
+
+#include "ComposedRule.h"
+#include "Node.h"
+#include "Options.h"
+#include "Subgraph.h"
+
+namespace MosesTraining
{
namespace GHKM
{
-AlignmentGraph::AlignmentGraph(const ParseTree *t,
+AlignmentGraph::AlignmentGraph(const SyntaxTree *t,
const std::vector<std::string> &s,
const Alignment &a)
{
@@ -208,20 +210,26 @@ void AlignmentGraph::ExtractComposedRules(Node *node, const Options &options)
}
}
-Node *AlignmentGraph::CopyParseTree(const ParseTree *root)
+Node *AlignmentGraph::CopyParseTree(const SyntaxTree *root)
{
NodeType nodeType = (root->IsLeaf()) ? TARGET : TREE;
- std::auto_ptr<Node> n(new Node(root->GetLabel(), nodeType));
+ std::auto_ptr<Node> n(new Node(root->value().label, nodeType));
if (nodeType == TREE) {
- n->SetPcfgScore(root->GetPcfgScore());
+ float score = 0.0f;
+ SyntaxNode::AttributeMap::const_iterator p =
+ root->value().attributes.find("pcfg");
+ if (p != root->value().attributes.end()) {
+ score = std::atof(p->second.c_str());
+ }
+ n->SetPcfgScore(score);
}
- const std::vector<ParseTree *> &children = root->GetChildren();
+ const std::vector<SyntaxTree *> &children = root->children();
std::vector<Node *> childNodes;
childNodes.reserve(children.size());
- for (std::vector<ParseTree *>::const_iterator p(children.begin());
+ for (std::vector<SyntaxTree *>::const_iterator p(children.begin());
p != children.end(); ++p) {
Node *child = CopyParseTree(*p);
child->AddParent(n.get());
@@ -385,4 +393,4 @@ Node *AlignmentGraph::DetermineAttachmentPoint(int index)
}
} // namespace GHKM
-} // namespace Moses
+} // namespace MosesTraining
diff --git a/phrase-extract/extract-ghkm/AlignmentGraph.h b/phrase-extract/extract-ghkm/AlignmentGraph.h
index cf26b8c27..032b946f0 100644
--- a/phrase-extract/extract-ghkm/AlignmentGraph.h
+++ b/phrase-extract/extract-ghkm/AlignmentGraph.h
@@ -21,26 +21,27 @@
#ifndef EXTRACT_GHKM_ALIGNMENT_GRAPH_H_
#define EXTRACT_GHKM_ALIGNMENT_GRAPH_H_
-#include "Alignment.h"
-#include "Options.h"
-
#include <set>
#include <string>
#include <vector>
-namespace Moses
+#include "SyntaxTree.h"
+
+#include "Alignment.h"
+#include "Options.h"
+
+namespace MosesTraining
{
namespace GHKM
{
class Node;
-class ParseTree;
class Subgraph;
class AlignmentGraph
{
public:
- AlignmentGraph(const ParseTree *,
+ AlignmentGraph(const SyntaxTree *,
const std::vector<std::string> &,
const Alignment &);
@@ -61,7 +62,7 @@ private:
AlignmentGraph(const AlignmentGraph &);
AlignmentGraph &operator=(const AlignmentGraph &);
- Node *CopyParseTree(const ParseTree *);
+ Node *CopyParseTree(const SyntaxTree *);
void ComputeFrontierSet(Node *, const Options &, std::set<Node *> &) const;
void CalcComplementSpans(Node *);
void GetTargetTreeLeaves(Node *, std::vector<Node *> &);
@@ -77,6 +78,6 @@ private:
};
} // namespace GHKM
-} // namespace Moses
+} // namespace MosesTraining
#endif
diff --git a/phrase-extract/extract-ghkm/ComposedRule.cpp b/phrase-extract/extract-ghkm/ComposedRule.cpp
index e9fc826b7..d322a255f 100644
--- a/phrase-extract/extract-ghkm/ComposedRule.cpp
+++ b/phrase-extract/extract-ghkm/ComposedRule.cpp
@@ -19,15 +19,15 @@
#include "ComposedRule.h"
-#include "Node.h"
-#include "Options.h"
-#include "Subgraph.h"
-
#include <set>
#include <vector>
#include <queue>
-namespace Moses
+#include "Node.h"
+#include "Options.h"
+#include "Subgraph.h"
+
+namespace MosesTraining
{
namespace GHKM
{
@@ -128,4 +128,4 @@ Subgraph ComposedRule::CreateSubgraph()
}
} // namespace GHKM
-} // namespace Moses
+} // namespace MosesTraining
diff --git a/phrase-extract/extract-ghkm/ComposedRule.h b/phrase-extract/extract-ghkm/ComposedRule.h
index b5f72a492..d456fd27c 100644
--- a/phrase-extract/extract-ghkm/ComposedRule.h
+++ b/phrase-extract/extract-ghkm/ComposedRule.h
@@ -21,12 +21,12 @@
#ifndef EXTRACT_GHKM_COMPOSED_RULE_H_
#define EXTRACT_GHKM_COMPOSED_RULE_H_
-#include "Subgraph.h"
-
#include <vector>
#include <queue>
-namespace Moses
+#include "Subgraph.h"
+
+namespace MosesTraining
{
namespace GHKM
{
@@ -67,6 +67,6 @@ private:
};
} // namespace GHKM
-} // namespace Moses
+} // namespace MosesTraining
#endif
diff --git a/phrase-extract/extract-ghkm/Exception.h b/phrase-extract/extract-ghkm/Exception.h
index a1e623cd1..99e1067f4 100644
--- a/phrase-extract/extract-ghkm/Exception.h
+++ b/phrase-extract/extract-ghkm/Exception.h
@@ -23,7 +23,7 @@
#include <string>
-namespace Moses
+namespace MosesTraining
{
namespace GHKM
{
@@ -41,6 +41,6 @@ private:
};
} // namespace GHKM
-} // namespace Moses
+} // namespace MosesTraining
#endif
diff --git a/phrase-extract/extract-ghkm/ExtractGHKM.cpp b/phrase-extract/extract-ghkm/ExtractGHKM.cpp
index b64dc7aec..c2ee43767 100644
--- a/phrase-extract/extract-ghkm/ExtractGHKM.cpp
+++ b/phrase-extract/extract-ghkm/ExtractGHKM.cpp
@@ -19,44 +19,50 @@
#include "ExtractGHKM.h"
+#include <cassert>
+#include <cstdlib>
+#include <fstream>
+#include <iostream>
+#include <iterator>
+#include <string>
+#include <sstream>
+#include <vector>
+
+#include <boost/program_options.hpp>
+
+#include "syntax-common/xml_tree_parser.h"
+
+#include "InputFileStream.h"
+#include "OutputFileStream.h"
+#include "SyntaxNode.h"
+#include "SyntaxNodeCollection.h"
+#include "SyntaxTree.h"
+#include "tables-core.h"
+#include "XmlException.h"
+#include "XmlTree.h"
+
#include "Alignment.h"
#include "AlignmentGraph.h"
#include "Exception.h"
-#include "InputFileStream.h"
#include "Node.h"
-#include "OutputFileStream.h"
#include "Options.h"
-#include "ParseTree.h"
#include "PhraseOrientation.h"
#include "ScfgRule.h"
#include "ScfgRuleWriter.h"
#include "Span.h"
#include "StsgRule.h"
#include "StsgRuleWriter.h"
-#include "SyntaxTree.h"
-#include "tables-core.h"
-#include "XmlException.h"
-#include "XmlTree.h"
-#include "XmlTreeParser.h"
-
-#include <boost/program_options.hpp>
-#include <cassert>
-#include <cstdlib>
-#include <fstream>
-#include <iostream>
-#include <iterator>
-#include <string>
-#include <sstream>
-#include <vector>
-
-namespace Moses
+namespace MosesTraining
{
namespace GHKM
{
int ExtractGHKM::Main(int argc, char *argv[])
{
+ using Moses::InputFileStream;
+ using Moses::OutputFileStream;
+
// Process command-line options.
Options options;
ProcessOptions(argc, argv, options);
@@ -113,14 +119,6 @@ int ExtractGHKM::Main(int argc, char *argv[])
OpenOutputFileOrDie(options.unknownWordSoftMatchesFile, unknownWordSoftMatchesStream);
}
- // Target label sets for producing glue grammar.
- std::set<std::string> targetLabelSet;
- std::map<std::string, int> targetTopLabelSet;
-
- // Source label sets for producing glue grammar.
- std::set<std::string> sourceLabelSet;
- std::map<std::string, int> sourceTopLabelSet;
-
// Word count statistics for producing unknown word labels.
std::map<std::string, int> targetWordCount;
std::map<std::string, std::string> targetWordLabel;
@@ -133,8 +131,8 @@ int ExtractGHKM::Main(int argc, char *argv[])
std::string sourceLine;
std::string alignmentLine;
Alignment alignment;
- XmlTreeParser targetXmlTreeParser(targetLabelSet, targetTopLabelSet);
-// XmlTreeParser sourceXmlTreeParser(sourceLabelSet, sourceTopLabelSet);
+ Syntax::XmlTreeParser targetXmlTreeParser;
+ Syntax::XmlTreeParser sourceXmlTreeParser;
ScfgRuleWriter scfgWriter(fwdExtractStream, invExtractStream, options);
StsgRuleWriter stsgWriter(fwdExtractStream, invExtractStream, options);
size_t lineNum = options.sentenceOffset;
@@ -158,7 +156,7 @@ int ExtractGHKM::Main(int argc, char *argv[])
std::cerr << "skipping line " << lineNum << " with empty target tree\n";
continue;
}
- std::auto_ptr<ParseTree> targetParseTree;
+ std::auto_ptr<SyntaxTree> targetParseTree;
try {
targetParseTree = targetXmlTreeParser.Parse(targetLine);
assert(targetParseTree.get());
@@ -171,38 +169,14 @@ int ExtractGHKM::Main(int argc, char *argv[])
Error(oss.str());
}
-
- // Parse source tree and construct a SyntaxTree object.
- MosesTraining::SyntaxTree sourceSyntaxTree;
- MosesTraining::SyntaxNode *sourceSyntaxTreeRoot=NULL;
-
- if (options.sourceLabels) {
- try {
- if (!ProcessAndStripXMLTags(sourceLine, sourceSyntaxTree, sourceLabelSet, sourceTopLabelSet, false)) {
- throw Exception("");
- }
- sourceSyntaxTree.ConnectNodes();
- sourceSyntaxTreeRoot = sourceSyntaxTree.GetTop();
- assert(sourceSyntaxTreeRoot);
- } catch (const Exception &e) {
- std::ostringstream oss;
- oss << "Failed to parse source XML tree at line " << lineNum;
- if (!e.GetMsg().empty()) {
- oss << ": " << e.GetMsg();
- }
- Error(oss.str());
- }
- }
-
- // Read source tokens.
- std::vector<std::string> sourceTokens(ReadTokens(sourceLine));
-
- // Construct a source ParseTree object from the SyntaxTree object.
- std::auto_ptr<ParseTree> sourceParseTree;
-
- if (options.sourceLabels) {
+ // Read source tokens (and parse tree if using source labels).
+ std::vector<std::string> sourceTokens;
+ std::auto_ptr<SyntaxTree> sourceParseTree;
+ if (!options.sourceLabels) {
+ sourceTokens = ReadTokens(sourceLine);
+ } else {
try {
- sourceParseTree = XmlTreeParser::ConvertTree(*sourceSyntaxTreeRoot, sourceTokens);
+ sourceParseTree = sourceXmlTreeParser.Parse(sourceLine);
assert(sourceParseTree.get());
} catch (const Exception &e) {
std::ostringstream oss;
@@ -212,9 +186,9 @@ int ExtractGHKM::Main(int argc, char *argv[])
}
Error(oss.str());
}
+ sourceTokens = sourceXmlTreeParser.words();
}
-
// Read word alignments.
try {
ReadAlignment(alignmentLine, alignment);
@@ -234,12 +208,14 @@ int ExtractGHKM::Main(int argc, char *argv[])
// Record word counts.
if (!options.targetUnknownWordFile.empty()) {
- CollectWordLabelCounts(*targetParseTree, options, targetWordCount, targetWordLabel);
+ CollectWordLabelCounts(*targetParseTree, options, targetWordCount,
+ targetWordLabel);
}
// Record word counts: source side.
if (options.sourceLabels && !options.sourceUnknownWordFile.empty()) {
- CollectWordLabelCounts(*sourceParseTree, options, sourceWordCount, sourceWordLabel);
+ CollectWordLabelCounts(*sourceParseTree, options, sourceWordCount,
+ sourceWordLabel);
}
// Form an alignment graph from the target tree, source words, and
@@ -255,7 +231,8 @@ int ExtractGHKM::Main(int argc, char *argv[])
}
// Initialize phrase orientation scoring object
- PhraseOrientation phraseOrientation( sourceTokens.size(), targetXmlTreeParser.GetWords().size(), alignment);
+ PhraseOrientation phraseOrientation(sourceTokens.size(),
+ targetXmlTreeParser.words().size(), alignment);
// Write the rules, subject to scope pruning.
const std::vector<Node *> &targetNodes = graph.GetTargetNodes();
@@ -264,12 +241,12 @@ int ExtractGHKM::Main(int argc, char *argv[])
const std::vector<const Subgraph *> &rules = (*p)->GetRules();
- Moses::GHKM::PhraseOrientation::REO_CLASS l2rOrientation=Moses::GHKM::PhraseOrientation::REO_CLASS_UNKNOWN, r2lOrientation=Moses::GHKM::PhraseOrientation::REO_CLASS_UNKNOWN;
+ PhraseOrientation::REO_CLASS l2rOrientation=PhraseOrientation::REO_CLASS_UNKNOWN, r2lOrientation=PhraseOrientation::REO_CLASS_UNKNOWN;
if (options.phraseOrientation && !rules.empty()) {
int sourceSpanBegin = *((*p)->GetSpan().begin());
int sourceSpanEnd = *((*p)->GetSpan().rbegin());
- l2rOrientation = phraseOrientation.GetOrientationInfo(sourceSpanBegin,sourceSpanEnd,Moses::GHKM::PhraseOrientation::REO_DIR_L2R);
- r2lOrientation = phraseOrientation.GetOrientationInfo(sourceSpanBegin,sourceSpanEnd,Moses::GHKM::PhraseOrientation::REO_DIR_R2L);
+ l2rOrientation = phraseOrientation.GetOrientationInfo(sourceSpanBegin,sourceSpanEnd,PhraseOrientation::REO_DIR_L2R);
+ r2lOrientation = phraseOrientation.GetOrientationInfo(sourceSpanBegin,sourceSpanEnd,PhraseOrientation::REO_DIR_R2L);
// std::cerr << "span " << sourceSpanBegin << " " << sourceSpanEnd << std::endl;
// std::cerr << "phraseOrientation " << phraseOrientation.GetOrientationInfo(sourceSpanBegin,sourceSpanEnd) << std::endl;
}
@@ -287,7 +264,7 @@ int ExtractGHKM::Main(int argc, char *argv[])
// SCFG output.
ScfgRule *r = 0;
if (options.sourceLabels) {
- r = new ScfgRule(**q, &sourceSyntaxTree);
+ r = new ScfgRule(**q, &sourceXmlTreeParser.node_collection());
} else {
r = new ScfgRule(**q);
}
@@ -310,8 +287,8 @@ int ExtractGHKM::Main(int argc, char *argv[])
fwdExtractStream << " ";
phraseOrientation.WriteOrientation(fwdExtractStream,r2lOrientation);
fwdExtractStream << "}}";
- phraseOrientation.IncrementPriorCount(Moses::GHKM::PhraseOrientation::REO_DIR_L2R,l2rOrientation,1);
- phraseOrientation.IncrementPriorCount(Moses::GHKM::PhraseOrientation::REO_DIR_R2L,r2lOrientation,1);
+ phraseOrientation.IncrementPriorCount(PhraseOrientation::REO_DIR_L2R,l2rOrientation,1);
+ phraseOrientation.IncrementPriorCount(PhraseOrientation::REO_DIR_R2L,r2lOrientation,1);
}
fwdExtractStream << std::endl;
invExtractStream << std::endl;
@@ -330,21 +307,36 @@ int ExtractGHKM::Main(int argc, char *argv[])
std::map<std::string,size_t> sourceLabels;
if (options.sourceLabels && !options.sourceLabelSetFile.empty()) {
-
- sourceLabelSet.insert("XLHS"); // non-matching label (left-hand side)
- sourceLabelSet.insert("XRHS"); // non-matching label (right-hand side)
- sourceLabelSet.insert("TOPLABEL"); // as used in the glue grammar
- sourceLabelSet.insert("SOMELABEL"); // as used in the glue grammar
+ std::set<std::string> extendedLabelSet = sourceXmlTreeParser.label_set();
+ extendedLabelSet.insert("XLHS"); // non-matching label (left-hand side)
+ extendedLabelSet.insert("XRHS"); // non-matching label (right-hand side)
+ extendedLabelSet.insert("TOPLABEL"); // as used in the glue grammar
+ extendedLabelSet.insert("SOMELABEL"); // as used in the glue grammar
size_t index = 0;
- for (std::set<std::string>::const_iterator iter=sourceLabelSet.begin();
- iter!=sourceLabelSet.end(); ++iter, ++index) {
+ for (std::set<std::string>::const_iterator iter=extendedLabelSet.begin();
+ iter!=extendedLabelSet.end(); ++iter, ++index) {
sourceLabels.insert(std::pair<std::string,size_t>(*iter,index));
}
WriteSourceLabelSet(sourceLabels, sourceLabelSetStream);
}
+ std::set<std::string> strippedTargetLabelSet;
+ std::map<std::string, int> strippedTargetTopLabelSet;
+ if (options.stripBitParLabels &&
+ (!options.glueGrammarFile.empty() || !options.unknownWordSoftMatchesFile.empty())) {
+ StripBitParLabels(targetXmlTreeParser.label_set(),
+ targetXmlTreeParser.top_label_set(),
+ strippedTargetLabelSet, strippedTargetTopLabelSet);
+ }
+
if (!options.glueGrammarFile.empty()) {
- WriteGlueGrammar(targetLabelSet, targetTopLabelSet, sourceLabels, options, glueGrammarStream);
+ if (options.stripBitParLabels) {
+ WriteGlueGrammar(strippedTargetLabelSet, strippedTargetTopLabelSet, sourceLabels, options, glueGrammarStream);
+ } else {
+ WriteGlueGrammar(targetXmlTreeParser.label_set(),
+ targetXmlTreeParser.top_label_set(),
+ sourceLabels, options, glueGrammarStream);
+ }
}
if (!options.targetUnknownWordFile.empty()) {
@@ -356,45 +348,17 @@ int ExtractGHKM::Main(int argc, char *argv[])
}
if (!options.unknownWordSoftMatchesFile.empty()) {
- WriteUnknownWordSoftMatches(targetLabelSet, unknownWordSoftMatchesStream);
+ if (options.stripBitParLabels) {
+ WriteUnknownWordSoftMatches(strippedTargetLabelSet, unknownWordSoftMatchesStream);
+ } else {
+ WriteUnknownWordSoftMatches(targetXmlTreeParser.label_set(),
+ unknownWordSoftMatchesStream);
+ }
}
return 0;
}
-void ExtractGHKM::OpenInputFileOrDie(const std::string &filename,
- std::ifstream &stream)
-{
- stream.open(filename.c_str());
- if (!stream) {
- std::ostringstream msg;
- msg << "failed to open input file: " << filename;
- Error(msg.str());
- }
-}
-
-void ExtractGHKM::OpenOutputFileOrDie(const std::string &filename,
- std::ofstream &stream)
-{
- stream.open(filename.c_str());
- if (!stream) {
- std::ostringstream msg;
- msg << "failed to open output file: " << filename;
- Error(msg.str());
- }
-}
-
-void ExtractGHKM::OpenOutputFileOrDie(const std::string &filename,
- OutputFileStream &stream)
-{
- bool ret = stream.Open(filename);
- if (!ret) {
- std::ostringstream msg;
- msg << "failed to open output file: " << filename;
- Error(msg.str());
- }
-}
-
void ExtractGHKM::ProcessOptions(int argc, char *argv[],
Options &options) const
{
@@ -404,7 +368,7 @@ void ExtractGHKM::ProcessOptions(int argc, char *argv[],
// Construct the 'top' of the usage message: the bit that comes before the
// options list.
std::ostringstream usageTop;
- usageTop << "Usage: " << GetName()
+ usageTop << "Usage: " << name()
<< " [OPTION]... TARGET SOURCE ALIGNMENT EXTRACT\n\n"
<< "SCFG rule extractor based on the GHKM algorithm described in\n"
<< "Galley et al. (2004).\n\n"
@@ -415,11 +379,22 @@ void ExtractGHKM::ProcessOptions(int argc, char *argv[],
usageBottom << "\nImplementation Notes:\n"
<< "\nThe parse tree is assumed to contain part-of-speech preterminal nodes.\n"
<< "\n"
- << "For the composed rule constraints: rule depth is the maximum distance from the\nrule's root node to a sink node, not counting preterminal expansions or word\nalignments. Rule size is the measure defined in DeNeefe et al (2007): the\nnumber of non-part-of-speech, non-leaf constituent labels in the target tree.\nNode count is the number of target tree nodes (excluding target words).\n"
+ << "For the composed rule constraints: rule depth is the "
+ "maximum distance from the\nrule's root node to a sink "
+ "node, not counting preterminal expansions or word\n"
+ "alignments. Rule size is the measure defined in DeNeefe "
+ "et al (2007): the\nnumber of non-part-of-speech, non-leaf "
+ "constituent labels in the target tree.\nNode count is the "
+ "number of target tree nodes (excluding target words).\n"
<< "\n"
<< "Scope pruning (Hopkins and Langmead, 2010) is applied to both minimal and\ncomposed rules.\n"
<< "\n"
- << "Unaligned source words are attached to the tree using the following heuristic:\nif there are aligned source words to both the left and the right of an unaligned\nsource word then it is attached to the lowest common ancestor of its nearest\nsuch left and right neighbours. Otherwise, it is attached to the root of the\nparse tree.\n"
+ << "Unaligned source words are attached to the tree using the "
+ "following heuristic:\nif there are aligned source words to "
+ "both the left and the right of an unaligned\nsource word "
+ "then it is attached to the lowest common ancestor of its "
+ "nearest\nsuch left and right neighbours. Otherwise, it is "
+ "attached to the root of the\nparse tree.\n"
<< "\n"
<< "Unless the --AllowUnary option is given, unary rules containing no lexical\nsource items are eliminated using the method described in Chung et al. (2011).\nThe parsing algorithm used in Moses is unable to handle such rules.\n"
<< "\n"
@@ -466,11 +441,15 @@ void ExtractGHKM::ProcessOptions(int argc, char *argv[],
("Minimal",
"extract minimal rules only")
("PartsOfSpeech",
- "output parts-of-speech information (preterminals from the parse tree)")
+ "output parts-of-speech as property (preterminals from the parse tree)")
+ ("PartsOfSpeechFactor",
+ "output parts-of-speech as factor (preterminals from the parse tree)")
("PCFG",
"include score based on PCFG scores in target corpus")
("PhraseOrientation",
"output phrase orientation information")
+ ("StripBitParLabels",
+ "strip suffix starting with a hyphen symbol (\"-\") from non-terminal labels")
("STSG",
"output STSG rules (default is SCFG)")
("T2S",
@@ -535,11 +514,8 @@ void ExtractGHKM::ProcessOptions(int argc, char *argv[],
// Process the command-line.
po::variables_map vm;
- const int optionStyle = cls::allow_long
- | cls::long_allow_adjacent
- | cls::long_allow_next;
try {
- po::store(po::command_line_parser(argc, argv).style(optionStyle).
+ po::store(po::command_line_parser(argc, argv).style(MosesOptionStyle()).
options(cmdLineOptions).positional(p).run(), vm);
po::notify(vm);
} catch (const std::exception &e) {
@@ -582,12 +558,18 @@ void ExtractGHKM::ProcessOptions(int argc, char *argv[],
if (vm.count("PartsOfSpeech")) {
options.partsOfSpeech = true;
}
+ if (vm.count("PartsOfSpeechFactor")) {
+ options.partsOfSpeechFactor = true;
+ }
if (vm.count("PCFG")) {
options.pcfg = true;
}
if (vm.count("PhraseOrientation")) {
options.phraseOrientation = true;
}
+ if (vm.count("StripBitParLabels")) {
+ options.stripBitParLabels = true;
+ }
if (vm.count("STSG")) {
options.stsg = true;
}
@@ -617,12 +599,6 @@ void ExtractGHKM::ProcessOptions(int argc, char *argv[],
}
}
-void ExtractGHKM::Error(const std::string &msg) const
-{
- std::cerr << GetName() << ": " << msg << std::endl;
- std::exit(1);
-}
-
std::vector<std::string> ExtractGHKM::ReadTokens(const std::string &s) const
{
std::vector<std::string> tokens;
@@ -657,7 +633,7 @@ void ExtractGHKM::WriteGlueGrammar(
const std::map<std::string, int> &topLabelSet,
const std::map<std::string,size_t> &sourceLabels,
const Options &options,
- std::ostream &out)
+ std::ostream &out) const
{
// choose a top label that is not already a label
std::string topLabel = "QQQQQQ";
@@ -672,17 +648,31 @@ void ExtractGHKM::WriteGlueGrammar(
const size_t sourceLabelGlueX = 1;
const size_t sourceLabelSentenceStart = 2;
const size_t sourceLabelSentenceEnd = 3;
- const size_t partOfSpeechSentenceStart = 0;
- const size_t partOfSpeechSentenceEnd = 1;
- std::string sentenceStart = "<s>";
- std::string sentenceEnd = "</s>";
+// const size_t partOfSpeechSentenceStart = 0;
+// const size_t partOfSpeechSentenceEnd = 1;
+
+#ifndef BOS_
+#define BOS_ "<s>" //Beginning of sentence symbol
+#endif
+#ifndef EOS_
+#define EOS_ "</s>" //End of sentence symbol
+#endif
+
+ std::string sentenceStartSource = BOS_;
+ std::string sentenceEndSource = EOS_;
+ std::string sentenceStartTarget = BOS_;
+ std::string sentenceEndTarget = EOS_;
if (options.partsOfSpeech) {
- sentenceStart = sentenceStart + "|" + sentenceStart;
- sentenceEnd = sentenceEnd + "|" + sentenceEnd;
+ sentenceStartTarget = sentenceStartTarget + "|" + BOS_;
+ sentenceEndTarget = sentenceEndTarget + "|" + EOS_;
+ }
+ if (options.partsOfSpeechFactor) {
+ sentenceStartTarget = sentenceStartTarget + "|" + BOS_;
+ sentenceEndTarget = sentenceEndTarget + "|" + EOS_;
}
// basic rules
- out << sentenceStart << " [X] ||| " << sentenceStart << " [" << topLabel << "] ||| 1 ||| 0-0 ||| ||| |||";
+ out << sentenceStartSource << " [X] ||| " << sentenceStartTarget << " [" << topLabel << "] ||| 1 ||| 0-0 ||| ||| |||";
if (options.treeFragments) {
out << " {{Tree [" << topLabel << " [SSTART <s>]]}}";
}
@@ -697,7 +687,7 @@ void ExtractGHKM::WriteGlueGrammar(
}
out << std::endl;
- out << "[X][" << topLabel << "] " << sentenceEnd << " [X] ||| [X][" << topLabel << "] " << sentenceEnd << " [" << topLabel << "] ||| 1 ||| 0-0 1-1 ||| ||| |||";
+ out << "[X][" << topLabel << "] " << sentenceEndSource << " [X] ||| [X][" << topLabel << "] " << sentenceEndTarget << " [" << topLabel << "] ||| 1 ||| 0-0 1-1 ||| ||| |||";
if (options.treeFragments) {
out << " {{Tree [" << topLabel << " [" << topLabel << "] [SEND </s>]]}}";
}
@@ -715,7 +705,7 @@ void ExtractGHKM::WriteGlueGrammar(
// top rules
for (std::map<std::string, int>::const_iterator i = topLabelSet.begin();
i != topLabelSet.end(); ++i) {
- out << sentenceStart << " [X][" << i->first << "] " << sentenceEnd << " [X] ||| " << sentenceStart << " [X][" << i->first << "] " << sentenceEnd << " [" << topLabel << "] ||| 1 ||| 0-0 1-1 2-2 ||| ||| |||";
+ out << sentenceStartSource << " [X][" << i->first << "] " << sentenceEndSource << " [X] ||| " << sentenceStartTarget << " [X][" << i->first << "] " << sentenceEndTarget << " [" << topLabel << "] ||| 1 ||| 0-0 1-1 2-2 ||| ||| |||";
if (options.treeFragments) {
out << " {{Tree [" << topLabel << " [SSTART <s>] [" << i->first << "] [SEND </s>]]}}";
}
@@ -763,7 +753,7 @@ void ExtractGHKM::WriteGlueGrammar(
void ExtractGHKM::WriteSourceLabelSet(
const std::map<std::string,size_t> &sourceLabels,
- std::ostream &out)
+ std::ostream &out) const
{
out << sourceLabels.size() << std::endl;
for (std::map<std::string,size_t>::const_iterator iter=sourceLabels.begin();
@@ -773,43 +763,39 @@ void ExtractGHKM::WriteSourceLabelSet(
}
void ExtractGHKM::CollectWordLabelCounts(
- ParseTree &root,
+ SyntaxTree &root,
const Options &options,
std::map<std::string, int> &wordCount,
std::map<std::string, std::string> &wordLabel)
{
- std::vector<const ParseTree*> leaves;
- root.GetLeaves(std::back_inserter(leaves));
- for (std::vector<const ParseTree *>::const_iterator p = leaves.begin();
- p != leaves.end(); ++p) {
- const ParseTree &leaf = **p;
- const std::string &word = leaf.GetLabel();
- const ParseTree *ancestor = leaf.GetParent();
+ for (SyntaxTree::ConstLeafIterator p(root);
+ p != SyntaxTree::ConstLeafIterator(); ++p) {
+ const SyntaxTree &leaf = *p;
+ const std::string &word = leaf.value().label;
+ const SyntaxTree *ancestor = leaf.parent();
// If unary rule elimination is enabled and this word is at the end of a
// chain of unary rewrites, e.g.
// PN-SB -> NE -> word
// then record the constituent label at the top of the chain instead of
// the part-of-speech label.
while (!options.allowUnary &&
- ancestor->GetParent() &&
- ancestor->GetParent()->GetChildren().size() == 1) {
- ancestor = ancestor->GetParent();
+ ancestor->parent() &&
+ ancestor->parent()->children().size() == 1) {
+ ancestor = ancestor->parent();
}
- const std::string &label = ancestor->GetLabel();
+ const std::string &label = ancestor->value().label;
++wordCount[word];
wordLabel[word] = label;
}
}
-std::vector<std::string> ExtractGHKM::ReadTokens(const ParseTree &root) const
+std::vector<std::string> ExtractGHKM::ReadTokens(const SyntaxTree &root) const
{
std::vector<std::string> tokens;
- std::vector<const ParseTree*> leaves;
- root.GetLeaves(std::back_inserter(leaves));
- for (std::vector<const ParseTree *>::const_iterator p = leaves.begin();
- p != leaves.end(); ++p) {
- const ParseTree &leaf = **p;
- const std::string &word = leaf.GetLabel();
+ for (SyntaxTree::ConstLeafIterator p(root);
+ p != SyntaxTree::ConstLeafIterator(); ++p) {
+ const SyntaxTree &leaf = *p;
+ const std::string &word = leaf.value().label;
tokens.push_back(word);
}
return tokens;
@@ -820,7 +806,7 @@ void ExtractGHKM::WriteUnknownWordLabel(
const std::map<std::string, std::string> &wordLabel,
const Options &options,
std::ostream &out,
- bool writeCounts)
+ bool writeCounts) const
{
if (!options.unknownWordSoftMatchesFile.empty()) {
out << "UNK 1" << std::endl;
@@ -836,7 +822,16 @@ void ExtractGHKM::WriteUnknownWordLabel(
std::map<std::string, std::string>::const_iterator q =
wordLabel.find(p->first);
assert(q != wordLabel.end());
- ++labelCount[q->second];
+ if (options.stripBitParLabels) {
+ size_t pos = q->second.find('-');
+ if (pos == std::string::npos) {
+ ++labelCount[q->second];
+ } else {
+ ++labelCount[q->second.substr(0,pos)];
+ }
+ } else {
+ ++labelCount[q->second];
+ }
++total;
}
}
@@ -859,7 +854,7 @@ void ExtractGHKM::WriteUnknownWordLabel(
void ExtractGHKM::WriteUnknownWordSoftMatches(
const std::set<std::string> &labelSet,
- std::ostream &out)
+ std::ostream &out) const
{
for (std::set<std::string>::const_iterator p = labelSet.begin(); p != labelSet.end(); ++p) {
std::string label = *p;
@@ -867,5 +862,38 @@ void ExtractGHKM::WriteUnknownWordSoftMatches(
}
}
+void ExtractGHKM::StripBitParLabels(
+ const std::set<std::string> &labelSet,
+ const std::map<std::string, int> &topLabelSet,
+ std::set<std::string> &outLabelSet,
+ std::map<std::string, int> &outTopLabelSet) const
+{
+ for (std::set<std::string>::const_iterator it=labelSet.begin();
+ it!=labelSet.end(); ++it) {
+ size_t pos = it->find('-');
+ if (pos == std::string::npos) {
+ outLabelSet.insert(*it);
+ } else {
+ outLabelSet.insert(it->substr(0,pos));
+ }
+ }
+ for (std::map<std::string,int>::const_iterator it=topLabelSet.begin();
+ it!=topLabelSet.end(); ++it) {
+ size_t pos = it->first.find('-');
+ std::string stripped;
+ if (pos == std::string::npos) {
+ stripped = it->first;
+ } else {
+ stripped = it->first.substr(0,pos);
+ }
+ std::map<std::string, int>::iterator found=outTopLabelSet.find(stripped);
+ if (found != outTopLabelSet.end()) {
+ found->second += it->second;
+ } else {
+ outTopLabelSet.insert(std::pair<std::string,int>(stripped,it->second));
+ }
+ }
+}
+
} // namespace GHKM
-} // namespace Moses
+} // namespace MosesTraining
diff --git a/phrase-extract/extract-ghkm/ExtractGHKM.h b/phrase-extract/extract-ghkm/ExtractGHKM.h
index df54ed250..0d0fa8bf1 100644
--- a/phrase-extract/extract-ghkm/ExtractGHKM.h
+++ b/phrase-extract/extract-ghkm/ExtractGHKM.h
@@ -18,8 +18,6 @@
***********************************************************************/
#pragma once
-#ifndef EXTRACT_GHKM_EXTRACT_GHKM_H_
-#define EXTRACT_GHKM_EXTRACT_GHKM_H_
#include <map>
#include <ostream>
@@ -27,32 +25,28 @@
#include <string>
#include <vector>
-namespace Moses
-{
+#include "OutputFileStream.h"
+#include "SyntaxTree.h"
-class OutputFileStream;
+#include "syntax-common/tool.h"
+namespace MosesTraining
+{
namespace GHKM
{
struct Options;
-class ParseTree;
-class ExtractGHKM
+class ExtractGHKM : public Syntax::Tool
{
public:
- ExtractGHKM() : m_name("extract-ghkm") {}
- const std::string &GetName() const {
- return m_name;
- }
- int Main(int argc, char *argv[]);
+ ExtractGHKM() : Tool("extract-ghkm") {}
+
+ virtual int Main(int argc, char *argv[]);
+
private:
- void Error(const std::string &) const;
- void OpenInputFileOrDie(const std::string &, std::ifstream &);
- void OpenOutputFileOrDie(const std::string &, std::ofstream &);
- void OpenOutputFileOrDie(const std::string &, OutputFileStream &);
- void RecordTreeLabels(const ParseTree &, std::set<std::string> &);
- void CollectWordLabelCounts(ParseTree &,
+ void RecordTreeLabels(const SyntaxTree &, std::set<std::string> &);
+ void CollectWordLabelCounts(SyntaxTree &,
const Options &,
std::map<std::string, int> &,
std::map<std::string, std::string> &);
@@ -60,25 +54,26 @@ private:
const std::map<std::string, std::string> &,
const Options &,
std::ostream &,
- bool writeCounts=false);
+ bool writeCounts=false) const;
void WriteUnknownWordSoftMatches(const std::set<std::string> &,
- std::ostream &);
+ std::ostream &) const;
void WriteGlueGrammar(const std::set<std::string> &,
const std::map<std::string, int> &,
const std::map<std::string,size_t> &,
const Options &,
- std::ostream &);
+ std::ostream &) const;
void WriteSourceLabelSet(const std::map<std::string,size_t> &,
- std::ostream &);
+ std::ostream &) const;
+ void StripBitParLabels(const std::set<std::string> &labelSet,
+ const std::map<std::string, int> &topLabelSet,
+ std::set<std::string> &outLabelSet,
+ std::map<std::string, int> &outTopLabelSet) const;
+
std::vector<std::string> ReadTokens(const std::string &) const;
- std::vector<std::string> ReadTokens(const ParseTree &root) const;
+ std::vector<std::string> ReadTokens(const SyntaxTree &root) const;
void ProcessOptions(int, char *[], Options &) const;
-
- std::string m_name;
};
} // namespace GHKM
-} // namespace Moses
-
-#endif
+} // namespace MosesTraining
diff --git a/phrase-extract/extract-ghkm/Jamfile b/phrase-extract/extract-ghkm/Jamfile
index f2d1ac5a8..4692937de 100644
--- a/phrase-extract/extract-ghkm/Jamfile
+++ b/phrase-extract/extract-ghkm/Jamfile
@@ -1 +1 @@
-exe extract-ghkm : [ glob *.cpp ] ..//deps ../..//boost_iostreams ../..//boost_program_options ../..//z : <include>.. ;
+exe extract-ghkm : [ glob *.cpp ] ..//syntax-common ..//deps ../..//boost_iostreams ../..//boost_program_options ../..//z : <include>.. ;
diff --git a/phrase-extract/extract-ghkm/Main.cpp b/phrase-extract/extract-ghkm/Main.cpp
index 14064406b..64b3e0f00 100644
--- a/phrase-extract/extract-ghkm/Main.cpp
+++ b/phrase-extract/extract-ghkm/Main.cpp
@@ -21,6 +21,6 @@
int main(int argc, char *argv[])
{
- Moses::GHKM::ExtractGHKM tool;
+ MosesTraining::GHKM::ExtractGHKM tool;
return tool.Main(argc, argv);
}
diff --git a/phrase-extract/extract-ghkm/Node.cpp b/phrase-extract/extract-ghkm/Node.cpp
index e14d8c050..384db3306 100644
--- a/phrase-extract/extract-ghkm/Node.cpp
+++ b/phrase-extract/extract-ghkm/Node.cpp
@@ -21,7 +21,7 @@
#include "Subgraph.h"
-namespace Moses
+namespace MosesTraining
{
namespace GHKM
{
@@ -70,4 +70,4 @@ void Node::GetTargetWords(std::vector<std::string> &targetWords) const
}
} // namespace GHKM
-} // namespace Moses
+} // namespace MosesTraining
diff --git a/phrase-extract/extract-ghkm/Node.h b/phrase-extract/extract-ghkm/Node.h
index 2eed01311..71a24b28e 100644
--- a/phrase-extract/extract-ghkm/Node.h
+++ b/phrase-extract/extract-ghkm/Node.h
@@ -21,14 +21,14 @@
#ifndef EXTRACT_GHKM_NODE_H_
#define EXTRACT_GHKM_NODE_H_
-#include "Span.h"
-
#include <cassert>
#include <iterator>
#include <string>
#include <vector>
-namespace Moses
+#include "Span.h"
+
+namespace MosesTraining
{
namespace GHKM
{
@@ -215,6 +215,6 @@ Node *Node::LowestCommonAncestor(InputIterator first, InputIterator last)
}
} // namespace GHKM
-} // namespace Moses
+} // namespace MosesTraining
#endif
diff --git a/phrase-extract/extract-ghkm/Options.h b/phrase-extract/extract-ghkm/Options.h
index 33d59e290..f694fb55c 100644
--- a/phrase-extract/extract-ghkm/Options.h
+++ b/phrase-extract/extract-ghkm/Options.h
@@ -18,12 +18,10 @@
***********************************************************************/
#pragma once
-#ifndef EXTRACT_GHKM_OPTIONS_H_
-#define EXTRACT_GHKM_OPTIONS_H_
#include <string>
-namespace Moses
+namespace MosesTraining
{
namespace GHKM
{
@@ -41,10 +39,12 @@ public:
, maxScope(3)
, minimal(false)
, partsOfSpeech(false)
+ , partsOfSpeechFactor(false)
, pcfg(false)
, phraseOrientation(false)
, sentenceOffset(0)
, sourceLabels(false)
+ , stripBitParLabels(false)
, stsg(false)
, t2s(false)
, treeFragments(false)
@@ -70,12 +70,14 @@ public:
int maxScope;
bool minimal;
bool partsOfSpeech;
+ bool partsOfSpeechFactor;
bool pcfg;
bool phraseOrientation;
int sentenceOffset;
bool sourceLabels;
std::string sourceLabelSetFile;
std::string sourceUnknownWordFile;
+ bool stripBitParLabels;
bool stsg;
bool t2s;
std::string targetUnknownWordFile;
@@ -87,6 +89,5 @@ public:
};
} // namespace GHKM
-} // namespace Moses
+} // namespace MosesTraining
-#endif
diff --git a/phrase-extract/extract-ghkm/ParseTree.cpp b/phrase-extract/extract-ghkm/ParseTree.cpp
deleted file mode 100644
index f86486487..000000000
--- a/phrase-extract/extract-ghkm/ParseTree.cpp
+++ /dev/null
@@ -1,56 +0,0 @@
-/***********************************************************************
- Moses - statistical machine translation system
- Copyright (C) 2006-2011 University of Edinburgh
-
- This library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- This library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with this library; if not, write to the Free Software
- Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-***********************************************************************/
-
-#include "ParseTree.h"
-
-namespace Moses
-{
-namespace GHKM
-{
-
-ParseTree::~ParseTree()
-{
- for (std::vector<ParseTree*>::iterator p(m_children.begin());
- p != m_children.end(); ++p) {
- delete *p;
- }
-}
-
-void ParseTree::SetChildren(const std::vector<ParseTree*> &children)
-{
- m_children = children;
-}
-
-void ParseTree::SetParent(ParseTree *parent)
-{
- m_parent = parent;
-}
-
-void ParseTree::AddChild(ParseTree *child)
-{
- m_children.push_back(child);
-}
-
-bool ParseTree::IsLeaf() const
-{
- return m_children.empty();
-}
-
-} // namespace GHKM
-} // namespace Moses
diff --git a/phrase-extract/extract-ghkm/ParseTree.h b/phrase-extract/extract-ghkm/ParseTree.h
deleted file mode 100644
index 694286c9d..000000000
--- a/phrase-extract/extract-ghkm/ParseTree.h
+++ /dev/null
@@ -1,97 +0,0 @@
-/***********************************************************************
- Moses - statistical machine translation system
- Copyright (C) 2006-2011 University of Edinburgh
-
- This library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- This library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with this library; if not, write to the Free Software
- Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-***********************************************************************/
-
-#pragma once
-#ifndef EXTRACT_GHKM_PARSE_TREE_H_
-#define EXTRACT_GHKM_PARSE_TREE_H_
-
-#include <string>
-#include <vector>
-
-namespace Moses
-{
-namespace GHKM
-{
-
-class ParseTree
-{
-public:
- ParseTree(const std::string &label)
- : m_label(label)
- , m_parent(0)
- , m_pcfgScore(0.0) {}
-
- ~ParseTree();
-
- const std::string &GetLabel() const {
- return m_label;
- }
- const std::vector<ParseTree*> &GetChildren() const {
- return m_children;
- }
- const ParseTree *GetParent() const {
- return m_parent;
- }
- float GetPcfgScore() const {
- return m_pcfgScore;
- }
-
- void SetParent(ParseTree *);
- void SetChildren(const std::vector<ParseTree*> &);
- void SetPcfgScore(float score) {
- m_pcfgScore = score;
- }
-
- void AddChild(ParseTree *);
-
- bool IsLeaf() const;
-
- template<typename OutputIterator>
- void GetLeaves(OutputIterator) const;
-
-private:
- // Disallow copying
- ParseTree(const ParseTree &);
- ParseTree &operator=(const ParseTree &);
-
- std::string m_label;
- std::vector<ParseTree*> m_children;
- ParseTree *m_parent;
- float m_pcfgScore; // log probability
-};
-
-template<typename OutputIterator>
-void ParseTree::GetLeaves(OutputIterator result) const
-{
- if (IsLeaf()) {
- *result++ = this;
- } else {
- std::vector<ParseTree *>::const_iterator p = m_children.begin();
- std::vector<ParseTree *>::const_iterator end = m_children.end();
- while (p != end) {
- ParseTree &child = **p++;
- child.GetLeaves(result);
- }
- }
-}
-
-} // namespace GHKM
-} // namespace Moses
-
-#endif
diff --git a/phrase-extract/extract-ghkm/PhraseOrientation.cpp b/phrase-extract/extract-ghkm/PhraseOrientation.cpp
index 8f1356cb3..57952d580 100644
--- a/phrase-extract/extract-ghkm/PhraseOrientation.cpp
+++ b/phrase-extract/extract-ghkm/PhraseOrientation.cpp
@@ -26,7 +26,7 @@
#include <boost/assign/list_of.hpp>
-namespace Moses
+namespace MosesTraining
{
namespace GHKM
{
@@ -469,5 +469,5 @@ void PhraseOrientation::WritePriorCounts(std::ostream& out, const REO_MODEL_TYPE
}
} // namespace GHKM
-} // namespace Moses
+} // namespace MosesTraining
diff --git a/phrase-extract/extract-ghkm/PhraseOrientation.h b/phrase-extract/extract-ghkm/PhraseOrientation.h
index d826c127c..572124e61 100644
--- a/phrase-extract/extract-ghkm/PhraseOrientation.h
+++ b/phrase-extract/extract-ghkm/PhraseOrientation.h
@@ -1,4 +1,3 @@
-
/***********************************************************************
Moses - statistical machine translation system
Copyright (C) 2006-2011 University of Edinburgh
@@ -20,16 +19,18 @@
#pragma once
-#include "Alignment.h"
-#include "moses/AlignmentInfo.h"
-
#include <map>
#include <set>
#include <string>
#include <vector>
+
#include <boost/unordered_map.hpp>
-namespace Moses
+#include "moses/AlignmentInfo.h"
+
+#include "Alignment.h"
+
+namespace MosesTraining
{
namespace GHKM
{
@@ -53,8 +54,8 @@ public:
PhraseOrientation(int sourceSize,
int targetSize,
- const AlignmentInfo &alignTerm,
- const AlignmentInfo &alignNonTerm);
+ const Moses::AlignmentInfo &alignTerm,
+ const Moses::AlignmentInfo &alignNonTerm);
REO_CLASS GetOrientationInfo(int startF, int endF, REO_DIR direction) const;
REO_CLASS GetOrientationInfo(int startF, int startE, int endF, int endE, REO_DIR direction) const;
@@ -119,5 +120,4 @@ private:
};
} // namespace GHKM
-} // namespace Moses
-
+} // namespace MosesTraining
diff --git a/phrase-extract/extract-ghkm/Rule.cpp b/phrase-extract/extract-ghkm/Rule.cpp
index da6b2ff23..1b7207c3c 100644
--- a/phrase-extract/extract-ghkm/Rule.cpp
+++ b/phrase-extract/extract-ghkm/Rule.cpp
@@ -3,7 +3,7 @@
#include "Node.h"
#include "Subgraph.h"
-namespace Moses
+namespace MosesTraining
{
namespace GHKM
{
@@ -38,4 +38,4 @@ bool Rule::PartitionOrderComp(const Node *a, const Node *b)
}
} // namespace GHKM
-} // namespace Moses
+} // namespace MosesTraining
diff --git a/phrase-extract/extract-ghkm/Rule.h b/phrase-extract/extract-ghkm/Rule.h
index 36e24c799..b87934735 100644
--- a/phrase-extract/extract-ghkm/Rule.h
+++ b/phrase-extract/extract-ghkm/Rule.h
@@ -7,7 +7,7 @@
#include "Alignment.h"
-namespace Moses
+namespace MosesTraining
{
namespace GHKM
{
@@ -54,6 +54,6 @@ protected:
};
} // namespace GHKM
-} // namespace Moses
+} // namespace MosesTraining
#endif
diff --git a/phrase-extract/extract-ghkm/ScfgRule.cpp b/phrase-extract/extract-ghkm/ScfgRule.cpp
index 5a196698e..1a49c862e 100644
--- a/phrase-extract/extract-ghkm/ScfgRule.cpp
+++ b/phrase-extract/extract-ghkm/ScfgRule.cpp
@@ -19,23 +19,25 @@
#include "ScfgRule.h"
+#include <algorithm>
+
#include "Node.h"
#include "Subgraph.h"
-#include "SyntaxTree.h"
-
-#include <algorithm>
+#include "SyntaxNode.h"
+#include "SyntaxNodeCollection.h"
-namespace Moses
+namespace MosesTraining
{
namespace GHKM
{
ScfgRule::ScfgRule(const Subgraph &fragment,
- const MosesTraining::SyntaxTree *sourceSyntaxTree)
- : m_sourceLHS("X", NonTerminal)
+ const SyntaxNodeCollection *sourceNodeCollection)
+ : m_graphFragment(fragment)
+ , m_sourceLHS("X", NonTerminal)
, m_targetLHS(fragment.GetRoot()->GetLabel(), NonTerminal)
, m_pcfgScore(fragment.GetPcfgScore())
- , m_hasSourceLabels(sourceSyntaxTree)
+ , m_hasSourceLabels(sourceNodeCollection)
{
// Source RHS
@@ -80,9 +82,9 @@ ScfgRule::ScfgRule(const Subgraph &fragment,
}
}
}
- if (sourceSyntaxTree) {
+ if (sourceNodeCollection) {
// Source syntax label
- PushSourceLabel(sourceSyntaxTree,&sinkNode,"XRHS");
+ PushSourceLabel(sourceNodeCollection,&sinkNode,"XRHS");
}
}
@@ -123,26 +125,26 @@ ScfgRule::ScfgRule(const Subgraph &fragment,
}
}
- if (sourceSyntaxTree) {
- // Source syntax label for root node (if sourceSyntaxTree available)
- PushSourceLabel(sourceSyntaxTree,fragment.GetRoot(),"XLHS");
+ if (sourceNodeCollection) {
+ // Source syntax label for root node (if sourceNodeCollection available)
+ PushSourceLabel(sourceNodeCollection,fragment.GetRoot(),"XLHS");
// All non-terminal spans (including the LHS) should have obtained a label
// (a source-side syntactic constituent label if the span matches, "XLHS" otherwise)
// assert(m_sourceLabels.size() == m_numberOfNonTerminals+1);
}
}
-void ScfgRule::PushSourceLabel(const MosesTraining::SyntaxTree *sourceSyntaxTree,
+void ScfgRule::PushSourceLabel(const SyntaxNodeCollection *sourceNodeCollection,
const Node *node,
const std::string &nonMatchingLabel)
{
ContiguousSpan span = Closure(node->GetSpan());
- if (sourceSyntaxTree->HasNode(span.first,span.second)) { // does a source constituent match the span?
- std::vector<MosesTraining::SyntaxNode*> sourceLabels =
- sourceSyntaxTree->GetNodes(span.first,span.second);
+ if (sourceNodeCollection->HasNode(span.first,span.second)) { // does a source constituent match the span?
+ std::vector<SyntaxNode*> sourceLabels =
+ sourceNodeCollection->GetNodes(span.first,span.second);
if (!sourceLabels.empty()) {
// store the topmost matching label from the source syntax tree
- m_sourceLabels.push_back(sourceLabels.back()->GetLabel());
+ m_sourceLabels.push_back(sourceLabels.back()->label);
}
} else {
// no matching source-side syntactic constituent: store nonMatchingLabel
@@ -195,4 +197,4 @@ void ScfgRule::UpdateSourceLabelCoocCounts(std::map< std::string, std::map<std::
}
} // namespace GHKM
-} // namespace Moses
+} // namespace MosesTraining
diff --git a/phrase-extract/extract-ghkm/ScfgRule.h b/phrase-extract/extract-ghkm/ScfgRule.h
index 1b210e0d2..439c19fd7 100644
--- a/phrase-extract/extract-ghkm/ScfgRule.h
+++ b/phrase-extract/extract-ghkm/ScfgRule.h
@@ -18,12 +18,6 @@
***********************************************************************/
#pragma once
-#ifndef EXTRACT_GHKM_SCFG_RULE_H_
-#define EXTRACT_GHKM_SCFG_RULE_H_
-
-#include "Alignment.h"
-#include "Rule.h"
-#include "SyntaxTree.h"
#include <string>
#include <vector>
@@ -31,7 +25,11 @@
#include <memory>
#include <iostream>
-namespace Moses
+#include "Alignment.h"
+#include "Rule.h"
+#include "SyntaxNodeCollection.h"
+
+namespace MosesTraining
{
namespace GHKM
{
@@ -43,8 +41,11 @@ class ScfgRule : public Rule
{
public:
ScfgRule(const Subgraph &fragment,
- const MosesTraining::SyntaxTree *sourceSyntaxTree = 0);
+ const SyntaxNodeCollection *sourceNodeCollection = 0);
+ const Subgraph &GetGraphFragment() const {
+ return m_graphFragment;
+ }
const Symbol &GetSourceLHS() const {
return m_sourceLHS;
}
@@ -77,10 +78,10 @@ public:
}
private:
- void PushSourceLabel(const MosesTraining::SyntaxTree *sourceSyntaxTree,
- const Node *node,
- const std::string &nonMatchingLabel);
+ void PushSourceLabel(const SyntaxNodeCollection *sourceNodeCollection,
+ const Node *node, const std::string &nonMatchingLabel);
+ const Subgraph& m_graphFragment;
Symbol m_sourceLHS;
Symbol m_targetLHS;
std::vector<Symbol> m_sourceRHS;
@@ -92,6 +93,4 @@ private:
};
} // namespace GHKM
-} // namespace Moses
-
-#endif
+} // namespace MosesTraining
diff --git a/phrase-extract/extract-ghkm/ScfgRuleWriter.cpp b/phrase-extract/extract-ghkm/ScfgRuleWriter.cpp
index 4d75c0dc7..b513ecdaf 100644
--- a/phrase-extract/extract-ghkm/ScfgRuleWriter.cpp
+++ b/phrase-extract/extract-ghkm/ScfgRuleWriter.cpp
@@ -19,10 +19,6 @@
#include "ScfgRuleWriter.h"
-#include "Alignment.h"
-#include "Options.h"
-#include "ScfgRule.h"
-
#include <cassert>
#include <cmath>
#include <ostream>
@@ -30,7 +26,11 @@
#include <sstream>
#include <vector>
-namespace Moses
+#include "Alignment.h"
+#include "Options.h"
+#include "ScfgRule.h"
+
+namespace MosesTraining
{
namespace GHKM
{
@@ -121,6 +121,13 @@ void ScfgRuleWriter::WriteStandardFormat(const ScfgRule &rule,
}
}
+ // If parts-of-speech as a factor requested: retrieve preterminals from graph fragment
+ std::vector<std::string> partsOfSpeech;
+ if (m_options.partsOfSpeechFactor) {
+ const Subgraph &graphFragment = rule.GetGraphFragment();
+ graphFragment.GetPartsOfSpeech(partsOfSpeech);
+ }
+
// Write the source side of the rule to sourceSS.
int i = 0;
for (std::vector<Symbol>::const_iterator p(sourceRHS.begin());
@@ -140,6 +147,7 @@ void ScfgRuleWriter::WriteStandardFormat(const ScfgRule &rule,
// Write the target side of the rule to targetSS.
i = 0;
+ int targetTerminalIndex = 0;
for (std::vector<Symbol>::const_iterator p(targetRHS.begin());
p != targetRHS.end(); ++p, ++i) {
if (p->GetType() == NonTerminal) {
@@ -147,6 +155,12 @@ void ScfgRuleWriter::WriteStandardFormat(const ScfgRule &rule,
WriteSymbol(sourceRHS[sourceIndex], targetSS);
}
WriteSymbol(*p, targetSS);
+ // If parts-of-speech as a factor requested: write part-of-speech
+ if (m_options.partsOfSpeechFactor && (p->GetType() != NonTerminal)) {
+ assert(targetTerminalIndex<partsOfSpeech.size());
+ targetSS << "|" << partsOfSpeech[targetTerminalIndex];
+ ++targetTerminalIndex;
+ }
targetSS << " ";
}
WriteSymbol(rule.GetTargetLHS(), targetSS);
@@ -159,10 +173,16 @@ void ScfgRuleWriter::WriteUnpairedFormat(const ScfgRule &rule,
const std::vector<Symbol> &sourceRHS = rule.GetSourceRHS();
const std::vector<Symbol> &targetRHS = rule.GetTargetRHS();
+ // If parts-of-speech as a factor requested: retrieve preterminals from graph fragment
+ std::vector<std::string> partsOfSpeech;
+ if (m_options.partsOfSpeechFactor) {
+ const Subgraph &graphFragment = rule.GetGraphFragment();
+ graphFragment.GetPartsOfSpeech(partsOfSpeech);
+ }
+
// Write the source side of the rule to sourceSS.
- int i = 0;
for (std::vector<Symbol>::const_iterator p(sourceRHS.begin());
- p != sourceRHS.end(); ++p, ++i) {
+ p != sourceRHS.end(); ++p) {
WriteSymbol(*p, sourceSS);
sourceSS << " ";
}
@@ -173,10 +193,16 @@ void ScfgRuleWriter::WriteUnpairedFormat(const ScfgRule &rule,
}
// Write the target side of the rule to targetSS.
- i = 0;
+ int targetTerminalIndex = 0;
for (std::vector<Symbol>::const_iterator p(targetRHS.begin());
- p != targetRHS.end(); ++p, ++i) {
+ p != targetRHS.end(); ++p) {
WriteSymbol(*p, targetSS);
+ // If parts-of-speech as a factor requested: write part-of-speech
+ if (m_options.partsOfSpeechFactor && (p->GetType() != NonTerminal)) {
+ assert(targetTerminalIndex<partsOfSpeech.size());
+ targetSS << "|" << partsOfSpeech[targetTerminalIndex];
+ ++targetTerminalIndex;
+ }
targetSS << " ";
}
WriteSymbol(rule.GetTargetLHS(), targetSS);
@@ -185,11 +211,22 @@ void ScfgRuleWriter::WriteUnpairedFormat(const ScfgRule &rule,
void ScfgRuleWriter::WriteSymbol(const Symbol &symbol, std::ostream &out)
{
if (symbol.GetType() == NonTerminal) {
- out << "[" << symbol.GetValue() << "]";
+ out << "[";
+ if (m_options.stripBitParLabels) {
+ size_t pos = symbol.GetValue().find('-');
+ if (pos == std::string::npos) {
+ out << symbol.GetValue();
+ } else {
+ out << symbol.GetValue().substr(0,pos);
+ }
+ } else {
+ out << symbol.GetValue();
+ }
+ out << "]";
} else {
out << symbol.GetValue();
}
}
} // namespace GHKM
-} // namespace Moses
+} // namespace MosesTraining
diff --git a/phrase-extract/extract-ghkm/ScfgRuleWriter.h b/phrase-extract/extract-ghkm/ScfgRuleWriter.h
index e3edd6ee6..31358c57d 100644
--- a/phrase-extract/extract-ghkm/ScfgRuleWriter.h
+++ b/phrase-extract/extract-ghkm/ScfgRuleWriter.h
@@ -18,14 +18,12 @@
***********************************************************************/
#pragma once
-#ifndef EXTRACT_GHKM_RULE_WRITER_H_
-#define EXTRACT_GHKM_RULE_WRITER_H_
-
-#include "Subgraph.h"
#include <ostream>
-namespace Moses
+#include "Subgraph.h"
+
+namespace MosesTraining
{
namespace GHKM
{
@@ -59,6 +57,5 @@ private:
};
} // namespace GHKM
-} // namespace Moses
+} // namespace MosesTraining
-#endif
diff --git a/phrase-extract/extract-ghkm/Span.cpp b/phrase-extract/extract-ghkm/Span.cpp
index d637ec3d2..f6636cebb 100644
--- a/phrase-extract/extract-ghkm/Span.cpp
+++ b/phrase-extract/extract-ghkm/Span.cpp
@@ -19,7 +19,7 @@
#include "Span.h"
-namespace Moses
+namespace MosesTraining
{
namespace GHKM
{
@@ -45,4 +45,4 @@ ContiguousSpan Closure(const Span &s)
}
} // namespace GHKM
-} // namespace Moses
+} // namespace MosesTraining
diff --git a/phrase-extract/extract-ghkm/Span.h b/phrase-extract/extract-ghkm/Span.h
index c4d146c4e..90bed416a 100644
--- a/phrase-extract/extract-ghkm/Span.h
+++ b/phrase-extract/extract-ghkm/Span.h
@@ -24,7 +24,7 @@
#include <map>
#include <set>
-namespace Moses
+namespace MosesTraining
{
namespace GHKM
{
@@ -36,7 +36,7 @@ bool SpansIntersect(const Span &, const ContiguousSpan &);
ContiguousSpan Closure(const Span &);
-} // namespace Moses
+} // namespace MosesTraining
} // namespace GHKM
#endif
diff --git a/phrase-extract/extract-ghkm/StsgRule.cpp b/phrase-extract/extract-ghkm/StsgRule.cpp
index 83398f80a..10368e4c0 100644
--- a/phrase-extract/extract-ghkm/StsgRule.cpp
+++ b/phrase-extract/extract-ghkm/StsgRule.cpp
@@ -1,12 +1,11 @@
#include "StsgRule.h"
+#include <algorithm>
+
#include "Node.h"
#include "Subgraph.h"
-#include "SyntaxTree.h"
-
-#include <algorithm>
-namespace Moses
+namespace MosesTraining
{
namespace GHKM
{
@@ -92,4 +91,4 @@ StsgRule::StsgRule(const Subgraph &fragment)
}
} // namespace GHKM
-} // namespace Moses
+} // namespace MosesTraining
diff --git a/phrase-extract/extract-ghkm/StsgRule.h b/phrase-extract/extract-ghkm/StsgRule.h
index b14695c5c..a037a8d91 100644
--- a/phrase-extract/extract-ghkm/StsgRule.h
+++ b/phrase-extract/extract-ghkm/StsgRule.h
@@ -2,12 +2,12 @@
#ifndef EXTRACT_GHKM_STSG_RULE_H_
#define EXTRACT_GHKM_STSG_RULE_H_
+#include <vector>
+
#include "Rule.h"
#include "Subgraph.h"
-#include <vector>
-
-namespace Moses
+namespace MosesTraining
{
namespace GHKM
{
@@ -39,6 +39,6 @@ private:
};
} // namespace GHKM
-} // namespace Moses
+} // namespace MosesTraining
#endif
diff --git a/phrase-extract/extract-ghkm/StsgRuleWriter.cpp b/phrase-extract/extract-ghkm/StsgRuleWriter.cpp
index a9596b65c..32953bf68 100644
--- a/phrase-extract/extract-ghkm/StsgRuleWriter.cpp
+++ b/phrase-extract/extract-ghkm/StsgRuleWriter.cpp
@@ -1,9 +1,5 @@
#include "StsgRuleWriter.h"
-#include "Alignment.h"
-#include "Options.h"
-#include "StsgRule.h"
-
#include <cassert>
#include <cmath>
#include <ostream>
@@ -11,7 +7,11 @@
#include <sstream>
#include <vector>
-namespace Moses
+#include "Alignment.h"
+#include "Options.h"
+#include "StsgRule.h"
+
+namespace MosesTraining
{
namespace GHKM
{
@@ -92,4 +92,4 @@ void StsgRuleWriter::Write(const StsgRule &rule)
}
} // namespace GHKM
-} // namespace Moses
+} // namespace MosesTraining
diff --git a/phrase-extract/extract-ghkm/StsgRuleWriter.h b/phrase-extract/extract-ghkm/StsgRuleWriter.h
index efba44d2c..3f215a5c9 100644
--- a/phrase-extract/extract-ghkm/StsgRuleWriter.h
+++ b/phrase-extract/extract-ghkm/StsgRuleWriter.h
@@ -2,11 +2,11 @@
#ifndef EXTRACT_GHKM_STSG_RULE_WRITER_H_
#define EXTRACT_GHKM_STSG_RULE_WRITER_H_
-#include "Subgraph.h"
-
#include <ostream>
-namespace Moses
+#include "Subgraph.h"
+
+namespace MosesTraining
{
namespace GHKM
{
@@ -36,6 +36,6 @@ private:
};
} // namespace GHKM
-} // namespace Moses
+} // namespace MosesTraining
#endif
diff --git a/phrase-extract/extract-ghkm/Subgraph.cpp b/phrase-extract/extract-ghkm/Subgraph.cpp
index ae7a68b4d..f04c6982c 100644
--- a/phrase-extract/extract-ghkm/Subgraph.cpp
+++ b/phrase-extract/extract-ghkm/Subgraph.cpp
@@ -18,10 +18,11 @@
***********************************************************************/
#include <iostream>
-#include "Subgraph.h"
+
#include "Node.h"
+#include "Subgraph.h"
-namespace Moses
+namespace MosesTraining
{
namespace GHKM
{
@@ -168,5 +169,30 @@ void Subgraph::RecursivelyPrintPartsOfSpeech(const Node *n, std::ostream &out) c
}
}
-} // namespace Moses
+void Subgraph::GetPartsOfSpeech(std::vector<std::string> &out) const
+{
+ out.clear();
+ RecursivelyGetPartsOfSpeech(m_root,out);
+}
+
+void Subgraph::RecursivelyGetPartsOfSpeech(const Node *n, std::vector<std::string> &out) const
+{
+ NodeType nodeType = n->GetType();
+ if (nodeType == TREE) {
+ if (m_leaves.find(n) == m_leaves.end()) {
+ const std::vector<Node *> &children = n->GetChildren();
+ for (std::vector<Node *>::const_iterator p(children.begin());
+ p != children.end(); ++p) {
+ Node *child = *p;
+ if (child->GetType() == TARGET) {
+ out.push_back(n->GetLabel());
+ } else {
+ RecursivelyGetPartsOfSpeech(child,out);
+ }
+ }
+ }
+ }
+}
+
+} // namespace MosesTraining
} // namespace GHKM
diff --git a/phrase-extract/extract-ghkm/Subgraph.h b/phrase-extract/extract-ghkm/Subgraph.h
index 815b4f968..a9c6dac48 100644
--- a/phrase-extract/extract-ghkm/Subgraph.h
+++ b/phrase-extract/extract-ghkm/Subgraph.h
@@ -18,15 +18,13 @@
***********************************************************************/
#pragma once
-#ifndef EXTRACT_GHKM_SUBGRAPH_H_
-#define EXTRACT_GHKM_SUBGRAPH_H_
-
-#include "Node.h"
#include <set>
#include <vector>
-namespace Moses
+#include "Node.h"
+
+namespace MosesTraining
{
namespace GHKM
{
@@ -118,6 +116,7 @@ public:
void GetTargetLeaves(std::vector<const Node *> &) const;
void PrintTree(std::ostream &out) const;
void PrintPartsOfSpeech(std::ostream &out) const;
+ void GetPartsOfSpeech(std::vector<std::string> &out) const;
private:
void GetTargetLeaves(const Node *, std::vector<const Node *> &) const;
@@ -127,6 +126,7 @@ private:
int CountNodes(const Node *) const;
void RecursivelyPrintTree(const Node *n, std::ostream &out) const;
void RecursivelyPrintPartsOfSpeech(const Node *n, std::ostream &out) const;
+ void RecursivelyGetPartsOfSpeech(const Node *n, std::vector<std::string> &out) const;
const Node *m_root;
std::set<const Node *> m_leaves;
@@ -137,6 +137,5 @@ private:
};
} // namespace GHKM
-} // namespace Moses
+} // namespace MosesTraining
-#endif
diff --git a/phrase-extract/extract-ghkm/XmlTreeParser.cpp b/phrase-extract/extract-ghkm/XmlTreeParser.cpp
deleted file mode 100644
index 2f28c3244..000000000
--- a/phrase-extract/extract-ghkm/XmlTreeParser.cpp
+++ /dev/null
@@ -1,94 +0,0 @@
-/***********************************************************************
- Moses - statistical machine translation system
- Copyright (C) 2006-2011 University of Edinburgh
-
- This library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- This library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with this library; if not, write to the Free Software
- Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-***********************************************************************/
-
-#include "XmlTreeParser.h"
-
-#include "ParseTree.h"
-#include "tables-core.h"
-#include "XmlException.h"
-#include "XmlTree.h"
-
-#include <cassert>
-#include <vector>
-
-using namespace MosesTraining;
-
-namespace Moses
-{
-namespace GHKM
-{
-
-XmlTreeParser::XmlTreeParser(std::set<std::string> &labelSet,
- std::map<std::string, int> &topLabelSet)
- : m_labelSet(labelSet)
- , m_topLabelSet(topLabelSet)
-{
-}
-
-std::auto_ptr<ParseTree> XmlTreeParser::Parse(const std::string &line)
-{
- m_line = line;
- m_tree.Clear();
- try {
- if (!ProcessAndStripXMLTags(m_line, m_tree, m_labelSet, m_topLabelSet,
- false)) {
- throw Exception("");
- }
- } catch (const XmlException &e) {
- throw Exception(e.getMsg());
- }
- m_tree.ConnectNodes();
- SyntaxNode *root = m_tree.GetTop();
- assert(root);
- m_words = tokenize(m_line.c_str());
- return ConvertTree(*root, m_words);
-}
-
-// Converts a SyntaxNode tree to a Moses::GHKM::ParseTree.
-std::auto_ptr<ParseTree> XmlTreeParser::ConvertTree(
- const SyntaxNode &tree,
- const std::vector<std::string> &words)
-{
- std::auto_ptr<ParseTree> root(new ParseTree(tree.GetLabel()));
- root->SetPcfgScore(tree.GetPcfgScore());
- const std::vector<SyntaxNode*> &children = tree.GetChildren();
- if (children.empty()) {
- if (tree.GetStart() != tree.GetEnd()) {
- std::ostringstream msg;
- msg << "leaf node covers multiple words (" << tree.GetStart()
- << "-" << tree.GetEnd() << "): this is currently unsupported";
- throw Exception(msg.str());
- }
- std::auto_ptr<ParseTree> leaf(new ParseTree(words[tree.GetStart()]));
- leaf->SetParent(root.get());
- root->AddChild(leaf.release());
- } else {
- for (std::vector<SyntaxNode*>::const_iterator p = children.begin();
- p != children.end(); ++p) {
- assert(*p);
- std::auto_ptr<ParseTree> child = ConvertTree(**p, words);
- child->SetParent(root.get());
- root->AddChild(child.release());
- }
- }
- return root;
-}
-
-} // namespace GHKM
-} // namespace Moses
diff --git a/phrase-extract/extract-ghkm/XmlTreeParser.h b/phrase-extract/extract-ghkm/XmlTreeParser.h
deleted file mode 100644
index ff0baeace..000000000
--- a/phrase-extract/extract-ghkm/XmlTreeParser.h
+++ /dev/null
@@ -1,68 +0,0 @@
-/***********************************************************************
- Moses - statistical machine translation system
- Copyright (C) 2006-2011 University of Edinburgh
-
- This library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- This library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with this library; if not, write to the Free Software
- Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-***********************************************************************/
-
-#pragma once
-#ifndef EXTRACT_GHKM_XML_TREE_PARSER_H_
-#define EXTRACT_GHKM_XML_TREE_PARSER_H_
-
-#include "Exception.h"
-
-#include "SyntaxTree.h"
-
-#include <map>
-#include <memory>
-#include <set>
-#include <string>
-#include <vector>
-
-namespace Moses
-{
-namespace GHKM
-{
-
-class ParseTree;
-
-// Parses a string in Moses' XML parse tree format and returns a ParseTree
-// object.
-class XmlTreeParser
-{
-public:
- XmlTreeParser(std::set<std::string> &, std::map<std::string, int> &);
- std::auto_ptr<ParseTree> Parse(const std::string &);
-
- static std::auto_ptr<ParseTree> ConvertTree(const MosesTraining::SyntaxNode &,
- const std::vector<std::string> &);
-
- const std::vector<std::string>& GetWords() {
- return m_words;
- };
-
-private:
-
- std::set<std::string> &m_labelSet;
- std::map<std::string, int> &m_topLabelSet;
- std::string m_line;
- MosesTraining::SyntaxTree m_tree;
- std::vector<std::string> m_words;
-};
-
-} // namespace GHKM
-} // namespace Moses
-
-#endif
diff --git a/phrase-extract/extract-main.cpp b/phrase-extract/extract-main.cpp
index 46c029eff..70d4cad35 100644
--- a/phrase-extract/extract-main.cpp
+++ b/phrase-extract/extract-main.cpp
@@ -11,8 +11,8 @@
#include <fstream>
#include <vector>
#include <string>
-#include <stdlib.h>
-#include <assert.h>
+#include <cstdlib>
+#include <cassert>
#include <cstring>
#include <sstream>
#include <map>
@@ -86,7 +86,14 @@ namespace MosesTraining
class ExtractTask
{
public:
- ExtractTask(size_t id, SentenceAlignment &sentence,PhraseExtractionOptions &initoptions, Moses::OutputFileStream &extractFile, Moses::OutputFileStream &extractFileInv,Moses::OutputFileStream &extractFileOrientation, Moses::OutputFileStream &extractFileContext, Moses::OutputFileStream &extractFileContextInv):
+ ExtractTask(
+ size_t id, SentenceAlignment &sentence,
+ PhraseExtractionOptions &initoptions,
+ Moses::OutputFileStream &extractFile,
+ Moses::OutputFileStream &extractFileInv,
+ Moses::OutputFileStream &extractFileOrientation,
+ Moses::OutputFileStream &extractFileContext,
+ Moses::OutputFileStream &extractFileContextInv):
m_sentence(sentence),
m_options(initoptions),
m_extractFile(extractFile),
diff --git a/phrase-extract/extract-mixed-syntax/ConsistentPhrases.cpp b/phrase-extract/extract-mixed-syntax/ConsistentPhrases.cpp
index f03a61840..b1d64fc54 100644
--- a/phrase-extract/extract-mixed-syntax/ConsistentPhrases.cpp
+++ b/phrase-extract/extract-mixed-syntax/ConsistentPhrases.cpp
@@ -19,10 +19,10 @@ ConsistentPhrases::ConsistentPhrases()
ConsistentPhrases::~ConsistentPhrases()
{
- for (int start = 0; start < m_coll.size(); ++start) {
+ for (size_t start = 0; start < m_coll.size(); ++start) {
std::vector<Coll> &allSourceStart = m_coll[start];
- for (int size = 0; size < allSourceStart.size(); ++size) {
+ for (size_t size = 0; size < allSourceStart.size(); ++size) {
Coll &coll = allSourceStart[size];
Moses::RemoveAllInColl(coll);
}
@@ -48,8 +48,8 @@ void ConsistentPhrases::Add(int sourceStart, int sourceEnd,
targetStart, targetEnd,
params);
- pair<Coll::iterator, bool> inserted = coll.insert(cp);
- assert(inserted.second);
+ assert(coll.find(cp) == coll.end());
+ coll.insert(cp);
}
const ConsistentPhrases::Coll &ConsistentPhrases::GetColl(int sourceStart, int sourceEnd) const
@@ -69,10 +69,10 @@ ConsistentPhrases::Coll &ConsistentPhrases::GetColl(int sourceStart, int sourceE
std::string ConsistentPhrases::Debug() const
{
std::stringstream out;
- for (int start = 0; start < m_coll.size(); ++start) {
+ for (size_t start = 0; start < m_coll.size(); ++start) {
const std::vector<Coll> &allSourceStart = m_coll[start];
- for (int size = 0; size < allSourceStart.size(); ++size) {
+ for (size_t size = 0; size < allSourceStart.size(); ++size) {
const Coll &coll = allSourceStart[size];
Coll::const_iterator iter;
@@ -89,9 +89,9 @@ std::string ConsistentPhrases::Debug() const
void ConsistentPhrases::AddHieroNonTerms(const Parameter &params)
{
// add [X] labels everywhere
- for (int i = 0; i < m_coll.size(); ++i) {
+ for (size_t i = 0; i < m_coll.size(); ++i) {
vector<Coll> &inner = m_coll[i];
- for (int j = 0; j < inner.size(); ++j) {
+ for (size_t j = 0; j < inner.size(); ++j) {
ConsistentPhrases::Coll &coll = inner[j];
ConsistentPhrases::Coll::iterator iter;
for (iter = coll.begin(); iter != coll.end(); ++iter) {
diff --git a/phrase-extract/extract-mixed-syntax/Rule.cpp b/phrase-extract/extract-mixed-syntax/Rule.cpp
index 83060caba..5888f2160 100644
--- a/phrase-extract/extract-mixed-syntax/Rule.cpp
+++ b/phrase-extract/extract-mixed-syntax/Rule.cpp
@@ -27,9 +27,9 @@ Rule::Rule(const NonTerm &lhsNonTerm, const AlignedSentence &alignedSentence)
Rule::Rule(const Rule &copy, const NonTerm &nonTerm)
:m_lhs(copy.m_lhs)
,m_alignedSentence(copy.m_alignedSentence)
+ ,m_nonterms(copy.m_nonterms)
,m_isValid(true)
,m_canRecurse(true)
- ,m_nonterms(copy.m_nonterms)
{
m_nonterms.push_back(&nonTerm);
CreateSource();
@@ -225,8 +225,6 @@ void Rule::NonTermContext(int sourceTarget, int factor, size_t ntInd, const Cons
void Rule::Prevalidate(const Parameter &params)
{
- const ConsistentPhrase &cp = m_lhs.GetConsistentPhrase();
-
// check number of source symbols in rule
if (m_source.GetSize() > params.maxSymbolsSource) {
m_isValid = false;
@@ -432,7 +430,7 @@ void Rule::Prevalidate(const Parameter &params)
// min/max span per scope
if (params.scopeSpan.size()) {
- int scope = GetScope(params);
+ size_t scope = GetScope(params);
if (scope >= params.scopeSpan.size()) {
// no constraint on it. It's ok
} else {
@@ -574,9 +572,6 @@ void Rule::CreateTarget(const Parameter &params)
void Rule::CreateAlignments()
{
- int sourceStart = GetConsistentPhrase().corners[0];
- int targetStart = GetConsistentPhrase().corners[2];
-
for (size_t sourcePos = 0; sourcePos < m_source.GetSize(); ++sourcePos) {
const RuleSymbol *symbol = m_source[sourcePos];
if (!symbol->IsNonTerm()) {
diff --git a/phrase-extract/extract-mixed-syntax/pugiconfig.hpp b/phrase-extract/extract-mixed-syntax/pugiconfig.hpp
index c2196715c..5a63fd488 100644
--- a/phrase-extract/extract-mixed-syntax/pugiconfig.hpp
+++ b/phrase-extract/extract-mixed-syntax/pugiconfig.hpp
@@ -57,7 +57,7 @@
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
- *
+ *
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
diff --git a/phrase-extract/extract-mixed-syntax/pugixml.cpp b/phrase-extract/extract-mixed-syntax/pugixml.cpp
index 8a7e97f14..5076e3cc0 100644
--- a/phrase-extract/extract-mixed-syntax/pugixml.cpp
+++ b/phrase-extract/extract-mixed-syntax/pugixml.cpp
@@ -16,17 +16,17 @@
#include "pugixml.hpp"
-#include <stdlib.h>
-#include <stdio.h>
-#include <string.h>
-#include <assert.h>
-#include <wchar.h>
+#include <cstdlib>
+#include <cstdio>
+#include <cstring>
+#include <cassert>
+#include <cwchar>
#ifndef PUGIXML_NO_XPATH
-# include <math.h>
+# include <cmath>
# include <float.h>
# ifdef PUGIXML_NO_EXCEPTIONS
-# include <setjmp.h>
+# include <csetjmp>
# endif
#endif
@@ -50,7 +50,7 @@
#endif
#ifdef __INTEL_COMPILER
-# pragma warning(disable: 177) // function was declared but never referenced
+# pragma warning(disable: 177) // function was declared but never referenced
# pragma warning(disable: 279) // controlling expression is constant
# pragma warning(disable: 1478 1786) // function was declared "deprecated"
# pragma warning(disable: 1684) // conversion from pointer to same-sized integral type
diff --git a/phrase-extract/extract-mixed-syntax/pugixml.hpp b/phrase-extract/extract-mixed-syntax/pugixml.hpp
index 77b4dcf47..cbc527bef 100644
--- a/phrase-extract/extract-mixed-syntax/pugixml.hpp
+++ b/phrase-extract/extract-mixed-syntax/pugixml.hpp
@@ -23,7 +23,7 @@
#define HEADER_PUGIXML_HPP
// Include stddef.h for size_t and ptrdiff_t
-#include <stddef.h>
+#include <cstddef>
// Include exception header for XPath
#if !defined(PUGIXML_NO_XPATH) && !defined(PUGIXML_NO_EXCEPTIONS)
@@ -74,1166 +74,1157 @@
namespace pugi
{
- // Character type used for all internal storage and operations; depends on PUGIXML_WCHAR_MODE
- typedef PUGIXML_CHAR char_t;
+// Character type used for all internal storage and operations; depends on PUGIXML_WCHAR_MODE
+typedef PUGIXML_CHAR char_t;
#ifndef PUGIXML_NO_STL
- // String type used for operations that work with STL string; depends on PUGIXML_WCHAR_MODE
- typedef std::basic_string<PUGIXML_CHAR, std::char_traits<PUGIXML_CHAR>, std::allocator<PUGIXML_CHAR> > string_t;
+// String type used for operations that work with STL string; depends on PUGIXML_WCHAR_MODE
+typedef std::basic_string<PUGIXML_CHAR, std::char_traits<PUGIXML_CHAR>, std::allocator<PUGIXML_CHAR> > string_t;
#endif
}
// The PugiXML namespace
namespace pugi
{
- // Tree node types
- enum xml_node_type
- {
- node_null, // Empty (null) node handle
- node_document, // A document tree's absolute root
- node_element, // Element tag, i.e. '<node/>'
- node_pcdata, // Plain character data, i.e. 'text'
- node_cdata, // Character data, i.e. '<![CDATA[text]]>'
- node_comment, // Comment tag, i.e. '<!-- text -->'
- node_pi, // Processing instruction, i.e. '<?name?>'
- node_declaration, // Document declaration, i.e. '<?xml version="1.0"?>'
- node_doctype // Document type declaration, i.e. '<!DOCTYPE doc>'
- };
-
- // Parsing options
-
- // Minimal parsing mode (equivalent to turning all other flags off).
- // Only elements and PCDATA sections are added to the DOM tree, no text conversions are performed.
- const unsigned int parse_minimal = 0x0000;
-
- // This flag determines if processing instructions (node_pi) are added to the DOM tree. This flag is off by default.
- const unsigned int parse_pi = 0x0001;
-
- // This flag determines if comments (node_comment) are added to the DOM tree. This flag is off by default.
- const unsigned int parse_comments = 0x0002;
-
- // This flag determines if CDATA sections (node_cdata) are added to the DOM tree. This flag is on by default.
- const unsigned int parse_cdata = 0x0004;
-
- // This flag determines if plain character data (node_pcdata) that consist only of whitespace are added to the DOM tree.
- // This flag is off by default; turning it on usually results in slower parsing and more memory consumption.
- const unsigned int parse_ws_pcdata = 0x0008;
-
- // This flag determines if character and entity references are expanded during parsing. This flag is on by default.
- const unsigned int parse_escapes = 0x0010;
-
- // This flag determines if EOL characters are normalized (converted to #xA) during parsing. This flag is on by default.
- const unsigned int parse_eol = 0x0020;
-
- // This flag determines if attribute values are normalized using CDATA normalization rules during parsing. This flag is on by default.
- const unsigned int parse_wconv_attribute = 0x0040;
-
- // This flag determines if attribute values are normalized using NMTOKENS normalization rules during parsing. This flag is off by default.
- const unsigned int parse_wnorm_attribute = 0x0080;
-
- // This flag determines if document declaration (node_declaration) is added to the DOM tree. This flag is off by default.
- const unsigned int parse_declaration = 0x0100;
-
- // This flag determines if document type declaration (node_doctype) is added to the DOM tree. This flag is off by default.
- const unsigned int parse_doctype = 0x0200;
-
- // This flag determines if plain character data (node_pcdata) that is the only child of the parent node and that consists only
- // of whitespace is added to the DOM tree.
- // This flag is off by default; turning it on may result in slower parsing and more memory consumption.
- const unsigned int parse_ws_pcdata_single = 0x0400;
-
- // The default parsing mode.
- // Elements, PCDATA and CDATA sections are added to the DOM tree, character/reference entities are expanded,
- // End-of-Line characters are normalized, attribute values are normalized using CDATA normalization rules.
- const unsigned int parse_default = parse_cdata | parse_escapes | parse_wconv_attribute | parse_eol;
-
- // The full parsing mode.
- // Nodes of all types are added to the DOM tree, character/reference entities are expanded,
- // End-of-Line characters are normalized, attribute values are normalized using CDATA normalization rules.
- const unsigned int parse_full = parse_default | parse_pi | parse_comments | parse_declaration | parse_doctype;
-
- // These flags determine the encoding of input data for XML document
- enum xml_encoding
- {
- encoding_auto, // Auto-detect input encoding using BOM or < / <? detection; use UTF8 if BOM is not found
- encoding_utf8, // UTF8 encoding
- encoding_utf16_le, // Little-endian UTF16
- encoding_utf16_be, // Big-endian UTF16
- encoding_utf16, // UTF16 with native endianness
- encoding_utf32_le, // Little-endian UTF32
- encoding_utf32_be, // Big-endian UTF32
- encoding_utf32, // UTF32 with native endianness
- encoding_wchar, // The same encoding wchar_t has (either UTF16 or UTF32)
- encoding_latin1
- };
-
- // Formatting flags
-
- // Indent the nodes that are written to output stream with as many indentation strings as deep the node is in DOM tree. This flag is on by default.
- const unsigned int format_indent = 0x01;
-
- // Write encoding-specific BOM to the output stream. This flag is off by default.
- const unsigned int format_write_bom = 0x02;
-
- // Use raw output mode (no indentation and no line breaks are written). This flag is off by default.
- const unsigned int format_raw = 0x04;
-
- // Omit default XML declaration even if there is no declaration in the document. This flag is off by default.
- const unsigned int format_no_declaration = 0x08;
-
- // Don't escape attribute values and PCDATA contents. This flag is off by default.
- const unsigned int format_no_escapes = 0x10;
-
- // Open file using text mode in xml_document::save_file. This enables special character (i.e. new-line) conversions on some systems. This flag is off by default.
- const unsigned int format_save_file_text = 0x20;
-
- // The default set of formatting flags.
- // Nodes are indented depending on their depth in DOM tree, a default declaration is output if document has none.
- const unsigned int format_default = format_indent;
-
- // Forward declarations
- struct xml_attribute_struct;
- struct xml_node_struct;
-
- class xml_node_iterator;
- class xml_attribute_iterator;
- class xml_named_node_iterator;
-
- class xml_tree_walker;
-
- class xml_node;
-
- class xml_text;
-
- #ifndef PUGIXML_NO_XPATH
- class xpath_node;
- class xpath_node_set;
- class xpath_query;
- class xpath_variable_set;
- #endif
-
- // Range-based for loop support
- template <typename It> class xml_object_range
- {
- public:
- typedef It const_iterator;
-
- xml_object_range(It b, It e): _begin(b), _end(e)
- {
- }
-
- It begin() const { return _begin; }
- It end() const { return _end; }
-
- private:
- It _begin, _end;
- };
-
- // Writer interface for node printing (see xml_node::print)
- class PUGIXML_CLASS xml_writer
- {
- public:
- virtual ~xml_writer() {}
-
- // Write memory chunk into stream/file/whatever
- virtual void write(const void* data, size_t size) = 0;
- };
-
- // xml_writer implementation for FILE*
- class PUGIXML_CLASS xml_writer_file: public xml_writer
- {
- public:
- // Construct writer from a FILE* object; void* is used to avoid header dependencies on stdio
- xml_writer_file(void* file);
-
- virtual void write(const void* data, size_t size);
-
- private:
- void* file;
- };
-
- #ifndef PUGIXML_NO_STL
- // xml_writer implementation for streams
- class PUGIXML_CLASS xml_writer_stream: public xml_writer
- {
- public:
- // Construct writer from an output stream object
- xml_writer_stream(std::basic_ostream<char, std::char_traits<char> >& stream);
- xml_writer_stream(std::basic_ostream<wchar_t, std::char_traits<wchar_t> >& stream);
-
- virtual void write(const void* data, size_t size);
-
- private:
- std::basic_ostream<char, std::char_traits<char> >* narrow_stream;
- std::basic_ostream<wchar_t, std::char_traits<wchar_t> >* wide_stream;
- };
- #endif
-
- // A light-weight handle for manipulating attributes in DOM tree
- class PUGIXML_CLASS xml_attribute
- {
- friend class xml_attribute_iterator;
- friend class xml_node;
-
- private:
- xml_attribute_struct* _attr;
-
- typedef void (*unspecified_bool_type)(xml_attribute***);
-
- public:
- // Default constructor. Constructs an empty attribute.
- xml_attribute();
-
- // Constructs attribute from internal pointer
- explicit xml_attribute(xml_attribute_struct* attr);
-
- // Safe bool conversion operator
- operator unspecified_bool_type() const;
-
- // Borland C++ workaround
- bool operator!() const;
-
- // Comparison operators (compares wrapped attribute pointers)
- bool operator==(const xml_attribute& r) const;
- bool operator!=(const xml_attribute& r) const;
- bool operator<(const xml_attribute& r) const;
- bool operator>(const xml_attribute& r) const;
- bool operator<=(const xml_attribute& r) const;
- bool operator>=(const xml_attribute& r) const;
-
- // Check if attribute is empty
- bool empty() const;
-
- // Get attribute name/value, or "" if attribute is empty
- const char_t* name() const;
- const char_t* value() const;
-
- // Get attribute value, or the default value if attribute is empty
- const char_t* as_string(const char_t* def = PUGIXML_TEXT("")) const;
-
- // Get attribute value as a number, or the default value if conversion did not succeed or attribute is empty
- int as_int(int def = 0) const;
- unsigned int as_uint(unsigned int def = 0) const;
- double as_double(double def = 0) const;
- float as_float(float def = 0) const;
-
- // Get attribute value as bool (returns true if first character is in '1tTyY' set), or the default value if attribute is empty
- bool as_bool(bool def = false) const;
-
- // Set attribute name/value (returns false if attribute is empty or there is not enough memory)
- bool set_name(const char_t* rhs);
- bool set_value(const char_t* rhs);
-
- // Set attribute value with type conversion (numbers are converted to strings, boolean is converted to "true"/"false")
- bool set_value(int rhs);
- bool set_value(unsigned int rhs);
- bool set_value(double rhs);
- bool set_value(bool rhs);
-
- // Set attribute value (equivalent to set_value without error checking)
- xml_attribute& operator=(const char_t* rhs);
- xml_attribute& operator=(int rhs);
- xml_attribute& operator=(unsigned int rhs);
- xml_attribute& operator=(double rhs);
- xml_attribute& operator=(bool rhs);
-
- // Get next/previous attribute in the attribute list of the parent node
- xml_attribute next_attribute() const;
- xml_attribute previous_attribute() const;
-
- // Get hash value (unique for handles to the same object)
- size_t hash_value() const;
-
- // Get internal pointer
- xml_attribute_struct* internal_object() const;
- };
+// Tree node types
+enum xml_node_type {
+ node_null, // Empty (null) node handle
+ node_document, // A document tree's absolute root
+ node_element, // Element tag, i.e. '<node/>'
+ node_pcdata, // Plain character data, i.e. 'text'
+ node_cdata, // Character data, i.e. '<![CDATA[text]]>'
+ node_comment, // Comment tag, i.e. '<!-- text -->'
+ node_pi, // Processing instruction, i.e. '<?name?>'
+ node_declaration, // Document declaration, i.e. '<?xml version="1.0"?>'
+ node_doctype // Document type declaration, i.e. '<!DOCTYPE doc>'
+};
+
+// Parsing options
+
+// Minimal parsing mode (equivalent to turning all other flags off).
+// Only elements and PCDATA sections are added to the DOM tree, no text conversions are performed.
+const unsigned int parse_minimal = 0x0000;
+
+// This flag determines if processing instructions (node_pi) are added to the DOM tree. This flag is off by default.
+const unsigned int parse_pi = 0x0001;
+
+// This flag determines if comments (node_comment) are added to the DOM tree. This flag is off by default.
+const unsigned int parse_comments = 0x0002;
+
+// This flag determines if CDATA sections (node_cdata) are added to the DOM tree. This flag is on by default.
+const unsigned int parse_cdata = 0x0004;
+
+// This flag determines if plain character data (node_pcdata) that consist only of whitespace are added to the DOM tree.
+// This flag is off by default; turning it on usually results in slower parsing and more memory consumption.
+const unsigned int parse_ws_pcdata = 0x0008;
+
+// This flag determines if character and entity references are expanded during parsing. This flag is on by default.
+const unsigned int parse_escapes = 0x0010;
+
+// This flag determines if EOL characters are normalized (converted to #xA) during parsing. This flag is on by default.
+const unsigned int parse_eol = 0x0020;
+
+// This flag determines if attribute values are normalized using CDATA normalization rules during parsing. This flag is on by default.
+const unsigned int parse_wconv_attribute = 0x0040;
+
+// This flag determines if attribute values are normalized using NMTOKENS normalization rules during parsing. This flag is off by default.
+const unsigned int parse_wnorm_attribute = 0x0080;
+
+// This flag determines if document declaration (node_declaration) is added to the DOM tree. This flag is off by default.
+const unsigned int parse_declaration = 0x0100;
+
+// This flag determines if document type declaration (node_doctype) is added to the DOM tree. This flag is off by default.
+const unsigned int parse_doctype = 0x0200;
+
+// This flag determines if plain character data (node_pcdata) that is the only child of the parent node and that consists only
+// of whitespace is added to the DOM tree.
+// This flag is off by default; turning it on may result in slower parsing and more memory consumption.
+const unsigned int parse_ws_pcdata_single = 0x0400;
+
+// The default parsing mode.
+// Elements, PCDATA and CDATA sections are added to the DOM tree, character/reference entities are expanded,
+// End-of-Line characters are normalized, attribute values are normalized using CDATA normalization rules.
+const unsigned int parse_default = parse_cdata | parse_escapes | parse_wconv_attribute | parse_eol;
+
+// The full parsing mode.
+// Nodes of all types are added to the DOM tree, character/reference entities are expanded,
+// End-of-Line characters are normalized, attribute values are normalized using CDATA normalization rules.
+const unsigned int parse_full = parse_default | parse_pi | parse_comments | parse_declaration | parse_doctype;
+
+// These flags determine the encoding of input data for XML document
+enum xml_encoding {
+ encoding_auto, // Auto-detect input encoding using BOM or < / <? detection; use UTF8 if BOM is not found
+ encoding_utf8, // UTF8 encoding
+ encoding_utf16_le, // Little-endian UTF16
+ encoding_utf16_be, // Big-endian UTF16
+ encoding_utf16, // UTF16 with native endianness
+ encoding_utf32_le, // Little-endian UTF32
+ encoding_utf32_be, // Big-endian UTF32
+ encoding_utf32, // UTF32 with native endianness
+ encoding_wchar, // The same encoding wchar_t has (either UTF16 or UTF32)
+ encoding_latin1
+};
+
+// Formatting flags
+
+// Indent the nodes that are written to output stream with as many indentation strings as deep the node is in DOM tree. This flag is on by default.
+const unsigned int format_indent = 0x01;
+
+// Write encoding-specific BOM to the output stream. This flag is off by default.
+const unsigned int format_write_bom = 0x02;
+
+// Use raw output mode (no indentation and no line breaks are written). This flag is off by default.
+const unsigned int format_raw = 0x04;
+
+// Omit default XML declaration even if there is no declaration in the document. This flag is off by default.
+const unsigned int format_no_declaration = 0x08;
+
+// Don't escape attribute values and PCDATA contents. This flag is off by default.
+const unsigned int format_no_escapes = 0x10;
+
+// Open file using text mode in xml_document::save_file. This enables special character (i.e. new-line) conversions on some systems. This flag is off by default.
+const unsigned int format_save_file_text = 0x20;
+
+// The default set of formatting flags.
+// Nodes are indented depending on their depth in DOM tree, a default declaration is output if document has none.
+const unsigned int format_default = format_indent;
+
+// Forward declarations
+struct xml_attribute_struct;
+struct xml_node_struct;
+
+class xml_node_iterator;
+class xml_attribute_iterator;
+class xml_named_node_iterator;
+
+class xml_tree_walker;
+
+class xml_node;
+
+class xml_text;
+
+#ifndef PUGIXML_NO_XPATH
+class xpath_node;
+class xpath_node_set;
+class xpath_query;
+class xpath_variable_set;
+#endif
+
+// Range-based for loop support
+template <typename It> class xml_object_range
+{
+public:
+ typedef It const_iterator;
+
+ xml_object_range(It b, It e): _begin(b), _end(e) {
+ }
+
+ It begin() const {
+ return _begin;
+ }
+ It end() const {
+ return _end;
+ }
+
+private:
+ It _begin, _end;
+};
+
+// Writer interface for node printing (see xml_node::print)
+class PUGIXML_CLASS xml_writer
+{
+public:
+ virtual ~xml_writer() {}
+
+ // Write memory chunk into stream/file/whatever
+ virtual void write(const void* data, size_t size) = 0;
+};
+
+// xml_writer implementation for FILE*
+class PUGIXML_CLASS xml_writer_file: public xml_writer
+{
+public:
+ // Construct writer from a FILE* object; void* is used to avoid header dependencies on stdio
+ xml_writer_file(void* file);
+
+ virtual void write(const void* data, size_t size);
+
+private:
+ void* file;
+};
+
+#ifndef PUGIXML_NO_STL
+// xml_writer implementation for streams
+class PUGIXML_CLASS xml_writer_stream: public xml_writer
+{
+public:
+ // Construct writer from an output stream object
+ xml_writer_stream(std::basic_ostream<char, std::char_traits<char> >& stream);
+ xml_writer_stream(std::basic_ostream<wchar_t, std::char_traits<wchar_t> >& stream);
+
+ virtual void write(const void* data, size_t size);
+
+private:
+ std::basic_ostream<char, std::char_traits<char> >* narrow_stream;
+ std::basic_ostream<wchar_t, std::char_traits<wchar_t> >* wide_stream;
+};
+#endif
+
+// A light-weight handle for manipulating attributes in DOM tree
+class PUGIXML_CLASS xml_attribute
+{
+ friend class xml_attribute_iterator;
+ friend class xml_node;
+
+private:
+ xml_attribute_struct* _attr;
+
+ typedef void (*unspecified_bool_type)(xml_attribute***);
+
+public:
+ // Default constructor. Constructs an empty attribute.
+ xml_attribute();
+
+ // Constructs attribute from internal pointer
+ explicit xml_attribute(xml_attribute_struct* attr);
+
+ // Safe bool conversion operator
+ operator unspecified_bool_type() const;
+
+ // Borland C++ workaround
+ bool operator!() const;
+
+ // Comparison operators (compares wrapped attribute pointers)
+ bool operator==(const xml_attribute& r) const;
+ bool operator!=(const xml_attribute& r) const;
+ bool operator<(const xml_attribute& r) const;
+ bool operator>(const xml_attribute& r) const;
+ bool operator<=(const xml_attribute& r) const;
+ bool operator>=(const xml_attribute& r) const;
+
+ // Check if attribute is empty
+ bool empty() const;
+
+ // Get attribute name/value, or "" if attribute is empty
+ const char_t* name() const;
+ const char_t* value() const;
+
+ // Get attribute value, or the default value if attribute is empty
+ const char_t* as_string(const char_t* def = PUGIXML_TEXT("")) const;
+
+ // Get attribute value as a number, or the default value if conversion did not succeed or attribute is empty
+ int as_int(int def = 0) const;
+ unsigned int as_uint(unsigned int def = 0) const;
+ double as_double(double def = 0) const;
+ float as_float(float def = 0) const;
+
+ // Get attribute value as bool (returns true if first character is in '1tTyY' set), or the default value if attribute is empty
+ bool as_bool(bool def = false) const;
+
+ // Set attribute name/value (returns false if attribute is empty or there is not enough memory)
+ bool set_name(const char_t* rhs);
+ bool set_value(const char_t* rhs);
+
+ // Set attribute value with type conversion (numbers are converted to strings, boolean is converted to "true"/"false")
+ bool set_value(int rhs);
+ bool set_value(unsigned int rhs);
+ bool set_value(double rhs);
+ bool set_value(bool rhs);
+
+ // Set attribute value (equivalent to set_value without error checking)
+ xml_attribute& operator=(const char_t* rhs);
+ xml_attribute& operator=(int rhs);
+ xml_attribute& operator=(unsigned int rhs);
+ xml_attribute& operator=(double rhs);
+ xml_attribute& operator=(bool rhs);
+
+ // Get next/previous attribute in the attribute list of the parent node
+ xml_attribute next_attribute() const;
+ xml_attribute previous_attribute() const;
+
+ // Get hash value (unique for handles to the same object)
+ size_t hash_value() const;
+
+ // Get internal pointer
+ xml_attribute_struct* internal_object() const;
+};
#ifdef __BORLANDC__
- // Borland C++ workaround
- bool PUGIXML_FUNCTION operator&&(const xml_attribute& lhs, bool rhs);
- bool PUGIXML_FUNCTION operator||(const xml_attribute& lhs, bool rhs);
+// Borland C++ workaround
+bool PUGIXML_FUNCTION operator&&(const xml_attribute& lhs, bool rhs);
+bool PUGIXML_FUNCTION operator||(const xml_attribute& lhs, bool rhs);
#endif
- // A light-weight handle for manipulating nodes in DOM tree
- class PUGIXML_CLASS xml_node
- {
- friend class xml_attribute_iterator;
- friend class xml_node_iterator;
- friend class xml_named_node_iterator;
-
- protected:
- xml_node_struct* _root;
-
- typedef void (*unspecified_bool_type)(xml_node***);
-
- public:
- // Default constructor. Constructs an empty node.
- xml_node();
-
- // Constructs node from internal pointer
- explicit xml_node(xml_node_struct* p);
-
- // Safe bool conversion operator
- operator unspecified_bool_type() const;
-
- // Borland C++ workaround
- bool operator!() const;
-
- // Comparison operators (compares wrapped node pointers)
- bool operator==(const xml_node& r) const;
- bool operator!=(const xml_node& r) const;
- bool operator<(const xml_node& r) const;
- bool operator>(const xml_node& r) const;
- bool operator<=(const xml_node& r) const;
- bool operator>=(const xml_node& r) const;
-
- // Check if node is empty.
- bool empty() const;
-
- // Get node type
- xml_node_type type() const;
-
- // Get node name/value, or "" if node is empty or it has no name/value
- const char_t* name() const;
- const char_t* value() const;
-
- // Get attribute list
- xml_attribute first_attribute() const;
- xml_attribute last_attribute() const;
-
- // Get children list
- xml_node first_child() const;
- xml_node last_child() const;
-
- // Get next/previous sibling in the children list of the parent node
- xml_node next_sibling() const;
- xml_node previous_sibling() const;
-
- // Get parent node
- xml_node parent() const;
-
- // Get root of DOM tree this node belongs to
- xml_node root() const;
-
- // Get text object for the current node
- xml_text text() const;
-
- // Get child, attribute or next/previous sibling with the specified name
- xml_node child(const char_t* name) const;
- xml_attribute attribute(const char_t* name) const;
- xml_node next_sibling(const char_t* name) const;
- xml_node previous_sibling(const char_t* name) const;
-
- // Get child value of current node; that is, value of the first child node of type PCDATA/CDATA
- const char_t* child_value() const;
-
- // Get child value of child with specified name. Equivalent to child(name).child_value().
- const char_t* child_value(const char_t* name) const;
-
- // Set node name/value (returns false if node is empty, there is not enough memory, or node can not have name/value)
- bool set_name(const char_t* rhs);
- bool set_value(const char_t* rhs);
-
- // Add attribute with specified name. Returns added attribute, or empty attribute on errors.
- xml_attribute append_attribute(const char_t* name);
- xml_attribute prepend_attribute(const char_t* name);
- xml_attribute insert_attribute_after(const char_t* name, const xml_attribute& attr);
- xml_attribute insert_attribute_before(const char_t* name, const xml_attribute& attr);
-
- // Add a copy of the specified attribute. Returns added attribute, or empty attribute on errors.
- xml_attribute append_copy(const xml_attribute& proto);
- xml_attribute prepend_copy(const xml_attribute& proto);
- xml_attribute insert_copy_after(const xml_attribute& proto, const xml_attribute& attr);
- xml_attribute insert_copy_before(const xml_attribute& proto, const xml_attribute& attr);
-
- // Add child node with specified type. Returns added node, or empty node on errors.
- xml_node append_child(xml_node_type type = node_element);
- xml_node prepend_child(xml_node_type type = node_element);
- xml_node insert_child_after(xml_node_type type, const xml_node& node);
- xml_node insert_child_before(xml_node_type type, const xml_node& node);
-
- // Add child element with specified name. Returns added node, or empty node on errors.
- xml_node append_child(const char_t* name);
- xml_node prepend_child(const char_t* name);
- xml_node insert_child_after(const char_t* name, const xml_node& node);
- xml_node insert_child_before(const char_t* name, const xml_node& node);
-
- // Add a copy of the specified node as a child. Returns added node, or empty node on errors.
- xml_node append_copy(const xml_node& proto);
- xml_node prepend_copy(const xml_node& proto);
- xml_node insert_copy_after(const xml_node& proto, const xml_node& node);
- xml_node insert_copy_before(const xml_node& proto, const xml_node& node);
-
- // Remove specified attribute
- bool remove_attribute(const xml_attribute& a);
- bool remove_attribute(const char_t* name);
-
- // Remove specified child
- bool remove_child(const xml_node& n);
- bool remove_child(const char_t* name);
-
- // Find attribute using predicate. Returns first attribute for which predicate returned true.
- template <typename Predicate> xml_attribute find_attribute(Predicate pred) const
- {
- if (!_root) return xml_attribute();
-
- for (xml_attribute attrib = first_attribute(); attrib; attrib = attrib.next_attribute())
- if (pred(attrib))
- return attrib;
-
- return xml_attribute();
- }
-
- // Find child node using predicate. Returns first child for which predicate returned true.
- template <typename Predicate> xml_node find_child(Predicate pred) const
- {
- if (!_root) return xml_node();
-
- for (xml_node node = first_child(); node; node = node.next_sibling())
- if (pred(node))
- return node;
-
- return xml_node();
- }
-
- // Find node from subtree using predicate. Returns first node from subtree (depth-first), for which predicate returned true.
- template <typename Predicate> xml_node find_node(Predicate pred) const
- {
- if (!_root) return xml_node();
-
- xml_node cur = first_child();
-
- while (cur._root && cur._root != _root)
- {
- if (pred(cur)) return cur;
-
- if (cur.first_child()) cur = cur.first_child();
- else if (cur.next_sibling()) cur = cur.next_sibling();
- else
- {
- while (!cur.next_sibling() && cur._root != _root) cur = cur.parent();
-
- if (cur._root != _root) cur = cur.next_sibling();
- }
- }
-
- return xml_node();
- }
-
- // Find child node by attribute name/value
- xml_node find_child_by_attribute(const char_t* name, const char_t* attr_name, const char_t* attr_value) const;
- xml_node find_child_by_attribute(const char_t* attr_name, const char_t* attr_value) const;
-
- #ifndef PUGIXML_NO_STL
- // Get the absolute node path from root as a text string.
- string_t path(char_t delimiter = '/') const;
- #endif
-
- // Search for a node by path consisting of node names and . or .. elements.
- xml_node first_element_by_path(const char_t* path, char_t delimiter = '/') const;
-
- // Recursively traverse subtree with xml_tree_walker
- bool traverse(xml_tree_walker& walker);
-
- #ifndef PUGIXML_NO_XPATH
- // Select single node by evaluating XPath query. Returns first node from the resulting node set.
- xpath_node select_single_node(const char_t* query, xpath_variable_set* variables = 0) const;
- xpath_node select_single_node(const xpath_query& query) const;
-
- // Select node set by evaluating XPath query
- xpath_node_set select_nodes(const char_t* query, xpath_variable_set* variables = 0) const;
- xpath_node_set select_nodes(const xpath_query& query) const;
- #endif
-
- // Print subtree using a writer object
- void print(xml_writer& writer, const char_t* indent = PUGIXML_TEXT("\t"), unsigned int flags = format_default, xml_encoding encoding = encoding_auto, unsigned int depth = 0) const;
-
- #ifndef PUGIXML_NO_STL
- // Print subtree to stream
- void print(std::basic_ostream<char, std::char_traits<char> >& os, const char_t* indent = PUGIXML_TEXT("\t"), unsigned int flags = format_default, xml_encoding encoding = encoding_auto, unsigned int depth = 0) const;
- void print(std::basic_ostream<wchar_t, std::char_traits<wchar_t> >& os, const char_t* indent = PUGIXML_TEXT("\t"), unsigned int flags = format_default, unsigned int depth = 0) const;
- #endif
-
- // Child nodes iterators
- typedef xml_node_iterator iterator;
-
- iterator begin() const;
- iterator end() const;
-
- // Attribute iterators
- typedef xml_attribute_iterator attribute_iterator;
-
- attribute_iterator attributes_begin() const;
- attribute_iterator attributes_end() const;
-
- // Range-based for support
- xml_object_range<xml_node_iterator> children() const;
- xml_object_range<xml_named_node_iterator> children(const char_t* name) const;
- xml_object_range<xml_attribute_iterator> attributes() const;
-
- // Get node offset in parsed file/string (in char_t units) for debugging purposes
- ptrdiff_t offset_debug() const;
-
- // Get hash value (unique for handles to the same object)
- size_t hash_value() const;
-
- // Get internal pointer
- xml_node_struct* internal_object() const;
- };
+// A light-weight handle for manipulating nodes in DOM tree
+class PUGIXML_CLASS xml_node
+{
+ friend class xml_attribute_iterator;
+ friend class xml_node_iterator;
+ friend class xml_named_node_iterator;
+
+protected:
+ xml_node_struct* _root;
+
+ typedef void (*unspecified_bool_type)(xml_node***);
+
+public:
+ // Default constructor. Constructs an empty node.
+ xml_node();
+
+ // Constructs node from internal pointer
+ explicit xml_node(xml_node_struct* p);
+
+ // Safe bool conversion operator
+ operator unspecified_bool_type() const;
+
+ // Borland C++ workaround
+ bool operator!() const;
+
+ // Comparison operators (compares wrapped node pointers)
+ bool operator==(const xml_node& r) const;
+ bool operator!=(const xml_node& r) const;
+ bool operator<(const xml_node& r) const;
+ bool operator>(const xml_node& r) const;
+ bool operator<=(const xml_node& r) const;
+ bool operator>=(const xml_node& r) const;
+
+ // Check if node is empty.
+ bool empty() const;
+
+ // Get node type
+ xml_node_type type() const;
+
+ // Get node name/value, or "" if node is empty or it has no name/value
+ const char_t* name() const;
+ const char_t* value() const;
+
+ // Get attribute list
+ xml_attribute first_attribute() const;
+ xml_attribute last_attribute() const;
+
+ // Get children list
+ xml_node first_child() const;
+ xml_node last_child() const;
+
+ // Get next/previous sibling in the children list of the parent node
+ xml_node next_sibling() const;
+ xml_node previous_sibling() const;
+
+ // Get parent node
+ xml_node parent() const;
+
+ // Get root of DOM tree this node belongs to
+ xml_node root() const;
+
+ // Get text object for the current node
+ xml_text text() const;
+
+ // Get child, attribute or next/previous sibling with the specified name
+ xml_node child(const char_t* name) const;
+ xml_attribute attribute(const char_t* name) const;
+ xml_node next_sibling(const char_t* name) const;
+ xml_node previous_sibling(const char_t* name) const;
+
+ // Get child value of current node; that is, value of the first child node of type PCDATA/CDATA
+ const char_t* child_value() const;
+
+ // Get child value of child with specified name. Equivalent to child(name).child_value().
+ const char_t* child_value(const char_t* name) const;
+
+ // Set node name/value (returns false if node is empty, there is not enough memory, or node can not have name/value)
+ bool set_name(const char_t* rhs);
+ bool set_value(const char_t* rhs);
+
+ // Add attribute with specified name. Returns added attribute, or empty attribute on errors.
+ xml_attribute append_attribute(const char_t* name);
+ xml_attribute prepend_attribute(const char_t* name);
+ xml_attribute insert_attribute_after(const char_t* name, const xml_attribute& attr);
+ xml_attribute insert_attribute_before(const char_t* name, const xml_attribute& attr);
+
+ // Add a copy of the specified attribute. Returns added attribute, or empty attribute on errors.
+ xml_attribute append_copy(const xml_attribute& proto);
+ xml_attribute prepend_copy(const xml_attribute& proto);
+ xml_attribute insert_copy_after(const xml_attribute& proto, const xml_attribute& attr);
+ xml_attribute insert_copy_before(const xml_attribute& proto, const xml_attribute& attr);
+
+ // Add child node with specified type. Returns added node, or empty node on errors.
+ xml_node append_child(xml_node_type type = node_element);
+ xml_node prepend_child(xml_node_type type = node_element);
+ xml_node insert_child_after(xml_node_type type, const xml_node& node);
+ xml_node insert_child_before(xml_node_type type, const xml_node& node);
+
+ // Add child element with specified name. Returns added node, or empty node on errors.
+ xml_node append_child(const char_t* name);
+ xml_node prepend_child(const char_t* name);
+ xml_node insert_child_after(const char_t* name, const xml_node& node);
+ xml_node insert_child_before(const char_t* name, const xml_node& node);
+
+ // Add a copy of the specified node as a child. Returns added node, or empty node on errors.
+ xml_node append_copy(const xml_node& proto);
+ xml_node prepend_copy(const xml_node& proto);
+ xml_node insert_copy_after(const xml_node& proto, const xml_node& node);
+ xml_node insert_copy_before(const xml_node& proto, const xml_node& node);
+
+ // Remove specified attribute
+ bool remove_attribute(const xml_attribute& a);
+ bool remove_attribute(const char_t* name);
+
+ // Remove specified child
+ bool remove_child(const xml_node& n);
+ bool remove_child(const char_t* name);
+
+ // Find attribute using predicate. Returns first attribute for which predicate returned true.
+ template <typename Predicate> xml_attribute find_attribute(Predicate pred) const {
+ if (!_root) return xml_attribute();
+
+ for (xml_attribute attrib = first_attribute(); attrib; attrib = attrib.next_attribute())
+ if (pred(attrib))
+ return attrib;
+
+ return xml_attribute();
+ }
+
+ // Find child node using predicate. Returns first child for which predicate returned true.
+ template <typename Predicate> xml_node find_child(Predicate pred) const {
+ if (!_root) return xml_node();
+
+ for (xml_node node = first_child(); node; node = node.next_sibling())
+ if (pred(node))
+ return node;
+
+ return xml_node();
+ }
+
+ // Find node from subtree using predicate. Returns first node from subtree (depth-first), for which predicate returned true.
+ template <typename Predicate> xml_node find_node(Predicate pred) const {
+ if (!_root) return xml_node();
+
+ xml_node cur = first_child();
+
+ while (cur._root && cur._root != _root) {
+ if (pred(cur)) return cur;
+
+ if (cur.first_child()) cur = cur.first_child();
+ else if (cur.next_sibling()) cur = cur.next_sibling();
+ else {
+ while (!cur.next_sibling() && cur._root != _root) cur = cur.parent();
+
+ if (cur._root != _root) cur = cur.next_sibling();
+ }
+ }
+
+ return xml_node();
+ }
+
+ // Find child node by attribute name/value
+ xml_node find_child_by_attribute(const char_t* name, const char_t* attr_name, const char_t* attr_value) const;
+ xml_node find_child_by_attribute(const char_t* attr_name, const char_t* attr_value) const;
+
+#ifndef PUGIXML_NO_STL
+ // Get the absolute node path from root as a text string.
+ string_t path(char_t delimiter = '/') const;
+#endif
+
+ // Search for a node by path consisting of node names and . or .. elements.
+ xml_node first_element_by_path(const char_t* path, char_t delimiter = '/') const;
+
+ // Recursively traverse subtree with xml_tree_walker
+ bool traverse(xml_tree_walker& walker);
+
+#ifndef PUGIXML_NO_XPATH
+ // Select single node by evaluating XPath query. Returns first node from the resulting node set.
+ xpath_node select_single_node(const char_t* query, xpath_variable_set* variables = 0) const;
+ xpath_node select_single_node(const xpath_query& query) const;
+
+ // Select node set by evaluating XPath query
+ xpath_node_set select_nodes(const char_t* query, xpath_variable_set* variables = 0) const;
+ xpath_node_set select_nodes(const xpath_query& query) const;
+#endif
+
+ // Print subtree using a writer object
+ void print(xml_writer& writer, const char_t* indent = PUGIXML_TEXT("\t"), unsigned int flags = format_default, xml_encoding encoding = encoding_auto, unsigned int depth = 0) const;
+
+#ifndef PUGIXML_NO_STL
+ // Print subtree to stream
+ void print(std::basic_ostream<char, std::char_traits<char> >& os, const char_t* indent = PUGIXML_TEXT("\t"), unsigned int flags = format_default, xml_encoding encoding = encoding_auto, unsigned int depth = 0) const;
+ void print(std::basic_ostream<wchar_t, std::char_traits<wchar_t> >& os, const char_t* indent = PUGIXML_TEXT("\t"), unsigned int flags = format_default, unsigned int depth = 0) const;
+#endif
+
+ // Child nodes iterators
+ typedef xml_node_iterator iterator;
+
+ iterator begin() const;
+ iterator end() const;
+
+ // Attribute iterators
+ typedef xml_attribute_iterator attribute_iterator;
+
+ attribute_iterator attributes_begin() const;
+ attribute_iterator attributes_end() const;
+
+ // Range-based for support
+ xml_object_range<xml_node_iterator> children() const;
+ xml_object_range<xml_named_node_iterator> children(const char_t* name) const;
+ xml_object_range<xml_attribute_iterator> attributes() const;
+
+ // Get node offset in parsed file/string (in char_t units) for debugging purposes
+ ptrdiff_t offset_debug() const;
+
+ // Get hash value (unique for handles to the same object)
+ size_t hash_value() const;
+
+ // Get internal pointer
+ xml_node_struct* internal_object() const;
+};
#ifdef __BORLANDC__
- // Borland C++ workaround
- bool PUGIXML_FUNCTION operator&&(const xml_node& lhs, bool rhs);
- bool PUGIXML_FUNCTION operator||(const xml_node& lhs, bool rhs);
+// Borland C++ workaround
+bool PUGIXML_FUNCTION operator&&(const xml_node& lhs, bool rhs);
+bool PUGIXML_FUNCTION operator||(const xml_node& lhs, bool rhs);
#endif
- // A helper for working with text inside PCDATA nodes
- class PUGIXML_CLASS xml_text
- {
- friend class xml_node;
+// A helper for working with text inside PCDATA nodes
+class PUGIXML_CLASS xml_text
+{
+ friend class xml_node;
- xml_node_struct* _root;
+ xml_node_struct* _root;
- typedef void (*unspecified_bool_type)(xml_text***);
+ typedef void (*unspecified_bool_type)(xml_text***);
- explicit xml_text(xml_node_struct* root);
+ explicit xml_text(xml_node_struct* root);
- xml_node_struct* _data_new();
- xml_node_struct* _data() const;
+ xml_node_struct* _data_new();
+ xml_node_struct* _data() const;
- public:
- // Default constructor. Constructs an empty object.
- xml_text();
+public:
+ // Default constructor. Constructs an empty object.
+ xml_text();
- // Safe bool conversion operator
- operator unspecified_bool_type() const;
+ // Safe bool conversion operator
+ operator unspecified_bool_type() const;
- // Borland C++ workaround
- bool operator!() const;
+ // Borland C++ workaround
+ bool operator!() const;
- // Check if text object is empty
- bool empty() const;
+ // Check if text object is empty
+ bool empty() const;
- // Get text, or "" if object is empty
- const char_t* get() const;
+ // Get text, or "" if object is empty
+ const char_t* get() const;
- // Get text, or the default value if object is empty
- const char_t* as_string(const char_t* def = PUGIXML_TEXT("")) const;
+ // Get text, or the default value if object is empty
+ const char_t* as_string(const char_t* def = PUGIXML_TEXT("")) const;
- // Get text as a number, or the default value if conversion did not succeed or object is empty
- int as_int(int def = 0) const;
- unsigned int as_uint(unsigned int def = 0) const;
- double as_double(double def = 0) const;
- float as_float(float def = 0) const;
+ // Get text as a number, or the default value if conversion did not succeed or object is empty
+ int as_int(int def = 0) const;
+ unsigned int as_uint(unsigned int def = 0) const;
+ double as_double(double def = 0) const;
+ float as_float(float def = 0) const;
- // Get text as bool (returns true if first character is in '1tTyY' set), or the default value if object is empty
- bool as_bool(bool def = false) const;
+ // Get text as bool (returns true if first character is in '1tTyY' set), or the default value if object is empty
+ bool as_bool(bool def = false) const;
- // Set text (returns false if object is empty or there is not enough memory)
- bool set(const char_t* rhs);
+ // Set text (returns false if object is empty or there is not enough memory)
+ bool set(const char_t* rhs);
- // Set text with type conversion (numbers are converted to strings, boolean is converted to "true"/"false")
- bool set(int rhs);
- bool set(unsigned int rhs);
- bool set(double rhs);
- bool set(bool rhs);
+ // Set text with type conversion (numbers are converted to strings, boolean is converted to "true"/"false")
+ bool set(int rhs);
+ bool set(unsigned int rhs);
+ bool set(double rhs);
+ bool set(bool rhs);
- // Set text (equivalent to set without error checking)
- xml_text& operator=(const char_t* rhs);
- xml_text& operator=(int rhs);
- xml_text& operator=(unsigned int rhs);
- xml_text& operator=(double rhs);
- xml_text& operator=(bool rhs);
+ // Set text (equivalent to set without error checking)
+ xml_text& operator=(const char_t* rhs);
+ xml_text& operator=(int rhs);
+ xml_text& operator=(unsigned int rhs);
+ xml_text& operator=(double rhs);
+ xml_text& operator=(bool rhs);
- // Get the data node (node_pcdata or node_cdata) for this object
- xml_node data() const;
- };
+ // Get the data node (node_pcdata or node_cdata) for this object
+ xml_node data() const;
+};
#ifdef __BORLANDC__
- // Borland C++ workaround
- bool PUGIXML_FUNCTION operator&&(const xml_text& lhs, bool rhs);
- bool PUGIXML_FUNCTION operator||(const xml_text& lhs, bool rhs);
+// Borland C++ workaround
+bool PUGIXML_FUNCTION operator&&(const xml_text& lhs, bool rhs);
+bool PUGIXML_FUNCTION operator||(const xml_text& lhs, bool rhs);
#endif
- // Child node iterator (a bidirectional iterator over a collection of xml_node)
- class PUGIXML_CLASS xml_node_iterator
- {
- friend class xml_node;
+// Child node iterator (a bidirectional iterator over a collection of xml_node)
+class PUGIXML_CLASS xml_node_iterator
+{
+ friend class xml_node;
- private:
- mutable xml_node _wrap;
- xml_node _parent;
+private:
+ mutable xml_node _wrap;
+ xml_node _parent;
- xml_node_iterator(xml_node_struct* ref, xml_node_struct* parent);
+ xml_node_iterator(xml_node_struct* ref, xml_node_struct* parent);
- public:
- // Iterator traits
- typedef ptrdiff_t difference_type;
- typedef xml_node value_type;
- typedef xml_node* pointer;
- typedef xml_node& reference;
+public:
+ // Iterator traits
+ typedef ptrdiff_t difference_type;
+ typedef xml_node value_type;
+ typedef xml_node* pointer;
+ typedef xml_node& reference;
- #ifndef PUGIXML_NO_STL
- typedef std::bidirectional_iterator_tag iterator_category;
- #endif
+#ifndef PUGIXML_NO_STL
+ typedef std::bidirectional_iterator_tag iterator_category;
+#endif
- // Default constructor
- xml_node_iterator();
+ // Default constructor
+ xml_node_iterator();
- // Construct an iterator which points to the specified node
- xml_node_iterator(const xml_node& node);
+ // Construct an iterator which points to the specified node
+ xml_node_iterator(const xml_node& node);
- // Iterator operators
- bool operator==(const xml_node_iterator& rhs) const;
- bool operator!=(const xml_node_iterator& rhs) const;
-
- xml_node& operator*() const;
- xml_node* operator->() const;
-
- const xml_node_iterator& operator++();
- xml_node_iterator operator++(int);
+ // Iterator operators
+ bool operator==(const xml_node_iterator& rhs) const;
+ bool operator!=(const xml_node_iterator& rhs) const;
- const xml_node_iterator& operator--();
- xml_node_iterator operator--(int);
- };
+ xml_node& operator*() const;
+ xml_node* operator->() const;
- // Attribute iterator (a bidirectional iterator over a collection of xml_attribute)
- class PUGIXML_CLASS xml_attribute_iterator
- {
- friend class xml_node;
-
- private:
- mutable xml_attribute _wrap;
- xml_node _parent;
-
- xml_attribute_iterator(xml_attribute_struct* ref, xml_node_struct* parent);
-
- public:
- // Iterator traits
- typedef ptrdiff_t difference_type;
- typedef xml_attribute value_type;
- typedef xml_attribute* pointer;
- typedef xml_attribute& reference;
-
- #ifndef PUGIXML_NO_STL
- typedef std::bidirectional_iterator_tag iterator_category;
- #endif
-
- // Default constructor
- xml_attribute_iterator();
-
- // Construct an iterator which points to the specified attribute
- xml_attribute_iterator(const xml_attribute& attr, const xml_node& parent);
-
- // Iterator operators
- bool operator==(const xml_attribute_iterator& rhs) const;
- bool operator!=(const xml_attribute_iterator& rhs) const;
-
- xml_attribute& operator*() const;
- xml_attribute* operator->() const;
-
- const xml_attribute_iterator& operator++();
- xml_attribute_iterator operator++(int);
-
- const xml_attribute_iterator& operator--();
- xml_attribute_iterator operator--(int);
- };
-
- // Named node range helper
- class xml_named_node_iterator
- {
- public:
- // Iterator traits
- typedef ptrdiff_t difference_type;
- typedef xml_node value_type;
- typedef xml_node* pointer;
- typedef xml_node& reference;
-
- #ifndef PUGIXML_NO_STL
- typedef std::forward_iterator_tag iterator_category;
- #endif
-
- // Default constructor
- xml_named_node_iterator();
-
- // Construct an iterator which points to the specified node
- xml_named_node_iterator(const xml_node& node, const char_t* name);
-
- // Iterator operators
- bool operator==(const xml_named_node_iterator& rhs) const;
- bool operator!=(const xml_named_node_iterator& rhs) const;
-
- xml_node& operator*() const;
- xml_node* operator->() const;
-
- const xml_named_node_iterator& operator++();
- xml_named_node_iterator operator++(int);
-
- private:
- mutable xml_node _node;
- const char_t* _name;
- };
-
- // Abstract tree walker class (see xml_node::traverse)
- class PUGIXML_CLASS xml_tree_walker
- {
- friend class xml_node;
-
- private:
- int _depth;
-
- protected:
- // Get current traversal depth
- int depth() const;
-
- public:
- xml_tree_walker();
- virtual ~xml_tree_walker();
-
- // Callback that is called when traversal begins
- virtual bool begin(xml_node& node);
-
- // Callback that is called for each node traversed
- virtual bool for_each(xml_node& node) = 0;
-
- // Callback that is called when traversal ends
- virtual bool end(xml_node& node);
- };
-
- // Parsing status, returned as part of xml_parse_result object
- enum xml_parse_status
- {
- status_ok = 0, // No error
-
- status_file_not_found, // File was not found during load_file()
- status_io_error, // Error reading from file/stream
- status_out_of_memory, // Could not allocate memory
- status_internal_error, // Internal error occurred
-
- status_unrecognized_tag, // Parser could not determine tag type
-
- status_bad_pi, // Parsing error occurred while parsing document declaration/processing instruction
- status_bad_comment, // Parsing error occurred while parsing comment
- status_bad_cdata, // Parsing error occurred while parsing CDATA section
- status_bad_doctype, // Parsing error occurred while parsing document type declaration
- status_bad_pcdata, // Parsing error occurred while parsing PCDATA section
- status_bad_start_element, // Parsing error occurred while parsing start element tag
- status_bad_attribute, // Parsing error occurred while parsing element attribute
- status_bad_end_element, // Parsing error occurred while parsing end element tag
- status_end_element_mismatch // There was a mismatch of start-end tags (closing tag had incorrect name, some tag was not closed or there was an excessive closing tag)
- };
-
- // Parsing result
- struct PUGIXML_CLASS xml_parse_result
- {
- // Parsing status (see xml_parse_status)
- xml_parse_status status;
-
- // Last parsed offset (in char_t units from start of input data)
- ptrdiff_t offset;
-
- // Source document encoding
- xml_encoding encoding;
-
- // Default constructor, initializes object to failed state
- xml_parse_result();
-
- // Cast to bool operator
- operator bool() const;
-
- // Get error description
- const char* description() const;
- };
-
- // Document class (DOM tree root)
- class PUGIXML_CLASS xml_document: public xml_node
- {
- private:
- char_t* _buffer;
-
- char _memory[192];
-
- // Non-copyable semantics
- xml_document(const xml_document&);
- const xml_document& operator=(const xml_document&);
-
- void create();
- void destroy();
-
- xml_parse_result load_buffer_impl(void* contents, size_t size, unsigned int options, xml_encoding encoding, bool is_mutable, bool own);
-
- public:
- // Default constructor, makes empty document
- xml_document();
-
- // Destructor, invalidates all node/attribute handles to this document
- ~xml_document();
-
- // Removes all nodes, leaving the empty document
- void reset();
-
- // Removes all nodes, then copies the entire contents of the specified document
- void reset(const xml_document& proto);
-
- #ifndef PUGIXML_NO_STL
- // Load document from stream.
- xml_parse_result load(std::basic_istream<char, std::char_traits<char> >& stream, unsigned int options = parse_default, xml_encoding encoding = encoding_auto);
- xml_parse_result load(std::basic_istream<wchar_t, std::char_traits<wchar_t> >& stream, unsigned int options = parse_default);
- #endif
-
- // Load document from zero-terminated string. No encoding conversions are applied.
- xml_parse_result load(const char_t* contents, unsigned int options = parse_default);
-
- // Load document from file
- xml_parse_result load_file(const char* path, unsigned int options = parse_default, xml_encoding encoding = encoding_auto);
- xml_parse_result load_file(const wchar_t* path, unsigned int options = parse_default, xml_encoding encoding = encoding_auto);
+ const xml_node_iterator& operator++();
+ xml_node_iterator operator++(int);
- // Load document from buffer. Copies/converts the buffer, so it may be deleted or changed after the function returns.
- xml_parse_result load_buffer(const void* contents, size_t size, unsigned int options = parse_default, xml_encoding encoding = encoding_auto);
+ const xml_node_iterator& operator--();
+ xml_node_iterator operator--(int);
+};
- // Load document from buffer, using the buffer for in-place parsing (the buffer is modified and used for storage of document data).
- // You should ensure that buffer data will persist throughout the document's lifetime, and free the buffer memory manually once document is destroyed.
- xml_parse_result load_buffer_inplace(void* contents, size_t size, unsigned int options = parse_default, xml_encoding encoding = encoding_auto);
+// Attribute iterator (a bidirectional iterator over a collection of xml_attribute)
+class PUGIXML_CLASS xml_attribute_iterator
+{
+ friend class xml_node;
- // Load document from buffer, using the buffer for in-place parsing (the buffer is modified and used for storage of document data).
- // You should allocate the buffer with pugixml allocation function; document will free the buffer when it is no longer needed (you can't use it anymore).
- xml_parse_result load_buffer_inplace_own(void* contents, size_t size, unsigned int options = parse_default, xml_encoding encoding = encoding_auto);
+private:
+ mutable xml_attribute _wrap;
+ xml_node _parent;
- // Save XML document to writer (semantics is slightly different from xml_node::print, see documentation for details).
- void save(xml_writer& writer, const char_t* indent = PUGIXML_TEXT("\t"), unsigned int flags = format_default, xml_encoding encoding = encoding_auto) const;
+ xml_attribute_iterator(xml_attribute_struct* ref, xml_node_struct* parent);
- #ifndef PUGIXML_NO_STL
- // Save XML document to stream (semantics is slightly different from xml_node::print, see documentation for details).
- void save(std::basic_ostream<char, std::char_traits<char> >& stream, const char_t* indent = PUGIXML_TEXT("\t"), unsigned int flags = format_default, xml_encoding encoding = encoding_auto) const;
- void save(std::basic_ostream<wchar_t, std::char_traits<wchar_t> >& stream, const char_t* indent = PUGIXML_TEXT("\t"), unsigned int flags = format_default) const;
- #endif
+public:
+ // Iterator traits
+ typedef ptrdiff_t difference_type;
+ typedef xml_attribute value_type;
+ typedef xml_attribute* pointer;
+ typedef xml_attribute& reference;
- // Save XML to file
- bool save_file(const char* path, const char_t* indent = PUGIXML_TEXT("\t"), unsigned int flags = format_default, xml_encoding encoding = encoding_auto) const;
- bool save_file(const wchar_t* path, const char_t* indent = PUGIXML_TEXT("\t"), unsigned int flags = format_default, xml_encoding encoding = encoding_auto) const;
+#ifndef PUGIXML_NO_STL
+ typedef std::bidirectional_iterator_tag iterator_category;
+#endif
- // Get document element
- xml_node document_element() const;
- };
+ // Default constructor
+ xml_attribute_iterator();
+
+ // Construct an iterator which points to the specified attribute
+ xml_attribute_iterator(const xml_attribute& attr, const xml_node& parent);
+
+ // Iterator operators
+ bool operator==(const xml_attribute_iterator& rhs) const;
+ bool operator!=(const xml_attribute_iterator& rhs) const;
+
+ xml_attribute& operator*() const;
+ xml_attribute* operator->() const;
+
+ const xml_attribute_iterator& operator++();
+ xml_attribute_iterator operator++(int);
+
+ const xml_attribute_iterator& operator--();
+ xml_attribute_iterator operator--(int);
+};
+
+// Named node range helper
+class xml_named_node_iterator
+{
+public:
+ // Iterator traits
+ typedef ptrdiff_t difference_type;
+ typedef xml_node value_type;
+ typedef xml_node* pointer;
+ typedef xml_node& reference;
+
+#ifndef PUGIXML_NO_STL
+ typedef std::forward_iterator_tag iterator_category;
+#endif
+
+ // Default constructor
+ xml_named_node_iterator();
+
+ // Construct an iterator which points to the specified node
+ xml_named_node_iterator(const xml_node& node, const char_t* name);
+
+ // Iterator operators
+ bool operator==(const xml_named_node_iterator& rhs) const;
+ bool operator!=(const xml_named_node_iterator& rhs) const;
+
+ xml_node& operator*() const;
+ xml_node* operator->() const;
+
+ const xml_named_node_iterator& operator++();
+ xml_named_node_iterator operator++(int);
+
+private:
+ mutable xml_node _node;
+ const char_t* _name;
+};
+
+// Abstract tree walker class (see xml_node::traverse)
+class PUGIXML_CLASS xml_tree_walker
+{
+ friend class xml_node;
+
+private:
+ int _depth;
+
+protected:
+ // Get current traversal depth
+ int depth() const;
+
+public:
+ xml_tree_walker();
+ virtual ~xml_tree_walker();
+
+ // Callback that is called when traversal begins
+ virtual bool begin(xml_node& node);
+
+ // Callback that is called for each node traversed
+ virtual bool for_each(xml_node& node) = 0;
+
+ // Callback that is called when traversal ends
+ virtual bool end(xml_node& node);
+};
+
+// Parsing status, returned as part of xml_parse_result object
+enum xml_parse_status {
+ status_ok = 0, // No error
+
+ status_file_not_found, // File was not found during load_file()
+ status_io_error, // Error reading from file/stream
+ status_out_of_memory, // Could not allocate memory
+ status_internal_error, // Internal error occurred
+
+ status_unrecognized_tag, // Parser could not determine tag type
+
+ status_bad_pi, // Parsing error occurred while parsing document declaration/processing instruction
+ status_bad_comment, // Parsing error occurred while parsing comment
+ status_bad_cdata, // Parsing error occurred while parsing CDATA section
+ status_bad_doctype, // Parsing error occurred while parsing document type declaration
+ status_bad_pcdata, // Parsing error occurred while parsing PCDATA section
+ status_bad_start_element, // Parsing error occurred while parsing start element tag
+ status_bad_attribute, // Parsing error occurred while parsing element attribute
+ status_bad_end_element, // Parsing error occurred while parsing end element tag
+ status_end_element_mismatch // There was a mismatch of start-end tags (closing tag had incorrect name, some tag was not closed or there was an excessive closing tag)
+};
+
+// Parsing result
+struct PUGIXML_CLASS xml_parse_result {
+ // Parsing status (see xml_parse_status)
+ xml_parse_status status;
+
+ // Last parsed offset (in char_t units from start of input data)
+ ptrdiff_t offset;
+
+ // Source document encoding
+ xml_encoding encoding;
+
+ // Default constructor, initializes object to failed state
+ xml_parse_result();
+
+ // Cast to bool operator
+ operator bool() const;
+
+ // Get error description
+ const char* description() const;
+};
+
+// Document class (DOM tree root)
+class PUGIXML_CLASS xml_document: public xml_node
+{
+private:
+ char_t* _buffer;
+
+ char _memory[192];
+
+ // Non-copyable semantics
+ xml_document(const xml_document&);
+ const xml_document& operator=(const xml_document&);
+
+ void create();
+ void destroy();
+
+ xml_parse_result load_buffer_impl(void* contents, size_t size, unsigned int options, xml_encoding encoding, bool is_mutable, bool own);
+
+public:
+ // Default constructor, makes empty document
+ xml_document();
+
+ // Destructor, invalidates all node/attribute handles to this document
+ ~xml_document();
+
+ // Removes all nodes, leaving the empty document
+ void reset();
+
+ // Removes all nodes, then copies the entire contents of the specified document
+ void reset(const xml_document& proto);
+
+#ifndef PUGIXML_NO_STL
+ // Load document from stream.
+ xml_parse_result load(std::basic_istream<char, std::char_traits<char> >& stream, unsigned int options = parse_default, xml_encoding encoding = encoding_auto);
+ xml_parse_result load(std::basic_istream<wchar_t, std::char_traits<wchar_t> >& stream, unsigned int options = parse_default);
+#endif
+
+ // Load document from zero-terminated string. No encoding conversions are applied.
+ xml_parse_result load(const char_t* contents, unsigned int options = parse_default);
+
+ // Load document from file
+ xml_parse_result load_file(const char* path, unsigned int options = parse_default, xml_encoding encoding = encoding_auto);
+ xml_parse_result load_file(const wchar_t* path, unsigned int options = parse_default, xml_encoding encoding = encoding_auto);
+
+ // Load document from buffer. Copies/converts the buffer, so it may be deleted or changed after the function returns.
+ xml_parse_result load_buffer(const void* contents, size_t size, unsigned int options = parse_default, xml_encoding encoding = encoding_auto);
+
+ // Load document from buffer, using the buffer for in-place parsing (the buffer is modified and used for storage of document data).
+ // You should ensure that buffer data will persist throughout the document's lifetime, and free the buffer memory manually once document is destroyed.
+ xml_parse_result load_buffer_inplace(void* contents, size_t size, unsigned int options = parse_default, xml_encoding encoding = encoding_auto);
+
+ // Load document from buffer, using the buffer for in-place parsing (the buffer is modified and used for storage of document data).
+ // You should allocate the buffer with pugixml allocation function; document will free the buffer when it is no longer needed (you can't use it anymore).
+ xml_parse_result load_buffer_inplace_own(void* contents, size_t size, unsigned int options = parse_default, xml_encoding encoding = encoding_auto);
+
+ // Save XML document to writer (semantics is slightly different from xml_node::print, see documentation for details).
+ void save(xml_writer& writer, const char_t* indent = PUGIXML_TEXT("\t"), unsigned int flags = format_default, xml_encoding encoding = encoding_auto) const;
+
+#ifndef PUGIXML_NO_STL
+ // Save XML document to stream (semantics is slightly different from xml_node::print, see documentation for details).
+ void save(std::basic_ostream<char, std::char_traits<char> >& stream, const char_t* indent = PUGIXML_TEXT("\t"), unsigned int flags = format_default, xml_encoding encoding = encoding_auto) const;
+ void save(std::basic_ostream<wchar_t, std::char_traits<wchar_t> >& stream, const char_t* indent = PUGIXML_TEXT("\t"), unsigned int flags = format_default) const;
+#endif
+
+ // Save XML to file
+ bool save_file(const char* path, const char_t* indent = PUGIXML_TEXT("\t"), unsigned int flags = format_default, xml_encoding encoding = encoding_auto) const;
+ bool save_file(const wchar_t* path, const char_t* indent = PUGIXML_TEXT("\t"), unsigned int flags = format_default, xml_encoding encoding = encoding_auto) const;
+
+ // Get document element
+ xml_node document_element() const;
+};
#ifndef PUGIXML_NO_XPATH
- // XPath query return type
- enum xpath_value_type
- {
- xpath_type_none, // Unknown type (query failed to compile)
- xpath_type_node_set, // Node set (xpath_node_set)
- xpath_type_number, // Number
- xpath_type_string, // String
- xpath_type_boolean // Boolean
- };
-
- // XPath parsing result
- struct PUGIXML_CLASS xpath_parse_result
- {
- // Error message (0 if no error)
- const char* error;
-
- // Last parsed offset (in char_t units from string start)
- ptrdiff_t offset;
-
- // Default constructor, initializes object to failed state
- xpath_parse_result();
-
- // Cast to bool operator
- operator bool() const;
-
- // Get error description
- const char* description() const;
- };
-
- // A single XPath variable
- class PUGIXML_CLASS xpath_variable
- {
- friend class xpath_variable_set;
-
- protected:
- xpath_value_type _type;
- xpath_variable* _next;
-
- xpath_variable();
-
- // Non-copyable semantics
- xpath_variable(const xpath_variable&);
- xpath_variable& operator=(const xpath_variable&);
-
- public:
- // Get variable name
- const char_t* name() const;
-
- // Get variable type
- xpath_value_type type() const;
-
- // Get variable value; no type conversion is performed, default value (false, NaN, empty string, empty node set) is returned on type mismatch error
- bool get_boolean() const;
- double get_number() const;
- const char_t* get_string() const;
- const xpath_node_set& get_node_set() const;
-
- // Set variable value; no type conversion is performed, false is returned on type mismatch error
- bool set(bool value);
- bool set(double value);
- bool set(const char_t* value);
- bool set(const xpath_node_set& value);
- };
-
- // A set of XPath variables
- class PUGIXML_CLASS xpath_variable_set
- {
- private:
- xpath_variable* _data[64];
-
- // Non-copyable semantics
- xpath_variable_set(const xpath_variable_set&);
- xpath_variable_set& operator=(const xpath_variable_set&);
-
- xpath_variable* find(const char_t* name) const;
-
- public:
- // Default constructor/destructor
- xpath_variable_set();
- ~xpath_variable_set();
-
- // Add a new variable or get the existing one, if the types match
- xpath_variable* add(const char_t* name, xpath_value_type type);
-
- // Set value of an existing variable; no type conversion is performed, false is returned if there is no such variable or if types mismatch
- bool set(const char_t* name, bool value);
- bool set(const char_t* name, double value);
- bool set(const char_t* name, const char_t* value);
- bool set(const char_t* name, const xpath_node_set& value);
-
- // Get existing variable by name
- xpath_variable* get(const char_t* name);
- const xpath_variable* get(const char_t* name) const;
- };
-
- // A compiled XPath query object
- class PUGIXML_CLASS xpath_query
- {
- private:
- void* _impl;
- xpath_parse_result _result;
-
- typedef void (*unspecified_bool_type)(xpath_query***);
-
- // Non-copyable semantics
- xpath_query(const xpath_query&);
- xpath_query& operator=(const xpath_query&);
-
- public:
- // Construct a compiled object from XPath expression.
- // If PUGIXML_NO_EXCEPTIONS is not defined, throws xpath_exception on compilation errors.
- explicit xpath_query(const char_t* query, xpath_variable_set* variables = 0);
-
- // Destructor
- ~xpath_query();
-
- // Get query expression return type
- xpath_value_type return_type() const;
-
- // Evaluate expression as boolean value in the specified context; performs type conversion if necessary.
- // If PUGIXML_NO_EXCEPTIONS is not defined, throws std::bad_alloc on out of memory errors.
- bool evaluate_boolean(const xpath_node& n) const;
-
- // Evaluate expression as double value in the specified context; performs type conversion if necessary.
- // If PUGIXML_NO_EXCEPTIONS is not defined, throws std::bad_alloc on out of memory errors.
- double evaluate_number(const xpath_node& n) const;
-
- #ifndef PUGIXML_NO_STL
- // Evaluate expression as string value in the specified context; performs type conversion if necessary.
- // If PUGIXML_NO_EXCEPTIONS is not defined, throws std::bad_alloc on out of memory errors.
- string_t evaluate_string(const xpath_node& n) const;
- #endif
-
- // Evaluate expression as string value in the specified context; performs type conversion if necessary.
- // At most capacity characters are written to the destination buffer, full result size is returned (includes terminating zero).
- // If PUGIXML_NO_EXCEPTIONS is not defined, throws std::bad_alloc on out of memory errors.
- // If PUGIXML_NO_EXCEPTIONS is defined, returns empty set instead.
- size_t evaluate_string(char_t* buffer, size_t capacity, const xpath_node& n) const;
-
- // Evaluate expression as node set in the specified context.
- // If PUGIXML_NO_EXCEPTIONS is not defined, throws xpath_exception on type mismatch and std::bad_alloc on out of memory errors.
- // If PUGIXML_NO_EXCEPTIONS is defined, returns empty node set instead.
- xpath_node_set evaluate_node_set(const xpath_node& n) const;
-
- // Get parsing result (used to get compilation errors in PUGIXML_NO_EXCEPTIONS mode)
- const xpath_parse_result& result() const;
-
- // Safe bool conversion operator
- operator unspecified_bool_type() const;
-
- // Borland C++ workaround
- bool operator!() const;
- };
-
- #ifndef PUGIXML_NO_EXCEPTIONS
- // XPath exception class
- class PUGIXML_CLASS xpath_exception: public std::exception
- {
- private:
- xpath_parse_result _result;
-
- public:
- // Construct exception from parse result
- explicit xpath_exception(const xpath_parse_result& result);
-
- // Get error message
- virtual const char* what() const throw();
-
- // Get parse result
- const xpath_parse_result& result() const;
- };
- #endif
-
- // XPath node class (either xml_node or xml_attribute)
- class PUGIXML_CLASS xpath_node
- {
- private:
- xml_node _node;
- xml_attribute _attribute;
-
- typedef void (*unspecified_bool_type)(xpath_node***);
-
- public:
- // Default constructor; constructs empty XPath node
- xpath_node();
-
- // Construct XPath node from XML node/attribute
- xpath_node(const xml_node& node);
- xpath_node(const xml_attribute& attribute, const xml_node& parent);
-
- // Get node/attribute, if any
- xml_node node() const;
- xml_attribute attribute() const;
-
- // Get parent of contained node/attribute
- xml_node parent() const;
-
- // Safe bool conversion operator
- operator unspecified_bool_type() const;
-
- // Borland C++ workaround
- bool operator!() const;
-
- // Comparison operators
- bool operator==(const xpath_node& n) const;
- bool operator!=(const xpath_node& n) const;
- };
+// XPath query return type
+enum xpath_value_type {
+ xpath_type_none, // Unknown type (query failed to compile)
+ xpath_type_node_set, // Node set (xpath_node_set)
+ xpath_type_number, // Number
+ xpath_type_string, // String
+ xpath_type_boolean // Boolean
+};
+
+// XPath parsing result
+struct PUGIXML_CLASS xpath_parse_result {
+ // Error message (0 if no error)
+ const char* error;
+
+ // Last parsed offset (in char_t units from string start)
+ ptrdiff_t offset;
+
+ // Default constructor, initializes object to failed state
+ xpath_parse_result();
+
+ // Cast to bool operator
+ operator bool() const;
+
+ // Get error description
+ const char* description() const;
+};
+
+// A single XPath variable
+class PUGIXML_CLASS xpath_variable
+{
+ friend class xpath_variable_set;
+
+protected:
+ xpath_value_type _type;
+ xpath_variable* _next;
+
+ xpath_variable();
+
+ // Non-copyable semantics
+ xpath_variable(const xpath_variable&);
+ xpath_variable& operator=(const xpath_variable&);
+
+public:
+ // Get variable name
+ const char_t* name() const;
+
+ // Get variable type
+ xpath_value_type type() const;
+
+ // Get variable value; no type conversion is performed, default value (false, NaN, empty string, empty node set) is returned on type mismatch error
+ bool get_boolean() const;
+ double get_number() const;
+ const char_t* get_string() const;
+ const xpath_node_set& get_node_set() const;
+
+ // Set variable value; no type conversion is performed, false is returned on type mismatch error
+ bool set(bool value);
+ bool set(double value);
+ bool set(const char_t* value);
+ bool set(const xpath_node_set& value);
+};
+
+// A set of XPath variables
+class PUGIXML_CLASS xpath_variable_set
+{
+private:
+ xpath_variable* _data[64];
+
+ // Non-copyable semantics
+ xpath_variable_set(const xpath_variable_set&);
+ xpath_variable_set& operator=(const xpath_variable_set&);
+
+ xpath_variable* find(const char_t* name) const;
+
+public:
+ // Default constructor/destructor
+ xpath_variable_set();
+ ~xpath_variable_set();
+
+ // Add a new variable or get the existing one, if the types match
+ xpath_variable* add(const char_t* name, xpath_value_type type);
+
+ // Set value of an existing variable; no type conversion is performed, false is returned if there is no such variable or if types mismatch
+ bool set(const char_t* name, bool value);
+ bool set(const char_t* name, double value);
+ bool set(const char_t* name, const char_t* value);
+ bool set(const char_t* name, const xpath_node_set& value);
+
+ // Get existing variable by name
+ xpath_variable* get(const char_t* name);
+ const xpath_variable* get(const char_t* name) const;
+};
+
+// A compiled XPath query object
+class PUGIXML_CLASS xpath_query
+{
+private:
+ void* _impl;
+ xpath_parse_result _result;
+
+ typedef void (*unspecified_bool_type)(xpath_query***);
+
+ // Non-copyable semantics
+ xpath_query(const xpath_query&);
+ xpath_query& operator=(const xpath_query&);
+
+public:
+ // Construct a compiled object from XPath expression.
+ // If PUGIXML_NO_EXCEPTIONS is not defined, throws xpath_exception on compilation errors.
+ explicit xpath_query(const char_t* query, xpath_variable_set* variables = 0);
+
+ // Destructor
+ ~xpath_query();
+
+ // Get query expression return type
+ xpath_value_type return_type() const;
+
+ // Evaluate expression as boolean value in the specified context; performs type conversion if necessary.
+ // If PUGIXML_NO_EXCEPTIONS is not defined, throws std::bad_alloc on out of memory errors.
+ bool evaluate_boolean(const xpath_node& n) const;
+
+ // Evaluate expression as double value in the specified context; performs type conversion if necessary.
+ // If PUGIXML_NO_EXCEPTIONS is not defined, throws std::bad_alloc on out of memory errors.
+ double evaluate_number(const xpath_node& n) const;
+
+#ifndef PUGIXML_NO_STL
+ // Evaluate expression as string value in the specified context; performs type conversion if necessary.
+ // If PUGIXML_NO_EXCEPTIONS is not defined, throws std::bad_alloc on out of memory errors.
+ string_t evaluate_string(const xpath_node& n) const;
+#endif
+
+ // Evaluate expression as string value in the specified context; performs type conversion if necessary.
+ // At most capacity characters are written to the destination buffer, full result size is returned (includes terminating zero).
+ // If PUGIXML_NO_EXCEPTIONS is not defined, throws std::bad_alloc on out of memory errors.
+ // If PUGIXML_NO_EXCEPTIONS is defined, returns empty set instead.
+ size_t evaluate_string(char_t* buffer, size_t capacity, const xpath_node& n) const;
+
+ // Evaluate expression as node set in the specified context.
+ // If PUGIXML_NO_EXCEPTIONS is not defined, throws xpath_exception on type mismatch and std::bad_alloc on out of memory errors.
+ // If PUGIXML_NO_EXCEPTIONS is defined, returns empty node set instead.
+ xpath_node_set evaluate_node_set(const xpath_node& n) const;
+
+ // Get parsing result (used to get compilation errors in PUGIXML_NO_EXCEPTIONS mode)
+ const xpath_parse_result& result() const;
+
+ // Safe bool conversion operator
+ operator unspecified_bool_type() const;
+
+ // Borland C++ workaround
+ bool operator!() const;
+};
+
+#ifndef PUGIXML_NO_EXCEPTIONS
+// XPath exception class
+class PUGIXML_CLASS xpath_exception: public std::exception
+{
+private:
+ xpath_parse_result _result;
+
+public:
+ // Construct exception from parse result
+ explicit xpath_exception(const xpath_parse_result& result);
+
+ // Get error message
+ virtual const char* what() const throw();
+
+ // Get parse result
+ const xpath_parse_result& result() const;
+};
+#endif
+
+// XPath node class (either xml_node or xml_attribute)
+class PUGIXML_CLASS xpath_node
+{
+private:
+ xml_node _node;
+ xml_attribute _attribute;
+
+ typedef void (*unspecified_bool_type)(xpath_node***);
+
+public:
+ // Default constructor; constructs empty XPath node
+ xpath_node();
+
+ // Construct XPath node from XML node/attribute
+ xpath_node(const xml_node& node);
+ xpath_node(const xml_attribute& attribute, const xml_node& parent);
+
+ // Get node/attribute, if any
+ xml_node node() const;
+ xml_attribute attribute() const;
+
+ // Get parent of contained node/attribute
+ xml_node parent() const;
+
+ // Safe bool conversion operator
+ operator unspecified_bool_type() const;
+
+ // Borland C++ workaround
+ bool operator!() const;
+
+ // Comparison operators
+ bool operator==(const xpath_node& n) const;
+ bool operator!=(const xpath_node& n) const;
+};
#ifdef __BORLANDC__
- // Borland C++ workaround
- bool PUGIXML_FUNCTION operator&&(const xpath_node& lhs, bool rhs);
- bool PUGIXML_FUNCTION operator||(const xpath_node& lhs, bool rhs);
+// Borland C++ workaround
+bool PUGIXML_FUNCTION operator&&(const xpath_node& lhs, bool rhs);
+bool PUGIXML_FUNCTION operator||(const xpath_node& lhs, bool rhs);
#endif
- // A fixed-size collection of XPath nodes
- class PUGIXML_CLASS xpath_node_set
- {
- public:
- // Collection type
- enum type_t
- {
- type_unsorted, // Not ordered
- type_sorted, // Sorted by document order (ascending)
- type_sorted_reverse // Sorted by document order (descending)
- };
-
- // Constant iterator type
- typedef const xpath_node* const_iterator;
-
- // Default constructor. Constructs empty set.
- xpath_node_set();
-
- // Constructs a set from iterator range; data is not checked for duplicates and is not sorted according to provided type, so be careful
- xpath_node_set(const_iterator begin, const_iterator end, type_t type = type_unsorted);
-
- // Destructor
- ~xpath_node_set();
-
- // Copy constructor/assignment operator
- xpath_node_set(const xpath_node_set& ns);
- xpath_node_set& operator=(const xpath_node_set& ns);
-
- // Get collection type
- type_t type() const;
-
- // Get collection size
- size_t size() const;
-
- // Indexing operator
- const xpath_node& operator[](size_t index) const;
-
- // Collection iterators
- const_iterator begin() const;
- const_iterator end() const;
-
- // Sort the collection in ascending/descending order by document order
- void sort(bool reverse = false);
-
- // Get first node in the collection by document order
- xpath_node first() const;
-
- // Check if collection is empty
- bool empty() const;
-
- private:
- type_t _type;
-
- xpath_node _storage;
-
- xpath_node* _begin;
- xpath_node* _end;
-
- void _assign(const_iterator begin, const_iterator end);
- };
+// A fixed-size collection of XPath nodes
+class PUGIXML_CLASS xpath_node_set
+{
+public:
+ // Collection type
+ enum type_t {
+ type_unsorted, // Not ordered
+ type_sorted, // Sorted by document order (ascending)
+ type_sorted_reverse // Sorted by document order (descending)
+ };
+
+ // Constant iterator type
+ typedef const xpath_node* const_iterator;
+
+ // Default constructor. Constructs empty set.
+ xpath_node_set();
+
+ // Constructs a set from iterator range; data is not checked for duplicates and is not sorted according to provided type, so be careful
+ xpath_node_set(const_iterator begin, const_iterator end, type_t type = type_unsorted);
+
+ // Destructor
+ ~xpath_node_set();
+
+ // Copy constructor/assignment operator
+ xpath_node_set(const xpath_node_set& ns);
+ xpath_node_set& operator=(const xpath_node_set& ns);
+
+ // Get collection type
+ type_t type() const;
+
+ // Get collection size
+ size_t size() const;
+
+ // Indexing operator
+ const xpath_node& operator[](size_t index) const;
+
+ // Collection iterators
+ const_iterator begin() const;
+ const_iterator end() const;
+
+ // Sort the collection in ascending/descending order by document order
+ void sort(bool reverse = false);
+
+ // Get first node in the collection by document order
+ xpath_node first() const;
+
+ // Check if collection is empty
+ bool empty() const;
+
+private:
+ type_t _type;
+
+ xpath_node _storage;
+
+ xpath_node* _begin;
+ xpath_node* _end;
+
+ void _assign(const_iterator begin, const_iterator end);
+};
#endif
#ifndef PUGIXML_NO_STL
- // Convert wide string to UTF8
- std::basic_string<char, std::char_traits<char>, std::allocator<char> > PUGIXML_FUNCTION as_utf8(const wchar_t* str);
- std::basic_string<char, std::char_traits<char>, std::allocator<char> > PUGIXML_FUNCTION as_utf8(const std::basic_string<wchar_t, std::char_traits<wchar_t>, std::allocator<wchar_t> >& str);
-
- // Convert UTF8 to wide string
- std::basic_string<wchar_t, std::char_traits<wchar_t>, std::allocator<wchar_t> > PUGIXML_FUNCTION as_wide(const char* str);
- std::basic_string<wchar_t, std::char_traits<wchar_t>, std::allocator<wchar_t> > PUGIXML_FUNCTION as_wide(const std::basic_string<char, std::char_traits<char>, std::allocator<char> >& str);
+// Convert wide string to UTF8
+std::basic_string<char, std::char_traits<char>, std::allocator<char> > PUGIXML_FUNCTION as_utf8(const wchar_t* str);
+std::basic_string<char, std::char_traits<char>, std::allocator<char> > PUGIXML_FUNCTION as_utf8(const std::basic_string<wchar_t, std::char_traits<wchar_t>, std::allocator<wchar_t> >& str);
+
+// Convert UTF8 to wide string
+std::basic_string<wchar_t, std::char_traits<wchar_t>, std::allocator<wchar_t> > PUGIXML_FUNCTION as_wide(const char* str);
+std::basic_string<wchar_t, std::char_traits<wchar_t>, std::allocator<wchar_t> > PUGIXML_FUNCTION as_wide(const std::basic_string<char, std::char_traits<char>, std::allocator<char> >& str);
#endif
- // Memory allocation function interface; returns pointer to allocated memory or NULL on failure
- typedef void* (*allocation_function)(size_t size);
-
- // Memory deallocation function interface
- typedef void (*deallocation_function)(void* ptr);
-
- // Override default memory management functions. All subsequent allocations/deallocations will be performed via supplied functions.
- void PUGIXML_FUNCTION set_memory_management_functions(allocation_function allocate, deallocation_function deallocate);
-
- // Get current memory management functions
- allocation_function PUGIXML_FUNCTION get_memory_allocation_function();
- deallocation_function PUGIXML_FUNCTION get_memory_deallocation_function();
+// Memory allocation function interface; returns pointer to allocated memory or NULL on failure
+typedef void* (*allocation_function)(size_t size);
+
+// Memory deallocation function interface
+typedef void (*deallocation_function)(void* ptr);
+
+// Override default memory management functions. All subsequent allocations/deallocations will be performed via supplied functions.
+void PUGIXML_FUNCTION set_memory_management_functions(allocation_function allocate, deallocation_function deallocate);
+
+// Get current memory management functions
+allocation_function PUGIXML_FUNCTION get_memory_allocation_function();
+deallocation_function PUGIXML_FUNCTION get_memory_deallocation_function();
}
#if !defined(PUGIXML_NO_STL) && (defined(_MSC_VER) || defined(__ICC))
namespace std
{
- // Workarounds for (non-standard) iterator category detection for older versions (MSVC7/IC8 and earlier)
- std::bidirectional_iterator_tag PUGIXML_FUNCTION _Iter_cat(const pugi::xml_node_iterator&);
- std::bidirectional_iterator_tag PUGIXML_FUNCTION _Iter_cat(const pugi::xml_attribute_iterator&);
- std::forward_iterator_tag PUGIXML_FUNCTION _Iter_cat(const pugi::xml_named_node_iterator&);
+// Workarounds for (non-standard) iterator category detection for older versions (MSVC7/IC8 and earlier)
+std::bidirectional_iterator_tag PUGIXML_FUNCTION _Iter_cat(const pugi::xml_node_iterator&);
+std::bidirectional_iterator_tag PUGIXML_FUNCTION _Iter_cat(const pugi::xml_attribute_iterator&);
+std::forward_iterator_tag PUGIXML_FUNCTION _Iter_cat(const pugi::xml_named_node_iterator&);
}
#endif
#if !defined(PUGIXML_NO_STL) && defined(__SUNPRO_CC)
namespace std
{
- // Workarounds for (non-standard) iterator category detection
- std::bidirectional_iterator_tag PUGIXML_FUNCTION __iterator_category(const pugi::xml_node_iterator&);
- std::bidirectional_iterator_tag PUGIXML_FUNCTION __iterator_category(const pugi::xml_attribute_iterator&);
- std::forward_iterator_tag PUGIXML_FUNCTION __iterator_category(const pugi::xml_named_node_iterator&);
+// Workarounds for (non-standard) iterator category detection
+std::bidirectional_iterator_tag PUGIXML_FUNCTION __iterator_category(const pugi::xml_node_iterator&);
+std::bidirectional_iterator_tag PUGIXML_FUNCTION __iterator_category(const pugi::xml_attribute_iterator&);
+std::forward_iterator_tag PUGIXML_FUNCTION __iterator_category(const pugi::xml_named_node_iterator&);
}
#endif
@@ -1253,7 +1244,7 @@ namespace std
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
- *
+ *
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
diff --git a/phrase-extract/extract-rules-main.cpp b/phrase-extract/extract-rules-main.cpp
index 50baa4e0d..e6fff965d 100644
--- a/phrase-extract/extract-rules-main.cpp
+++ b/phrase-extract/extract-rules-main.cpp
@@ -41,7 +41,7 @@
#include "HoleCollection.h"
#include "RuleExist.h"
#include "SentenceAlignmentWithSyntax.h"
-#include "SyntaxTree.h"
+#include "SyntaxNode.h"
#include "tables-core.h"
#include "XmlTree.h"
#include "InputFileStream.h"
@@ -110,6 +110,8 @@ void collectWordLabelCounts(SentenceAlignmentWithSyntax &sentence );
void writeGlueGrammar(const string &, RuleExtractionOptions &options, set< string > &targetLabelCollection, map< string, int > &targetTopLabelCollection);
void writeUnknownWordLabel(const string &);
+double getPcfgScore(const SyntaxNode &);
+
int main(int argc, char* argv[])
{
@@ -505,7 +507,7 @@ void ExtractTask::preprocessSourceHieroPhrase( int startT, int endT, int startS,
int labelI = labelIndex[ 2+holeCount+holeTotal ];
string label = m_options.sourceSyntax ?
- m_sentence.sourceTree.GetNodes(currPos,hole.GetEnd(0))[ labelI ]->GetLabel() : "X";
+ m_sentence.sourceTree.GetNodes(currPos,hole.GetEnd(0))[ labelI ]->label : "X";
hole.SetLabel(label, 0);
currPos = hole.GetEnd(0);
@@ -548,7 +550,7 @@ string ExtractTask::saveTargetHieroPhrase( int startT, int endT, int startS, int
int labelI = labelIndex[ 2+holeCount ];
string targetLabel;
if (m_options.targetSyntax) {
- targetLabel = m_sentence.targetTree.GetNodes(currPos,hole.GetEnd(1))[labelI]->GetLabel();
+ targetLabel = m_sentence.targetTree.GetNodes(currPos,hole.GetEnd(1))[labelI]->label;
} else if (m_options.boundaryRules && (startS == 0 || endS == countS - 1)) {
targetLabel = "S";
} else {
@@ -564,8 +566,7 @@ string ExtractTask::saveTargetHieroPhrase( int startT, int endT, int startS, int
}
if (m_options.pcfgScore) {
- double score = m_sentence.targetTree.GetNodes(currPos,hole.GetEnd(1))[labelI]->GetPcfgScore();
- logPCFGScore -= score;
+ logPCFGScore -= getPcfgScore(*m_sentence.targetTree.GetNodes(currPos,hole.GetEnd(1))[labelI]);
}
currPos = hole.GetEnd(1);
@@ -674,7 +675,7 @@ void ExtractTask::saveHieroPhrase( int startT, int endT, int startS, int endS
// phrase labels
string targetLabel;
if (m_options.targetSyntax) {
- targetLabel = m_sentence.targetTree.GetNodes(startT,endT)[labelIndex[0] ]->GetLabel();
+ targetLabel = m_sentence.targetTree.GetNodes(startT,endT)[labelIndex[0] ]->label;
} else if (m_options.boundaryRules && (startS == 0 || endS == countS - 1)) {
targetLabel = "S";
} else {
@@ -682,14 +683,14 @@ void ExtractTask::saveHieroPhrase( int startT, int endT, int startS, int endS
}
string sourceLabel = m_options.sourceSyntax ?
- m_sentence.sourceTree.GetNodes(startS,endS)[ labelIndex[1] ]->GetLabel() : "X";
+ m_sentence.sourceTree.GetNodes(startS,endS)[ labelIndex[1] ]->label : "X";
// create non-terms on the source side
preprocessSourceHieroPhrase(startT, endT, startS, endS, indexS, holeColl, labelIndex);
// target
if (m_options.pcfgScore) {
- double logPCFGScore = m_sentence.targetTree.GetNodes(startT,endT)[labelIndex[0]]->GetPcfgScore();
+ double logPCFGScore = getPcfgScore(*m_sentence.targetTree.GetNodes(startT,endT)[labelIndex[0]]);
rule.target = saveTargetHieroPhrase(startT, endT, startS, endS, indexT, holeColl, labelIndex, logPCFGScore, countS)
+ " [" + targetLabel + "]";
rule.pcfgScore = std::exp(logPCFGScore);
@@ -946,13 +947,13 @@ void ExtractTask::addRule( int startT, int endT, int startS, int endS, int count
// phrase labels
string targetLabel,sourceLabel;
if (m_options.targetSyntax && m_options.conditionOnTargetLhs) {
- sourceLabel = targetLabel = m_sentence.targetTree.GetNodes(startT,endT)[0]->GetLabel();
+ sourceLabel = targetLabel = m_sentence.targetTree.GetNodes(startT,endT)[0]->label;
} else {
sourceLabel = m_options.sourceSyntax ?
- m_sentence.sourceTree.GetNodes(startS,endS)[0]->GetLabel() : "X";
+ m_sentence.sourceTree.GetNodes(startS,endS)[0]->label : "X";
if (m_options.targetSyntax) {
- targetLabel = m_sentence.targetTree.GetNodes(startT,endT)[0]->GetLabel();
+ targetLabel = m_sentence.targetTree.GetNodes(startT,endT)[0]->label;
} else if (m_options.boundaryRules && (startS == 0 || endS == countS - 1)) {
targetLabel = "S";
} else {
@@ -973,7 +974,7 @@ void ExtractTask::addRule( int startT, int endT, int startS, int endS, int count
rule.target += "[" + targetLabel + "]";
if (m_options.pcfgScore) {
- double logPCFGScore = m_sentence.targetTree.GetNodes(startT,endT)[0]->GetPcfgScore();
+ double logPCFGScore = getPcfgScore(*m_sentence.targetTree.GetNodes(startT,endT)[0]);
rule.pcfgScore = std::exp(logPCFGScore);
}
@@ -1165,7 +1166,7 @@ void collectWordLabelCounts( SentenceAlignmentWithSyntax &sentence )
const vector< SyntaxNode* >& labels = sentence.targetTree.GetNodes(ti,ti);
if (labels.size() > 0) {
wordCount[ word ]++;
- wordLabel[ word ] = labels[0]->GetLabel();
+ wordLabel[ word ] = labels[0]->label;
}
}
}
@@ -1194,3 +1195,13 @@ void writeUnknownWordLabel(const string & fileName)
outFile.close();
}
+
+double getPcfgScore(const SyntaxNode &node)
+{
+ double score = 0.0f;
+ SyntaxNode::AttributeMap::const_iterator p = node.attributes.find("pcfg");
+ if (p != node.attributes.end()) {
+ score = std::atof(p->second.c_str());
+ }
+ return score;
+}
diff --git a/phrase-extract/filter-rule-table/FilterRuleTable.cpp b/phrase-extract/filter-rule-table/FilterRuleTable.cpp
index c42c13de6..24c2803a7 100644
--- a/phrase-extract/filter-rule-table/FilterRuleTable.cpp
+++ b/phrase-extract/filter-rule-table/FilterRuleTable.cpp
@@ -82,7 +82,7 @@ int FilterRuleTable::Main(int argc, char *argv[])
StringCfgFilter filter(testStrings);
filter.Filter(std::cin, std::cout);
} else if (testSentenceFormat == kTree) {
- std::vector<boost::shared_ptr<StringTree> > testTrees;
+ std::vector<boost::shared_ptr<SyntaxTree> > testTrees;
ReadTestSet(testStream, testTrees);
if (sourceSideRuleFormat == kCfg) {
// TODO Implement TreeCfgFilter
@@ -124,7 +124,7 @@ void FilterRuleTable::ReadTestSet(
}
void FilterRuleTable::ReadTestSet(
- std::istream &input, std::vector<boost::shared_ptr<StringTree> > &sentences)
+ std::istream &input, std::vector<boost::shared_ptr<SyntaxTree> > &sentences)
{
XmlTreeParser parser;
int lineNum = 0;
@@ -136,7 +136,8 @@ void FilterRuleTable::ReadTestSet(
<< std::endl;
continue;
}
- sentences.push_back(boost::shared_ptr<StringTree>(parser.Parse(line)));
+ sentences.push_back(
+ boost::shared_ptr<SyntaxTree>(parser.Parse(line).release()));
}
}
@@ -166,7 +167,7 @@ void FilterRuleTable::ProcessOptions(int argc, char *argv[],
// Construct the 'top' of the usage message: the bit that comes before the
// options list.
std::ostringstream usageTop;
- usageTop << "Usage: " << GetName()
+ usageTop << "Usage: " << name()
<< " [OPTION]... MODEL TEST\n\n"
<< "Filter for SCFG/STSG rule tables.\n\n"
<< "Options";
@@ -202,11 +203,8 @@ void FilterRuleTable::ProcessOptions(int argc, char *argv[],
// Process the command-line.
po::variables_map vm;
- const int optionStyle = cls::allow_long
- | cls::long_allow_adjacent
- | cls::long_allow_next;
try {
- po::store(po::command_line_parser(argc, argv).style(optionStyle).
+ po::store(po::command_line_parser(argc, argv).style(MosesOptionStyle()).
options(cmdLineOptions).positional(p).run(), vm);
po::notify(vm);
} catch (const std::exception &e) {
@@ -228,17 +226,6 @@ void FilterRuleTable::ProcessOptions(int argc, char *argv[],
}
}
-void FilterRuleTable::Error(const std::string &msg) const
-{
- std::cerr << GetName() << ": error: " << msg << std::endl;
- std::exit(1);
-}
-
-void FilterRuleTable::Warn(const std::string &msg) const
-{
- std::cerr << GetName() << ": warning: " << msg << std::endl;
-}
-
} // namespace FilterRuleTable
} // namespace Syntax
} // namespace MosesTraining
diff --git a/phrase-extract/filter-rule-table/FilterRuleTable.h b/phrase-extract/filter-rule-table/FilterRuleTable.h
index 3a9489428..7b51bb8fa 100644
--- a/phrase-extract/filter-rule-table/FilterRuleTable.h
+++ b/phrase-extract/filter-rule-table/FilterRuleTable.h
@@ -5,7 +5,9 @@
#include <boost/shared_ptr.hpp>
-#include "syntax-common/string_tree.h"
+#include "SyntaxTree.h"
+
+#include "syntax-common/tool.h"
#include "StringForest.h"
@@ -18,25 +20,19 @@ namespace FilterRuleTable
struct Options;
-class FilterRuleTable
+class FilterRuleTable : public Tool
{
public:
- FilterRuleTable() : m_name("filter-rule-table") {}
-
- const std::string &GetName() const {
- return m_name;
- }
+ FilterRuleTable() : Tool("filter-rule-table") {}
- int Main(int argc, char *argv[]);
+ virtual int Main(int argc, char *argv[]);
private:
- void Error(const std::string &) const;
-
// Filter rule table (on std::cin) for test set (string version).
void Filter(const std::vector<std::vector<std::string> > &);
// Filter rule table (on std::cin) for test set (parse tree version).
- void Filter(const std::vector<boost::shared_ptr<StringTree> > &);
+ void Filter(const std::vector<boost::shared_ptr<SyntaxTree> > &);
void ProcessOptions(int, char *[], Options &) const;
@@ -46,15 +42,11 @@ private:
// Read test set (tree version)
void ReadTestSet(std::istream &,
- std::vector<boost::shared_ptr<StringTree> > &);
+ std::vector<boost::shared_ptr<SyntaxTree> > &);
// Read test set (forest version)
void ReadTestSet(std::istream &,
std::vector<boost::shared_ptr<StringForest> > &);
-
- void Warn(const std::string &) const;
-
- std::string m_name;
};
} // namespace FilterRuleTable
diff --git a/phrase-extract/filter-rule-table/ForestTsgFilter.h b/phrase-extract/filter-rule-table/ForestTsgFilter.h
index ff48b2e22..c9fe41f57 100644
--- a/phrase-extract/filter-rule-table/ForestTsgFilter.h
+++ b/phrase-extract/filter-rule-table/ForestTsgFilter.h
@@ -10,7 +10,6 @@
#include <boost/unordered_set.hpp>
#include "syntax-common/numbered_set.h"
-#include "syntax-common/string_tree.h"
#include "syntax-common/tree.h"
#include "syntax-common/tree_fragment_tokenizer.h"
diff --git a/phrase-extract/filter-rule-table/TreeCfgFilter.cpp b/phrase-extract/filter-rule-table/TreeCfgFilter.cpp
index 153c706f3..dc938ac19 100644
--- a/phrase-extract/filter-rule-table/TreeCfgFilter.cpp
+++ b/phrase-extract/filter-rule-table/TreeCfgFilter.cpp
@@ -12,7 +12,7 @@ namespace FilterRuleTable
{
TreeCfgFilter::TreeCfgFilter(
- const std::vector<boost::shared_ptr<StringTree> > &sentences)
+ const std::vector<boost::shared_ptr<SyntaxTree> > &sentences)
{
}
diff --git a/phrase-extract/filter-rule-table/TreeCfgFilter.h b/phrase-extract/filter-rule-table/TreeCfgFilter.h
index 5812a6dcc..3434ff200 100644
--- a/phrase-extract/filter-rule-table/TreeCfgFilter.h
+++ b/phrase-extract/filter-rule-table/TreeCfgFilter.h
@@ -8,8 +8,9 @@
#include <boost/shared_ptr.hpp>
#include <boost/unordered_map.hpp>
+#include "SyntaxTree.h"
+
#include "syntax-common/numbered_set.h"
-#include "syntax-common/string_tree.h"
#include "syntax-common/tree.h"
#include "syntax-common/tree_fragment_tokenizer.h"
@@ -25,10 +26,11 @@ namespace FilterRuleTable
// Filters a rule table, discarding rules that cannot be applied to a given
// test set. The rule table must have a TSG source-side and the test sentences
// must be parse trees.
-class TreeCfgFilter : public CfgFilter {
- public:
+class TreeCfgFilter : public CfgFilter
+{
+public:
// Initialize the filter for a given set of test sentences.
- TreeCfgFilter(const std::vector<boost::shared_ptr<StringTree> > &);
+ TreeCfgFilter(const std::vector<boost::shared_ptr<SyntaxTree> > &);
void Filter(std::istream &in, std::ostream &out);
};
diff --git a/phrase-extract/filter-rule-table/TreeTsgFilter.cpp b/phrase-extract/filter-rule-table/TreeTsgFilter.cpp
index 32a59fd6c..b9c58228d 100644
--- a/phrase-extract/filter-rule-table/TreeTsgFilter.cpp
+++ b/phrase-extract/filter-rule-table/TreeTsgFilter.cpp
@@ -8,13 +8,13 @@ namespace FilterRuleTable
{
TreeTsgFilter::TreeTsgFilter(
- const std::vector<boost::shared_ptr<StringTree> > &sentences)
+ const std::vector<boost::shared_ptr<SyntaxTree> > &sentences)
{
- // Convert each StringTree to an IdTree.
+ // Convert each SyntaxTree to an IdTree.
m_sentences.reserve(sentences.size());
- for (std::vector<boost::shared_ptr<StringTree> >::const_iterator p =
+ for (std::vector<boost::shared_ptr<SyntaxTree> >::const_iterator p =
sentences.begin(); p != sentences.end(); ++p) {
- m_sentences.push_back(boost::shared_ptr<IdTree>(StringTreeToIdTree(**p)));
+ m_sentences.push_back(boost::shared_ptr<IdTree>(SyntaxTreeToIdTree(**p)));
}
m_labelToTree.resize(m_testVocab.Size());
@@ -25,15 +25,15 @@ TreeTsgFilter::TreeTsgFilter(
}
}
-TreeTsgFilter::IdTree *TreeTsgFilter::StringTreeToIdTree(const StringTree &s)
+TreeTsgFilter::IdTree *TreeTsgFilter::SyntaxTreeToIdTree(const SyntaxTree &s)
{
- IdTree *t = new IdTree(m_testVocab.Insert(s.value()));
- const std::vector<StringTree*> &sChildren = s.children();
+ IdTree *t = new IdTree(m_testVocab.Insert(s.value().label));
+ const std::vector<SyntaxTree*> &sChildren = s.children();
std::vector<IdTree*> &tChildren = t->children();
tChildren.reserve(sChildren.size());
- for (std::vector<StringTree*>::const_iterator p = sChildren.begin();
+ for (std::vector<SyntaxTree*>::const_iterator p = sChildren.begin();
p != sChildren.end(); ++p) {
- IdTree *child = StringTreeToIdTree(**p);
+ IdTree *child = SyntaxTreeToIdTree(**p);
child->parent() = t;
tChildren.push_back(child);
}
diff --git a/phrase-extract/filter-rule-table/TreeTsgFilter.h b/phrase-extract/filter-rule-table/TreeTsgFilter.h
index 17378b552..fa11350b6 100644
--- a/phrase-extract/filter-rule-table/TreeTsgFilter.h
+++ b/phrase-extract/filter-rule-table/TreeTsgFilter.h
@@ -8,8 +8,9 @@
#include <boost/shared_ptr.hpp>
#include <boost/unordered_map.hpp>
+#include "SyntaxTree.h"
+
#include "syntax-common/numbered_set.h"
-#include "syntax-common/string_tree.h"
#include "syntax-common/tree.h"
#include "syntax-common/tree_fragment_tokenizer.h"
@@ -29,7 +30,7 @@ class TreeTsgFilter : public TsgFilter
{
public:
// Initialize the filter for a given set of test sentences.
- TreeTsgFilter(const std::vector<boost::shared_ptr<StringTree> > &);
+ TreeTsgFilter(const std::vector<boost::shared_ptr<SyntaxTree> > &);
private:
// Add an entry to m_labelToTree for every subtree of the given tree.
@@ -41,9 +42,9 @@ private:
// Try to match a fragment against a specific subtree of a test tree.
bool MatchFragment(const IdTree &, const IdTree &);
- // Convert a StringTree to an IdTree (wrt m_testVocab). Inserts symbols into
+ // Convert a SyntaxTree to an IdTree (wrt m_testVocab). Inserts symbols into
// m_testVocab.
- IdTree *StringTreeToIdTree(const StringTree &);
+ IdTree *SyntaxTreeToIdTree(const SyntaxTree &);
std::vector<boost::shared_ptr<IdTree> > m_sentences;
std::vector<std::vector<const IdTree *> > m_labelToTree;
diff --git a/phrase-extract/pcfg-common/Jamfile b/phrase-extract/pcfg-common/Jamfile
deleted file mode 100644
index 5669b443e..000000000
--- a/phrase-extract/pcfg-common/Jamfile
+++ /dev/null
@@ -1 +0,0 @@
-lib pcfg_common : [ glob *.cc ] ..//syntax-common ..//deps : <include>.. ;
diff --git a/phrase-extract/pcfg-common/pcfg.h b/phrase-extract/pcfg-common/pcfg.h
deleted file mode 100644
index c5c04cba4..000000000
--- a/phrase-extract/pcfg-common/pcfg.h
+++ /dev/null
@@ -1,63 +0,0 @@
-/***********************************************************************
- Moses - statistical machine translation system
- Copyright (C) 2006-2012 University of Edinburgh
-
- This library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- This library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with this library; if not, write to the Free Software
- Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-***********************************************************************/
-
-#pragma once
-#ifndef PCFG_PCFG_H_
-#define PCFG_PCFG_H_
-
-#include <istream>
-#include <map>
-#include <ostream>
-#include <vector>
-
-#include "typedef.h"
-
-namespace MosesTraining {
-namespace Syntax {
-namespace PCFG {
-
-class Pcfg {
- public:
- typedef std::vector<std::size_t> Key;
- typedef std::map<Key, double> Map;
- typedef Map::iterator iterator;
- typedef Map::const_iterator const_iterator;
-
- Pcfg() {}
-
- iterator begin() { return rules_.begin(); }
- const_iterator begin() const { return rules_.begin(); }
-
- iterator end() { return rules_.end(); }
- const_iterator end() const { return rules_.end(); }
-
- void Add(const Key &, double);
- bool Lookup(const Key &, double &) const;
- void Read(std::istream &, Vocabulary &);
- void Write(const Vocabulary &, std::ostream &) const;
-
- private:
- Map rules_;
-};
-
-} // namespace PCFG
-} // namespace Syntax
-} // namespace MosesTraining
-
-#endif
diff --git a/phrase-extract/pcfg-common/pcfg_tree.h b/phrase-extract/pcfg-common/pcfg_tree.h
deleted file mode 100644
index ce28eb8dd..000000000
--- a/phrase-extract/pcfg-common/pcfg_tree.h
+++ /dev/null
@@ -1,79 +0,0 @@
-/***********************************************************************
- Moses - statistical machine translation system
- Copyright (C) 2006-2012 University of Edinburgh
-
- This library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- This library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with this library; if not, write to the Free Software
- Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-***********************************************************************/
-
-#pragma once
-#ifndef PCFG_PCFG_TREE_H_
-#define PCFG_PCFG_TREE_H_
-
-#include <string>
-
-#include "syntax_tree.h"
-#include "xml_tree_writer.h"
-
-namespace MosesTraining {
-namespace Syntax {
-namespace PCFG {
-
-template<typename DerivedType>
-class PcfgTreeBase : public SyntaxTreeBase<std::string, DerivedType> {
- public:
- typedef std::string LabelType;
- typedef SyntaxTreeBase<LabelType, DerivedType> BaseType;
-
- PcfgTreeBase(const LabelType &label) : BaseType(label), score_(0.0) {}
-
- double score() const { return score_; }
- void set_score(double s) { score_ = s; }
-
- private:
- double score_;
-};
-
-class PcfgTree : public PcfgTreeBase<PcfgTree> {
- public:
- typedef PcfgTreeBase<PcfgTree> BaseType;
- PcfgTree(const BaseType::LabelType &label) : BaseType(label) {}
-};
-
-// Specialise XmlOutputHandler for PcfgTree.
-template<>
-class XmlOutputHandler<PcfgTree> {
- public:
- typedef std::map<std::string, std::string> AttributeMap;
-
- void GetLabel(const PcfgTree &tree, std::string &label) const {
- label = tree.label();
- }
-
- void GetAttributes(const PcfgTree &tree, AttributeMap &attribute_map) const {
- attribute_map.clear();
- double score = tree.score();
- if (score != 0.0) {
- std::ostringstream out;
- out << tree.score();
- attribute_map["pcfg"] = out.str();
- }
- }
-};
-
-} // namespace PCFG
-} // namespace Syntax
-} // namespace MosesTraining
-
-#endif
diff --git a/phrase-extract/pcfg-common/syntax_tree.h b/phrase-extract/pcfg-common/syntax_tree.h
deleted file mode 100644
index c0c6eaef9..000000000
--- a/phrase-extract/pcfg-common/syntax_tree.h
+++ /dev/null
@@ -1,93 +0,0 @@
-/***********************************************************************
- Moses - statistical machine translation system
- Copyright (C) 2006-2012 University of Edinburgh
-
- This library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- This library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with this library; if not, write to the Free Software
- Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-***********************************************************************/
-
-#pragma once
-#ifndef PCFG_SYNTAX_TREE_H_
-#define PCFG_SYNTAX_TREE_H_
-
-#include <cassert>
-#include <vector>
-
-namespace MosesTraining {
-namespace Syntax {
-namespace PCFG {
-
-// Base class for SyntaxTree, AgreementTree, and friends.
-template<typename T, typename DerivedType>
-class SyntaxTreeBase {
- public:
- // Constructors
- SyntaxTreeBase(const T &label)
- : label_(label)
- , children_()
- , parent_(0) {}
-
- SyntaxTreeBase(const T &label, const std::vector<DerivedType *> &children)
- : label_(label)
- , children_(children)
- , parent_(0) {}
-
- // Destructor
- virtual ~SyntaxTreeBase();
-
- const T &label() const { return label_; }
- const DerivedType *parent() const { return parent_; }
- DerivedType *parent() { return parent_; }
- const std::vector<DerivedType *> &children() const { return children_; }
- std::vector<DerivedType *> &children() { return children_; }
-
- void set_label(const T &label) { label_ = label; }
- void set_parent(DerivedType *parent) { parent_ = parent; }
- void set_children(const std::vector<DerivedType *> &c) { children_ = c; }
-
- bool IsLeaf() const { return children_.empty(); }
-
- bool IsPreterminal() const {
- return children_.size() == 1 && children_[0]->IsLeaf();
- }
-
- void AddChild(DerivedType *child) { children_.push_back(child); }
-
- private:
- T label_;
- std::vector<DerivedType *> children_;
- DerivedType *parent_;
-};
-
-template<typename T>
-class SyntaxTree : public SyntaxTreeBase<T, SyntaxTree<T> > {
- public:
- typedef SyntaxTreeBase<T, SyntaxTree<T> > BaseType;
- SyntaxTree(const T &label) : BaseType(label) {}
- SyntaxTree(const T &label, const std::vector<SyntaxTree *> &children)
- : BaseType(label, children) {}
-};
-
-template<typename T, typename DerivedType>
-SyntaxTreeBase<T, DerivedType>::~SyntaxTreeBase() {
- for (std::size_t i = 0; i < children_.size(); ++i) {
- delete children_[i];
- }
-}
-
-} // namespace PCFG
-} // namespace Syntax
-} // namespace MosesTraining
-
-#endif
diff --git a/phrase-extract/pcfg-common/tool.cc b/phrase-extract/pcfg-common/tool.cc
deleted file mode 100644
index f54e07a12..000000000
--- a/phrase-extract/pcfg-common/tool.cc
+++ /dev/null
@@ -1,82 +0,0 @@
-/***********************************************************************
- Moses - statistical machine translation system
- Copyright (C) 2006-2012 University of Edinburgh
-
- This library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- This library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with this library; if not, write to the Free Software
- Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-***********************************************************************/
-
-#include "tool.h"
-
-#include <sstream>
-
-namespace MosesTraining {
-namespace Syntax {
-namespace PCFG {
-
-std::istream &Tool::OpenInputOrDie(const std::string &filename) {
- // TODO Check that function is only called once?
- if (filename.empty() || filename == "-") {
- input_ptr_ = &(std::cin);
- } else {
- input_file_stream_.open(filename.c_str());
- if (!input_file_stream_) {
- std::ostringstream msg;
- msg << "failed to open input file: " << filename;
- Error(msg.str());
- }
- input_ptr_ = &input_file_stream_;
- }
- return *input_ptr_;
-}
-
-std::ostream &Tool::OpenOutputOrDie(const std::string &filename) {
- // TODO Check that function is only called once?
- if (filename.empty() || filename == "-") {
- output_ptr_ = &(std::cout);
- } else {
- output_file_stream_.open(filename.c_str());
- if (!output_file_stream_) {
- std::ostringstream msg;
- msg << "failed to open output file: " << filename;
- Error(msg.str());
- }
- output_ptr_ = &output_file_stream_;
- }
- return *output_ptr_;
-}
-
-void Tool::OpenNamedInputOrDie(const std::string &filename,
- std::ifstream &stream) {
- stream.open(filename.c_str());
- if (!stream) {
- std::ostringstream msg;
- msg << "failed to open input file: " << filename;
- Error(msg.str());
- }
-}
-
-void Tool::OpenNamedOutputOrDie(const std::string &filename,
- std::ofstream &stream) {
- stream.open(filename.c_str());
- if (!stream) {
- std::ostringstream msg;
- msg << "failed to open output file: " << filename;
- Error(msg.str());
- }
-}
-
-} // namespace PCFG
-} // namespace Syntax
-} // namespace MosesTraining
diff --git a/phrase-extract/pcfg-common/tool.h b/phrase-extract/pcfg-common/tool.h
deleted file mode 100644
index 2c903a11e..000000000
--- a/phrase-extract/pcfg-common/tool.h
+++ /dev/null
@@ -1,93 +0,0 @@
-/***********************************************************************
- Moses - statistical machine translation system
- Copyright (C) 2006-2012 University of Edinburgh
-
- This library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- This library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with this library; if not, write to the Free Software
- Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-***********************************************************************/
-
-#pragma once
-#ifndef PCFG_TOOL_H_
-#define PCFG_TOOL_H_
-
-#include <cstdlib>
-#include <fstream>
-#include <iostream>
-#include <string>
-
-#include <boost/program_options/cmdline.hpp>
-
-namespace MosesTraining {
-namespace Syntax {
-namespace PCFG {
-
-class Tool {
- public:
- virtual ~Tool() {}
-
- const std::string &name() const { return name_; }
-
- virtual int Main(int argc, char *argv[]) = 0;
-
- protected:
- Tool(const std::string &name) : name_(name) {}
-
- // Returns the boost::program_options style that should be used by all tools.
- static int CommonOptionStyle() {
- namespace cls = boost::program_options::command_line_style;
- return cls::default_style & (~cls::allow_guessing);
- }
-
- void Warn(const std::string &msg) const {
- std::cerr << name_ << ": warning: " << msg << std::endl;
- }
-
- void Error(const std::string &msg) const {
- std::cerr << name_ << ": error: " << msg << std::endl;
- std::exit(1);
- }
-
- // Initialises the tool's main input stream and returns a reference that is
- // valid for the remainder of the tool's lifetime. If filename is empty or
- // "-" then input is standard input; otherwise it is the named file. Calls
- // Error() if the file cannot be opened for reading.
- std::istream &OpenInputOrDie(const std::string &filename);
-
- // Initialises the tool's main output stream and returns a reference that is
- // valid for the remainder of the tool's lifetime. If filename is empty or
- // "-" then output is standard output; otherwise it is the named file. Calls
- // Error() if the file cannot be opened for writing.
- std::ostream &OpenOutputOrDie(const std::string &filename);
-
- // Opens the named input file using the supplied ifstream. Calls Error() if
- // the file cannot be opened for reading.
- void OpenNamedInputOrDie(const std::string &, std::ifstream &);
-
- // Opens the named output file using the supplied ofstream. Calls Error() if
- // the file cannot be opened for writing.
- void OpenNamedOutputOrDie(const std::string &, std::ofstream &);
-
- private:
- std::string name_;
- std::istream *input_ptr_;
- std::ifstream input_file_stream_;
- std::ostream *output_ptr_;
- std::ofstream output_file_stream_;
-};
-
-} // namespace PCFG
-} // namespace Syntax
-} // namespace MosesTraining
-
-#endif
diff --git a/phrase-extract/pcfg-common/typedef.h b/phrase-extract/pcfg-common/typedef.h
deleted file mode 100644
index e738163df..000000000
--- a/phrase-extract/pcfg-common/typedef.h
+++ /dev/null
@@ -1,39 +0,0 @@
-/***********************************************************************
- Moses - statistical machine translation system
- Copyright (C) 2006-2012 University of Edinburgh
-
- This library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- This library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with this library; if not, write to the Free Software
- Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-***********************************************************************/
-
-#pragma once
-#ifndef PCFG_TYPEDEF_H_
-#define PCFG_TYPEDEF_H_
-
-#include <string>
-
-#include "syntax-common/numbered_set.h"
-#include "syntax_tree.h"
-
-namespace MosesTraining {
-namespace Syntax {
-namespace PCFG {
-
-typedef NumberedSet<std::string> Vocabulary;
-
-} // namespace PCFG
-} // namespace Syntax
-} // namespace MosesTraining
-
-#endif
diff --git a/phrase-extract/pcfg-common/xml_tree_parser.cc b/phrase-extract/pcfg-common/xml_tree_parser.cc
deleted file mode 100644
index 3d9291994..000000000
--- a/phrase-extract/pcfg-common/xml_tree_parser.cc
+++ /dev/null
@@ -1,88 +0,0 @@
-/***********************************************************************
- Moses - statistical machine translation system
- Copyright (C) 2006-2012 University of Edinburgh
-
- This library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- This library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with this library; if not, write to the Free Software
- Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-***********************************************************************/
-
-#include "xml_tree_parser.h"
-
-#include <cassert>
-#include <vector>
-
-#include "tables-core.h"
-#include "XmlException.h"
-#include "XmlTree.h"
-
-#include "syntax-common/exception.h"
-
-namespace MosesTraining {
-namespace Syntax {
-namespace PCFG {
-
-XmlTreeParser::XmlTreeParser() {
-}
-
-std::auto_ptr<PcfgTree> XmlTreeParser::Parse(const std::string &line) {
- m_line = line;
- m_tree.Clear();
- try {
- if (!ProcessAndStripXMLTags(m_line, m_tree, m_labelSet, m_topLabelSet)) {
- throw Exception("");
- }
- } catch (const XmlException &e) {
- throw Exception(e.getMsg());
- }
- m_tree.ConnectNodes();
- SyntaxNode *root = m_tree.GetTop();
- if (!root) {
- // There is no XML tree.
- return std::auto_ptr<PcfgTree>();
- }
- m_words = tokenize(m_line.c_str());
- return ConvertTree(*root, m_words);
-}
-
-// Converts a SyntaxNode tree to a Moses::PCFG::PcfgTree.
-std::auto_ptr<PcfgTree> XmlTreeParser::ConvertTree(
- const SyntaxNode &tree,
- const std::vector<std::string> &words) {
- std::auto_ptr<PcfgTree> root(new PcfgTree(tree.GetLabel()));
- const std::vector<SyntaxNode*> &children = tree.GetChildren();
- if (children.empty()) {
- if (tree.GetStart() != tree.GetEnd()) {
- std::ostringstream msg;
- msg << "leaf node covers multiple words (" << tree.GetStart()
- << "-" << tree.GetEnd() << "): this is currently unsupported";
- throw Exception(msg.str());
- }
- std::auto_ptr<PcfgTree> leaf(new PcfgTree(words[tree.GetStart()]));
- leaf->set_parent(root.get());
- root->AddChild(leaf.release());
- } else {
- for (std::vector<SyntaxNode*>::const_iterator p = children.begin();
- p != children.end(); ++p) {
- assert(*p);
- std::auto_ptr<PcfgTree> child = ConvertTree(**p, words);
- child->set_parent(root.get());
- root->AddChild(child.release());
- }
- }
- return root;
-}
-
-} // namespace PCFG
-} // namespace Syntax
-} // namespace MosesTraining
diff --git a/phrase-extract/pcfg-common/xml_tree_parser.h b/phrase-extract/pcfg-common/xml_tree_parser.h
deleted file mode 100644
index 675a112d8..000000000
--- a/phrase-extract/pcfg-common/xml_tree_parser.h
+++ /dev/null
@@ -1,58 +0,0 @@
-/***********************************************************************
- Moses - statistical machine translation system
- Copyright (C) 2006-2012 University of Edinburgh
-
- This library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- This library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with this library; if not, write to the Free Software
- Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-***********************************************************************/
-
-#pragma once
-#ifndef PCFG_XML_TREE_PARSER_H_
-#define PCFG_XML_TREE_PARSER_H_
-
-#include <map>
-#include <memory>
-#include <set>
-#include <string>
-#include <vector>
-
-#include "pcfg_tree.h"
-#include "SyntaxTree.h"
-
-namespace MosesTraining {
-namespace Syntax {
-namespace PCFG {
-
-// Parses a string in Moses' XML parse tree format and returns a PcfgTree
-// object.
-class XmlTreeParser {
- public:
- XmlTreeParser();
- std::auto_ptr<PcfgTree> Parse(const std::string &);
- private:
- std::auto_ptr<PcfgTree> ConvertTree(const MosesTraining::SyntaxNode &,
- const std::vector<std::string> &);
-
- std::set<std::string> m_labelSet;
- std::map<std::string, int> m_topLabelSet;
- std::string m_line;
- MosesTraining::SyntaxTree m_tree;
- std::vector<std::string> m_words;
-};
-
-} // namespace PCFG
-} // namespace Syntax
-} // namespace MosesTraining
-
-#endif
diff --git a/phrase-extract/pcfg-common/xml_tree_writer.h b/phrase-extract/pcfg-common/xml_tree_writer.h
deleted file mode 100644
index 8582e544f..000000000
--- a/phrase-extract/pcfg-common/xml_tree_writer.h
+++ /dev/null
@@ -1,135 +0,0 @@
-/***********************************************************************
- Moses - statistical machine translation system
- Copyright (C) 2006-2012 University of Edinburgh
-
- This library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- This library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with this library; if not, write to the Free Software
- Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-***********************************************************************/
-
-#pragma once
-#ifndef PCFG_XML_TREE_WRITER_H_
-#define PCFG_XML_TREE_WRITER_H_
-
-#include <cassert>
-#include <map>
-#include <memory>
-#include <ostream>
-#include <vector>
-#include <string>
-
-#include "XmlTree.h"
-
-#include "syntax_tree.h"
-
-namespace MosesTraining {
-namespace Syntax {
-namespace PCFG {
-
-template<typename InputTree>
-class XmlOutputHandler {
- public:
- typedef std::map<std::string, std::string> AttributeMap;
-
- void GetLabel(const InputTree &, std::string &) const;
- void GetAttributes(const InputTree &, AttributeMap &) const;
-};
-
-template<typename InputTree>
-class XmlTreeWriter : public XmlOutputHandler<InputTree> {
- public:
- typedef XmlOutputHandler<InputTree> Base;
- void Write(const InputTree &, std::ostream &) const;
- private:
- std::string Escape(const std::string &) const;
-};
-
-template<typename InputTree>
-void XmlTreeWriter<InputTree>::Write(const InputTree &tree,
- std::ostream &out) const {
- assert(!tree.IsLeaf());
-
- // Opening tag
-
- std::string label;
- Base::GetLabel(tree, label);
- out << "<tree label=\"" << Escape(label) << "\"";
-
- typename Base::AttributeMap attribute_map;
- Base::GetAttributes(tree, attribute_map);
-
- for (typename Base::AttributeMap::const_iterator p = attribute_map.begin();
- p != attribute_map.end(); ++p) {
- out << " " << p->first << "=\"" << p->second << "\"";
- }
-
- out << ">";
-
- // Children
-
- const std::vector<InputTree *> &children = tree.children();
- for (typename std::vector<InputTree *>::const_iterator p = children.begin();
- p != children.end(); ++p) {
- InputTree &child = **p;
- if (child.IsLeaf()) {
- Base::GetLabel(child, label);
- out << " " << Escape(label);
- } else {
- out << " ";
- Write(**p, out);
- }
- }
-
- // Closing tag
- out << " </tree>";
-
- if (tree.parent() == 0) {
- out << std::endl;
- }
-}
-
-// Escapes XML special characters.
-template<typename InputTree>
-std::string XmlTreeWriter<InputTree>::Escape(const std::string &s) const {
- std::string t;
- std::size_t len = s.size();
- t.reserve(len);
- for (std::size_t i = 0; i < len; ++i) {
- if (s[i] == '<') {
- t += "&lt;";
- } else if (s[i] == '>') {
- t += "&gt;";
- } else if (s[i] == '[') {
- t += "&#91;";
- } else if (s[i] == ']') {
- t += "&#93;";
- } else if (s[i] == '|') {
- t += "&#124;";
- } else if (s[i] == '&') {
- t += "&amp;";
- } else if (s[i] == '\'') {
- t += "&apos;";
- } else if (s[i] == '"') {
- t += "&quot;";
- } else {
- t += s[i];
- }
- }
- return t;
-}
-
-} // namespace PCFG
-} // namespace Syntax
-} // namespace MosesTraining
-
-#endif
diff --git a/phrase-extract/pcfg-extract/Jamfile b/phrase-extract/pcfg-extract/Jamfile
index 61f056599..2f4ae1e7d 100644
--- a/phrase-extract/pcfg-extract/Jamfile
+++ b/phrase-extract/pcfg-extract/Jamfile
@@ -1 +1 @@
-exe pcfg-extract : [ glob *.cc ] ..//pcfg-common ../..//boost_program_options : <include>.. ;
+exe pcfg-extract : [ glob *.cc ] ..//syntax-common ../..//boost_program_options : <include>.. ;
diff --git a/phrase-extract/pcfg-extract/main.cc b/phrase-extract/pcfg-extract/main.cc
index 84051f2e2..010c04b00 100644
--- a/phrase-extract/pcfg-extract/main.cc
+++ b/phrase-extract/pcfg-extract/main.cc
@@ -1,17 +1,17 @@
/***********************************************************************
Moses - statistical machine translation system
Copyright (C) 2006-2012 University of Edinburgh
-
+
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
-
+
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
-
+
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
@@ -19,7 +19,8 @@
#include "pcfg_extract.h"
-int main(int argc, char *argv[]) {
+int main(int argc, char *argv[])
+{
MosesTraining::Syntax::PCFG::PcfgExtract tool;
return tool.Main(argc, argv);
}
diff --git a/phrase-extract/pcfg-extract/pcfg_extract.cc b/phrase-extract/pcfg-extract/pcfg_extract.cc
index a5e06aa82..0e89e26be 100644
--- a/phrase-extract/pcfg-extract/pcfg_extract.cc
+++ b/phrase-extract/pcfg-extract/pcfg_extract.cc
@@ -1,17 +1,17 @@
/***********************************************************************
Moses - statistical machine translation system
Copyright (C) 2006-2012 University of Edinburgh
-
+
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
-
+
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
-
+
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
@@ -19,20 +19,6 @@
#include "pcfg_extract.h"
-#include "options.h"
-#include "rule_collection.h"
-#include "rule_extractor.h"
-
-#include "syntax-common/exception.h"
-
-#include "pcfg-common/pcfg.h"
-#include "pcfg-common/pcfg_tree.h"
-#include "pcfg-common/syntax_tree.h"
-#include "pcfg-common/typedef.h"
-#include "pcfg-common/xml_tree_parser.h"
-
-#include <boost/program_options.hpp>
-
#include <cassert>
#include <cstdlib>
#include <fstream>
@@ -43,11 +29,28 @@
#include <string>
#include <vector>
-namespace MosesTraining {
-namespace Syntax {
-namespace PCFG {
+#include <boost/program_options.hpp>
+
+#include "syntax-common/exception.h"
+#include "syntax-common/pcfg.h"
+#include "syntax-common/vocabulary.h"
+#include "syntax-common/xml_tree_parser.h"
+
+#include "SyntaxTree.h"
+
+#include "options.h"
+#include "rule_collection.h"
+#include "rule_extractor.h"
+
+namespace MosesTraining
+{
+namespace Syntax
+{
+namespace PCFG
+{
-int PcfgExtract::Main(int argc, char *argv[]) {
+int PcfgExtract::Main(int argc, char *argv[])
+{
// Process command-line options.
Options options;
ProcessOptions(argc, argv, options);
@@ -59,7 +62,7 @@ int PcfgExtract::Main(int argc, char *argv[]) {
XmlTreeParser parser;
std::string line;
std::size_t line_num = 0;
- std::auto_ptr<PcfgTree> tree;
+ std::auto_ptr<MosesTraining::SyntaxTree> tree;
while (std::getline(std::cin, line)) {
++line_num;
try {
@@ -87,7 +90,8 @@ int PcfgExtract::Main(int argc, char *argv[]) {
}
void PcfgExtract::ProcessOptions(int argc, char *argv[],
- Options &options) const {
+ Options &options) const
+{
namespace po = boost::program_options;
std::ostringstream usage_top;
@@ -96,7 +100,7 @@ void PcfgExtract::ProcessOptions(int argc, char *argv[],
// Declare the command line options that are visible to the user.
po::options_description visible(usage_top.str());
visible.add_options()
- ("help", "print help message and exit")
+ ("help", "print help message and exit")
;
// Declare the command line options that are hidden from the user
@@ -114,7 +118,7 @@ void PcfgExtract::ProcessOptions(int argc, char *argv[],
// Process the command-line.
po::variables_map vm;
try {
- po::store(po::command_line_parser(argc, argv).style(CommonOptionStyle()).
+ po::store(po::command_line_parser(argc, argv).style(MosesOptionStyle()).
options(cmd_line_options).positional(p).run(), vm);
po::notify(vm);
} catch (const std::exception &e) {
diff --git a/phrase-extract/pcfg-extract/pcfg_extract.h b/phrase-extract/pcfg-extract/pcfg_extract.h
index 5882e45da..3b084acbe 100644
--- a/phrase-extract/pcfg-extract/pcfg_extract.h
+++ b/phrase-extract/pcfg-extract/pcfg_extract.h
@@ -21,7 +21,7 @@
#ifndef PCFG_EXTRACT_PCFG_EXTRACT_H_
#define PCFG_EXTRACT_PCFG_EXTRACT_H_
-#include "pcfg-common/tool.h"
+#include "syntax-common/tool.h"
namespace MosesTraining
{
diff --git a/phrase-extract/pcfg-extract/rule_collection.cc b/phrase-extract/pcfg-extract/rule_collection.cc
index 21e84d2fa..a814f82d6 100644
--- a/phrase-extract/pcfg-extract/rule_collection.cc
+++ b/phrase-extract/pcfg-extract/rule_collection.cc
@@ -1,17 +1,17 @@
/***********************************************************************
Moses - statistical machine translation system
Copyright (C) 2006-2012 University of Edinburgh
-
+
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
-
+
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
-
+
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
@@ -19,19 +19,24 @@
#include "rule_collection.h"
-#include "pcfg-common/pcfg.h"
+#include "syntax-common/pcfg.h"
#include <cmath>
-namespace MosesTraining {
-namespace Syntax {
-namespace PCFG {
+namespace MosesTraining
+{
+namespace Syntax
+{
+namespace PCFG
+{
-void RuleCollection::Add(std::size_t lhs, const std::vector<std::size_t> &rhs) {
+void RuleCollection::Add(std::size_t lhs, const std::vector<std::size_t> &rhs)
+{
++collection_[lhs][rhs];
}
-void RuleCollection::CreatePcfg(Pcfg &pcfg) {
+void RuleCollection::CreatePcfg(Pcfg &pcfg)
+{
std::vector<std::size_t> key;
for (const_iterator p = begin(); p != end(); ++p) {
std::size_t lhs = p->first;
diff --git a/phrase-extract/pcfg-extract/rule_collection.h b/phrase-extract/pcfg-extract/rule_collection.h
index 3d9a9f98b..3bbc32721 100644
--- a/phrase-extract/pcfg-extract/rule_collection.h
+++ b/phrase-extract/pcfg-extract/rule_collection.h
@@ -25,7 +25,7 @@
#include <boost/unordered_map.hpp>
-#include "pcfg-common/pcfg.h"
+#include "syntax-common/pcfg.h"
namespace MosesTraining
{
diff --git a/phrase-extract/pcfg-extract/rule_extractor.cc b/phrase-extract/pcfg-extract/rule_extractor.cc
index bb4698fae..f20f2d978 100644
--- a/phrase-extract/pcfg-extract/rule_extractor.cc
+++ b/phrase-extract/pcfg-extract/rule_extractor.cc
@@ -1,17 +1,17 @@
/***********************************************************************
Moses - statistical machine translation system
Copyright (C) 2006-2012 University of Edinburgh
-
+
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
-
+
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
-
+
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
@@ -19,30 +19,33 @@
#include "rule_extractor.h"
-#include "pcfg-common/pcfg_tree.h"
-
-namespace MosesTraining {
-namespace Syntax {
-namespace PCFG {
+namespace MosesTraining
+{
+namespace Syntax
+{
+namespace PCFG
+{
RuleExtractor::RuleExtractor(Vocabulary &non_term_vocab)
- : non_term_vocab_(non_term_vocab) {
+ : non_term_vocab_(non_term_vocab)
+{
}
-void RuleExtractor::Extract(const PcfgTree &tree, RuleCollection &rc) const {
- if (tree.IsPreterminal() || tree.IsLeaf()) {
+void RuleExtractor::Extract(const SyntaxTree &tree, RuleCollection &rc) const
+{
+ if (tree.IsLeaf() || tree.children()[0]->IsLeaf()) {
return;
}
- std::size_t lhs = non_term_vocab_.Insert(tree.label());
+ std::size_t lhs = non_term_vocab_.Insert(tree.value().label);
std::vector<std::size_t> rhs;
- const std::vector<PcfgTree *> &children = tree.children();
+ const std::vector<SyntaxTree *> &children = tree.children();
rhs.reserve(children.size());
- for (std::vector<PcfgTree *>::const_iterator p(children.begin());
+ for (std::vector<SyntaxTree *>::const_iterator p(children.begin());
p != children.end(); ++p) {
- const PcfgTree &child = **p;
- rhs.push_back(non_term_vocab_.Insert(child.label()));
+ const SyntaxTree &child = **p;
+ rhs.push_back(non_term_vocab_.Insert(child.value().label));
Extract(child, rc);
}
rc.Add(lhs, rhs);
diff --git a/phrase-extract/pcfg-extract/rule_extractor.h b/phrase-extract/pcfg-extract/rule_extractor.h
index f35460909..91014747c 100644
--- a/phrase-extract/pcfg-extract/rule_extractor.h
+++ b/phrase-extract/pcfg-extract/rule_extractor.h
@@ -21,7 +21,9 @@
#ifndef PCFG_EXTRACT_RULE_EXTRACTOR_H_
#define PCFG_EXTRACT_RULE_EXTRACTOR_H_
-#include "pcfg-common/typedef.h"
+#include "SyntaxTree.h"
+
+#include "syntax-common/vocabulary.h"
#include "rule_collection.h"
@@ -32,14 +34,12 @@ namespace Syntax
namespace PCFG
{
-class PcfgTree;
-
// Extracts PCFG rules from syntax trees and adds them to a RuleCollection.
class RuleExtractor
{
public:
RuleExtractor(Vocabulary &);
- void Extract(const PcfgTree &, RuleCollection &) const;
+ void Extract(const SyntaxTree &, RuleCollection &) const;
private:
Vocabulary &non_term_vocab_;
};
diff --git a/phrase-extract/pcfg-score/Jamfile b/phrase-extract/pcfg-score/Jamfile
index 45d46492a..ca321d04c 100644
--- a/phrase-extract/pcfg-score/Jamfile
+++ b/phrase-extract/pcfg-score/Jamfile
@@ -1 +1 @@
-exe pcfg-score : [ glob *.cc ] ..//pcfg-common ../..//boost_program_options : <include>.. ;
+exe pcfg-score : [ glob *.cc ] ..//syntax-common ../..//boost_program_options : <include>.. ;
diff --git a/phrase-extract/pcfg-score/main.cc b/phrase-extract/pcfg-score/main.cc
index 5ce19f797..f4b6b1b64 100644
--- a/phrase-extract/pcfg-score/main.cc
+++ b/phrase-extract/pcfg-score/main.cc
@@ -1,17 +1,17 @@
/***********************************************************************
Moses - statistical machine translation system
Copyright (C) 2006-2012 University of Edinburgh
-
+
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
-
+
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
-
+
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
@@ -19,7 +19,8 @@
#include "pcfg_score.h"
-int main(int argc, char *argv[]) {
+int main(int argc, char *argv[])
+{
MosesTraining::Syntax::PCFG::PcfgScore tool;
return tool.Main(argc, argv);
}
diff --git a/phrase-extract/pcfg-score/pcfg_score.cc b/phrase-extract/pcfg-score/pcfg_score.cc
index a561c18ed..bdbb761f9 100644
--- a/phrase-extract/pcfg-score/pcfg_score.cc
+++ b/phrase-extract/pcfg-score/pcfg_score.cc
@@ -1,17 +1,17 @@
/***********************************************************************
Moses - statistical machine translation system
Copyright (C) 2006-2012 University of Edinburgh
-
+
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
-
+
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
-
+
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
@@ -33,26 +33,30 @@
#include <boost/program_options.hpp>
-#include "syntax-common/exception.h"
+#include "SyntaxTree.h"
-#include "pcfg-common/pcfg.h"
-#include "pcfg-common/pcfg_tree.h"
-#include "pcfg-common/syntax_tree.h"
-#include "pcfg-common/typedef.h"
-#include "pcfg-common/xml_tree_parser.h"
-
-namespace MosesTraining {
-namespace Syntax {
-namespace PCFG {
-
-int PcfgScore::Main(int argc, char *argv[]) {
+#include "syntax-common/exception.h"
+#include "syntax-common/pcfg.h"
+#include "syntax-common/vocabulary.h"
+#include "syntax-common/xml_tree_parser.h"
+#include "syntax-common/xml_tree_writer.h"
+
+namespace MosesTraining
+{
+namespace Syntax
+{
+namespace PCFG
+{
+
+int PcfgScore::Main(int argc, char *argv[])
+{
// Process command-line options.
Options options;
ProcessOptions(argc, argv, options);
// Open PCFG stream.
std::ifstream pcfg_stream;
- OpenNamedInputOrDie(options.pcfg_file, pcfg_stream);
+ OpenInputFileOrDie(options.pcfg_file, pcfg_stream);
// Read PCFG.
Pcfg pcfg;
@@ -62,14 +66,14 @@ int PcfgScore::Main(int argc, char *argv[]) {
// Score corpus according to PCFG.
TreeScorer scorer(pcfg, non_term_vocab);
XmlTreeParser parser;
- XmlTreeWriter<PcfgTree> writer;
+ XmlTreeWriter writer(std::cout);
std::string line;
std::size_t line_num = 0;
- std::auto_ptr<PcfgTree> tree;
+ std::auto_ptr<SyntaxTree> tree;
while (std::getline(std::cin, line)) {
++line_num;
try {
- tree = parser.Parse(line);
+ tree = parser.Parse(line, true);
} catch (Exception &e) {
std::ostringstream msg;
msg << "line " << line_num << ": " << e.msg();
@@ -89,13 +93,14 @@ int PcfgScore::Main(int argc, char *argv[]) {
std::cout << line << std::endl;
continue;
}
- writer.Write(*tree, std::cout);
+ writer.Write(*tree);
}
return 0;
}
-void PcfgScore::ProcessOptions(int argc, char *argv[], Options &options) const {
+void PcfgScore::ProcessOptions(int argc, char *argv[], Options &options) const
+{
namespace po = boost::program_options;
std::ostringstream usage_top;
@@ -105,14 +110,14 @@ void PcfgScore::ProcessOptions(int argc, char *argv[], Options &options) const {
// Declare the command line options that are visible to the user.
po::options_description visible(usage_top.str());
visible.add_options()
- ("help", "print help message and exit")
+ ("help", "print help message and exit")
;
// Declare the command line options that are hidden from the user
// (these are used as positional options).
po::options_description hidden("Hidden options");
hidden.add_options()
- ("pcfg-file", po::value(&options.pcfg_file), "pcfg file")
+ ("pcfg-file", po::value(&options.pcfg_file), "pcfg file")
;
// Compose the full set of command-line options.
@@ -126,7 +131,7 @@ void PcfgScore::ProcessOptions(int argc, char *argv[], Options &options) const {
// Process the command-line.
po::variables_map vm;
try {
- po::store(po::command_line_parser(argc, argv).style(CommonOptionStyle()).
+ po::store(po::command_line_parser(argc, argv).style(MosesOptionStyle()).
options(cmd_line_options).positional(p).run(), vm);
po::notify(vm);
} catch (const std::exception &e) {
diff --git a/phrase-extract/pcfg-score/pcfg_score.h b/phrase-extract/pcfg-score/pcfg_score.h
index b0b4a77cd..b691b107f 100644
--- a/phrase-extract/pcfg-score/pcfg_score.h
+++ b/phrase-extract/pcfg-score/pcfg_score.h
@@ -21,7 +21,7 @@
#ifndef PCFG_SCORE_PCFG_SCORE_H_
#define PCFG_SCORE_PCFG_SCORE_H_
-#include "pcfg-common/tool.h"
+#include "syntax-common/tool.h"
namespace MosesTraining
{
diff --git a/phrase-extract/pcfg-score/tree_scorer.cc b/phrase-extract/pcfg-score/tree_scorer.cc
index 53b6aaccf..3c6b6b0c8 100644
--- a/phrase-extract/pcfg-score/tree_scorer.cc
+++ b/phrase-extract/pcfg-score/tree_scorer.cc
@@ -1,17 +1,17 @@
/***********************************************************************
Moses - statistical machine translation system
Copyright (C) 2006-2012 University of Edinburgh
-
+
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
-
+
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
-
+
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
@@ -20,39 +20,56 @@
#include "tree_scorer.h"
#include <cassert>
+#include <sstream>
-namespace MosesTraining {
-namespace Syntax {
-namespace PCFG {
+namespace MosesTraining
+{
+namespace Syntax
+{
+namespace PCFG
+{
TreeScorer::TreeScorer(const Pcfg &pcfg, const Vocabulary &non_term_vocab)
- : pcfg_(pcfg)
- , non_term_vocab_(non_term_vocab) {
+ : pcfg_(pcfg)
+ , non_term_vocab_(non_term_vocab)
+{
}
-bool TreeScorer::Score(PcfgTree &root) const {
- if (root.IsPreterminal() || root.IsLeaf()) {
+bool TreeScorer::Score(SyntaxTree &root)
+{
+ scores_.clear();
+ ZeroScores(root);
+ if (!CalcScores(root)) {
+ return false;
+ }
+ SetAttributes(root);
+ return true;
+}
+
+bool TreeScorer::CalcScores(SyntaxTree &root)
+{
+ if (root.IsLeaf() || root.children()[0]->IsLeaf()) {
return true;
}
- const std::vector<PcfgTree *> &children = root.children();
+ const std::vector<SyntaxTree *> &children = root.children();
double log_prob = 0.0;
std::vector<std::size_t> key;
key.reserve(children.size()+1);
- key.push_back(non_term_vocab_.Lookup(root.label()));
+ key.push_back(non_term_vocab_.Lookup(root.value().label));
- for (std::vector<PcfgTree *>::const_iterator p(children.begin());
+ for (std::vector<SyntaxTree *>::const_iterator p(children.begin());
p != children.end(); ++p) {
- PcfgTree *child = *p;
+ SyntaxTree *child = *p;
assert(!child->IsLeaf());
- key.push_back(non_term_vocab_.Lookup(child->label()));
- if (!Score(*child)) {
+ key.push_back(non_term_vocab_.Lookup(child->value().label));
+ if (!CalcScores(*child)) {
return false;
}
- if (!child->IsPreterminal()) {
- log_prob += child->score();
+ if (!child->children()[0]->IsLeaf()) {
+ log_prob += scores_[child];
}
}
double rule_score;
@@ -61,10 +78,42 @@ bool TreeScorer::Score(PcfgTree &root) const {
return false;
}
log_prob += rule_score;
- root.set_score(log_prob);
+ scores_[&root] = log_prob;
return true;
}
+void TreeScorer::SetAttributes(SyntaxTree &root)
+{
+ // Terminals don't need attributes.
+ if (root.IsLeaf()) {
+ return;
+ }
+ // Preterminals don't need attributes (they have the implicit score 0.0).
+ if (root.children()[0]->IsLeaf()) {
+ return;
+ }
+ double score = scores_[&root];
+ if (score != 0.0) {
+ std::ostringstream out;
+ out << score;
+ root.value().attributes["pcfg"] = out.str();
+ }
+ for (std::vector<SyntaxTree *>::const_iterator p(root.children().begin());
+ p != root.children().end(); ++p) {
+ SetAttributes(**p);
+ }
+}
+
+void TreeScorer::ZeroScores(SyntaxTree &root)
+{
+ scores_[&root] = 0.0f;
+ const std::vector<SyntaxTree *> &children = root.children();
+ for (std::vector<SyntaxTree *>::const_iterator p(children.begin());
+ p != children.end(); ++p) {
+ ZeroScores(**p);
+ }
+}
+
} // namespace PCFG
} // namespace Syntax
} // namespace MosesTraining
diff --git a/phrase-extract/pcfg-score/tree_scorer.h b/phrase-extract/pcfg-score/tree_scorer.h
index 8b1afcc3a..b95d13ddb 100644
--- a/phrase-extract/pcfg-score/tree_scorer.h
+++ b/phrase-extract/pcfg-score/tree_scorer.h
@@ -21,9 +21,10 @@
#ifndef PCFG_SCORE_TREE_SCORER_H_
#define PCFG_SCORE_TREE_SCORER_H_
-#include "pcfg-common/pcfg.h"
-#include "pcfg-common/pcfg_tree.h"
-#include "pcfg-common/typedef.h"
+#include "SyntaxTree.h"
+
+#include "syntax-common/vocabulary.h"
+#include "syntax-common/pcfg.h"
namespace MosesTraining
{
@@ -39,11 +40,16 @@ public:
// Score tree according to PCFG. Returns false if unsuccessful (due to
// missing rule).
- bool Score(PcfgTree &) const;
+ bool Score(SyntaxTree &);
private:
const Pcfg &pcfg_;
const Vocabulary &non_term_vocab_;
+ std::map<SyntaxTree *, double> scores_;
+
+ bool CalcScores(SyntaxTree &);
+ void SetAttributes(SyntaxTree &);
+ void ZeroScores(SyntaxTree &);
};
} // namespace PCFG
diff --git a/phrase-extract/postprocess-egret-forests/Forest.h b/phrase-extract/postprocess-egret-forests/Forest.h
new file mode 100644
index 000000000..7f00ecb88
--- /dev/null
+++ b/phrase-extract/postprocess-egret-forests/Forest.h
@@ -0,0 +1,46 @@
+#pragma once
+
+#include "vector"
+
+#include <boost/shared_ptr.hpp>
+
+#include "Symbol.h"
+
+namespace MosesTraining
+{
+namespace Syntax
+{
+namespace PostprocessEgretForests
+{
+
+class Forest
+{
+public:
+ struct Vertex;
+
+ struct Hyperedge {
+ double weight;
+ Vertex *head;
+ std::vector<Vertex *> tail;
+ };
+
+ struct Vertex {
+ Symbol symbol;
+ int start;
+ int end;
+ std::vector<boost::shared_ptr<Hyperedge> > incoming;
+ };
+
+ Forest() {}
+
+ std::vector<boost::shared_ptr<Vertex> > vertices;
+
+private:
+ // Copying is not allowed.
+ Forest(const Forest &);
+ Forest &operator=(const Forest &);
+};
+
+} // namespace PostprocessEgretForests
+} // namespace Syntax
+} // namespace MosesTraining
diff --git a/phrase-extract/postprocess-egret-forests/ForestParser.cpp b/phrase-extract/postprocess-egret-forests/ForestParser.cpp
new file mode 100644
index 000000000..21e479ca6
--- /dev/null
+++ b/phrase-extract/postprocess-egret-forests/ForestParser.cpp
@@ -0,0 +1,151 @@
+#include "ForestParser.h"
+
+#include <istream>
+#include <string>
+
+#include <boost/make_shared.hpp>
+
+#include "util/tokenize_piece.hh"
+
+#include "syntax-common/exception.h"
+
+namespace MosesTraining
+{
+namespace Syntax
+{
+namespace PostprocessEgretForests
+{
+
+ForestParser::ForestParser()
+ : m_input(0)
+{
+}
+
+ForestParser::ForestParser(std::istream &input)
+ : m_input(&input)
+{
+ ++(*this);
+}
+
+ForestParser &ForestParser::operator++()
+{
+ if (!m_input) {
+ return *this;
+ }
+ m_vertexSet.clear();
+ m_entry.forest.vertices.clear();
+ if (!std::getline(*m_input, m_tmpLine)) {
+ m_input = 0;
+ return *this;
+ }
+ // The first line contains the sentence number.
+ ParseSentenceNumLine(m_tmpLine, m_entry.sentNum);
+ // The second line contains the sentence string.
+ std::getline(*m_input, m_entry.sentence);
+ // Subsequent lines contain hyperedges -- or a blank line if there was a
+ // parse failure -- terminated by a blank line.
+ std::getline(*m_input, m_tmpLine);
+ if (m_tmpLine == "") { // Parse failure
+ std::getline(*m_input, m_tmpLine);
+ assert(m_tmpLine == "");
+ return *this;
+ }
+ while (m_tmpLine != "") {
+ ParseHyperedgeLine(m_tmpLine, m_entry.forest);
+ std::getline(*m_input, m_tmpLine);
+ }
+ return *this;
+}
+
+boost::shared_ptr<Forest::Vertex> ForestParser::AddVertex(const VertexSP &v)
+{
+ std::pair<VertexSet::iterator, bool> ret = m_vertexSet.insert(v);
+ if (ret.second) {
+ m_entry.forest.vertices.push_back(*ret.first);
+ }
+ return *ret.first;
+}
+
+void ForestParser::ParseSentenceNumLine(const std::string &line,
+ std::size_t &sentNum)
+{
+ const util::AnyCharacter delimiter(" \t");
+ util::TokenIter<util::AnyCharacter, true> p(line, delimiter);
+ if (*p != "sentence") {
+ // FIXME
+ throw Exception("");
+ }
+ ++p;
+ std::string tmp;
+ p->CopyToString(&tmp);
+ sentNum = std::atoi(tmp.c_str());
+}
+
+void ForestParser::ParseHyperedgeLine(const std::string &line, Forest &forest)
+{
+ const util::AnyCharacter delimiter(" \t");
+ util::TokenIter<util::AnyCharacter, true> p(line, delimiter);
+ VertexSP v = AddVertex(ParseVertex(*p));
+ HyperedgeSP e = boost::make_shared<Forest::Hyperedge>();
+ e->head = v.get();
+ ++p;
+ if (*p != "=>") {
+ // FIXME
+ throw Exception("");
+ }
+ for (++p; *p != "|||"; ++p) {
+ v = ParseVertex(*p);
+ if (v->start == -1) {
+ // Egret does not give start/end for terminals.
+ v->start = v->end = e->head->start;
+ }
+ e->tail.push_back(AddVertex(v).get());
+ }
+ ++p;
+ std::string tmp;
+ p->CopyToString(&tmp);
+ e->weight = std::atof(tmp.c_str());
+ e->head->incoming.push_back(e);
+}
+
+boost::shared_ptr<Forest::Vertex> ForestParser::ParseVertex(
+ const StringPiece &s)
+{
+ VertexSP v = boost::make_shared<Forest::Vertex>();
+ std::size_t pos = s.rfind('[');
+ if (pos == std::string::npos) {
+ s.CopyToString(&v->symbol.value);
+ v->symbol.isNonTerminal = false;
+ v->start = v->end = -1;
+ return v;
+ }
+ if (pos > 2 && s[pos-2] == '^' && s[pos-1] == 'g') {
+ s.substr(0, pos-2).CopyToString(&v->symbol.value);
+ } else {
+ s.substr(0, pos).CopyToString(&v->symbol.value);
+ }
+ v->symbol.isNonTerminal = true;
+ std::size_t begin = pos + 1;
+ pos = s.find(',', begin+1);
+ std::string tmp;
+ s.substr(begin, pos-begin).CopyToString(&tmp);
+ v->start = std::atoi(tmp.c_str());
+ s.substr(pos+1, s.size()-pos-2).CopyToString(&tmp);
+ v->end = std::atoi(tmp.c_str());
+ return v;
+}
+
+bool operator==(const ForestParser &lhs, const ForestParser &rhs)
+{
+ // TODO Is this right? Compare values of istreams if non-zero?
+ return lhs.m_input == rhs.m_input;
+}
+
+bool operator!=(const ForestParser &lhs, const ForestParser &rhs)
+{
+ return !(lhs == rhs);
+}
+
+} // namespace PostprocessEgretForests
+} // namespace Syntax
+} // namespace MosesTraining
diff --git a/phrase-extract/postprocess-egret-forests/ForestParser.h b/phrase-extract/postprocess-egret-forests/ForestParser.h
new file mode 100644
index 000000000..7f0b6f297
--- /dev/null
+++ b/phrase-extract/postprocess-egret-forests/ForestParser.h
@@ -0,0 +1,87 @@
+#pragma once
+
+#include <istream>
+#include <string>
+#include <vector>
+#include <utility>
+
+#include <boost/shared_ptr.hpp>
+#include <boost/unordered_set.hpp>
+
+#include "util/string_piece.hh"
+
+#include "Forest.h"
+#include "Symbol.h"
+
+namespace MosesTraining
+{
+namespace Syntax
+{
+namespace PostprocessEgretForests
+{
+
+class ForestParser
+{
+public:
+ struct Entry {
+ std::size_t sentNum;
+ std::string sentence;
+ Forest forest;
+ };
+
+ ForestParser();
+ ForestParser(std::istream &);
+
+ Entry &operator*() {
+ return m_entry;
+ }
+ Entry *operator->() {
+ return &m_entry;
+ }
+
+ ForestParser &operator++();
+
+ friend bool operator==(const ForestParser &, const ForestParser &);
+ friend bool operator!=(const ForestParser &, const ForestParser &);
+
+private:
+ typedef boost::shared_ptr<Forest::Vertex> VertexSP;
+ typedef boost::shared_ptr<Forest::Hyperedge> HyperedgeSP;
+
+ struct VertexSetHash {
+ std::size_t operator()(const VertexSP &v) const {
+ std::size_t seed = 0;
+ boost::hash_combine(seed, v->symbol);
+ boost::hash_combine(seed, v->start);
+ boost::hash_combine(seed, v->end);
+ return seed;
+ }
+ };
+
+ struct VertexSetPred {
+ bool operator()(const VertexSP &v, const VertexSP &w) const {
+ return v->symbol == w->symbol && v->start == w->start && v->end == w->end;
+ }
+ };
+
+ typedef boost::unordered_set<VertexSP, VertexSetHash,
+ VertexSetPred> VertexSet;
+
+ // Copying is not allowed
+ ForestParser(const ForestParser &);
+ ForestParser &operator=(const ForestParser &);
+
+ VertexSP AddVertex(const VertexSP &);
+ void ParseHyperedgeLine(const std::string &, Forest &);
+ void ParseSentenceNumLine(const std::string &, std::size_t &);
+ VertexSP ParseVertex(const StringPiece &);
+
+ Entry m_entry;
+ std::istream *m_input;
+ std::string m_tmpLine;
+ VertexSet m_vertexSet;
+};
+
+} // namespace PostprocessEgretForests
+} // namespace Syntax
+} // namespace MosesTraining
diff --git a/phrase-extract/postprocess-egret-forests/ForestWriter.cpp b/phrase-extract/postprocess-egret-forests/ForestWriter.cpp
new file mode 100644
index 000000000..54a2cbed9
--- /dev/null
+++ b/phrase-extract/postprocess-egret-forests/ForestWriter.cpp
@@ -0,0 +1,105 @@
+#include "ForestWriter.h"
+
+#include <cassert>
+#include <vector>
+
+#include "TopologicalSorter.h"
+
+namespace MosesTraining
+{
+namespace Syntax
+{
+namespace PostprocessEgretForests
+{
+
+void ForestWriter::Write(const std::string &sentence, const Forest &forest,
+ std::size_t sentNum)
+{
+ m_out << "sentence " << sentNum << " :" << std::endl;
+ m_out << PossiblyEscape(sentence) << std::endl;
+
+ // Check for parse failure.
+ if (forest.vertices.empty()) {
+ m_out << std::endl << std::endl;
+ return;
+ }
+
+ // Sort the vertices topologically then output the hyperedges from each.
+ std::vector<const Forest::Vertex *> vertices;
+ TopologicalSorter sorter;
+ sorter.Sort(forest, vertices);
+ for (std::vector<const Forest::Vertex *>::const_iterator p = vertices.begin();
+ p != vertices.end(); ++p) {
+ const Forest::Vertex &v = **p;
+ for (std::vector<boost::shared_ptr<Forest::Hyperedge> >::const_iterator
+ q = v.incoming.begin(); q != v.incoming.end(); ++q) {
+ WriteHyperedgeLine(**q);
+ }
+ }
+
+ // Write a terminating blank line.
+ m_out << std::endl;
+}
+
+void ForestWriter::WriteHyperedgeLine(const Forest::Hyperedge &e)
+{
+ WriteVertex(*e.head);
+ m_out << " =>";
+ for (std::vector<Forest::Vertex *>::const_iterator p = e.tail.begin();
+ p != e.tail.end(); ++p) {
+ m_out << " ";
+ WriteVertex(**p);
+ }
+ m_out << " ||| " << e.weight << std::endl;
+}
+
+void ForestWriter::WriteVertex(const Forest::Vertex &v)
+{
+ m_out << PossiblyEscape(v.symbol.value);
+ if (!v.incoming.empty()) {
+ m_out << "[" << v.start << "," << v.end << "]";
+ }
+}
+
+std::string ForestWriter::PossiblyEscape(const std::string &s) const
+{
+ if (m_options.escape) {
+ return Escape(s);
+ } else {
+ return s;
+ }
+}
+
+// Escapes XML special characters.
+std::string ForestWriter::Escape(const std::string &s) const
+{
+ std::string t;
+ std::size_t len = s.size();
+ t.reserve(len);
+ for (std::size_t i = 0; i < len; ++i) {
+ if (s[i] == '<') {
+ t += "&lt;";
+ } else if (s[i] == '>') {
+ t += "&gt;";
+ } else if (s[i] == '[') {
+ t += "&#91;";
+ } else if (s[i] == ']') {
+ t += "&#93;";
+ } else if (s[i] == '|') {
+ t += "&#124;";
+ } else if (s[i] == '&') {
+ t += "&amp;";
+ } else if (s[i] == '\'') {
+ t += "&apos;";
+ } else if (s[i] == '"') {
+ t += "&quot;";
+ } else {
+ t += s[i];
+ }
+ }
+ return t;
+}
+
+} // namespace PostprocessEgretForests
+} // namespace Syntax
+} // namespace MosesTraining
diff --git a/phrase-extract/postprocess-egret-forests/ForestWriter.h b/phrase-extract/postprocess-egret-forests/ForestWriter.h
new file mode 100644
index 000000000..10c1fe05c
--- /dev/null
+++ b/phrase-extract/postprocess-egret-forests/ForestWriter.h
@@ -0,0 +1,36 @@
+#pragma once
+
+#include <ostream>
+#include <string>
+
+#include "Forest.h"
+#include "Options.h"
+
+namespace MosesTraining
+{
+namespace Syntax
+{
+namespace PostprocessEgretForests
+{
+
+class ForestWriter
+{
+public:
+ ForestWriter(const Options &options, std::ostream &out)
+ : m_options(options), m_out(out) {}
+
+ void Write(const std::string &, const Forest &, std::size_t);
+
+private:
+ std::string Escape(const std::string &) const;
+ std::string PossiblyEscape(const std::string &) const;
+ void WriteHyperedgeLine(const Forest::Hyperedge &);
+ void WriteVertex(const Forest::Vertex &);
+
+ const Options &m_options;
+ std::ostream &m_out;
+};
+
+} // namespace PostprocessEgretForests
+} // namespace Syntax
+} // namespace MosesTraining
diff --git a/phrase-extract/postprocess-egret-forests/Jamfile b/phrase-extract/postprocess-egret-forests/Jamfile
new file mode 100644
index 000000000..7f9e0e715
--- /dev/null
+++ b/phrase-extract/postprocess-egret-forests/Jamfile
@@ -0,0 +1 @@
+exe postprocess-egret-forests : [ glob *.cpp ] ..//syntax-common ..//deps ../..//boost_iostreams ../..//boost_program_options ../..//z : <include>.. ;
diff --git a/phrase-extract/postprocess-egret-forests/Main.cpp b/phrase-extract/postprocess-egret-forests/Main.cpp
new file mode 100644
index 000000000..fead94652
--- /dev/null
+++ b/phrase-extract/postprocess-egret-forests/Main.cpp
@@ -0,0 +1,9 @@
+#include "PostprocessEgretForests.h"
+
+#include "syntax-common/exception.h"
+
+int main(int argc, char *argv[])
+{
+ MosesTraining::Syntax::PostprocessEgretForests::PostprocessEgretForests tool;
+ return tool.Main(argc, argv);
+}
diff --git a/phrase-extract/postprocess-egret-forests/Options.h b/phrase-extract/postprocess-egret-forests/Options.h
new file mode 100644
index 000000000..653f2b75d
--- /dev/null
+++ b/phrase-extract/postprocess-egret-forests/Options.h
@@ -0,0 +1,22 @@
+#pragma once
+
+#include <string>
+
+namespace MosesTraining
+{
+namespace Syntax
+{
+namespace PostprocessEgretForests
+{
+
+struct Options {
+public:
+ Options() : escape(false) {}
+
+ bool escape;
+ std::string splitPointsFile;
+};
+
+} // namespace PostprocessEgretForests
+} // namespace Syntax
+} // namespace Moses
diff --git a/phrase-extract/postprocess-egret-forests/PostprocessEgretForests.cpp b/phrase-extract/postprocess-egret-forests/PostprocessEgretForests.cpp
new file mode 100644
index 000000000..4911d4913
--- /dev/null
+++ b/phrase-extract/postprocess-egret-forests/PostprocessEgretForests.cpp
@@ -0,0 +1,151 @@
+#include "PostprocessEgretForests.h"
+
+#include <cassert>
+#include <cstdlib>
+#include <fstream>
+#include <iostream>
+#include <iterator>
+#include <string>
+#include <sstream>
+#include <vector>
+
+#include <boost/program_options.hpp>
+#include <boost/scoped_ptr.hpp>
+
+#include "syntax-common/exception.h"
+
+#include "Forest.h"
+#include "ForestParser.h"
+#include "ForestWriter.h"
+#include "Options.h"
+#include "SplitPoint.h"
+#include "SplitPointFileParser.h"
+
+namespace MosesTraining
+{
+namespace Syntax
+{
+namespace PostprocessEgretForests
+{
+
+int PostprocessEgretForests::Main(int argc, char *argv[])
+{
+ try {
+ // Process command-line options.
+ Options options;
+ ProcessOptions(argc, argv, options);
+
+ // Open input files.
+ boost::scoped_ptr<SplitPointFileParser> splitPointParser;
+ std::ifstream splitPointFileStream;
+ if (!options.splitPointsFile.empty()) {
+ OpenInputFileOrDie(options.splitPointsFile, splitPointFileStream);
+ splitPointParser.reset(new SplitPointFileParser(splitPointFileStream));
+ }
+
+ ProcessForest(std::cin, std::cout, splitPointParser.get(), options);
+ } catch (const MosesTraining::Syntax::Exception &e) {
+ Error(e.msg());
+ }
+ return 0;
+}
+
+void PostprocessEgretForests::ProcessForest(
+ std::istream &in, std::ostream &out, SplitPointFileParser *splitPointParser,
+ const Options &options)
+{
+ std::size_t sentNum = 0;
+ ForestWriter writer(options, out);
+ ForestParser end;
+ for (ForestParser p(in); p != end; ++p) {
+ ++sentNum;
+ if (splitPointParser) {
+ if (*splitPointParser == SplitPointFileParser()) {
+ throw Exception("prematurely reached end of split point file");
+ }
+ if (!p->forest.vertices.empty()) {
+ try {
+ MarkSplitPoints((*splitPointParser)->splitPoints, p->forest);
+ MarkSplitPoints((*splitPointParser)->splitPoints, p->sentence);
+ } catch (const Exception &e) {
+ std::ostringstream msg;
+ msg << "failed to mark split point for sentence " << sentNum << ": "
+ << e.msg();
+ throw Exception(msg.str());
+ }
+ }
+ ++(*splitPointParser);
+ }
+ writer.Write(p->sentence, p->forest, p->sentNum);
+ }
+}
+
+void PostprocessEgretForests::ProcessOptions(int argc, char *argv[],
+ Options &options) const
+{
+ namespace po = boost::program_options;
+ namespace cls = boost::program_options::command_line_style;
+
+ // Construct the 'top' of the usage message: the bit that comes before the
+ // options list.
+ std::ostringstream usageTop;
+ usageTop << "Usage: " << name()
+ << " [OPTION]...\n\n"
+ << "TODO\n\n"
+ << "Options";
+
+ // Construct the 'bottom' of the usage message.
+ std::ostringstream usageBottom;
+ usageBottom << "TODO";
+
+ // Declare the command line options that are visible to the user.
+ po::options_description visible(usageTop.str());
+ visible.add_options()
+ ("Escape",
+ "escape Moses special characters")
+ ("MarkSplitPoints",
+ po::value(&options.splitPointsFile),
+ "read split points from named file and mark (using @) in output")
+ ;
+
+ // Declare the command line options that are hidden from the user
+ // (these are used as positional options).
+ po::options_description hidden("Hidden options");
+ hidden.add_options()
+ // None
+ ;
+
+ // Compose the full set of command-line options.
+ po::options_description cmdLineOptions;
+ cmdLineOptions.add(visible).add(hidden);
+
+ // Register the positional options.
+ po::positional_options_description p;
+ // Currently none
+
+ // Process the command-line.
+ po::variables_map vm;
+ try {
+ po::store(po::command_line_parser(argc, argv).style(MosesOptionStyle()).
+ options(cmdLineOptions).positional(p).run(), vm);
+ po::notify(vm);
+ } catch (const std::exception &e) {
+ std::ostringstream msg;
+ msg << e.what() << "\n\n" << visible << usageBottom.str();
+ Error(msg.str());
+ }
+
+ if (vm.count("help")) {
+ std::cout << visible << usageBottom.str() << std::endl;
+ std::exit(0);
+ }
+
+ // Process Boolean options.
+ if (vm.count("Escape")) {
+ options.escape = true;
+ }
+}
+
+} // namespace PostprocessEgretForests
+} // namespace Syntax
+} // namespace MosesTraining
diff --git a/phrase-extract/postprocess-egret-forests/PostprocessEgretForests.h b/phrase-extract/postprocess-egret-forests/PostprocessEgretForests.h
new file mode 100644
index 000000000..51970084e
--- /dev/null
+++ b/phrase-extract/postprocess-egret-forests/PostprocessEgretForests.h
@@ -0,0 +1,38 @@
+#pragma once
+
+#include <istream>
+#include <ostream>
+#include <string>
+
+#include "syntax-common/tool.h"
+
+namespace MosesTraining
+{
+namespace Syntax
+{
+namespace PostprocessEgretForests
+{
+
+struct Options;
+class SplitPointFileParser;
+
+class PostprocessEgretForests : public Tool
+{
+public:
+ PostprocessEgretForests() : Tool("postprocess-egret-forests") {}
+
+ virtual int Main(int argc, char *argv[]);
+
+private:
+ void OneBestTree(std::istream &, std::ostream &, SplitPointFileParser *,
+ const Options &);
+
+ void ProcessForest(std::istream &, std::ostream &, SplitPointFileParser *,
+ const Options &);
+
+ void ProcessOptions(int, char *[], Options &) const;
+};
+
+} // namespace PostprocessEgretForests
+} // namespace Syntax
+} // namespace MosesTraining
diff --git a/phrase-extract/postprocess-egret-forests/SplitPoint.cpp b/phrase-extract/postprocess-egret-forests/SplitPoint.cpp
new file mode 100644
index 000000000..491ec4649
--- /dev/null
+++ b/phrase-extract/postprocess-egret-forests/SplitPoint.cpp
@@ -0,0 +1,111 @@
+#include "SplitPoint.h"
+
+#include <map>
+#include <set>
+#include <sstream>
+
+#include "util/string_piece.hh"
+#include "util/tokenize_piece.hh"
+
+#include "syntax-common/exception.h"
+
+namespace MosesTraining
+{
+namespace Syntax
+{
+namespace PostprocessEgretForests
+{
+
+void MarkSplitPoints(const std::vector<SplitPoint> &splitPoints,
+ std::string &sentence)
+{
+ if (splitPoints.empty()) {
+ return;
+ }
+
+ // FIXME Assumes all split points have same connector
+ std::string connector;
+ std::map<int, std::set<int> > points;
+ for (std::vector<SplitPoint>::const_iterator p = splitPoints.begin();
+ p != splitPoints.end(); ++p) {
+ points[p->tokenPos].insert(p->charPos);
+ connector = p->connector;
+ }
+
+ // Split the sentence in to a sequence of tokens.
+ std::vector<std::string> terminals;
+ const util::AnyCharacter delim(" \t");
+ for (util::TokenIter<util::AnyCharacter, true> p(sentence, delim); p; ++p) {
+ terminals.resize(terminals.size()+1);
+ p->CopyToString(&terminals.back());
+ }
+
+ // Mark the split points.
+ for (std::map<int, std::set<int> >::const_iterator p = points.begin();
+ p != points.end(); ++p) {
+ std::string &word = terminals[p->first];
+ int offset = 0;
+ for (std::set<int>::const_iterator q = p->second.begin();
+ q != p->second.end(); ++q) {
+ std::string str = std::string("@") + connector + std::string("@");
+ word.replace(*q+offset, connector.size(), str);
+ offset += 2;
+ }
+ }
+
+ sentence.clear();
+ for (std::size_t i = 0; i < terminals.size(); ++i) {
+ if (i > 0) {
+ sentence += " ";
+ }
+ sentence += terminals[i];
+ }
+}
+
+void MarkSplitPoints(const std::vector<SplitPoint> &splitPoints, Forest &forest)
+{
+ if (splitPoints.empty()) {
+ return;
+ }
+
+ // FIXME Assumes all split points have same connector
+ std::string connector;
+ std::map<int, std::set<int> > points;
+ for (std::vector<SplitPoint>::const_iterator p = splitPoints.begin();
+ p != splitPoints.end(); ++p) {
+ points[p->tokenPos].insert(p->charPos);
+ connector = p->connector;
+ }
+
+ // Get the terminal vertices in sentence order.
+ std::vector<Forest::Vertex *> terminals;
+ for (std::vector<boost::shared_ptr<Forest::Vertex> >::const_iterator
+ p = forest.vertices.begin(); p != forest.vertices.end(); ++p) {
+ if (!(*p)->incoming.empty()) {
+ continue;
+ }
+ int pos = (*p)->start;
+ if (pos >= terminals.size()) {
+ terminals.resize(pos+1);
+ }
+ terminals[pos] = p->get();
+ }
+
+ // Mark the split points.
+ for (std::map<int, std::set<int> >::const_iterator p = points.begin();
+ p != points.end(); ++p) {
+ std::string &word = terminals[p->first]->symbol.value;
+ int offset = 0;
+ for (std::set<int>::const_iterator q = p->second.begin();
+ q != p->second.end(); ++q) {
+ std::string str = std::string("@") + connector + std::string("@");
+ word.replace(*q+offset, connector.size(), str);
+ offset += 2;
+ }
+ }
+
+}
+
+} // namespace PostprocessEgretForests
+} // namespace Syntax
+} // namespace MosesTraining
diff --git a/phrase-extract/postprocess-egret-forests/SplitPoint.h b/phrase-extract/postprocess-egret-forests/SplitPoint.h
new file mode 100644
index 000000000..d744b127f
--- /dev/null
+++ b/phrase-extract/postprocess-egret-forests/SplitPoint.h
@@ -0,0 +1,27 @@
+#pragma once
+
+#include <vector>
+#include <string>
+
+#include "Forest.h"
+
+namespace MosesTraining
+{
+namespace Syntax
+{
+namespace PostprocessEgretForests
+{
+
+struct SplitPoint {
+ int tokenPos;
+ int charPos;
+ std::string connector;
+};
+
+void MarkSplitPoints(const std::vector<SplitPoint> &, Forest &);
+
+void MarkSplitPoints(const std::vector<SplitPoint> &, std::string &);
+
+} // namespace PostprocessEgretForests
+} // namespace Syntax
+} // namespace MosesTraining
diff --git a/phrase-extract/postprocess-egret-forests/SplitPointFileParser.cpp b/phrase-extract/postprocess-egret-forests/SplitPointFileParser.cpp
new file mode 100644
index 000000000..4bf3c4792
--- /dev/null
+++ b/phrase-extract/postprocess-egret-forests/SplitPointFileParser.cpp
@@ -0,0 +1,86 @@
+#include "SplitPointFileParser.h"
+
+#include <istream>
+#include <string>
+
+#include "util/string_piece.hh"
+#include "util/tokenize_piece.hh"
+
+#include "syntax-common/exception.h"
+
+namespace MosesTraining
+{
+namespace Syntax
+{
+namespace PostprocessEgretForests
+{
+
+SplitPointFileParser::SplitPointFileParser()
+ : m_input(0)
+{
+}
+
+SplitPointFileParser::SplitPointFileParser(std::istream &input)
+ : m_input(&input)
+{
+ ++(*this);
+}
+
+SplitPointFileParser &SplitPointFileParser::operator++()
+{
+ if (!m_input) {
+ return *this;
+ }
+ m_entry.splitPoints.clear();
+ if (!std::getline(*m_input, m_tmpLine)) {
+ m_input = 0;
+ return *this;
+ }
+ ParseLine(m_tmpLine, m_entry.splitPoints);
+ return *this;
+}
+
+void SplitPointFileParser::ParseLine(const std::string &line,
+ std::vector<SplitPoint> &splitPoints)
+{
+ std::string tmp;
+ const util::AnyCharacter delimiter(" \t");
+ for (util::TokenIter<util::AnyCharacter, true> p(line, delimiter); p; ++p) {
+ splitPoints.resize(splitPoints.size()+1);
+ SplitPoint &splitPoint = splitPoints.back();
+ std::size_t pos = p->find(',');
+
+ StringPiece sp = p->substr(0, pos);
+ sp.CopyToString(&tmp);
+ splitPoint.tokenPos = std::atoi(tmp.c_str());
+ std::size_t begin = pos+1;
+ pos = p->find(',', begin);
+
+ sp = p->substr(begin, pos-begin);
+ sp.CopyToString(&tmp);
+ splitPoint.charPos = std::atoi(tmp.c_str());
+
+ sp = p->substr(pos+1);
+ sp.CopyToString(&splitPoint.connector);
+ if (splitPoint.connector.size() > 1) {
+ throw Exception("multi-character connectors not currently supported");
+ }
+ }
+}
+
+bool operator==(const SplitPointFileParser &lhs,
+ const SplitPointFileParser &rhs)
+{
+ // TODO Is this right? Compare values of istreams if non-zero?
+ return lhs.m_input == rhs.m_input;
+}
+
+bool operator!=(const SplitPointFileParser &lhs,
+ const SplitPointFileParser &rhs)
+{
+ return !(lhs == rhs);
+}
+
+} // namespace PostprocessEgretForests
+} // namespace Syntax
+} // namespace MosesTraining
diff --git a/phrase-extract/postprocess-egret-forests/SplitPointFileParser.h b/phrase-extract/postprocess-egret-forests/SplitPointFileParser.h
new file mode 100644
index 000000000..35fdb3ad2
--- /dev/null
+++ b/phrase-extract/postprocess-egret-forests/SplitPointFileParser.h
@@ -0,0 +1,51 @@
+#pragma once
+
+#include <istream>
+#include <string>
+#include <vector>
+
+#include "SplitPoint.h"
+
+namespace MosesTraining
+{
+namespace Syntax
+{
+namespace PostprocessEgretForests
+{
+
+class SplitPointFileParser
+{
+public:
+ struct Entry {
+ std::vector<SplitPoint> splitPoints;
+ };
+
+ SplitPointFileParser();
+ SplitPointFileParser(std::istream &);
+
+ const Entry &operator*() const {
+ return m_entry;
+ }
+ const Entry *operator->() const {
+ return &m_entry;
+ }
+
+ SplitPointFileParser &operator++();
+
+ friend bool operator==(const SplitPointFileParser &,
+ const SplitPointFileParser &);
+
+ friend bool operator!=(const SplitPointFileParser &,
+ const SplitPointFileParser &);
+
+private:
+ void ParseLine(const std::string &, std::vector<SplitPoint> &);
+
+ Entry m_entry;
+ std::istream *m_input;
+ std::string m_tmpLine;
+};
+
+} // namespace PostprocessEgretForests
+} // namespace Syntax
+} // namespace MosesTraining
diff --git a/phrase-extract/postprocess-egret-forests/Symbol.h b/phrase-extract/postprocess-egret-forests/Symbol.h
new file mode 100644
index 000000000..1b8929f49
--- /dev/null
+++ b/phrase-extract/postprocess-egret-forests/Symbol.h
@@ -0,0 +1,48 @@
+#pragma once
+
+#include <string>
+
+#include <boost/functional/hash.hpp>
+
+namespace MosesTraining
+{
+namespace Syntax
+{
+namespace PostprocessEgretForests
+{
+
+struct Symbol {
+ std::string value;
+ bool isNonTerminal;
+};
+
+inline std::size_t hash_value(const Symbol &s)
+{
+ std::size_t seed = 0;
+ boost::hash_combine(seed, s.value);
+ boost::hash_combine(seed, s.isNonTerminal);
+ return seed;
+}
+
+inline bool operator==(const Symbol &s, const Symbol &t)
+{
+ return s.value == t.value && s.isNonTerminal == t.isNonTerminal;
+}
+
+struct SymbolHasher {
+public:
+ std::size_t operator()(const Symbol &s) const {
+ return hash_value(s);
+ }
+};
+
+struct SymbolEqualityPred {
+public:
+ bool operator()(const Symbol &s, const Symbol &t) const {
+ return s.value == t.value && s.isNonTerminal == t.isNonTerminal;
+ }
+};
+
+} // namespace PostprocessEgretForests
+} // namespace Syntax
+} // namespace MosesTraining
diff --git a/phrase-extract/postprocess-egret-forests/TopologicalSorter.cpp b/phrase-extract/postprocess-egret-forests/TopologicalSorter.cpp
new file mode 100644
index 000000000..029169af5
--- /dev/null
+++ b/phrase-extract/postprocess-egret-forests/TopologicalSorter.cpp
@@ -0,0 +1,56 @@
+#include "TopologicalSorter.h"
+
+namespace MosesTraining
+{
+namespace Syntax
+{
+namespace PostprocessEgretForests
+{
+
+void TopologicalSorter::Sort(const Forest &forest,
+ std::vector<const Forest::Vertex *> &permutation)
+{
+ permutation.clear();
+ BuildPredSets(forest);
+ m_visited.clear();
+ for (std::vector<boost::shared_ptr<Forest::Vertex> >::const_iterator
+ p = forest.vertices.begin(); p != forest.vertices.end(); ++p) {
+ if (m_visited.find(p->get()) == m_visited.end()) {
+ Visit(**p, permutation);
+ }
+ }
+}
+
+void TopologicalSorter::BuildPredSets(const Forest &forest)
+{
+ m_predSets.clear();
+ for (std::vector<boost::shared_ptr<Forest::Vertex> >::const_iterator
+ p = forest.vertices.begin(); p != forest.vertices.end(); ++p) {
+ const Forest::Vertex &head = **p;
+ for (std::vector<boost::shared_ptr<Forest::Hyperedge> >::const_iterator
+ q = head.incoming.begin(); q != head.incoming.end(); ++q) {
+ const Forest::Hyperedge &e = **q;
+ for (std::vector<Forest::Vertex *>::const_iterator
+ r = e.tail.begin(); r != e.tail.end(); ++r) {
+ m_predSets[&head].insert(*r);
+ }
+ }
+ }
+}
+
+void TopologicalSorter::Visit(const Forest::Vertex &v,
+ std::vector<const Forest::Vertex *> &permutation)
+{
+ m_visited.insert(&v);
+ const VertexSet &predSet = m_predSets[&v];
+ for (VertexSet::const_iterator p = predSet.begin(); p != predSet.end(); ++p) {
+ if (m_visited.find(*p) == m_visited.end()) {
+ Visit(**p, permutation);
+ }
+ }
+ permutation.push_back(&v);
+}
+
+} // namespace PostprocessEgretForests
+} // namespace Syntax
+} // namespace MosesTraining
diff --git a/phrase-extract/postprocess-egret-forests/TopologicalSorter.h b/phrase-extract/postprocess-egret-forests/TopologicalSorter.h
new file mode 100644
index 000000000..7ed667369
--- /dev/null
+++ b/phrase-extract/postprocess-egret-forests/TopologicalSorter.h
@@ -0,0 +1,34 @@
+#pragma once
+
+#include <vector>
+
+#include <boost/unordered_map.hpp>
+#include <boost/unordered_set.hpp>
+
+#include "Forest.h"
+
+namespace MosesTraining
+{
+namespace Syntax
+{
+namespace PostprocessEgretForests
+{
+
+class TopologicalSorter
+{
+public:
+ void Sort(const Forest &, std::vector<const Forest::Vertex *> &);
+
+private:
+ typedef boost::unordered_set<const Forest::Vertex *> VertexSet;
+
+ void BuildPredSets(const Forest &);
+ void Visit(const Forest::Vertex &, std::vector<const Forest::Vertex *> &);
+
+ boost::unordered_set<const Forest::Vertex *> m_visited;
+ boost::unordered_map<const Forest::Vertex *, VertexSet> m_predSets;
+};
+
+} // namespace PostprocessEgretForests
+} // namespace Syntax
+} // namespace MosesTraining
diff --git a/phrase-extract/relax-parse-main.cpp b/phrase-extract/relax-parse-main.cpp
index a6d50cef5..f7a2a271b 100644
--- a/phrase-extract/relax-parse-main.cpp
+++ b/phrase-extract/relax-parse-main.cpp
@@ -21,6 +21,7 @@
#include "relax-parse.h"
#include "tables-core.h"
+#include "util/tokenize.hh"
using namespace std;
using namespace MosesTraining;
@@ -42,14 +43,14 @@ int main(int argc, char* argv[])
// process into syntax tree representation
set< string > labelCollection; // set of labels, not used
map< string, int > topLabelCollection; // count of top labels, not used
- SyntaxTree tree;
+ SyntaxNodeCollection tree;
ProcessAndStripXMLTags( inBufferString, tree, labelCollection, topLabelCollection, false );
- vector< string > inWords = tokenize( inBufferString.c_str() );
+ const vector< string > inWords = util::tokenize( inBufferString );
// output tree
// cerr << "BEFORE:" << endl << tree;
- ParentNodes parents = tree.Parse();
+ ParentNodes parents = determineSplitPoints(tree);
// execute selected grammar relaxation schemes
if (leftBinarizeFlag)
@@ -104,7 +105,7 @@ void init(int argc, char* argv[])
}
}
-void store( SyntaxTree &tree, vector< string > &words )
+void store( SyntaxNodeCollection &tree, const vector< string > &words )
{
// output words
for( size_t i=0; i<words.size(); i++ ) {
@@ -117,22 +118,22 @@ void store( SyntaxTree &tree, vector< string > &words )
// output tree nodes
vector< SyntaxNode* > nodes = tree.GetAllNodes();
for( size_t i=0; i<nodes.size(); i++ ) {
- cout << " <tree span=\"" << nodes[i]->GetStart()
- << "-" << nodes[i]->GetEnd()
- << "\" label=\"" << nodes[i]->GetLabel()
+ cout << " <tree span=\"" << nodes[i]->start
+ << "-" << nodes[i]->end
+ << "\" label=\"" << nodes[i]->label
<< "\"/>";
}
cout << endl;
}
-void LeftBinarize( SyntaxTree &tree, ParentNodes &parents )
+void LeftBinarize( SyntaxNodeCollection &tree, ParentNodes &parents )
{
for(ParentNodes::const_iterator p = parents.begin(); p != parents.end(); p++) {
const SplitPoints &point = *p;
if (point.size() > 3) {
const vector< SyntaxNode* >& topNodes
= tree.GetNodes( point[0], point[point.size()-1]-1);
- string topLabel = topNodes[0]->GetLabel();
+ string topLabel = topNodes[0]->label;
for(size_t i=2; i<point.size()-1; i++) {
// cerr << "LeftBin " << point[0] << "-" << (point[point.size()-1]-1) << ": " << point[0] << "-" << point[i]-1 << " ^" << topLabel << endl;
@@ -142,7 +143,7 @@ void LeftBinarize( SyntaxTree &tree, ParentNodes &parents )
}
}
-void RightBinarize( SyntaxTree &tree, ParentNodes &parents )
+void RightBinarize( SyntaxNodeCollection &tree, ParentNodes &parents )
{
for(ParentNodes::const_iterator p = parents.begin(); p != parents.end(); p++) {
const SplitPoints &point = *p;
@@ -150,7 +151,7 @@ void RightBinarize( SyntaxTree &tree, ParentNodes &parents )
int endPoint = point[point.size()-1]-1;
const vector< SyntaxNode* >& topNodes
= tree.GetNodes( point[0], endPoint);
- string topLabel = topNodes[0]->GetLabel();
+ string topLabel = topNodes[0]->label;
for(size_t i=1; i<point.size()-2; i++) {
// cerr << "RightBin " << point[0] << "-" << (point[point.size()-1]-1) << ": " << point[i] << "-" << endPoint << " ^" << topLabel << endl;
@@ -160,11 +161,11 @@ void RightBinarize( SyntaxTree &tree, ParentNodes &parents )
}
}
-void SAMT( SyntaxTree &tree, ParentNodes &parents )
+void SAMT( SyntaxNodeCollection &tree, ParentNodes &parents )
{
int numWords = tree.GetNumWords();
- SyntaxTree newTree; // to store new nodes
+ SyntaxNodeCollection newTree; // to store new nodes
// look through parents to combine children
for(ParentNodes::const_iterator p = parents.begin(); p != parents.end(); p++) {
@@ -177,29 +178,29 @@ void SAMT( SyntaxTree &tree, ParentNodes &parents )
// cerr << endl;
for(size_t i = 0; i+2 < point.size(); i++) {
- // cerr << "\tadding " << point[i] << ";" << point[i+1] << ";" << (point[i+2]-1) << ": " << tree.GetNodes(point[i ],point[i+1]-1)[0]->GetLabel() << "+" << tree.GetNodes(point[i+1],point[i+2]-1)[0]->GetLabel() << endl;
+ // cerr << "\tadding " << point[i] << ";" << point[i+1] << ";" << (point[i+2]-1) << ": " << tree.GetNodes(point[i ],point[i+1]-1)[0]->label << "+" << tree.GetNodes(point[i+1],point[i+2]-1)[0]->label << endl;
newTree.AddNode( point[i],point[i+2]-1,
- tree.GetNodes(point[i ],point[i+1]-1)[0]->GetLabel()
+ tree.GetNodes(point[i ],point[i+1]-1)[0]->label
+ "+" +
- tree.GetNodes(point[i+1],point[i+2]-1)[0]->GetLabel() );
+ tree.GetNodes(point[i+1],point[i+2]-1)[0]->label);
}
}
if (point.size() >= 4) {
int ps = point.size();
- string topLabel = tree.GetNodes(point[0],point[ps-1]-1)[0]->GetLabel();
+ string topLabel = tree.GetNodes(point[0],point[ps-1]-1)[0]->label;
- // cerr << "\tadding " << topLabel + "\\" + tree.GetNodes(point[0],point[1]-1)[0]->GetLabel() << endl;
+ // cerr << "\tadding " << topLabel + "\\" + tree.GetNodes(point[0],point[1]-1)[0]->label << endl;
newTree.AddNode( point[1],point[ps-1]-1,
topLabel
+ "\\" +
- tree.GetNodes(point[0],point[1]-1)[0]->GetLabel() );
+ tree.GetNodes(point[0],point[1]-1)[0]->label );
- // cerr << "\tadding " << topLabel + "/" + tree.GetNodes(point[ps-2],point[ps-1]-1)[0]->GetLabel() << endl;
+ // cerr << "\tadding " << topLabel + "/" + tree.GetNodes(point[ps-2],point[ps-1]-1)[0]->label << endl;
newTree.AddNode( point[0],point[ps-2]-1,
topLabel
+ "/" +
- tree.GetNodes(point[ps-2],point[ps-1]-1)[0]->GetLabel() );
+ tree.GetNodes(point[ps-2],point[ps-1]-1)[0]->label );
}
}
@@ -218,12 +219,12 @@ void SAMT( SyntaxTree &tree, ParentNodes &parents )
for(int mid=start+1; mid<=end && !done; mid++) {
if (tree.HasNode(start,mid-1) && tree.HasNode(mid,end)) {
- // cerr << "\tadding " << tree.GetNodes(start,mid-1)[0]->GetLabel() << "++" << tree.GetNodes(mid, end )[0]->GetLabel() << endl;
+ // cerr << "\tadding " << tree.GetNodes(start,mid-1)[0]->label << "++" << tree.GetNodes(mid, end )[0]->label << endl;
newTree.AddNode( start, end,
- tree.GetNodes(start,mid-1)[0]->GetLabel()
+ tree.GetNodes(start,mid-1)[0]->label
+ "++" +
- tree.GetNodes(mid, end )[0]->GetLabel() );
+ tree.GetNodes(mid, end )[0]->label );
done = true;
}
}
@@ -233,9 +234,9 @@ void SAMT( SyntaxTree &tree, ParentNodes &parents )
for(int postEnd=end+1; postEnd<numWords && !done; postEnd++) {
if (tree.HasNode(start,postEnd) && tree.HasNode(end+1,postEnd)) {
newTree.AddNode( start, end,
- tree.GetNodes(start,postEnd)[0]->GetLabel()
+ tree.GetNodes(start,postEnd)[0]->label
+ "//" +
- tree.GetNodes(end+1,postEnd)[0]->GetLabel() );
+ tree.GetNodes(end+1,postEnd)[0]->label );
done = true;
}
}
@@ -244,11 +245,11 @@ void SAMT( SyntaxTree &tree, ParentNodes &parents )
// if matching a constituent A left-minus constituent B: use A\\B
for(int preStart=start-1; preStart>=0; preStart--) {
if (tree.HasNode(preStart,end) && tree.HasNode(preStart,start-1)) {
- // cerr << "\tadding " << tree.GetNodes(preStart,end )[0]->GetLabel() << "\\\\" <<tree.GetNodes(preStart,start-1)[0]->GetLabel() << endl;
+ // cerr << "\tadding " << tree.GetNodes(preStart,end )[0]->label << "\\\\" <<tree.GetNodes(preStart,start-1)[0]->label << endl;
newTree.AddNode( start, end,
- tree.GetNodes(preStart,end )[0]->GetLabel()
+ tree.GetNodes(preStart,end )[0]->label
+ "\\\\" +
- tree.GetNodes(preStart,start-1)[0]->GetLabel() );
+ tree.GetNodes(preStart,start-1)[0]->label );
done = true;
}
}
@@ -267,6 +268,48 @@ void SAMT( SyntaxTree &tree, ParentNodes &parents )
// adding all new nodes
vector< SyntaxNode* > nodes = newTree.GetAllNodes();
for( size_t i=0; i<nodes.size(); i++ ) {
- tree.AddNode( nodes[i]->GetStart(), nodes[i]->GetEnd(), nodes[i]->GetLabel());
+ tree.AddNode( nodes[i]->start, nodes[i]->end, nodes[i]->label);
}
}
+
+ParentNodes determineSplitPoints(const SyntaxNodeCollection &nodeColl)
+{
+ ParentNodes parents;
+
+ const std::size_t numWords = nodeColl.GetNumWords();
+
+ // looping through all spans of size >= 2
+ for( int length=2; length<=numWords; length++ ) {
+ for( int startPos = 0; startPos <= numWords-length; startPos++ ) {
+ if (nodeColl.HasNode( startPos, startPos+length-1 )) {
+ // processing one (parent) span
+
+ //std::cerr << "# " << startPos << "-" << (startPos+length-1) << ":";
+ SplitPoints splitPoints;
+ splitPoints.push_back( startPos );
+ //std::cerr << " " << startPos;
+
+ int first = 1;
+ int covered = 0;
+ int found_somehing = 1; // break loop if nothing found
+ while( covered < length && found_somehing ) {
+ // find largest covering subspan (child)
+ // starting at last covered position
+ found_somehing = 0;
+ for( int midPos=length-first; midPos>covered; midPos-- ) {
+ if( nodeColl.HasNode( startPos+covered, startPos+midPos-1 ) ) {
+ covered = midPos;
+ splitPoints.push_back( startPos+covered );
+ // std::cerr << " " << ( startPos+covered );
+ first = 0;
+ found_somehing = 1;
+ }
+ }
+ }
+ // std::cerr << std::endl;
+ parents.push_back( splitPoints );
+ }
+ }
+ }
+ return parents;
+}
diff --git a/phrase-extract/relax-parse.h b/phrase-extract/relax-parse.h
index ec604405e..7c412646a 100644
--- a/phrase-extract/relax-parse.h
+++ b/phrase-extract/relax-parse.h
@@ -28,7 +28,7 @@
#include <algorithm>
#include <cstring>
-#include "SyntaxTree.h"
+#include "SyntaxNodeCollection.h"
#include "XmlTree.h"
#define LINE_MAX_LENGTH 1000000
@@ -37,10 +37,14 @@ bool leftBinarizeFlag = false;
bool rightBinarizeFlag = false;
char SAMTLevel = 0;
+typedef std::vector< int > SplitPoints;
+typedef std::vector< SplitPoints > ParentNodes;
+
// functions
void init(int argc, char* argv[]);
-void store( MosesTraining::SyntaxTree &tree, std::vector<std::string> &words );
-void LeftBinarize( MosesTraining::SyntaxTree &tree, MosesTraining::ParentNodes &parents );
-void RightBinarize( MosesTraining::SyntaxTree &tree, MosesTraining::ParentNodes &parents );
-void SAMT( MosesTraining::SyntaxTree &tree, MosesTraining::ParentNodes &parents );
+ParentNodes determineSplitPoints(const MosesTraining::SyntaxNodeCollection &);
+void store( MosesTraining::SyntaxNodeCollection &tree, const std::vector<std::string> &words );
+void LeftBinarize( MosesTraining::SyntaxNodeCollection &tree, ParentNodes &parents );
+void RightBinarize( MosesTraining::SyntaxNodeCollection &tree, ParentNodes &parents );
+void SAMT( MosesTraining::SyntaxNodeCollection &tree, ParentNodes &parents );
diff --git a/phrase-extract/score-main.cpp b/phrase-extract/score-main.cpp
index 4849bfce3..cf28f90b9 100644
--- a/phrase-extract/score-main.cpp
+++ b/phrase-extract/score-main.cpp
@@ -18,10 +18,6 @@
***********************************************************************/
#include <sstream>
-#include <cstdio>
-#include <iostream>
-#include <fstream>
-#include <stdlib.h>
#include <assert.h>
#include <cstring>
#include <map>
@@ -38,7 +34,8 @@
#include "InputFileStream.h"
#include "OutputFileStream.h"
-using namespace std;
+#include "moses/Util.h"
+
using namespace boost::algorithm;
using namespace MosesTraining;
@@ -67,7 +64,9 @@ bool unalignedFlag = false;
bool unalignedFWFlag = false;
bool crossedNonTerm = false;
bool spanLength = false;
+bool ruleLength = false;
bool nonTermContext = false;
+bool nonTermContextTarget = false;
int countOfCounts[COC_MAX+1];
int totalDistinct = 0;
@@ -96,7 +95,6 @@ Vocabulary vcbS;
} // namespace
-std::vector<std::string> tokenize( const char [] );
void processLine( std::string line,
int lineID, bool includeSentenceIdFlag, int &sentenceId,
@@ -109,19 +107,20 @@ void writeLeftHandSideLabelCounts( const boost::unordered_map<std::string,float>
const std::string &fileNameLeftHandSideSourceLabelCounts,
const std::string &fileNameLeftHandSideTargetSourceLabelCounts );
void writeLabelSet( const std::set<std::string> &labelSet, const std::string &fileName );
-void processPhrasePairs( std::vector< ExtractionPhrasePair* > &phrasePairsWithSameSource, ostream &phraseTableFile,
+void processPhrasePairs( std::vector< ExtractionPhrasePair* > &phrasePairsWithSameSource, std::ostream &phraseTableFile,
const ScoreFeatureManager& featureManager, const MaybeLog& maybeLogProb );
-void outputPhrasePair(const ExtractionPhrasePair &phrasePair, float, int, ostream &phraseTableFile, const ScoreFeatureManager &featureManager, const MaybeLog &maybeLog );
+void outputPhrasePair(const ExtractionPhrasePair &phrasePair, float, int, std::ostream &phraseTableFile, const ScoreFeatureManager &featureManager, const MaybeLog &maybeLog );
double computeLexicalTranslation( const PHRASE *phraseSource, const PHRASE *phraseTarget, const ALIGNMENT *alignmentTargetToSource );
double computeUnalignedPenalty( const ALIGNMENT *alignmentTargetToSource );
-set<std::string> functionWordList;
+std::set<std::string> functionWordList;
void loadOrientationPriors(const std::string &fileNamePhraseOrientationPriors, std::vector<float> &orientationClassPriorsL2R, std::vector<float> &orientationClassPriorsR2L);
-void loadFunctionWords( const string &fileNameFunctionWords );
+void loadFunctionWords( const std::string &fileNameFunctionWords );
double computeUnalignedFWPenalty( const PHRASE *phraseTarget, const ALIGNMENT *alignmentTargetToSource );
int calcCrossedNonTerm( const PHRASE *phraseTarget, const ALIGNMENT *alignmentTargetToSource );
-void printSourcePhrase( const PHRASE *phraseSource, const PHRASE *phraseTarget, const ALIGNMENT *targetToSourceAlignment, ostream &out );
-void printTargetPhrase( const PHRASE *phraseSource, const PHRASE *phraseTarget, const ALIGNMENT *targetToSourceAlignment, ostream &out );
+void printSourcePhrase( const PHRASE *phraseSource, const PHRASE *phraseTarget, const ALIGNMENT *targetToSourceAlignment, std::ostream &out );
+void printTargetPhrase( const PHRASE *phraseSource, const PHRASE *phraseTarget, const ALIGNMENT *targetToSourceAlignment, std::ostream &out );
void invertAlignment( const PHRASE *phraseSource, const PHRASE *phraseTarget, const ALIGNMENT *inTargetToSourceAlignment, ALIGNMENT *outSourceToTargetAlignment );
+size_t NumNonTerminal(const PHRASE *phraseSource);
int main(int argc, char* argv[])
@@ -131,7 +130,15 @@ int main(int argc, char* argv[])
ScoreFeatureManager featureManager;
if (argc < 4) {
- std::cerr << "syntax: score extract lex phrase-table [--Inverse] [--Hierarchical] [--LogProb] [--NegLogProb] [--NoLex] [--GoodTuring] [--KneserNey] [--NoWordAlignment] [--UnalignedPenalty] [--UnalignedFunctionWordPenalty function-word-file] [--MinCountHierarchical count] [--PartsOfSpeech] [--PCFG] [--TreeFragments] [--SourceLabels] [--SourceLabelCountsLHS] [--TargetPreferenceLabels] [--UnpairedExtractFormat] [--ConditionOnTargetLHS] [--CrossedNonTerm]" << std::endl;
+ std::cerr <<
+ "syntax: score extract lex phrase-table [--Inverse] [--Hierarchical] "
+ "[--LogProb] [--NegLogProb] [--NoLex] [--GoodTuring] [--KneserNey] "
+ "[--NoWordAlignment] [--UnalignedPenalty] "
+ "[--UnalignedFunctionWordPenalty function-word-file] "
+ "[--MinCountHierarchical count] [--PartsOfSpeech] [--PCFG] "
+ "[--TreeFragments] [--SourceLabels] [--SourceLabelCountsLHS] "
+ "[--TargetPreferenceLabels] [--UnpairedExtractFormat] "
+ "[--ConditionOnTargetLHS] [--CrossedNonTerm]" << std::endl;
std::cerr << featureManager.usage() << std::endl;
exit(1);
}
@@ -148,7 +155,8 @@ int main(int argc, char* argv[])
std::string fileNameLeftHandSideTargetPreferenceLabelCounts;
std::string fileNameLeftHandSideRuleTargetTargetPreferenceLabelCounts;
std::string fileNamePhraseOrientationPriors;
- std::vector<std::string> featureArgs; // all unknown args passed to feature manager
+ // All unknown args are passed to feature manager.
+ std::vector<std::string> featureArgs;
for(int i=4; i<argc; i++) {
if (strcmp(argv[i],"inverse") == 0 || strcmp(argv[i],"--Inverse") == 0) {
@@ -228,7 +236,7 @@ int main(int argc, char* argv[])
negLogProb = -1;
std::cerr << "using negative log-probabilities" << std::endl;
} else if (strcmp(argv[i],"--MinCountHierarchical") == 0) {
- minCountHierarchical = atof(argv[++i]);
+ minCountHierarchical = Moses::Scan<float>( argv[++i] );
std::cerr << "dropping all phrase pairs occurring less than " << minCountHierarchical << " times" << std::endl;
minCountHierarchical -= 0.00001; // account for rounding
} else if (strcmp(argv[i],"--CrossedNonTerm") == 0) {
@@ -245,9 +253,15 @@ int main(int argc, char* argv[])
} else if (strcmp(argv[i],"--SpanLength") == 0) {
spanLength = true;
std::cerr << "span length feature" << std::endl;
+ } else if (strcmp(argv[i],"--RuleLength") == 0) {
+ ruleLength = true;
+ std::cerr << "rule length feature" << std::endl;
} else if (strcmp(argv[i],"--NonTermContext") == 0) {
nonTermContext = true;
std::cerr << "non-term context" << std::endl;
+ } else if (strcmp(argv[i],"--NonTermContextTarget") == 0) {
+ nonTermContextTarget = true;
+ std::cerr << "non-term context (target)" << std::endl;
} else {
featureArgs.push_back(argv[i]);
++i;
@@ -291,10 +305,9 @@ int main(int argc, char* argv[])
std::cerr << "ERROR: could not open extract file " << fileNameExtract << std::endl;
exit(1);
}
- istream &extractFileP = extractFile;
// output file: phrase translation table
- ostream *phraseTableFile;
+ std::ostream *phraseTableFile;
if (fileNamePhraseTable == "-") {
phraseTableFile = &std::cout;
@@ -310,8 +323,7 @@ int main(int argc, char* argv[])
}
// loop through all extracted phrase translations
- string line, lastLine;
- lastLine[0] = '\0';
+ std::string line, lastLine;
ExtractionPhrasePair *phrasePair = NULL;
std::vector< ExtractionPhrasePair* > phrasePairsWithSameSource;
std::vector< ExtractionPhrasePair* > phrasePairsWithSameSourceAndTarget; // required for hierarchical rules only, as non-terminal alignments might make the phrases incompatible
@@ -323,8 +335,7 @@ int main(int argc, char* argv[])
float tmpCount=0.0f, tmpPcfgSum=0.0f;
int i=0;
- // TODO why read only the 1st line?
- if ( getline(extractFileP, line) ) {
+ if ( getline(extractFile, line) ) {
++i;
tmpPhraseSource = new PHRASE();
tmpPhraseTarget = new PHRASE();
@@ -346,7 +357,7 @@ int main(int argc, char* argv[])
lastLine = line;
}
- while ( getline(extractFileP, line) ) {
+ while ( getline(extractFile, line) ) {
if ( ++i % 100000 == 0 ) {
std::cerr << "." << std::flush;
@@ -503,7 +514,8 @@ void processLine( std::string line,
phraseTarget->clear();
targetToSourceAlignment->clear();
- std::vector<std::string> token = tokenize( line.c_str() );
+ std::vector<std::string> token;
+ Moses::Tokenize( token, line );
int item = 1;
for ( size_t j=0; j<token.size(); ++j ) {
if (token[j] == "|||") {
@@ -534,7 +546,7 @@ void processLine( std::string line,
} else if (item + (includeSentenceIdFlag?-1:0) == 4) { // count
sscanf(token[j].c_str(), "%f", &count);
} else if (item + (includeSentenceIdFlag?-1:0) == 5) { // target syntax PCFG score
- float pcfgScore = std::atof(token[j].c_str());
+ float pcfgScore = Moses::Scan<float>( token[j] );
pcfgSum = pcfgScore * count;
}
}
@@ -548,17 +560,17 @@ void processLine( std::string line,
count = 1.0;
}
if (item < 3 || item > (includeSentenceIdFlag?7:6)) {
- std::cerr << "ERROR: faulty line " << lineID << ": " << line << endl;
+ std::cerr << "ERROR: faulty line " << lineID << ": " << line << std::endl;
}
}
-void writeCountOfCounts( const string &fileNameCountOfCounts )
+void writeCountOfCounts( const std::string &fileNameCountOfCounts )
{
// open file
Moses::OutputFileStream countOfCountsFile;
- bool success = countOfCountsFile.Open(fileNameCountOfCounts.c_str());
+ bool success = countOfCountsFile.Open(fileNameCountOfCounts);
if (!success) {
std::cerr << "ERROR: could not open count-of-counts file "
<< fileNameCountOfCounts << std::endl;
@@ -583,7 +595,7 @@ void writeLeftHandSideLabelCounts( const boost::unordered_map<std::string,float>
{
// open file
Moses::OutputFileStream leftHandSideSourceLabelCounts;
- bool success = leftHandSideSourceLabelCounts.Open(fileNameLeftHandSideSourceLabelCounts.c_str());
+ bool success = leftHandSideSourceLabelCounts.Open(fileNameLeftHandSideSourceLabelCounts);
if (!success) {
std::cerr << "ERROR: could not open left-hand side label counts file "
<< fileNameLeftHandSideSourceLabelCounts << std::endl;
@@ -600,7 +612,7 @@ void writeLeftHandSideLabelCounts( const boost::unordered_map<std::string,float>
// open file
Moses::OutputFileStream leftHandSideTargetSourceLabelCounts;
- success = leftHandSideTargetSourceLabelCounts.Open(fileNameLeftHandSideTargetSourceLabelCounts.c_str());
+ success = leftHandSideTargetSourceLabelCounts.Open(fileNameLeftHandSideTargetSourceLabelCounts);
if (!success) {
std::cerr << "ERROR: could not open left-hand side label joint counts file "
<< fileNameLeftHandSideTargetSourceLabelCounts << std::endl;
@@ -624,7 +636,7 @@ void writeLabelSet( const std::set<std::string> &labelSet, const std::string &fi
{
// open file
Moses::OutputFileStream out;
- bool success = out.Open(fileName.c_str());
+ bool success = out.Open(fileName);
if (!success) {
std::cerr << "ERROR: could not open file "
<< fileName << " for writing" << std::endl;
@@ -640,7 +652,7 @@ void writeLabelSet( const std::set<std::string> &labelSet, const std::string &fi
}
-void processPhrasePairs( std::vector< ExtractionPhrasePair* > &phrasePairsWithSameSource, ostream &phraseTableFile,
+void processPhrasePairs( std::vector< ExtractionPhrasePair* > &phrasePairsWithSameSource, std::ostream &phraseTableFile,
const ScoreFeatureManager& featureManager, const MaybeLog& maybeLogProb )
{
if (phrasePairsWithSameSource.size() == 0) {
@@ -668,7 +680,7 @@ void processPhrasePairs( std::vector< ExtractionPhrasePair* > &phrasePairsWithSa
void outputPhrasePair(const ExtractionPhrasePair &phrasePair,
float totalCount, int distinctCount,
- ostream &phraseTableFile,
+ std::ostream &phraseTableFile,
const ScoreFeatureManager& featureManager,
const MaybeLog& maybeLogProb )
{
@@ -677,13 +689,14 @@ void outputPhrasePair(const ExtractionPhrasePair &phrasePair,
const ALIGNMENT *bestAlignmentT2S = phrasePair.FindBestAlignmentTargetToSource();
float count = phrasePair.GetCount();
- map< string, float > domainCount;
+ std::map< std::string, float > domainCount;
// collect count of count statistics
if (goodTuringFlag || kneserNeyFlag) {
totalDistinct++;
int countInt = count + 0.99999;
- if (countInt <= COC_MAX)
+ if ((countInt <= COC_MAX) &&
+ (countInt > 0))
countOfCounts[ countInt ]++;
}
@@ -796,13 +809,13 @@ void outputPhrasePair(const ExtractionPhrasePair &phrasePair,
// extra features
ScoreFeatureContext context(phrasePair, maybeLogProb);
std::vector<float> extraDense;
- map<string,float> extraSparse;
+ std::map<std::string,float> extraSparse;
featureManager.addFeatures(context, extraDense, extraSparse);
for (size_t i = 0; i < extraDense.size(); ++i) {
phraseTableFile << " " << extraDense[i];
}
- for (map<string,float>::const_iterator i = extraSparse.begin();
+ for (std::map<std::string,float>::const_iterator i = extraSparse.begin();
i != extraSparse.end(); ++i) {
phraseTableFile << " " << i->first << " " << i->second;
}
@@ -882,23 +895,47 @@ void outputPhrasePair(const ExtractionPhrasePair &phrasePair,
}
if (spanLength && !inverseFlag) {
- string propValue = phrasePair.CollectAllPropertyValues("SpanLength");
+ std::string propValue = phrasePair.CollectAllPropertyValues("SpanLength");
if (!propValue.empty()) {
phraseTableFile << " {{SpanLength " << propValue << "}}";
}
}
- if (nonTermContext && !inverseFlag) {
- string propValue = phrasePair.CollectAllPropertyValues("NonTermContext");
+ if (ruleLength && !inverseFlag) {
+ std::string propValue = phrasePair.CollectAllPropertyValues("RuleLength");
if (!propValue.empty()) {
- phraseTableFile << " {{NonTermContext " << propValue << "}}";
+ phraseTableFile << " {{RuleLength " << propValue << "}}";
+ }
+ }
+
+ if (nonTermContext && !inverseFlag) {
+ std::string propValue = phrasePair.CollectAllPropertyValues("NonTermContext");
+ if (!propValue.empty() && propValue.size() < 50000) {
+ size_t nNTs = NumNonTerminal(phraseSource);
+ phraseTableFile << " {{NonTermContext " << nNTs << " " << propValue << "}}";
+ }
+ }
+
+ if (nonTermContextTarget && !inverseFlag) {
+ std::string propValue = phrasePair.CollectAllPropertyValues("NonTermContextTarget");
+ if (!propValue.empty() && propValue.size() < 50000) {
+ size_t nNTs = NumNonTerminal(phraseSource);
+ phraseTableFile << " {{NonTermContextTarget " << nNTs << " " << propValue << "}}";
}
}
phraseTableFile << std::endl;
}
-
+size_t NumNonTerminal(const PHRASE *phraseSource)
+{
+ size_t nNTs = 0;
+ for(size_t j=0; j<phraseSource->size()-1; ++j) {
+ if (isNonTerminal(vcbS.getWord( phraseSource->at(j) )))
+ ++nNTs;
+ }
+ return nNTs;
+}
void loadOrientationPriors(const std::string &fileNamePhraseOrientationPriors,
std::vector<float> &orientationClassPriorsL2R,
@@ -907,8 +944,7 @@ void loadOrientationPriors(const std::string &fileNamePhraseOrientationPriors,
assert(orientationClassPriorsL2R.size()==4 && orientationClassPriorsR2L.size()==4); // mono swap dleft dright
std::cerr << "Loading phrase orientation priors from " << fileNamePhraseOrientationPriors;
- ifstream inFile;
- inFile.open(fileNamePhraseOrientationPriors.c_str());
+ Moses::InputFileStream inFile(fileNamePhraseOrientationPriors);
if (inFile.fail()) {
std::cerr << " - ERROR: could not open file" << std::endl;
exit(1);
@@ -919,7 +955,7 @@ void loadOrientationPriors(const std::string &fileNamePhraseOrientationPriors,
float l2rSum = 0;
float r2lSum = 0;
while (getline(inFile, line)) {
- istringstream tokenizer(line);
+ std::istringstream tokenizer(line);
std::string key;
tokenizer >> key;
@@ -983,7 +1019,7 @@ void loadOrientationPriors(const std::string &fileNamePhraseOrientationPriors,
}
std::cerr << " - read " << linesRead << " lines from orientation priors file" << std::endl;
- inFile.close();
+ inFile.Close();
}
@@ -1038,7 +1074,7 @@ double computeUnalignedPenalty( const ALIGNMENT *alignmentTargetToSource )
double unaligned = 1.0;
// only checking target words - source words are caught when computing inverse
for(size_t ti=0; ti<alignmentTargetToSource->size(); ++ti) {
- const set< size_t > & srcIndices = alignmentTargetToSource->at(ti);
+ const std::set< size_t > & srcIndices = alignmentTargetToSource->at(ti);
if (srcIndices.empty()) {
unaligned *= 2.718;
}
@@ -1053,7 +1089,7 @@ double computeUnalignedFWPenalty( const PHRASE *phraseTarget, const ALIGNMENT *a
double unaligned = 1.0;
// only checking target words - source words are caught when computing inverse
for(size_t ti=0; ti<alignmentTargetToSource->size(); ++ti) {
- const set< size_t > & srcIndices = alignmentTargetToSource->at(ti);
+ const std::set< size_t > & srcIndices = alignmentTargetToSource->at(ti);
if (srcIndices.empty() && functionWordList.find( vcbT.getWord( phraseTarget->at(ti) ) ) != functionWordList.end()) {
unaligned *= 2.718;
}
@@ -1061,26 +1097,25 @@ double computeUnalignedFWPenalty( const PHRASE *phraseTarget, const ALIGNMENT *a
return unaligned;
}
-void loadFunctionWords( const string &fileName )
+void loadFunctionWords( const std::string &fileName )
{
std::cerr << "Loading function word list from " << fileName;
- ifstream inFile;
- inFile.open(fileName.c_str());
+ Moses::InputFileStream inFile(fileName);
if (inFile.fail()) {
std::cerr << " - ERROR: could not open file" << std::endl;
exit(1);
}
- istream *inFileP = &inFile;
- string line;
- while(getline(*inFileP, line)) {
- std::vector<string> token = tokenize( line.c_str() );
+ std::string line;
+ while(getline(inFile, line)) {
+ std::vector<std::string> token;
+ Moses::Tokenize( token, line );
if (token.size() > 0)
functionWordList.insert( token[0] );
}
std::cerr << " - read " << functionWordList.size() << " function words" << std::endl;
- inFile.close();
+ inFile.Close();
}
@@ -1091,14 +1126,14 @@ double computeLexicalTranslation( const PHRASE *phraseSource, const PHRASE *phra
int null = vcbS.getWordID("NULL");
// all target words have to be explained
for(size_t ti=0; ti<alignmentTargetToSource->size(); ti++) {
- const set< size_t > & srcIndices = alignmentTargetToSource->at(ti);
+ const std::set< size_t > & srcIndices = alignmentTargetToSource->at(ti);
if (srcIndices.empty()) {
// explain unaligned word by NULL
lexScore *= lexTable.permissiveLookup( null, phraseTarget->at(ti) );
} else {
// go through all the aligned words to compute average
double thisWordScore = 0;
- for (set< size_t >::const_iterator p(srcIndices.begin()); p != srcIndices.end(); ++p) {
+ for (std::set< size_t >::const_iterator p(srcIndices.begin()); p != srcIndices.end(); ++p) {
thisWordScore += lexTable.permissiveLookup( phraseSource->at(*p), phraseTarget->at(ti) );
}
lexScore *= thisWordScore / (double)srcIndices.size();
@@ -1108,24 +1143,23 @@ double computeLexicalTranslation( const PHRASE *phraseSource, const PHRASE *phra
}
-void LexicalTable::load( const string &fileName )
+void LexicalTable::load( const std::string &fileName )
{
std::cerr << "Loading lexical translation table from " << fileName;
- ifstream inFile;
- inFile.open(fileName.c_str());
+ Moses::InputFileStream inFile(fileName);
if (inFile.fail()) {
std::cerr << " - ERROR: could not open file" << std::endl;
exit(1);
}
- istream *inFileP = &inFile;
- string line;
+ std::string line;
int i=0;
- while(getline(*inFileP, line)) {
+ while(getline(inFile, line)) {
i++;
- if (i%100000 == 0) std::cerr << "." << flush;
+ if (i%100000 == 0) std::cerr << "." << std::flush;
- std::vector<string> token = tokenize( line.c_str() );
+ std::vector<std::string> token;
+ Moses::Tokenize( token, line );
if (token.size() != 3) {
std::cerr << "line " << i << " in " << fileName
<< " has wrong number of tokens, skipping:" << std::endl
@@ -1133,7 +1167,7 @@ void LexicalTable::load( const string &fileName )
continue;
}
- double prob = atof( token[2].c_str() );
+ double prob = Moses::Scan<double>( token[2] );
WORD_ID wordT = vcbT.storeIfNew( token[0] );
WORD_ID wordS = vcbS.storeIfNew( token[1] );
ltable[ wordS ][ wordT ] = prob;
@@ -1143,7 +1177,7 @@ void LexicalTable::load( const string &fileName )
void printSourcePhrase(const PHRASE *phraseSource, const PHRASE *phraseTarget,
- const ALIGNMENT *targetToSourceAlignment, ostream &out)
+ const ALIGNMENT *targetToSourceAlignment, std::ostream &out)
{
// get corresponding target non-terminal and output pair
ALIGNMENT *sourceToTargetAlignment = new ALIGNMENT();
@@ -1175,7 +1209,7 @@ void printSourcePhrase(const PHRASE *phraseSource, const PHRASE *phraseTarget,
void printTargetPhrase(const PHRASE *phraseSource, const PHRASE *phraseTarget,
- const ALIGNMENT *targetToSourceAlignment, ostream &out)
+ const ALIGNMENT *targetToSourceAlignment, std::ostream &out)
{
// output target symbols, except root, in rule table format
for (std::size_t i = 0; i < phraseTarget->size()-1; ++i) {
diff --git a/phrase-extract/score-stsg/ScoreStsg.cpp b/phrase-extract/score-stsg/ScoreStsg.cpp
index 09395e21e..f6df0d0da 100644
--- a/phrase-extract/score-stsg/ScoreStsg.cpp
+++ b/phrase-extract/score-stsg/ScoreStsg.cpp
@@ -35,7 +35,7 @@ namespace ScoreStsg
const int ScoreStsg::kCountOfCountsMax = 10;
ScoreStsg::ScoreStsg()
- : m_name("score-stsg")
+ : Tool("score-stsg")
, m_lexTable(m_srcVocab, m_tgtVocab)
, m_countOfCounts(kCountOfCountsMax, 0)
, m_totalDistinct(0)
@@ -300,17 +300,6 @@ double ScoreStsg::ComputeLexProb(const std::vector<RuleSymbol> &sourceFrontier,
return lexScore;
}
-void ScoreStsg::OpenOutputFileOrDie(const std::string &filename,
- Moses::OutputFileStream &stream)
-{
- bool ret = stream.Open(filename);
- if (!ret) {
- std::ostringstream msg;
- msg << "failed to open output file: " << filename;
- Error(msg.str());
- }
-}
-
void ScoreStsg::ProcessOptions(int argc, char *argv[], Options &options) const
{
namespace po = boost::program_options;
@@ -319,7 +308,7 @@ void ScoreStsg::ProcessOptions(int argc, char *argv[], Options &options) const
// Construct the 'top' of the usage message: the bit that comes before the
// options list.
std::ostringstream usageTop;
- usageTop << "Usage: " << GetName()
+ usageTop << "Usage: " << name()
<< " [OPTION]... EXTRACT LEX TABLE\n\n"
<< "STSG rule scorer\n\n"
<< "Options";
@@ -386,11 +375,8 @@ void ScoreStsg::ProcessOptions(int argc, char *argv[], Options &options) const
// Process the command-line.
po::variables_map vm;
- const int optionStyle = cls::allow_long
- | cls::long_allow_adjacent
- | cls::long_allow_next;
try {
- po::store(po::command_line_parser(argc, argv).style(optionStyle).
+ po::store(po::command_line_parser(argc, argv).style(MosesOptionStyle()).
options(cmdLineOptions).positional(p).run(), vm);
po::notify(vm);
} catch (const std::exception &e) {
@@ -440,12 +426,6 @@ void ScoreStsg::ProcessOptions(int argc, char *argv[], Options &options) const
}
}
-void ScoreStsg::Error(const std::string &msg) const
-{
- std::cerr << GetName() << ": " << msg << std::endl;
- std::exit(1);
-}
-
} // namespace ScoreStsg
} // namespace Syntax
} // namespace MosesTraining
diff --git a/phrase-extract/score-stsg/ScoreStsg.h b/phrase-extract/score-stsg/ScoreStsg.h
index 628c0080e..1757e181b 100644
--- a/phrase-extract/score-stsg/ScoreStsg.h
+++ b/phrase-extract/score-stsg/ScoreStsg.h
@@ -9,6 +9,8 @@
#include "ExtractionPhrasePair.h"
#include "OutputFileStream.h"
+#include "syntax-common/tool.h"
+
#include "LexicalTable.h"
#include "Options.h"
#include "RuleSymbol.h"
@@ -25,16 +27,12 @@ namespace ScoreStsg
class RuleGroup;
class RuleTableWriter;
-class ScoreStsg
+class ScoreStsg : public Tool
{
public:
ScoreStsg();
- const std::string &GetName() const {
- return m_name;
- }
-
- int Main(int argc, char *argv[]);
+ virtual int Main(int argc, char *argv[]);
private:
static const int kCountOfCountsMax;
@@ -43,10 +41,6 @@ private:
const std::vector<RuleSymbol> &,
const ALIGNMENT &);
- void Error(const std::string &) const;
-
- void OpenOutputFileOrDie(const std::string &, Moses::OutputFileStream &);
-
void ParseAlignmentString(const std::string &, int,
ALIGNMENT &);
@@ -59,7 +53,6 @@ private:
void TokenizeRuleHalf(const std::string &, TokenizedRuleHalf &);
- std::string m_name;
Options m_options;
Vocabulary m_srcVocab;
Vocabulary m_tgtVocab;
diff --git a/phrase-extract/score.h b/phrase-extract/score.h
index 8ac136be9..7b5e6633e 100644
--- a/phrase-extract/score.h
+++ b/phrase-extract/score.h
@@ -17,6 +17,8 @@
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
***********************************************************************/
+#pragma once
+
#include <string>
#include <map>
diff --git a/phrase-extract/statistics-main.cpp b/phrase-extract/statistics-main.cpp
index 9d814ed76..840f18602 100644
--- a/phrase-extract/statistics-main.cpp
+++ b/phrase-extract/statistics-main.cpp
@@ -7,13 +7,14 @@
#include <fstream>
#include <vector>
#include <string>
-#include <stdlib.h>
-#include <assert.h>
-#include <time.h>
+#include <cstdlib>
+#include <cassert>
+#include <ctime>
#include "AlignmentPhrase.h"
#include "tables-core.h"
#include "InputFileStream.h"
+#include "util/tokenize.hh"
using namespace std;
using namespace MosesTraining;
@@ -63,7 +64,6 @@ int main(int argc, char* argv[])
<< "src_phrase ||| trg_phrase || freq(src_phrase, trg_phrase) freq(src_phrase) length(src_phrase) length(trg_phrase)\n"
<< "if (inverse)\n"
<< "src_phrase ||| trg_phrase || freq(src_phrase, trg_phrase) freq(trg_phrase) length(src_phrase) length(trg_phrase)\n";
- time_t starttime = time(NULL);
if (argc != 4 && argc != 5) {
cerr << "syntax: statistics extract lex phrase-table [inverse]\n";
@@ -102,7 +102,6 @@ int main(int argc, char* argv[])
int lastForeign = -1;
vector< PhraseAlignment > phrasePairsWithSameF;
int i=0;
- int fileCount = 0;
string line;
while(getline(extractFileP, line)) {
@@ -239,7 +238,7 @@ void processPhrasePairs( vector< PhraseAlignment > &phrasePair )
bool PhraseAlignment::create(const char line[], int lineID )
{
- vector< string > token = tokenize( line );
+ const vector< string > token = util::tokenize( line );
int item = 1;
PHRASE phraseF, phraseE;
for (size_t j=0; j<token.size(); j++) {
@@ -323,7 +322,7 @@ void LexicalTable::load( const string &filePath )
i++;
if (i%100000 == 0) cerr << "." << flush;
- vector<string> token = tokenize( line.c_str() );
+ const vector<string> token = util::tokenize( line );
if (token.size() != 3) {
cerr << "line " << i << " in " << filePath << " has wrong number of tokens, skipping:\n" <<
token.size() << " " << token[0] << " " << line << endl;
diff --git a/phrase-extract/pcfg-common/pcfg.cc b/phrase-extract/syntax-common/pcfg.cc
index cae6d4763..3efe04218 100644
--- a/phrase-extract/pcfg-common/pcfg.cc
+++ b/phrase-extract/syntax-common/pcfg.cc
@@ -1,22 +1,3 @@
-/***********************************************************************
- Moses - statistical machine translation system
- Copyright (C) 2006-2012 University of Edinburgh
-
- This library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- This library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with this library; if not, write to the Free Software
- Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-***********************************************************************/
-
#include "pcfg.h"
#include <cassert>
@@ -28,7 +9,6 @@
namespace MosesTraining {
namespace Syntax {
-namespace PCFG {
void Pcfg::Add(const Key &key, double score) {
rules_[key] = score;
@@ -103,6 +83,5 @@ void Pcfg::Write(const Vocabulary &vocab, std::ostream &output) const {
}
}
-} // namespace PCFG
} // namespace Syntax
} // namespace MosesTraining
diff --git a/phrase-extract/syntax-common/pcfg.h b/phrase-extract/syntax-common/pcfg.h
new file mode 100644
index 000000000..0a731cc7a
--- /dev/null
+++ b/phrase-extract/syntax-common/pcfg.h
@@ -0,0 +1,38 @@
+#pragma once
+
+#include <istream>
+#include <map>
+#include <ostream>
+#include <vector>
+
+#include "vocabulary.h"
+
+namespace MosesTraining {
+namespace Syntax {
+
+class Pcfg {
+ public:
+ typedef std::vector<std::size_t> Key;
+ typedef std::map<Key, double> Map;
+ typedef Map::iterator iterator;
+ typedef Map::const_iterator const_iterator;
+
+ Pcfg() {}
+
+ iterator begin() { return rules_.begin(); }
+ const_iterator begin() const { return rules_.begin(); }
+
+ iterator end() { return rules_.end(); }
+ const_iterator end() const { return rules_.end(); }
+
+ void Add(const Key &, double);
+ bool Lookup(const Key &, double &) const;
+ void Read(std::istream &, Vocabulary &);
+ void Write(const Vocabulary &, std::ostream &) const;
+
+ private:
+ Map rules_;
+};
+
+} // namespace Syntax
+} // namespace MosesTraining
diff --git a/phrase-extract/syntax-common/tool.cc b/phrase-extract/syntax-common/tool.cc
new file mode 100644
index 000000000..e145b78be
--- /dev/null
+++ b/phrase-extract/syntax-common/tool.cc
@@ -0,0 +1,57 @@
+#include "tool.h"
+
+#include <cstdlib>
+#include <iostream>
+#include <sstream>
+
+#include <boost/program_options/cmdline.hpp>
+
+namespace MosesTraining {
+namespace Syntax {
+
+int Tool::MosesOptionStyle() {
+ namespace cls = boost::program_options::command_line_style;
+ return cls::allow_long | cls::long_allow_adjacent | cls::long_allow_next;
+}
+
+void Tool::Warn(const std::string &msg) const {
+ std::cerr << name_ << ": warning: " << msg << std::endl;
+}
+
+void Tool::Error(const std::string &msg) const {
+ std::cerr << name_ << ": error: " << msg << std::endl;
+ std::exit(1);
+}
+
+void Tool::OpenInputFileOrDie(const std::string &filename,
+ std::ifstream &stream) {
+ stream.open(filename.c_str());
+ if (!stream) {
+ std::ostringstream msg;
+ msg << "failed to open input file: " << filename;
+ Error(msg.str());
+ }
+}
+
+void Tool::OpenOutputFileOrDie(const std::string &filename,
+ std::ofstream &stream) {
+ stream.open(filename.c_str());
+ if (!stream) {
+ std::ostringstream msg;
+ msg << "failed to open output file: " << filename;
+ Error(msg.str());
+ }
+}
+
+void Tool::OpenOutputFileOrDie(const std::string &filename,
+ Moses::OutputFileStream &stream) {
+ bool ret = stream.Open(filename);
+ if (!ret) {
+ std::ostringstream msg;
+ msg << "failed to open output file: " << filename;
+ Error(msg.str());
+ }
+}
+
+} // namespace Syntax
+} // namespace MosesTraining
diff --git a/phrase-extract/syntax-common/tool.h b/phrase-extract/syntax-common/tool.h
new file mode 100644
index 000000000..e1df8025f
--- /dev/null
+++ b/phrase-extract/syntax-common/tool.h
@@ -0,0 +1,53 @@
+#pragma once
+
+#include <fstream>
+#include <string>
+
+#include "OutputFileStream.h"
+
+namespace MosesTraining {
+namespace Syntax {
+
+/*! Base class for command-line based tools.
+ */
+class Tool {
+ public:
+ virtual ~Tool() {}
+
+ //! Get the name of the tool.
+ const std::string &name() const { return name_; }
+
+ //! Virtual main function to be provided by subclass.
+ virtual int Main(int argc, char *argv[]) = 0;
+
+ protected:
+ Tool(const std::string &name) : name_(name) {}
+
+ //! Returns a boost::program_options style that is consistent with other
+ //! Moses tools (extract-rules, score, etc.).
+ static int MosesOptionStyle();
+
+ //! Write a formatted warning message to standard error.
+ void Warn(const std::string &) const;
+
+ //! Write a formatted error message to standard error and call exit(1).
+ void Error(const std::string &msg) const;
+
+ //! Opens the named input file using the supplied ifstream. Calls Error() if
+ //! the file cannot be opened for reading.
+ void OpenInputFileOrDie(const std::string &, std::ifstream &);
+
+ //! Opens the named output file using the supplied ofstream. Calls Error() if
+ //! the file cannot be opened for writing.
+ void OpenOutputFileOrDie(const std::string &, std::ofstream &);
+
+ //! Opens the named output file using the supplied OutputFileStream. Calls
+ //! Error() if the file cannot be opened for writing.
+ void OpenOutputFileOrDie(const std::string &, Moses::OutputFileStream &);
+
+ private:
+ std::string name_;
+};
+
+} // namespace Syntax
+} // namespace MosesTraining
diff --git a/phrase-extract/syntax-common/tree-inl.h b/phrase-extract/syntax-common/tree-inl.h
index 2ba55df1a..811bae2d2 100644
--- a/phrase-extract/syntax-common/tree-inl.h
+++ b/phrase-extract/syntax-common/tree-inl.h
@@ -35,23 +35,24 @@ std::size_t Tree<T>::Depth() const {
}
template<typename T>
-class Tree<T>::PreOrderIterator {
+template<typename V>
+class Tree<T>::PreOrderIter {
public:
- PreOrderIterator();
- PreOrderIterator(Tree<T> &);
+ PreOrderIter();
+ PreOrderIter(V &);
- Tree<T> &operator*() { return *node_; }
- Tree<T> *operator->() { return node_; }
+ V &operator*() { return *node_; }
+ V *operator->() { return node_; }
- PreOrderIterator &operator++();
- PreOrderIterator operator++(int);
+ PreOrderIter &operator++();
+ PreOrderIter operator++(int);
- bool operator==(const Tree<T>::PreOrderIterator &);
- bool operator!=(const Tree<T>::PreOrderIterator &);
+ bool operator==(const PreOrderIter &);
+ bool operator!=(const PreOrderIter &);
private:
// Pointer to the current node.
- Tree<T> *node_;
+ V *node_;
// Stack of indices defining the position of node_ within the child vectors
// of its ancestors.
@@ -59,17 +60,20 @@ class Tree<T>::PreOrderIterator {
};
template<typename T>
-Tree<T>::PreOrderIterator::PreOrderIterator()
+template<typename V>
+Tree<T>::PreOrderIter<V>::PreOrderIter()
: node_(0) {
}
template<typename T>
-Tree<T>::PreOrderIterator::PreOrderIterator(Tree<T> &t)
+template<typename V>
+Tree<T>::PreOrderIter<V>::PreOrderIter(V &t)
: node_(&t) {
}
template<typename T>
-typename Tree<T>::PreOrderIterator &Tree<T>::PreOrderIterator::operator++() {
+template<typename V>
+Tree<T>::PreOrderIter<V> &Tree<T>::PreOrderIter<V>::operator++() {
// If the current node has children then visit the left-most child next.
if (!node_->children().empty()) {
index_stack_.push(0);
@@ -79,7 +83,7 @@ typename Tree<T>::PreOrderIterator &Tree<T>::PreOrderIterator::operator++() {
// Otherwise, try node's ancestors until either a node is found with a
// sibling to the right or we reach the root (in which case the traversal
// is complete).
- Tree<T> *ancestor = node_->parent_;
+ V *ancestor = node_->parent_;
while (ancestor) {
std::size_t index = index_stack_.top();
index_stack_.pop();
@@ -95,19 +99,109 @@ typename Tree<T>::PreOrderIterator &Tree<T>::PreOrderIterator::operator++() {
}
template<typename T>
-typename Tree<T>::PreOrderIterator Tree<T>::PreOrderIterator::operator++(int) {
- PreOrderIterator tmp(*this);
+template<typename V>
+Tree<T>::PreOrderIter<V> Tree<T>::PreOrderIter<V>::operator++(int) {
+ PreOrderIter tmp(*this);
++*this;
return tmp;
}
template<typename T>
-bool Tree<T>::PreOrderIterator::operator==(const PreOrderIterator &rhs) {
+template<typename V>
+bool Tree<T>::PreOrderIter<V>::operator==(const PreOrderIter &rhs) {
return node_ == rhs.node_;
}
template<typename T>
-bool Tree<T>::PreOrderIterator::operator!=(const PreOrderIterator &rhs) {
+template<typename V>
+bool Tree<T>::PreOrderIter<V>::operator!=(const PreOrderIter &rhs) {
+ return node_ != rhs.node_;
+}
+
+template<typename T>
+template<typename V>
+class Tree<T>::LeafIter {
+ public:
+ LeafIter();
+ LeafIter(V &);
+
+ V &operator*() { return *node_; }
+ V *operator->() { return node_; }
+
+ LeafIter &operator++();
+ LeafIter operator++(int);
+
+ bool operator==(const LeafIter &);
+ bool operator!=(const LeafIter &);
+
+ private:
+ // Pointer to the current node.
+ V *node_;
+
+ // Stack of indices defining the position of node_ within the child vectors
+ // of its ancestors.
+ std::stack<std::size_t> index_stack_;
+};
+
+template<typename T>
+template<typename V>
+Tree<T>::LeafIter<V>::LeafIter()
+ : node_(0) {
+}
+
+template<typename T>
+template<typename V>
+Tree<T>::LeafIter<V>::LeafIter(V &t)
+ : node_(&t) {
+ // Navigate to the first leaf.
+ while (!node_->IsLeaf()) {
+ index_stack_.push(0);
+ node_ = node_->children()[0];
+ }
+}
+
+template<typename T>
+template<typename V>
+Tree<T>::LeafIter<V> &Tree<T>::LeafIter<V>::operator++() {
+ // Try node's ancestors until either a node is found with a sibling to the
+ // right or we reach the root (in which case the traversal is complete).
+ V *ancestor = node_->parent_;
+ while (ancestor) {
+ std::size_t index = index_stack_.top();
+ index_stack_.pop();
+ if (index+1 < ancestor->children_.size()) {
+ index_stack_.push(index+1);
+ node_ = ancestor->children()[index+1];
+ // Navigate to the first leaf.
+ while (!node_->IsLeaf()) {
+ index_stack_.push(0);
+ node_ = node_->children()[0];
+ }
+ return *this;
+ }
+ ancestor = ancestor->parent_;
+ }
+ node_ = 0;
+ return *this;
+}
+
+template<typename T>
+template<typename V>
+Tree<T>::LeafIter<V> Tree<T>::LeafIter<V>::operator++(int) {
+ LeafIter tmp(*this);
+ ++*this;
+ return tmp;
+}
+
+template<typename T>
+template<typename V>
+bool Tree<T>::LeafIter<V>::operator==(const LeafIter &rhs) {
+ return node_ == rhs.node_;
+}
+
+template<typename T>
+template<typename V>
+bool Tree<T>::LeafIter<V>::operator!=(const LeafIter &rhs) {
return node_ != rhs.node_;
}
diff --git a/phrase-extract/syntax-common/tree.h b/phrase-extract/syntax-common/tree.h
index 52adaa699..8cec07a54 100644
--- a/phrase-extract/syntax-common/tree.h
+++ b/phrase-extract/syntax-common/tree.h
@@ -61,23 +61,31 @@ class Tree {
//
// All iterators are forward iterators. Example use:
//
- // Tree<int> &root = GetMeATree();
- // for (Tree<int>::PreOrderIterator p(root);
- // p != Tree<int>::PreOrderIterator(); ++p) {
- // std::cout << p->value() << " ";
+ // const Tree<int> &root = GetMeATree();
+ // for (Tree<int>::ConstPreOrderIterator p(root);
+ // p != Tree<int>::ConstPreOrderIterator(); ++p) {
+ // std::cout << p->value() << "\n";
// }
+ private:
+ // Use templates to avoid code duplication between const and non-const
+ // iterators. V is the value type: either Tree<T> or const Tree<T>.
+ template<typename V> class PreOrderIter;
+ // template<typename V> class PostOrderIter; TODO
+ template<typename V> class LeafIter;
+
+ public:
// Pre-order iterators.
- class PreOrderIterator;
- // class ConstPreOrderIterator; TODO
+ typedef PreOrderIter<Tree<T> > PreOrderIterator;
+ typedef PreOrderIter<const Tree<T> > ConstPreOrderIterator;
// Post-order iterators.
- // class PostOrderIterator; TODO
- // class ConstPostOrderIterator; TODO
+ // typedef PostOrderIter<Tree<T> > PostOrderIterator; TODO
+ // typedef PostOrderIter<const Tree<T> > ConstPostOrderIterator; TODO
// Leaf iterators (left-to-right).
- // class LeafIterator; TODO
- // class ConstLeafIterator; TODO
+ typedef LeafIter<Tree<T> > LeafIterator;
+ typedef LeafIter<const Tree<T> > ConstLeafIterator;
private:
T value_;
diff --git a/phrase-extract/syntax-common/tree_test.cc b/phrase-extract/syntax-common/tree_test.cc
index 0a54ad3f1..8e689f000 100644
--- a/phrase-extract/syntax-common/tree_test.cc
+++ b/phrase-extract/syntax-common/tree_test.cc
@@ -61,6 +61,86 @@ BOOST_AUTO_TEST_CASE(pre_order_2) {
BOOST_REQUIRE(p == end);
}
+// Test Tree<>::ConstPreOrderIterator on this tree: (1 (2 (3 (4 (5) (6)))) (7))
+BOOST_AUTO_TEST_CASE(const_pre_order_1) {
+ boost::scoped_ptr<Tree<int> > root(new Tree<int>(1));
+ root->children().push_back(new Tree<int>(2));
+ root->children()[0]->children().push_back(new Tree<int>(3));
+ root->children()[0]->children()[0]->children().push_back(new Tree<int>(4));
+ root->children()[0]->children()[0]->children()[0]->children().push_back(
+ new Tree<int>(5));
+ root->children()[0]->children()[0]->children()[0]->children().push_back(
+ new Tree<int>(6));
+ root->children().push_back(new Tree<int>(7));
+ root->SetParents();
+
+ Tree<int>::ConstPreOrderIterator p(*root);
+ Tree<int>::ConstPreOrderIterator end;
+
+ BOOST_REQUIRE(p != end);
+ BOOST_REQUIRE(p->value() == 1);
+ ++p;
+ BOOST_REQUIRE(p != end);
+ BOOST_REQUIRE(p->value() == 2);
+ ++p;
+ BOOST_REQUIRE(p != end);
+ BOOST_REQUIRE(p->value() == 3);
+ ++p;
+ BOOST_REQUIRE(p != end);
+ BOOST_REQUIRE(p->value() == 4);
+ ++p;
+ BOOST_REQUIRE(p != end);
+ BOOST_REQUIRE(p->value() == 5);
+ ++p;
+ BOOST_REQUIRE(p != end);
+ BOOST_REQUIRE(p->value() == 6);
+ ++p;
+ BOOST_REQUIRE(p != end);
+ BOOST_REQUIRE(p->value() == 7);
+ ++p;
+ BOOST_REQUIRE(p == end);
+}
+
+// Test Tree<>::LeafIterator with a trivial, single-node tree.
+BOOST_AUTO_TEST_CASE(leaf_1) {
+ boost::scoped_ptr<Tree<int> > root(new Tree<int>(123));
+ Tree<int>::LeafIterator p(*root);
+ BOOST_REQUIRE(p != Tree<int>::LeafIterator());
+ BOOST_REQUIRE(p->value() == 123);
+ ++p;
+ BOOST_REQUIRE(p == Tree<int>::LeafIterator());
+}
+
+// Test Tree<>::LeafIterator on this tree: (1 (2 3) (4) (5 6 (7 8)))
+BOOST_AUTO_TEST_CASE(leaf_2) {
+ boost::scoped_ptr<Tree<int> > root(new Tree<int>(1));
+ root->children().push_back(new Tree<int>(2));
+ root->children()[0]->children().push_back(new Tree<int>(3));
+ root->children().push_back(new Tree<int>(4));
+ root->children().push_back(new Tree<int>(5));
+ root->children()[2]->children().push_back(new Tree<int>(6));
+ root->children()[2]->children().push_back(new Tree<int>(7));
+ root->children()[2]->children()[1]->children().push_back(new Tree<int>(8));
+ root->SetParents();
+
+ Tree<int>::LeafIterator p(*root);
+ Tree<int>::LeafIterator end;
+
+ BOOST_REQUIRE(p != end);
+ BOOST_REQUIRE(p->value() == 3);
+ ++p;
+ BOOST_REQUIRE(p != end);
+ BOOST_REQUIRE(p->value() == 4);
+ ++p;
+ BOOST_REQUIRE(p != end);
+ BOOST_REQUIRE(p->value() == 6);
+ ++p;
+ BOOST_REQUIRE(p != end);
+ BOOST_REQUIRE(p->value() == 8);
+ ++p;
+ BOOST_REQUIRE(p == end);
+}
+
} // namespace
} // namespace Syntax
} // namespace MosesTraining
diff --git a/phrase-extract/syntax-common/vocabulary.h b/phrase-extract/syntax-common/vocabulary.h
new file mode 100644
index 000000000..119767245
--- /dev/null
+++ b/phrase-extract/syntax-common/vocabulary.h
@@ -0,0 +1,13 @@
+#pragma once
+
+#include <string>
+
+#include "numbered_set.h"
+
+namespace MosesTraining {
+namespace Syntax {
+
+typedef NumberedSet<std::string> Vocabulary;
+
+} // namespace Syntax
+} // namespace MosesTraining
diff --git a/phrase-extract/syntax-common/xml_tree_parser.cc b/phrase-extract/syntax-common/xml_tree_parser.cc
index c4363a3e2..8bd511522 100644
--- a/phrase-extract/syntax-common/xml_tree_parser.cc
+++ b/phrase-extract/syntax-common/xml_tree_parser.cc
@@ -1,58 +1,64 @@
#include "xml_tree_parser.h"
+#include <cassert>
+#include <vector>
+
+#include "util/tokenize.hh"
+
+#include "SyntaxTree.h"
#include "tables-core.h"
#include "XmlException.h"
#include "XmlTree.h"
-#include <cassert>
-#include <vector>
+#include "exception.h"
namespace MosesTraining {
namespace Syntax {
-StringTree *XmlTreeParser::Parse(const std::string &line) {
- line_ = line;
- tree_.Clear();
+std::auto_ptr<SyntaxTree> XmlTreeParser::Parse(const std::string &line,
+ bool unescape)
+{
+ sentence_ = line;
+ node_collection_.Clear();
try {
- if (!ProcessAndStripXMLTags(line_, tree_, label_set_, top_label_set_,
- false)) {
+ if (!ProcessAndStripXMLTags(sentence_, node_collection_, label_set_,
+ top_label_set_, unescape)) {
throw Exception("");
}
} catch (const XmlException &e) {
throw Exception(e.getMsg());
}
- tree_.ConnectNodes();
- SyntaxNode *root = tree_.GetTop();
- assert(root);
- words_ = tokenize(line_.c_str());
- return ConvertTree(*root, words_);
+ std::auto_ptr<SyntaxTree> root = node_collection_.ExtractTree();
+ words_ = util::tokenize(sentence_);
+ AttachWords(words_, *root);
+ return root;
}
-// Converts a SyntaxNode tree to a StringTree.
-StringTree *XmlTreeParser::ConvertTree(const SyntaxNode &tree,
- const std::vector<std::string> &words) {
- StringTree *root = new StringTree(tree.GetLabel());
- const std::vector<SyntaxNode*> &children = tree.GetChildren();
- if (children.empty()) {
- if (tree.GetStart() != tree.GetEnd()) {
+void XmlTreeParser::AttachWords(const std::vector<std::string> &words,
+ SyntaxTree &root)
+{
+ std::vector<SyntaxTree*> leaves;
+ leaves.reserve(words.size());
+ for (SyntaxTree::LeafIterator p(root); p != SyntaxTree::LeafIterator(); ++p) {
+ leaves.push_back(&*p);
+ }
+
+ std::vector<std::string>::const_iterator q = words.begin();
+ for (std::vector<SyntaxTree*>::iterator p = leaves.begin(); p != leaves.end();
+ ++p) {
+ SyntaxTree *leaf = *p;
+ const int start = leaf->value().start;
+ const int end = leaf->value().end;
+ if (start != end) {
std::ostringstream msg;
- msg << "leaf node covers multiple words (" << tree.GetStart()
- << "-" << tree.GetEnd() << "): this is currently unsupported";
+ msg << "leaf node covers multiple words (" << start << "-" << end
+ << "): this is currently unsupported";
throw Exception(msg.str());
}
- StringTree *leaf = new StringTree(words[tree.GetStart()]);
- leaf->parent() = root;
- root->children().push_back(leaf);
- } else {
- for (std::vector<SyntaxNode*>::const_iterator p = children.begin();
- p != children.end(); ++p) {
- assert(*p);
- StringTree *child = ConvertTree(**p, words);
- child->parent() = root;
- root->children().push_back(child);
- }
+ SyntaxTree *newLeaf = new SyntaxTree(SyntaxNode(*q++, start, end));
+ leaf->children().push_back(newLeaf);
+ newLeaf->parent() = leaf;
}
- return root;
}
} // namespace Syntax
diff --git a/phrase-extract/syntax-common/xml_tree_parser.h b/phrase-extract/syntax-common/xml_tree_parser.h
index a5563f63a..48ea056b8 100644
--- a/phrase-extract/syntax-common/xml_tree_parser.h
+++ b/phrase-extract/syntax-common/xml_tree_parser.h
@@ -1,32 +1,56 @@
#pragma once
#include <map>
+#include <memory>
#include <set>
#include <string>
#include <vector>
+#include "SyntaxNodeCollection.h"
#include "SyntaxTree.h"
-#include "exception.h"
-#include "string_tree.h"
-
namespace MosesTraining {
namespace Syntax {
-// Parses a string in Moses' XML parse tree format and returns a StringTree
-// object. This is a wrapper around the ProcessAndStripXMLTags function.
+/** Parses string representations of parse trees in Moses' XML format and
+ * converts them to SyntaxTree objects.
+ *
+ * This is a thin wrapper around the ProcessAndStripXMLTags function. After
+ * calling Parse(), the output of the ProcessAndStripXMLTags function (the
+ * sentence, node collection, label set, and top label set) are available via
+ * accessors.
+ */
class XmlTreeParser {
public:
- StringTree *Parse(const std::string &);
+ //! Parse a single sentence and return a SyntaxTree (with words attached).
+ std::auto_ptr<SyntaxTree> Parse(const std::string &, bool unescape=false);
+
+ //! Get the sentence string (as returned by ProcessAndStripXMLTags).
+ const std::string &sentence() const { return sentence_; }
+
+ //! Get the sentence as a vector of words.
+ const std::vector<std::string> &words() const { return words_; }
+
+ //! Get the node collection (as returned by ProcessAndStripXMLTags).
+ const SyntaxNodeCollection &node_collection() const {
+ return node_collection_;
+ }
+
+ //! Get the label set (as returned by ProcessAndStripXMLTags).
+ const std::set<std::string> &label_set() const { return label_set_; }
+
+ //! Get the top label set (as returned by ProcessAndStripXMLTags).
+ const std::map<std::string, int> &top_label_set() const {
+ return top_label_set_;
+ }
private:
- static StringTree *ConvertTree(const MosesTraining::SyntaxNode &,
- const std::vector<std::string> &);
+ void AttachWords(const std::vector<std::string> &, SyntaxTree &);
+ std::string sentence_;
+ SyntaxNodeCollection node_collection_;
std::set<std::string> label_set_;
std::map<std::string, int> top_label_set_;
- std::string line_;
- MosesTraining::SyntaxTree tree_;
std::vector<std::string> words_;
};
diff --git a/phrase-extract/syntax-common/xml_tree_writer.cc b/phrase-extract/syntax-common/xml_tree_writer.cc
new file mode 100644
index 000000000..d17937fa8
--- /dev/null
+++ b/phrase-extract/syntax-common/xml_tree_writer.cc
@@ -0,0 +1,82 @@
+#include "xml_tree_writer.h"
+
+#include <cassert>
+#include <ostream>
+#include <vector>
+#include <string>
+
+#include "SyntaxTree.h"
+#include "XmlTree.h"
+
+
+namespace MosesTraining {
+namespace Syntax {
+
+void XmlTreeWriter::Write(const SyntaxTree &tree) const {
+ assert(!tree.IsLeaf());
+
+ // Opening tag
+ out_ << "<tree label=\"" << Escape(tree.value().label) << "\"";
+ for (SyntaxNode::AttributeMap::const_iterator
+ p = tree.value().attributes.begin();
+ p != tree.value().attributes.end(); ++p) {
+ if (p->first != "label") {
+ out_ << " " << p->first << "=\"" << p->second << "\"";
+ }
+ }
+ out_ << ">";
+
+ // Children
+ for (std::vector<SyntaxTree *>::const_iterator p = tree.children().begin();
+ p != tree.children().end(); ++p) {
+ SyntaxTree &child = **p;
+ if (child.IsLeaf()) {
+ out_ << " " << Escape(child.value().label);
+ } else {
+ out_ << " ";
+ Write(child);
+ }
+ }
+
+ // Closing tag
+ out_ << " </tree>";
+
+ if (tree.parent() == 0) {
+ out_ << std::endl;
+ }
+}
+
+// Escapes XML special characters.
+std::string XmlTreeWriter::Escape(const std::string &s) const {
+ if (!escape_) {
+ return s;
+ }
+ std::string t;
+ std::size_t len = s.size();
+ t.reserve(len);
+ for (std::size_t i = 0; i < len; ++i) {
+ if (s[i] == '<') {
+ t += "&lt;";
+ } else if (s[i] == '>') {
+ t += "&gt;";
+ } else if (s[i] == '[') {
+ t += "&#91;";
+ } else if (s[i] == ']') {
+ t += "&#93;";
+ } else if (s[i] == '|') {
+ t += "&#124;";
+ } else if (s[i] == '&') {
+ t += "&amp;";
+ } else if (s[i] == '\'') {
+ t += "&apos;";
+ } else if (s[i] == '"') {
+ t += "&quot;";
+ } else {
+ t += s[i];
+ }
+ }
+ return t;
+}
+
+} // namespace Syntax
+} // namespace MosesTraining
diff --git a/phrase-extract/syntax-common/xml_tree_writer.h b/phrase-extract/syntax-common/xml_tree_writer.h
new file mode 100644
index 000000000..b39d01fab
--- /dev/null
+++ b/phrase-extract/syntax-common/xml_tree_writer.h
@@ -0,0 +1,27 @@
+#pragma once
+
+#include <ostream>
+#include <string>
+
+#include "SyntaxTree.h"
+
+namespace MosesTraining {
+namespace Syntax {
+
+class XmlTreeWriter {
+ public:
+ XmlTreeWriter(std::ostream &out, bool escape=true)
+ : out_(out)
+ , escape_(escape) {}
+
+ void Write(const SyntaxTree &) const;
+
+ private:
+ std::string Escape(const std::string &) const;
+
+ std::ostream &out_;
+ bool escape_;
+};
+
+} // namespace Syntax
+} // namespace MosesTraining
diff --git a/phrase-extract/tables-core.cpp b/phrase-extract/tables-core.cpp
index 30c1544e9..4dd8e704a 100644
--- a/phrase-extract/tables-core.cpp
+++ b/phrase-extract/tables-core.cpp
@@ -1,5 +1,6 @@
// $Id$
//#include "beammain.h"
+#include "util/tokenize.hh"
#include "tables-core.h"
#define TABLE_LINE_MAX_LENGTH 1000
@@ -7,37 +8,9 @@
using namespace std;
-// as in beamdecoder/tables.cpp
-vector<string> tokenize( const char* input )
-{
- vector< string > token;
- bool betweenWords = true;
- int start=0;
- int i=0;
- for(; input[i] != '\0'; i++) {
- bool isSpace = (input[i] == ' ' || input[i] == '\t');
-
- if (!isSpace && betweenWords) {
- start = i;
- betweenWords = false;
- } else if (isSpace && !betweenWords) {
- token.push_back( string( input+start, i-start ) );
- betweenWords = true;
- }
- }
- if (!betweenWords)
- token.push_back( string( input+start, i-start ) );
- return token;
-}
-
namespace MosesTraining
{
-bool isNonTerminal( const WORD &symbol )
-{
- return symbol.substr(0, 1) == "[" && symbol.substr(symbol.size()-1, 1) == "]";
-}
-
WORD_ID Vocabulary::storeIfNew( const WORD& word )
{
map<WORD, WORD_ID>::iterator i = lookup.find( word );
@@ -107,7 +80,7 @@ void DTable::load( const string& fileName )
abort();
}
- vector<string> token = tokenize(line.c_str());
+ const vector<string> token = util::tokenize(line);
if (token.size() < 2) {
cerr << "line " << i << " in " << fileName << " too short, skipping\n";
continue;
diff --git a/phrase-extract/tables-core.h b/phrase-extract/tables-core.h
index 9662ced2a..011fe09e6 100644
--- a/phrase-extract/tables-core.h
+++ b/phrase-extract/tables-core.h
@@ -5,15 +5,13 @@
#include <iostream>
#include <fstream>
-#include <assert.h>
-#include <stdlib.h>
+#include <cassert>
+#include <cstdlib>
#include <string>
#include <queue>
#include <map>
#include <cmath>
-extern std::vector<std::string> tokenize( const char*);
-
namespace MosesTraining
{