Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/marian-nmt/marian.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMarcin Junczys-Dowmunt <marcinjd@microsoft.com>2018-12-07 00:21:25 +0300
committerMarcin Junczys-Dowmunt <marcinjd@microsoft.com>2018-12-07 00:21:25 +0300
commit1b2968c8b9465ce2225f304f5deea7f642f3e533 (patch)
tree3dc8fd194c28a635ac57dea9951ba74f9c21cef6
parent9562338ff78e226caad84ac29aa0be4e8b344368 (diff)
parente78d805955a5613e91cc3f2af1db2776a6c6e3da (diff)
Merge branch 'master' into nccl
-rwxr-xr-x.gitignore2
-rw-r--r--.gitmodules3
-rw-r--r--CHANGELOG.md14
-rw-r--r--CMakeLists.txt38
-rw-r--r--VERSION2
m---------examples0
-rw-r--r--src/3rd_party/CMakeLists.txt5
-rw-r--r--src/3rd_party/pathie-cpp/CHANGELOG52
-rw-r--r--src/3rd_party/pathie-cpp/CMakeLists.txt8
-rw-r--r--src/3rd_party/pathie-cpp/LICENSE24
-rw-r--r--src/3rd_party/pathie-cpp/README.md359
-rw-r--r--src/3rd_party/pathie-cpp/include/entry_iterator.hpp119
-rw-r--r--src/3rd_party/pathie-cpp/include/errors.hpp119
-rw-r--r--src/3rd_party/pathie-cpp/include/path.hpp377
-rw-r--r--src/3rd_party/pathie-cpp/include/pathie.hpp67
-rw-r--r--src/3rd_party/pathie-cpp/include/pathie_ifstream.hpp111
-rw-r--r--src/3rd_party/pathie-cpp/include/pathie_ofstream.hpp192
-rw-r--r--src/3rd_party/pathie-cpp/include/temp.hpp83
-rw-r--r--src/3rd_party/pathie-cpp/src/entry_iterator.cpp279
-rw-r--r--src/3rd_party/pathie-cpp/src/errors.cpp150
-rw-r--r--src/3rd_party/pathie-cpp/src/path.cpp3348
-rw-r--r--src/3rd_party/pathie-cpp/src/pathie.cpp226
-rw-r--r--src/3rd_party/pathie-cpp/src/pathie_ifstream.cpp320
-rw-r--r--src/3rd_party/pathie-cpp/src/pathie_ofstream.cpp326
-rw-r--r--src/3rd_party/pathie-cpp/src/temp.cpp197
m---------src/3rd_party/sentencepiece0
-rw-r--r--src/3rd_party/zstr/LICENSE21
-rw-r--r--src/3rd_party/zstr/README.org54
-rw-r--r--src/3rd_party/zstr/strict_fstream.hpp202
-rw-r--r--src/3rd_party/zstr/zstr.hpp411
-rw-r--r--src/CMakeLists.txt10
-rwxr-xr-xsrc/command/marian_vocab.cpp4
-rwxr-xr-xsrc/common/cli_wrapper.cpp89
-rwxr-xr-xsrc/common/cli_wrapper.h143
-rwxr-xr-xsrc/common/config.cpp2
-rwxr-xr-xsrc/common/config_parser.cpp113
-rwxr-xr-xsrc/common/config_parser.h2
-rwxr-xr-xsrc/common/config_validator.cpp3
-rwxr-xr-xsrc/common/file_stream.h149
-rwxr-xr-xsrc/common/filesystem.h65
-rwxr-xr-xsrc/common/logging.cpp10
-rwxr-xr-xsrc/common/logging.h33
-rwxr-xr-xsrc/common/timer.h8
-rwxr-xr-xsrc/common/version.cpp8
-rwxr-xr-xsrc/common/version.h2
-rwxr-xr-xsrc/data/corpus.cpp10
-rwxr-xr-xsrc/data/corpus_base.cpp30
-rwxr-xr-xsrc/data/default_vocab.cpp101
-rwxr-xr-xsrc/data/sentencepiece_vocab.cpp179
-rw-r--r--src/data/types.h23
-rwxr-xr-xsrc/data/vocab.cpp45
-rwxr-xr-xsrc/data/vocab.h17
-rw-r--r--src/data/vocab_base.h10
-rw-r--r--src/examples/mnist/model_lenet.h4
-rwxr-xr-xsrc/graph/node_operators_binary.h51
-rw-r--r--src/graph/node_operators_unary.h6
-rwxr-xr-xsrc/layers/loss.cpp12
-rw-r--r--src/layers/loss.h6
-rwxr-xr-xsrc/layers/word2vec_reader.h10
-rw-r--r--src/models/char_s2s.h2
-rwxr-xr-xsrc/models/hardatt.h303
-rw-r--r--src/models/model_factory.cpp43
-rw-r--r--src/rescorer/rescorer.h32
-rw-r--r--src/rescorer/score_collector.cpp9
-rwxr-xr-xsrc/tensors/cpu/prod.cpp4
-rwxr-xr-xsrc/training/scheduler.h5
-rwxr-xr-xsrc/translator/output_collector.cpp4
-rwxr-xr-xsrc/translator/output_collector.h7
-rwxr-xr-xsrc/translator/translator.h20
69 files changed, 7821 insertions, 862 deletions
diff --git a/.gitignore b/.gitignore
index 2931bf0a..80080441 100755
--- a/.gitignore
+++ b/.gitignore
@@ -62,5 +62,3 @@ examples/mnist/*ubyte
.vs
.vscode
-# SentencePiece is automatically downloaded when requested
-src/3rd_party/sentencepiece/
diff --git a/.gitmodules b/.gitmodules
index 903659e7..623b7060 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,3 +1,6 @@
[submodule "examples"]
path = examples
url = https://github.com/marian-nmt/marian-examples
+[submodule "src/3rd_party/sentencepiece"]
+ path = src/3rd_party/sentencepiece
+ url = https://github.com/marian-nmt/sentencepiece
diff --git a/CHANGELOG.md b/CHANGELOG.md
index ce5fb3a4..a2c2e48d 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,9 +7,22 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
## [Unreleased]
+### Fixed
+- Errors due to warnings
+
+### Changed
+- Set nearly all warnings as errors for Marian's own targets. Disable warnings for 3rd party.
+
+## [1.7.0] - 2018-11-27
+
### Added
- Word alignment generation in scorer
- Attention output generation in decoder and scorer with `--alignment soft`
+- Support for SentencePiece vocabularies and run-time segmentation/desegmentation
+- Support for SentencePiece vocabulary training during model training
+- Group training files by filename when creating vocabularies for joint vocabularies
+- Updated examples
+- Synchronous multi-node training (early version)
### Fixed
- Delayed output in line-by-line translation
@@ -17,6 +30,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
### Changed
- Generated word alignments include alignments for target EOS tokens
- Boost::program_options has been replaced by another CLI library
+- Replace boost::file_system with Pathie
- Expansion of unambiguous command-line arguments is no longer supported
## [1.6.0] - 2018-08-08
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 1293d39a..c585b9f4 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,6 +1,11 @@
cmake_minimum_required(VERSION 3.5.1)
set(CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake)
+if (POLICY CMP0074)
+ cmake_policy(SET CMP0074 NEW) # CMake 3.12
+endif ()
+
+
project(marian CXX C)
set(CMAKE_CXX_STANDARD 11)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
@@ -38,33 +43,33 @@ if(MSVC)
set(CMAKE_STATIC_LINKER_FLAGS "${CMAKE_STATIC_LINKER_FLAGS} /LTCG:incremental")
else()
- set(CMAKE_CXX_FLAGS " -std=c++11 -O3 -Ofast -m64 -pthread -march=${BUILD_ARCH} -msse4.1 -Wl,--no-as-needed -funroll-loops -ffinite-math-only -fPIC -Wno-unused-result -Wno-deprecated -Werror -Wno-pragmas")
+ set(DISABLE_GLOBALLY "-Wno-unused-result")
+
+ # These are used in src/CMakeLists.txt on a per-target basis
+ list(APPEND ALL_WARNINGS -Wall; -Werror; -Wno-unused-result; -Wno-deprecated; -Wno-pragmas; -Wno-unused-parameter; -Wextra; -Wno-unused-function;
+ -Wno-unused-value; -Wno-unknown-pragmas; -Wno-sign-compare; -Wno-missing-field-initializers)
+
+ # This warning does not exist prior to gcc 5.0
+ if(CMAKE_COMPILER_IS_GNUCC AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 5.0)
+ list(APPEND ALL_WARNINGS -Wsuggest-override)
+ endif()
+
+ set(CMAKE_CXX_FLAGS "-std=c++11 -O3 -Ofast -m64 -pthread -march=${BUILD_ARCH} -msse4.1 -Wl,--no-as-needed -funroll-loops -ffinite-math-only -fPIC ${DISABLE_GLOBALLY}")
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS} -g -rdynamic")
- set(CMAKE_CXX_FLAGS_DEBUG " -std=c++11 -g -rdynamic -O0 -pthread -Wl,--no-as-needed -fPIC -Wno-unused-result -Wno-deprecated -Werror -Wno-pragmas")
+ set(CMAKE_CXX_FLAGS_DEBUG "-std=c++11 -g -rdynamic -O0 -pthread -Wl,--no-as-needed -fPIC -Wno-unused-result -Wno-deprecated -Werror -Wno-pragmas")
set(CMAKE_CXX_FLAGS_SLIM "${CMAKE_CXX_FLAGS} -DNDEBUG")
- set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELEASE} -pg -g -rdynamic -Wall -Wextra -Wsuggest-override -Wno-unused-value -Wno-unknown-pragmas -Wno-sign-compare -Wno-missing-field-initializers")
+ set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS} -g -rdynamic")
set(CMAKE_CXX_FLAGS_PROFILE "${CMAKE_CXX_FLAGS_RELEASE} -pg -g -rdynamic")
set(CMAKE_CXX_FLAGS_PROFGEN "${CMAKE_CXX_FLAGS_RELEASE} -fprofile-generate -fprofile-correction")
set(CMAKE_CXX_FLAGS_PROFUSE "${CMAKE_CXX_FLAGS_RELEASE} -fprofile-use -fprofile-correction")
-endif()
+ endif()
# Downloading SentencePiece if requested and set to compile with it.
# Requires all the dependencies imposed by SentencePiece
if(USE_SENTENCEPIECE)
- message(STATUS "Using SentencePiece from our fork https://github.com/marian-nmt/sentencepiece.git")
- if(NOT EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/src/3rd_party/sentencepiece)
- execute_process(COMMAND git clone https://github.com/marian-nmt/sentencepiece.git
- WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/src/3rd_party
- RESULT_VARIABLE git_result
- ERROR_QUIET)
- message(STATUS "Downloaded SentencePiece [code: ${git_result}]")
- else()
- message(STATUS "It seems that SentencePiece has already been downloaded. Reusing.")
- endif()
-
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DUSE_SENTENCEPIECE")
LIST(APPEND CUDA_NVCC_FLAGS -DUSE_SENTENCEPIECE; )
- set(EXT_LIBS ${EXT_LIBS} sentencepiece)
+ set(EXT_LIBS ${EXT_LIBS} sentencepiece sentencepiece_train)
endif()
@@ -121,6 +126,7 @@ else(CMAKE_BUILD_TYPE STREQUAL "Debug")
list(APPEND CUDA_NVCC_FLAGS --default-stream per-thread; -O3; -g; --use_fast_math; -arch=sm_30; -gencode=arch=compute_30,code=sm_30; -gencode=arch=compute_50,code=sm_50; -gencode=arch=compute_52,code=sm_52; -gencode=arch=compute_60,code=sm_60; -gencode=arch=compute_61,code=sm_61; -gencode=arch=compute_61,code=compute_61 ;)
endif(CMAKE_BUILD_TYPE STREQUAL "Debug")
if(NOT MSVC)
+ # @TODO: add warnings here too
list(APPEND CUDA_NVCC_FLAGS -std=c++11; -Xcompiler\ -fPIC; -Xcompiler\ -Wno-unused-result; -Xcompiler\ -Wno-deprecated; -Xcompiler\ -Wno-pragmas; -Xcompiler\ -Wno-unused-value; -Xcompiler\ -Werror;)
else()
list(APPEND CUDA_NVCC_FLAGS -Xcompiler\ /FS; )
diff --git a/VERSION b/VERSION
index d4f6e2c5..a97fc441 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-v1.6.2
+v1.7.1
diff --git a/examples b/examples
-Subproject 8c6f4ef6859ef224dbc7ff891884bf7050d718c
+Subproject 336740065d9c23e53e912a1befff18981d9d27a
diff --git a/src/3rd_party/CMakeLists.txt b/src/3rd_party/CMakeLists.txt
index faf37527..f7eabf54 100644
--- a/src/3rd_party/CMakeLists.txt
+++ b/src/3rd_party/CMakeLists.txt
@@ -3,9 +3,9 @@ include_directories(.)
add_subdirectory(./yaml-cpp)
add_subdirectory(./SQLiteCpp)
+add_subdirectory(./pathie-cpp)
if(USE_SENTENCEPIECE)
-
if(USE_STATIC_LIBS)
set(_ORIG_CMAKE_FIND_LIBRARY_SUFFIXES ${CMAKE_FIND_LIBRARY_SUFFIXES})
if(WIN32)
@@ -29,9 +29,8 @@ if(USE_SENTENCEPIECE)
if(USE_STATIC_LIBS)
set(CMAKE_FIND_LIBRARY_SUFFIXES ${_ORIG_CMAKE_FIND_LIBRARY_SUFFIXES})
endif()
-
endif(USE_SENTENCEPIECE)
include_directories(./SQLiteCpp/include)
include_directories(./CLI)
-
+include_directories(./pathie-cpp/include)
diff --git a/src/3rd_party/pathie-cpp/CHANGELOG b/src/3rd_party/pathie-cpp/CHANGELOG
new file mode 100644
index 00000000..52942338
--- /dev/null
+++ b/src/3rd_party/pathie-cpp/CHANGELOG
@@ -0,0 +1,52 @@
+-- Version 0.1.0 (2017-10-28) --
+
+* Add Pathie::Tempdir and Pathie::Tempfile classes for
+ creating temporary directories and files.
+* Add Pathie::entry_iterator, Path::begin_entries(), and
+ Path::end_entries(). These allow you to use real C++
+ iterators for working with directory entries.
+* Change Path::find() to take a callback instead of std::function to
+ make it compile under C++98.
+* Add Path::operator/=.
+* Add `const' qualifier to Path::fopen() and Path::touch() as these
+ methods leave the path itself unchanged.
+* Add C++98 compatibility (instead of just C++11).
+* Mark stream replacements as experimental. They are mostly untested
+ and I don't really use them.
+* Add PATHIE_BUILD_STREAM_REPLACEMENTS for building the stream
+ replacements.
+* Rename build option ASSUME_UTF8_ON_UNIX to
+ PATHIE_ASSUME_UTF8_ON_UNIX.
+* Switch license from GPL3 to BSD-2clause.
+* Drop support for expanding "~username/foo" constructs. This
+ nonstandard extension was unportable and caused problems when
+ linking Pathie statically.
+* Add Path::utf8_str() method.
+* Restructure header #include order. Pathie now requires you
+ to specify the exact header to include (e.g. <pathie/path.hpp>)
+ instead of one global header. There was no point in having the
+ stream replacements included if not required.
+* Fix compilation problem with _PATHIE_UNIX not being defined
+* Do not include <windows.h> in Pathie public headers. This caused
+ problems in some circumstances when a certain macro combination
+ of windows.h was needed.
+* Remove config.hpp. This caused confusion when the library was used.
+ Build configuration now only happens via comandline options.
+* Drop shaky support for NTFS symlinks. It never worked really well
+ anyway.
+
+-- Version 0.0.3 (2015-04-30) --
+
+* Don't use CMake's global configuration variables, allowing pathie to
+ be built as a subproject.
+* Fix compilation error on systems that do not automatically
+ #include <stdexcept>.
+
+-- Version 0.0.2 (2015-02-16) --
+
+* Fix installation error on config.hpp
+* Add message that C++11 is required for compilation
+
+-- Version 0.0.1 (2015-02-13) --
+
+First public release.
diff --git a/src/3rd_party/pathie-cpp/CMakeLists.txt b/src/3rd_party/pathie-cpp/CMakeLists.txt
new file mode 100644
index 00000000..db5744f5
--- /dev/null
+++ b/src/3rd_party/pathie-cpp/CMakeLists.txt
@@ -0,0 +1,8 @@
+include_directories(..)
+include_directories(.)
+include_directories(include)
+
+FILE(GLOB PathieCppSources src/*.cpp)
+if (NOT TARGET pathie-cpp)
+ add_library(pathie-cpp OBJECT ${PathieCppSources})
+endif()
diff --git a/src/3rd_party/pathie-cpp/LICENSE b/src/3rd_party/pathie-cpp/LICENSE
new file mode 100644
index 00000000..f74dec43
--- /dev/null
+++ b/src/3rd_party/pathie-cpp/LICENSE
@@ -0,0 +1,24 @@
+Copyright © 2015, 2017 Marvin Gülker
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+1. Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+“AS IS” AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/src/3rd_party/pathie-cpp/README.md b/src/3rd_party/pathie-cpp/README.md
new file mode 100644
index 00000000..80b68770
--- /dev/null
+++ b/src/3rd_party/pathie-cpp/README.md
@@ -0,0 +1,359 @@
+PATHIE.
+=======
+
+This is the Pathie project. It aims to provide a C++ library that covers
+all needs of pathname manipulation and filename fiddling, without
+having to worry about the underlying platform. That is, it is a glue
+library that allows you to create platform-independent filename
+handling code with special regard to Unicode path names.
+
+Supported systems
+-----------------
+
+Currently supported platforms are Linux and Windows, the latter via
+MSYS2 GCC. Any other compiler or system might or might not work. Mac
+OS should work as well, but I cannot test this due to lack of a Mac. I
+gladly accept contributions for any system or compiler.
+
+Pathie's source code itself is written conforming to C++98. On UNIX
+systems, it assumes the system supports POSIX.1-2001. On Windows
+systems, the minimum supported Windows version is Windows Vista.
+
+Installation
+------------
+
+See INSTALL.md.
+
+The library
+-----------
+
+The entire world is using UTF-8 as the primary Unicode encoding. The
+entire world? No, a little company from Redmond resists the temptation
+and instead uses UTF-16LE, causing cross-platform handling of Unicode
+paths to be a nightmare.
+
+One of the main problems the author ran into was compiler-dependant
+code that was not marked as such. Many sites on the Internet claim
+Unicode path handling on Windows is easy, but in fact, it only is if
+you define “development for Windows” as “development with MSVC”,
+Microsoft’s proprietary C/C++ compiler, which provides nonstandard
+interfaces to allow for handling UTF-16LE filenames. The Pathie
+library has been developed with a focus on MinGW and crosscompilation
+from Linux to Windows and thus does not suffer from this problem.
+
+The Pathie library has been developed to release the programmer from
+the burden of handling the different encodings in use for filenames,
+and does so by focusing its API on UTF-8 regardless of the platform in
+use. Thus, if you use UTF-8 as your preferred encoding inside your
+program (take a look at the [UTF8 Everywhere
+website](http://www.utf8everywhere.org) for reasons why you should do
+that), Pathie will be of the most use for you, since it transparently
+converts whatever filesystem encoding is encountered to UTF-8 in its
+public interface. Likewise, any pathname you pass to the library is
+assumed to be UTF-8 and is transcoded transparently to the filesystem
+encoding before invoking the respective OS' filesystem access
+methods. Of course, explicit conversion functions are also provided,
+in case you do need a string in the native encoding or need to
+construct a path from a string in the native encoding.
+
+General Usage
+-------------
+
+First thing is to include the main header:
+
+~~~~~~~~~~~~~~~~~~{.cpp}
+#include <pathie/path.hpp>
+~~~~~~~~~~~~~~~~~~
+
+Now consider the simple task to get all children of a directory, which
+have Unicode filenames. Doing that manually will result in you having
+to convert between UTF-8 and UTF-16 all the time. With pathie, you can
+just do this:
+
+~~~~~~~~~~~~~~~~~~~{.cpp}
+std::vector<Pathie::Path> children = your_path.children();
+~~~~~~~~~~~~~~~~~~~
+
+Done. Retrieving the parent directory of your directory is pretty easy:
+
+~~~~~~~~~~~~~~~~~~~{.cpp}
+Pathie::Path yourpath("foo/bar/baz");
+Pathie::Path parent = yourpath.parent();
+~~~~~~~~~~~~~~~~~~~
+
+But Pathie is much more than just an abstraction of different filepath
+encodings. It is a utility library for pathname manipulation, i.e. it
+allows you to do things like finding the parent directory, expanding
+relative to absolute paths, decomposing a filename into basename,
+dirname, and extension, and so on. See the documentation of the
+central Pathie::Path class on what you can do.
+
+~~~~~~~~~~~~~~~~~~~~~~{.cpp}
+// Assume current directory is /tmp
+Pathie::Path p("foo/bar/../baz");
+p.expand(); // => /tmp/foo/baz
+~~~~~~~~~~~~~~~~~~~~~~
+
+Or my personal favourite:
+
+~~~~~~~~~~~~~~~~~~~{.cpp}
+Pathie::Path p1("/tmp/foo/bar");
+Pathie::Path p2("/tmp/bar/foo");
+Pathie::Path p3 = p1.relative(p2); // => ../../foo/bar
+~~~~~~~~~~~~~~~~~~~
+
+It also provides you with commonly used paths like the user’s
+configuration directory or the path to the running executable.
+
+~~~~~~~~~~~~~~~~~~~~{.cpp}
+Pathie::Path configdir = Pathie::Path::config_dir();
+Pathie::Path exepath = Pathie::Path::exe();
+~~~~~~~~~~~~~~~~~~~~
+
+Pathie assumes that all string arguments passed are in UTF-8 and
+transparently converts to the native filesystem encoding internally.
+
+Still, if you interface directly with the Windows API or other external
+libraries, you might want to retrieve the native representation from a
+Path or construct a Path from the native representation. Pathie
+doesn’t want to be in your way then. The following example constructs
+from and converts to the native representation on Windows, which is
+UTF-16LE:
+
+~~~~~~~~~~~~~~~~~~~~{.cpp}
+// Contruct from native
+wchar_t* utf16 = Win32ApiCall();
+Path mypath = Path::from_native(utf16); // also accepts std::wstring
+
+// Retrieve native (Note C++’ish std::wstring rather than
+// raw wchar_t* on Windows)
+std::wstring native_utf16 = mypath.native();
+~~~~~~~~~~~~~~~~~~~~
+
+On UNIX, these methods work with normal strings (std::string instead
+of std::wstring) in the underlying filesystem encoding. In most cases,
+that will be UTF-8, but some legacy systems may still use something
+like ISO-8859-1 in which case that will differ.
+
+### Temporary files and directories
+
+There are two classes `Pathie::Tempdir` and `Pathie::Tempfile` that
+you can use if you need to work with temporary files or directories,
+respectively. Constructing instances of these classes creates a
+temporary entry, which is removed (recursively in case of directories)
+when the instance is destroyed again. Use TempEntry::path() to get
+access to the Path instance pointing to the created entry.
+
+~~~~~~~~~~~~~~~~~~~~{.cpp}
+#include <pathie/tempdir.hpp>
+
+//...
+
+{
+ srand(time(NULL)); // Needs random number generator
+ Pathie::Tempdir tmpdir("foo"); // Pass a fragment to use as part of filename
+ std::cout << "Temporary dir is: " << tmpdir.path() << std::endl;
+}
+// When `tmpdir' is destroyed, the destructor recursively
+// deletes the directory that was created.
+~~~~~~~~~~~~~~~~~~~~
+
+### Opening a file with a Unicode path name
+
+On Windows with GCC, it is [not possible to open a file with Unicode
+pathname](https://stackoverflow.com/questions/821873) via C++'s usual
+`std::ifstream` and `std::ofstream` mechanism. There's a nonstandard
+extension provided by Microsoft's proprietary compiler that does this,
+but GCC does not have this extension. Consequently, code that is
+intended to compile on GCC (like Pathie) has to avoid it.
+
+There *is* however a function in the Win32API that allows to open a
+file with a Unicode pathname *and* that returns a standard C `FILE*`
+handle,
+[_wfopen()](http://msdn.microsoft.com/en-us/library/yeby3zcb.aspx). The
+method Path::fopen() uses this function on Windows and a regular C
+`fopen()` on all other platforms, thus allowing you to just deal with
+your Unicode filename via the regular C I/O interface. If you urgently
+need C++ I/O streams, read on.
+
+### Stream replacements
+
+Pathie mainly provides you with the means to handle paths, compose,
+and decompose them. There is an experimental feature however that
+provides replacements for C++ file streams that work with instances of
+Pathie::Path instead of strings for opening a file. These replacements
+are neither elegant nor portable, because they don't nicely honour the
+template concept the STL is based on by directly subclassing the
+standard streams in the matter needed most frequently and additionally
+relying on vendor-specific details. For GCC, an internal (but at least
+documented) interface is used to exchange the file descriptor inside a
+stream, and for MSVC, a nonstandard (but documented) constructor is
+used. Other compilers are not supported by this feature (which most
+notably affects clang, where I have no idea on the interfaces I need
+to use for such a trick).
+
+In one word, these replacements are hacky and I consider them
+experimental. If that does not strike you as problematic, you can
+enable this feature by passing `-DPATHIE_BUILD_STREAM_REPLACEMENTS=ON`
+when invoking `cmake` during the build process.
+
+In order to use the replacements, include the respective header
+(either `pathie_ifstream` or `pathie_ofstream`) and use the
+`Pathie::ifstream` and `Pathie::ofstream` classes just like you would
+use `std::ifstream` and `std::ofstream`, with the only difference
+being that you construct them from a Pathie::Path instance instead of
+a string. See the documentation of Pathie::ofstream for more
+information.
+
+~~~~~~~~~~~~~~~~~{.cpp}
+#include <pathie/pathie_ofstream>
+
+// ...
+
+Pathie::Path p("Bärenstark.txt");
+Pathie::ofstream file(p);
+file << "Some content" << std::endl;
+file.close()
+~~~~~~~~~~~~~~~~~
+
+There's also the inofficial
+[boost::nowide](http://cppcms.com/files/nowide/html/), which is
+similar to this feature and maybe more reliable. It has [recently been
+accepted into
+boost](https://lists.boost.org/boost-announce/2017/06/0516.php).
+
+Dependencies and linking
+------------------------
+
+Pathie is standalone, that is, it requires no other libraries except
+for those provided by your operating system. Note that there’s a
+caveat with this on Windows, which does provide the `Shlwapi` library
+by default, but MinGW's GCC does not automatically link it in. Be sure
+to link to this library explicitely when compiling for MinGW Windows
+by appending `-lShlwapi` to the end of your linking command line.
+
+It is recommended to link in pathie as a dynamic library, because
+there are some problems with it when linked statically on certain
+operating systems (see _Caveats_ below). If you are sure you aren’t
+affected by those problems, it is possible to link in pathie
+statically.
+
+Caveats
+-------
+
+This library assumes that under all UNIX systems out there (I also
+consider Mac OSX to be a UNIX system) the file system root always is
+`/` and the directory separator also always is `/`. This structure is
+mandatory as per POSIX -- in POSIX.1-2008, it’s specified in section
+10.1. Systems which do neither follow POSIX directory structure, nor
+are Windows, are unsupported.
+
+On POSIX-compliant systems other than Mac OS X, the filesystem
+encoding [generally is
+unspecified](https://unix.stackexchange.com/questions/2089/what-charset-encoding-is-used-for-filenames-and-paths-on-linux).
+Pathnames are merely byte blobs which do not contain NUL bytes, and
+components are separated by `/`. It’s up to the applications,
+including utilities like a shell or the ls(1) program, to make
+something of those byte streams. Therefore, it is perfectly possible
+that on one system, user A uses ISO-8859-1 filenames and user B uses
+UTF-8 filenames. Even the same user could use differently encoded
+filenames. Programs that have to interpret the byte blobs in pathnames
+on these systems look at the locale environment variables, namely
+`LANG` and `LC_ALL`, see section 7 of POSIX.1-2008. As a consequence,
+it may happen you want to create filenames with characters not
+supported in the user’s pathname encoding. For example, if you want to
+create a file with a hebrew filename and the user’s pathname encoding
+is ISO-8859-1, there’s a problem, because ISO-8859-1 has no hebrew
+characters in it, but in UTF-8, which is the encoding you are advised
+to use and which is what Pathie’s API expects from you, they are
+available. There is no sensible solution to this problem that the
+Pathie library could dictate; the `iconv()` function used by pathie
+just replaces characters that are unavailable in the target encoding
+with a system-defined default (probably “?”). Note that on systems
+which have a Unicode pathname encoding, especially modern Linuxes with
+UTF-8, such a situation can’t ever arise, because the Unicode
+encodings (UTF-*) cover all characters you can ever use.
+
+At least on FreeBSD, calling the POSIX `iconv()` function fails with
+the cryptic error message “Service unavailable” if a program is linked
+statically. I’ve reported [a bug on
+this](https://bugs.freebsd.org/bugzilla/show_bug.cgi?id=196567). This
+means that you currently can’t link in pathie statically on FreeBSD
+and systems which don’t allow statically linked executables to call
+`iconv()`.
+
+On Linux systems, it is recommended to set your program’s locale to the
+environment’s locale before you call any functions the Pathie library
+provides, because this will allow Pathie to use the correct encoding
+for filenames. This is relevant where the environment’s encoding is
+not UTF-8, e.g. with $LANG set to `de_DE.ISO-8859-1`. You can do this
+as follows (the `""` locale always refers to the locale of the
+environment):
+
+~~~~~~~~~~~~~~~~~~~~~{.cpp}
+#include <locale>
+std::locale::global(std::locale(""));
+~~~~~~~~~~~~~~~~~~~~~
+
+This is not required on Windows nor on Mac OS X, because these
+operating systems always use UTF-16LE (Windows) or UTF-8 (Mac OS X) as
+the filesystem encoding, regardless of the user's locale. It however
+does not hurt to call this either, it simply makes no difference for
+Pathie on these systems. If you urgently need to avoid this call on
+Linux, you need to compile pathie with the special build option
+PATHIE_ASSUME_UTF8_ON_UNIX, which will force Pathie to assume that
+UTF-8 is used as the filesystem encoding under any UNIX-based system.
+
+Links
+-----
+
+* Project page: https://www.guelkerdev.de/projects/pathie/
+* GitHub mirror: https://github.com/Quintus/pathie-cpp
+* Issue tracker: https://github.com/Quintus/pathie-cpp/issues
+
+Contributing
+------------
+
+Feel free to submit any contributions you deem useful. Try to make
+separate branches for your new features, give a description on what
+you changed, etc.
+
+Don’t you duplicate boost::filesystem?
+-------------------------------------
+
+Yes and
+no. [boost::filesystem](http://www.boost.org/doc/libs/1_56_0/libs/filesystem/doc/index.htm)
+provides many methods pathie provides, but has a major problem with
+Unicode path handling if you are not willing to do the UTF-8/UTF-16
+conversion manually. boost::filesystem always uses UTF-8 to store the
+paths on UNIX, and, which is the problem, always uses UTF-16LE to
+store the paths on a Windows system. There is no way to override
+this, although there is a [hidden documentation
+page](http://www.boost.org/doc/libs/1_51_0/libs/locale/doc/html/default_encoding_under_windows.html)
+that claims to solve the problem. I have wasted a great amount of time
+to persuade boost::filesystem to automatically convert all
+`std::string` input it receives into UTF-16LE, but failed to
+succeed. Each time I wanted to create a file with a Unicode filename,
+the test failed on Windows by producing garbage filenames. Finally I
+found out that the neat trick shown in the documentation above indeed
+does work -- but only if you use the Microsoft Visual C++ compiler
+(MSVC) to compile your code. I don’t, I generally use g++ via the
+[MinGW](http://www.mingw.org) toolchain. boost::filesystem fails with
+g++ via MinGW with regard to Unicode filenames on Windows as of this
+writing (September 2014).
+
+Apart from that, pathie provides some additional methods, especially
+with regard to finding out where the user’s paths are. It is modelled
+after Ruby’s popular
+[Pathname](http://ruby-doc.org/stdlib-2.1.2/libdoc/pathname/rdoc/Pathname.html#method-i-rmtree)
+class, but it doesn’t entirely duplicate its interface (which wouldn’t
+be idiomatic C++).
+
+Also, pathie is a small library. Adding it to your project shouldn’t
+hurt too much, while boost::filesystem is quite a large dependency.
+
+License
+-------
+
+Pathie is BSD-licensed; see the file “LICENSE” for the exact license
+conditions.
diff --git a/src/3rd_party/pathie-cpp/include/entry_iterator.hpp b/src/3rd_party/pathie-cpp/include/entry_iterator.hpp
new file mode 100644
index 00000000..85a53b1f
--- /dev/null
+++ b/src/3rd_party/pathie-cpp/include/entry_iterator.hpp
@@ -0,0 +1,119 @@
+/* -*- coding: utf-8 -*-
+ * This file is part of Pathie.
+ *
+ * Copyright © 2015, 2017 Marvin Gülker
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * “AS IS” AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef PATHIE_ENTRY_ITERATOR_HPP
+#define PATHIE_ENTRY_ITERATOR_HPP
+#include <iterator>
+
+namespace Pathie {
+
+ class Path;
+
+ /**
+ * An iterator class for reading the entries in a directory.
+ * Note that the entries of a directory always include the
+ * "." (current directory) and ".." (parent directory) entries
+ * unresolved, and that the order in which the entries in the
+ * directory are returned is undefined (actually, the order
+ * depends on the filesystem used).
+ *
+ * The iterators of this class are always const. You cannot change
+ * the values referenced.
+ *
+ * It is unspecified behaviour what happens if a directory entry is
+ * added or removed to/from the directory while you are iterating
+ * it. Thus, keep iterations short in time.
+ *
+ * Instances of this class wrap an ephemeral handle like for example
+ * a directory descriptor on Linux. This handle is not copiable,
+ * which should normally mean that instances of this class cannot be
+ * copied. However, the `std::iterator` interface mandates that
+ * iterator instances are copiable (see "Requirements" here:
+ * <http://en.cppreference.com/w/cpp/concept/Iterator>) and in fact
+ * the language copies iterators all the time if you use them for
+ * example in a for loop. Consequently, this class implements the
+ * copy constructor and the copy assignment. However, these operations
+ * do *not* actually copy the instance, but instead *move* the content
+ * from the source instance to the target instance. The source intance
+ * is afterwards unusable and looks like a finished iterator. The
+ * `const` qualifiers in the copy operations are explicitely casted
+ * away inside the functions to allow this, so they don't mean anything
+ * for them. This works fairly nice for the ordinary use case (where
+ * the language creates implicit copies), but the API may look as if
+ * copying instances is allowed. It is not. *Do not copy* instances of
+ * this class even though it looks as if it's possible. Implicit
+ * copies automatically done by C++ as in for loops are okay, but
+ * that's it. That is, you *can* do this:
+ *
+ * ~~~~{.cpp}
+ * entry_iterator iter;
+ * for(iter=my_path.begin_entries(); iter != my_path.end_entries(); iter++) {
+ * // Work with iter...
+ * }
+ * ~~~~
+ *
+ * But you *cannot* do this:
+ *
+ * ~~~~{.cpp}
+ * entry_iterator iter=my_path.begin_entries();
+ * entry_iterator iter2(iter);
+ * ~~~~
+ *
+ * This example does compile, but `iter` will be unusable after
+ * `iter2` has been constructed.
+ */
+ class entry_iterator: public std::iterator<std::input_iterator_tag, Path, int>
+ {
+ public:
+ entry_iterator();
+ entry_iterator(const Path* p_top);
+ ~entry_iterator();
+ entry_iterator& operator=(const Path* p_top); // Restart assignment
+ operator bool() const;
+ bool operator==(const entry_iterator& other) const;
+ bool operator!=(const entry_iterator& other) const;
+ entry_iterator& operator++(int);
+ entry_iterator& operator++();
+ const Path& operator*() const;
+ const Path* operator->() const;
+
+ // "Copy" operations that really move the content, see class docs
+ entry_iterator(const entry_iterator& other);
+ entry_iterator& operator=(const entry_iterator& other);
+ private:
+ void open_native_handle();
+ void close_native_handle();
+
+ const Path* mp_directory; ///< Path requested to read from.
+ void* mp_cur; ///< Native handle to the opened directory.
+ Path* mp_cur_path; ///< Path instance of the path pointed to by mp_cur (only a pointer to allow forward-declaration of Path).
+ };
+}
+
+#endif /* PATHIE_ENTRY_ITERATOR_HPP */
diff --git a/src/3rd_party/pathie-cpp/include/errors.hpp b/src/3rd_party/pathie-cpp/include/errors.hpp
new file mode 100644
index 00000000..d79fb3c3
--- /dev/null
+++ b/src/3rd_party/pathie-cpp/include/errors.hpp
@@ -0,0 +1,119 @@
+/* -*- coding: utf-8 -*-
+ * This file is part of Pathie.
+ *
+ * Copyright © 2015, 2017 Marvin Gülker
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * “AS IS” AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef PATHIE_ERRORS_HPP
+#define PATHIE_ERRORS_HPP
+#include <exception>
+#include <string>
+#include <cstdlib>
+
+/* DWORD is typedef'ed from unsigned long, see
+ * <https://msdn.microsoft.com/en-us/library/cc230318.aspx>
+ * HRESULT is typedef'ed from LONG, which in turn is a typedef
+ * of long, see <https://msdn.microsoft.com/en-us/library/cc230330.aspx>.
+ * I spell the types out here in this header to avoid having to
+ * include windows.h, which might interfer with programmes using
+ * pathie that want to include windows.h on itself. */
+
+#include "pathie.hpp"
+
+namespace Pathie {
+
+ /// Base class for all exceptions in this library.
+ class PathieError: public std::exception {
+ public:
+ PathieError(); ///< Constructs a new instance.
+ PathieError(std::string message); ///< Contructs a new instance with the given what() message.
+ virtual ~PathieError() throw();
+
+ virtual const char* what() const throw(); ///< The error message.
+ protected:
+ std::string m_pathie_errmsg; ///< The error message given in the constructor.
+ };
+
+
+ /// This exception is thrown when a call to a C/system function results
+ /// in `errno` being set.
+ class ErrnoError: public PathieError {
+ public:
+ ErrnoError(int val); ///< Constructs a new instance from the given `errno` value.
+ virtual ~ErrnoError() throw();
+
+ inline int get_val(){return m_val;} ///< The `errno` value.
+ private:
+ int m_val;
+ };
+
+#ifdef _WIN32
+
+ /// This exception is thrown only on Windows, when a call to the Win32API
+ /// fails.
+ /// The "unsigned long" type here is actually DWORD (which is it a
+ /// typedef of in Win32).
+ class WindowsError: public PathieError {
+ public:
+ WindowsError(unsigned long val); ///< Constructs a new instance from the given GetLastError() value.
+ virtual ~WindowsError() throw();
+
+ inline int get_val(){return m_val;} ///< The GetLastError() value.
+ private:
+ unsigned long m_val;
+ };
+
+ /// Similar to WindowsError, this exception is thrown when a HANDLE function
+ /// from the Win32API fails.
+ /// The "long" type here is actually HRESULT (which it is a typedef of in Win32).
+ class WindowsHresultError: public PathieError {
+ public:
+ WindowsHresultError(long value); ///< Constructs a new instance from the given handle function result.
+ virtual ~WindowsHresultError() throw();
+
+ inline long get_val(){return m_val;} ///< The handle function result.
+ private:
+ int m_val;
+ };
+#endif
+
+#ifdef _PATHIE_UNIX
+
+ /// This exception is thrown only on UNIX, when a call to the POSIX glob(3)
+ /// function fails.
+ class GlobError: public PathieError {
+ public:
+ GlobError(int val); ///< Contructs a new instance from the given glob(3) error code.
+ virtual ~GlobError() throw();
+
+ inline int get_val(){return m_val;} ///< The glob(3) error code.
+ private:
+ int m_val;
+ };
+#endif
+
+}
+#endif
diff --git a/src/3rd_party/pathie-cpp/include/path.hpp b/src/3rd_party/pathie-cpp/include/path.hpp
new file mode 100644
index 00000000..90729709
--- /dev/null
+++ b/src/3rd_party/pathie-cpp/include/path.hpp
@@ -0,0 +1,377 @@
+/* -*- coding: utf-8 -*-
+ * This file is part of Pathie.
+ *
+ * Copyright © 2015, 2017 Marvin Gülker
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * “AS IS” AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef PATHIE_PATH_HPP
+#define PATHIE_PATH_HPP
+#include <string>
+#include <iostream>
+#include <vector>
+#include <sys/stat.h>
+
+#include "pathie.hpp"
+#include "entry_iterator.hpp"
+
+namespace Pathie {
+
+ // Forward-declare, defined in pathie.cpp.
+#if defined(_WIN32)
+ std::string utf16_to_utf8(std::wstring);
+ std::wstring utf8_to_utf16(std::string);
+#elif defined(_PATHIE_UNIX)
+ std::string utf8_to_filename(const std::string& utf8);
+ std::string filename_to_utf8(const std::string& native_filename);
+#endif
+
+ /**
+ * \brief Main class, describing paths.
+ *
+ * This class represents a single path on the filesystem.
+ * The path does not have to exist, but this class provides
+ * you with means to create it.
+ *
+ * Note on predefined directories
+ * ------------------------------
+ *
+ * This class provides a lot of methods for retrieving information about
+ * system and user predefined directories. Note however that the
+ * referenced directories may or may not exist.
+ *
+ * See the pathlist.md document for an overview of possible path
+ * return values.
+ *
+ * Note on XDG directories on UNIX
+ * -------------------------------
+ *
+ * Nowadays UNIX systems have adapted the Freedesktop.org
+ * XDG standards, and it is highly recommended to follow them
+ * when you write an application that stores user-specific data.
+ * XDG directories fall in two groups: Core data directories, covered
+ * by the main XDG specification, and user-dir directories, described
+ * in the documentation of the XDG user-dirs software. Directories of
+ * the first group are available today on all Linux systems, examples
+ * for them are ~/.config, ~/.local/share, and others. Directories
+ * of the latter group are typically found on desktop systems and
+ * are missing on servers, examples include ~/Documents and ~/Downloads.
+ *
+ * The following XDG specifications are followed:
+ *
+ * * XDG main specification: http://standards.freedesktop.org/basedir-spec/basedir-spec-latest.html
+ * * XDG user-dirs specification: http://www.freedesktop.org/wiki/Software/xdg-user-dirs/
+ *
+ * Pathie is not a Shellscript parser, so it will fail if your XDG configuration
+ * files do not follow the usually found format. Especially no other variable
+ * substitution except from exactly one $HOME is understood.
+ *
+ * While the XDG specification for the core directories clearly says
+ * which directory to use if the administrator/user has not specified
+ * in his system configuration (by setting the appropriate environment
+ * variables), the user-dirs isn’t that easy. Or rather, it is, but not
+ * all desktop environment bother to follow it. The XDG user-dirs spec
+ * requires a file `~/.config/user-dirs.dirs` to exist, generated by the
+ * program xdg-user-dirs-update(1), which is run by all major desktop
+ * environments. Smaller ones don’t always do that, resulting in the file
+ * missing. The spec leaves open what should happen in such a case, i.e.
+ * it’s implemention-defined behaviour. I have chosen to return the
+ * user’s $HOME directory in such a case. The methods affected by this
+ * decision are the following ones:
+ *
+ * * documents_dir()
+ * * download_dir()
+ * * music_dir()
+ * * pictures_dir()
+ * * publicshare_dir()
+ * * templates_dir()
+ * * videos_dir()
+ *
+ * Other notes
+ * -----------
+ *
+ * On UNIX, this library follows the Filesystem Hierarchy Standard,
+ * version 2.3 (http://refspecs.linuxfoundation.org/FHS_2.3/fhs-2.3.html).
+ *
+ * On UNIX, the FHS defines a "normal" file hierarchy and a "local" one; for
+ * example, /usr/share is part of the "normal" file hierarchy, which is mirrored
+ * to the "local" one in /usr/local/share. The "local" hierarchy is inteded to be
+ * used by programs that the system administrator manually installed without resorting
+ * to the system’s default package manager. Such a difference does not exist on Windows.
+ * Pathie allows you to decide yourself which information you want to query when calling
+ * one of the following functions:
+ *
+ * * global_mutable_data_dir()
+ * * global_immutable_data_dir()
+ * * global_config_dir()
+ * * global_cache_dir()
+ * * global_appentries_dir()
+ *
+ * Each of these functions takes an argument that allows you to specify whether
+ * you want the "local" or the "normal" hierarchy’s paths returned. The argument
+ * however is optional, and you can use the set_global_dir_default() method to
+ * specify what should happen if no argument is specified. By default, paths of
+ * the "local" hierarchy are returned. For example:
+ *
+ * ~~~~~~~~~~~~~~~~~~~ c++
+ * Path p1 = Path::global_immutable_data_dir(); // /usr/local/share
+ * Path p2 = Path::global_immutable_data_dir(Path::LOCALPATH_NORMAL); // /usr/share
+ * Path p3 = Path::global_immutable_data_dir(Path::LOCALPATH_LOCAL); // /usr/local/share
+ *
+ * Path::set_global_dir_default(Path::LOCALPATH_NORMAL);
+ * Path p4 = Path::global_immutable_data_dir(); // /usr/share
+ * Path p5 = Path::global_immutable_data_dir(Path::LOCALPATH_LOCAL); // /usr/local/share
+ * Path p6 = Path::global_immutable_data_dir(Path::LOCALPATH_NORMAL); // /usr/share
+ * ~~~~~~~~~~~~~~~~~~~
+ *
+ * As you can see, the argument, if given, always takes precedence over the
+ * default set with set_global_dir_default().
+ */
+ class Path
+ {
+ public:
+
+ /**
+ * Specifies the argument type for the `global_*_dir()` functions.
+ * `LOCALPATH_DEFAULT` means fall back to the default set with `set_global_dir_default()`,
+ * `LOCALPATH_NORMAL` means to use the normal FHS paths, and `LOCALPATH_LOCAL` means to use
+ * the paths the FHS specifies for local additions.
+ */
+ enum localpathtype {
+ LOCALPATH_DEFAULT = 1,
+ LOCALPATH_NORMAL,
+ LOCALPATH_LOCAL
+ };
+
+ /// Default constructor.
+ Path();
+ /// Copy constructor.
+ Path(const Path& path);
+ /// Construct a path from a string.
+ Path(std::string path);
+ /// Construct a path from components.
+ Path(const std::vector<Path>& components);
+
+#if defined(_PATHIE_UNIX)
+ static inline Path from_native(const std::string& native_filename)
+ { return Path(filename_to_utf8(native_filename)); }
+#elif defined(_WIN32)
+ /** Convert a path that is in the native representation of
+ * the system into a Path instance. The argument will be
+ * transcoded from the system’s native encoding to UTF-8;
+ * on Windows, the argument is expected to be UTF-16LE therefore,
+ * while on UNIX, it is expected to be encoded in the environment’s
+ * locale. */
+ static inline Path from_native(const std::wstring& native_filename)
+ { return Path(utf16_to_utf8(native_filename)); }
+#else
+#error Unsupported system.
+#endif
+
+ /// Returns the current working directory.
+ static Path pwd();
+ /// Returns the path to the running executable.
+ static Path exe();
+ /// Returns the home directory.
+ static Path home();
+
+ static Path data_dir(); ///< Directory for permanent user data
+ static Path config_dir(); ///< Directory for permanent user configuration files
+ static Path cache_dir(); ///< Directory for cached user data
+ static Path runtime_dir(); ///< Directory for volatile information
+ static Path temp_dir(); ///< Directory for temporary data
+ static Path desktop_dir(); ///< User’s desktop directory
+ static Path documents_dir(); ///< User’s documents directory
+ static Path download_dir(); ///< User’s download directory
+ static Path music_dir(); ///< User’s music directory
+ static Path pictures_dir(); ///< User’s pictures directory
+ static Path publicshare_dir(); ///< User’s networking directory
+ static Path templates_dir(); ///< User’s document templates directory
+ static Path videos_dir(); ///< User’s video directory
+ static Path appentries_dir(); ///< User’s application starters directory
+
+ static Path global_mutable_data_dir(localpathtype local = LOCALPATH_DEFAULT); ///< Global directory for immutable permanent data
+ static Path global_immutable_data_dir(localpathtype local = LOCALPATH_DEFAULT); ///< Global directory for mutable permanent data
+ static Path global_config_dir(localpathtype local = LOCALPATH_DEFAULT); ///< Global directory for configuration files
+ static Path global_cache_dir(localpathtype local = LOCALPATH_DEFAULT); ///< Global directory for cached data
+ static Path global_runtime_dir(localpathtype local = LOCALPATH_DEFAULT); ///< Global directory for volatile information
+ static Path global_appentries_dir(localpathtype local = LOCALPATH_DEFAULT); ///< Global application starters directory
+ static Path global_programs_dir(); ///< Global directory for selfcontained programs
+
+ static Path mktmpdir(const std::string& name = "tmpd"); ///< Create a temporary directory
+
+ static inline void set_global_dir_default(localpathtype localdefault){ c_localdefault = localdefault; } ///< Specify what do do for the `global_*_dir()` methods if no argument is passed to them.
+ static inline localpathtype get_global_dir_default(){ return c_localdefault; } ///< Returns what was set with set_global_dir_default().
+
+#ifdef _PATHIE_UNIX
+ static std::vector<Path> data_dirs();
+ static std::vector<Path> config_dirs();
+#endif
+
+ /// Shell-like glob.
+ static std::vector<Path> glob(const std::string& pattern, int flags = 0);
+ /// Traverse directory recursively.
+ void find(bool (*cb)(const Path& entry)) const;
+
+ /// Return the path as a raw std::string.
+ std::string str() const;
+ /// Alias for str().
+ std::string utf8_str() const;
+ /// Assign the given string to the underlying path.
+ void assign(std::string str);
+
+#if defined(_PATHIE_UNIX)
+ std::string native() const;
+#elif defined(_WIN32)
+ /// Return the path in the native format.
+ std::wstring native() const;
+#else
+#error Unsupported system.
+#endif
+
+ void swap(Path& path) throw();
+
+ /// Number of components in the path string.
+ size_t component_count() const;
+ /// Burst path into components.
+ std::vector<Path> burst(bool descend = false) const;
+ /// Shell-like globbing.
+ std::vector<Path> dglob(const std::string& pattern, int flags = 0) const;
+ /// Glob pattern check without filesystem access.
+ bool fnmatch(const std::string& pattern, int flags = 0) const;
+
+ Path& operator=(const Path& path);
+ Path& operator=(const std::string& str);
+ /// Access single component in the path.
+ Path operator[](size_t index) const;
+ bool operator==(const Path& path) const;
+ bool operator!=(const Path& path) const;
+ bool operator<(const Path& path) const;
+ bool operator>(const Path& path) const;
+ bool operator<=(const Path& path) const;
+ bool operator>=(const Path& path) const;
+
+ Path operator/(Path path) const;
+ Path operator/(std::string str) const;
+ Path& operator/=(Path path);
+ Path& operator/=(std::string str);
+ Path join(Path path) const;
+ Path join(std::string path) const;
+ Path sub_ext(std::string new_extension) const;
+
+ /// Platform-independant C fopen().
+ FILE* fopen(const char* mode) const;
+ /// Update modification and access time to now.
+ void touch() const;
+
+ bool is_absolute() const; ///< Checks if a path is relative.
+ bool is_relative() const; ///< Checks if a path is absolute.
+ bool is_root() const; ///< Checks if a path is the file system root.
+
+ /// Remove all . and .. occurences.
+ Path prune() const;
+ /// Creates an absolute path for this path.
+ Path absolute(const Path& base = Path::pwd()) const;
+ /// Creates a relative path from an absolute one.
+ Path relative(Path base) const;
+ /// Expands all shortcuts plus create an absolute path for this path.
+ Path expand() const;
+ /// Get the one real path for this path.
+ Path real() const;
+
+ Path parent() const;
+ Path root() const;
+ Path basename() const;
+ Path dirname() const;
+ std::string extension() const;
+ void split(Path& dirname, Path& basename) const;
+
+ /// C stat information.
+#if defined(_PATHIE_UNIX)
+ struct stat* stat() const;
+#elif defined(_WIN32)
+ struct _stat* stat() const;
+#else
+#error Unsupported system.
+#endif
+
+ /// File size.
+ long size() const;
+ time_t atime() const;
+ time_t mtime() const;
+ time_t ctime() const;
+
+ /// List of entries.
+ std::vector<Path> entries() const;
+ /// List of children.
+ std::vector<Path> children() const;
+
+ bool exists() const;
+ bool is_directory() const;
+ bool is_file() const;
+ bool is_symlink() const;
+
+ Path readlink() const;
+ /// Create a symbolic link.
+ void make_symlink(const Path& target) const;
+ void mkdir() const;
+ void rmdir() const;
+ void unlink() const;
+ void remove() const;
+ /// "mkdir -p"-like functionality.
+ void mktree() const;
+ /// "rm -r"-link functionality.
+ void rmtree() const;
+ /// Change file names.
+ void rename(Path& newname) const;
+
+ entry_iterator begin_entries() const;
+ entry_iterator end_entries() const;
+
+ private:
+ static std::string make_tempname(const std::string& namepart);
+ // Remove double // and trailing /, replace \ with /.
+ void sanitize();
+
+#if defined(_PATHIE_UNIX)
+ static Path get_xdg_dir(const std::string& envvarname, const std::string& defaultpath);
+ static std::vector<Path> get_xdg_dirlist(const std::string& envvarname, const std::string& defaultlist);
+ static std::string get_xdg_userdir_setting(const std::string& setting);
+ static std::string get_home(std::string username);
+#elif defined(_WIN32)
+ bool is_ntfs_symlink(const wchar_t* path) const;
+ wchar_t* read_ntfs_symlink(const wchar_t* path) const;
+#endif
+
+ static localpathtype c_localdefault;
+ std::string m_path;
+ };
+
+}
+
+/// std::cout compatibility.
+std::ostream& operator<<(std::ostream& stream, const Pathie::Path& p);
+
+#endif
diff --git a/src/3rd_party/pathie-cpp/include/pathie.hpp b/src/3rd_party/pathie-cpp/include/pathie.hpp
new file mode 100644
index 00000000..6afbf5b0
--- /dev/null
+++ b/src/3rd_party/pathie-cpp/include/pathie.hpp
@@ -0,0 +1,67 @@
+/* -*- coding: utf-8 -*-
+ * This file is part of Pathie.
+ *
+ * Copyright © 2015, 2017 Marvin Gülker
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * “AS IS” AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef PATHIE_PATHIE_HPP
+#define PATHIE_PATHIE_HPP
+#if __cplusplus < 199711L
+#error Pathie requires C++98 support. Please use an option such as -std=c++98 to enable it.
+#endif
+
+#if !defined(_PATHIE_UNIX) && (defined(unix) || defined(__unix__) || defined(__unix) || defined(__APPLE__) || defined(BSD))
+#define _PATHIE_UNIX
+#endif
+
+#include <string>
+
+/// Namespace for this library.
+namespace Pathie {
+
+ /// Returns the version number is MAJOR.MINOR.TINY.
+ std::string version();
+
+ /**
+ * Returns the Git commit this was build from.
+ * Empty string if build without Git.
+ */
+ std::string gitrevision();
+
+#ifdef _WIN32
+ std::string utf16_to_utf8(std::wstring);
+ std::wstring utf8_to_utf16(std::string);
+#endif
+
+#ifdef _PATHIE_UNIX
+ std::string utf8_to_filename(const std::string& utf8);
+ std::string filename_to_utf8(const std::string& native_filename);
+ std::string convert_encodings(const char* from_encoding, const char* to_encoding, const std::string& string);
+#endif
+
+}
+
+#endif
diff --git a/src/3rd_party/pathie-cpp/include/pathie_ifstream.hpp b/src/3rd_party/pathie-cpp/include/pathie_ifstream.hpp
new file mode 100644
index 00000000..c5736b37
--- /dev/null
+++ b/src/3rd_party/pathie-cpp/include/pathie_ifstream.hpp
@@ -0,0 +1,111 @@
+/* -*- coding: utf-8 -*-
+ * This file is part of Pathie.
+ *
+ * Copyright © 2015, 2017 Marvin Gülker
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * “AS IS” AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef PATHIE_IFSTREAM_HPP
+#define PATHIE_IFSTREAM_HPP
+#include <fstream>
+
+#if defined(_WIN32) && defined(__GNUC__)
+#include <ext/stdio_filebuf.h>
+#endif
+
+#include "path.hpp"
+
+namespace Pathie {
+
+#if defined(_PATHIE_UNIX)
+ class ifstream: public std::ifstream {
+ public:
+ ifstream();
+ ifstream(char* path, std::ios_base::openmode = std::ios_base::in);
+ ifstream(std::string path, std::ios_base::openmode = std::ios_base::in);
+ ifstream(Pathie::Path path, std::ios_base::openmode = std::ios_base::in);
+
+ void open(const char* filename, ios_base::openmode mode = ios_base::in);
+ void open(const std::string& filename, ios_base::openmode mode = ios_base::in);
+ void open(const Pathie::Path& filename, ios_base::openmode mode = ios_base::in);
+ };
+
+#elif defined (_WIN32)
+# if defined(__GNUC__)
+ /**
+ * \brief Input stream for UTF-8-encoded filenames.
+ *
+ * This class implements an interface like `std::ifstream` that works
+ * with Unicode paths regardless of the platform. Please refer to
+ * the documentation of Pathie::ofstream for more information on
+ * rationale and usage; this class works the same way as Pathie::ofstream,
+ * just for input rather than output file streams.
+ */
+ class ifstream: public std::basic_istream<char, std::char_traits<char> >
+ {
+ public:
+ typedef char char_type; ///< Type used inside the stream.
+ typedef std::char_traits<char> traits_type; ///< Traits type
+ typedef typename traits_type::int_type int_type; ///< Int type
+ typedef typename traits_type::pos_type pos_type; ///< pos type
+ typedef typename traits_type::off_type off_type; ///< offset type
+
+ ifstream();
+ explicit ifstream(const char* filename, ios_base::openmode mode = ios_base::in);
+ explicit ifstream(const std::string& filename, ios_base::openmode mode = ios_base::in);
+ explicit ifstream(const Pathie::Path& filename, ios_base::openmode mode = ios_base::in);
+ ~ifstream();
+
+ __gnu_cxx::stdio_filebuf<char>* rdbuf() const;
+ bool is_open() const; // C++11 mandates const this, C++98 hadn’t that
+ void open(const char* filename, ios_base::openmode mode = ios_base::in);
+ void open(const std::string& filename, ios_base::openmode mode = ios_base::in);
+ void open(const Pathie::Path& filename, ios_base::openmode mode = ios_base::in);
+ void close();
+
+ private:
+ FILE* mp_file;
+ __gnu_cxx::stdio_filebuf<char>* mp_filebuffer;
+ bool m_buffer_allocated;
+ };
+
+# elif defined(_MSC_VER)
+ class ifstream: public std::ifstream {
+ public:
+ ifstream();
+ ifstream(char* path, std::ios_base::openmode = std::ios_base::in);
+ ifstream(std::string path, std::ios_base::openmode = std::ios_base::in);
+ ifstream(Pathie::Path path, std::ios_base::openmode = std::ios_base::in);
+ };
+
+# else
+# error Unsupported compiler: do not know how to open C++ stream on Unicode file.
+# endif
+#else
+# error Unsupported system.
+#endif
+
+}
+#endif
diff --git a/src/3rd_party/pathie-cpp/include/pathie_ofstream.hpp b/src/3rd_party/pathie-cpp/include/pathie_ofstream.hpp
new file mode 100644
index 00000000..1ff43e6c
--- /dev/null
+++ b/src/3rd_party/pathie-cpp/include/pathie_ofstream.hpp
@@ -0,0 +1,192 @@
+/* -*- coding: utf-8 -*-
+ * This file is part of Pathie.
+ *
+ * Copyright © 2015, 2017 Marvin Gülker
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * “AS IS” AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef PATHIE_OFSTREAM_HPP
+#define PATHIE_OFSTREAM_HPP
+
+#if defined(_WIN32) && defined(__GNUC__)
+#include <ostream>
+#include <ext/stdio_filebuf.h>
+#else
+#include <fstream>
+#endif
+
+#include "path.hpp"
+
+namespace Pathie {
+
+#if defined(_PATHIE_UNIX)
+ class ofstream: public std::ofstream {
+ public:
+ ofstream();
+ ofstream(char* path, std::ios_base::openmode = std::ios_base::out);
+ ofstream(std::string path, std::ios_base::openmode = std::ios_base::out);
+ ofstream(Pathie::Path path, std::ios_base::openmode = std::ios_base::out);
+
+ void open(const char* filename, ios_base::openmode mode = ios_base::out | ios_base::trunc);
+ void open(const std::string& filename, ios_base::openmode mode = ios_base::out | ios_base::trunc);
+ void open(const Pathie::Path& filename, ios_base::openmode mode = ios_base::out | ios_base::trunc);
+ };
+#elif defined (_WIN32)
+# if defined(__GNUC__)
+ /**
+ * \brief Output stream for UTF-8-encoded filenames.
+ *
+ * Unicode filenames with C++ are horrible, and this is why the Pathie library
+ * was written in the first sense. However, working with paths may be nice,
+ * but what does this mean for you if you cannot actually open the file
+ * whose path you have been manipulating? On UNIX, the `std::ofstream` class
+ * will work just as expected if you pass it a UTF-8 unicode filename and it
+ * will open exactly the path you specified. Windows however uses UTF-16LE
+ * as the encoding for pathnames, and the same code that runs on UNIX will
+ * produce garbage filenames on Windows. Take this as an example:
+ *
+ * ~~~~~~~~~~~~~~~~~ c++
+ * std::ofstream file("Bärenstark.txt");
+ * file << "Some content" << std::endl;
+ * file.close();
+ * ~~~~~~~~~~~~~~~~~
+ *
+ * The file will appear as expected on UNIX, but on Windows it will have
+ * a garbage filename because Windows interprets filenames based on the
+ * `char` type as in the local encoding (Windows-1252 on a Western European
+ * Windows system). You have to use filenames based on `wchar_t` on Windows
+ * to get the desired effect. This, however, doesn’t work neither:
+ *
+ * ~~~~~~~~~~~~~~~~~ c++
+ * std::ofstream file(L"Bärenstark.txt");
+ * file << "Some content" << std::endl;
+ * file.close()
+ * ~~~~~~~~~~~~~~~~~
+ *
+ * That is, it works on the Microsoft Visual C++ Compiler (MSVC). The reason
+ * for this is that the ISO C++ standard does not specify a constructor
+ * that takes filenames based on `wchar_t`, but only on `char`, which Windows
+ * interpretes as described above. That’s a nice proof of how Windows tries
+ * to be inherently different from all other modern OSes in this world, and
+ * how it makes simple tasks a pain if you want cross-platform behaviour.
+ * GCC on Windows, as distributed by the MinGW project, does not support the
+ * nonstandard contructor. As it stands, you **cannot** create Unicode files
+ * via the standard C++ interface with MinGW GCC. There is, however, a special
+ * function in the Windows API called `_wfopen()` that lets you at least open
+ * a file via a `fopen()`-like C API. Thankfully GCC provides a (also nonstandard)
+ * measure to create a filebuffer (this is what is used by the C++ streams
+ * under the hood to access the files) from a C `FILE*`. This class wraps
+ * that GNU C++ extension (`gnu_cxx::stdio_filebuf`) on Windows, as well as it wraps
+ * the standard stream API on other platforms. It therefore unites the different
+ * access methods under a single uniform interface that allows you to
+ * create Unicode filenames regardless of the platform you run on.
+ *
+ * Let’s revisit the previous example, now with Pathie’s streams:
+ *
+ * ~~~~~~~~~~~~~~~~~ c++
+ * Pathie::ofstream file("Bärenstark.txt");
+ * file << "Some content" << std::endl;
+ * file.close()
+ * ~~~~~~~~~~~~~~~~~
+ *
+ * The `Pathie::ofstream` constructor takes a UTF-8 string and does the
+ * necessary conversion to UTF-16, uses `_wfopen()` under the hood to access
+ * the file, and then wraps a C++ stream around the already opened file
+ * descriptor. On platforms other than MiNGW Windows, the `Pathie::ofstream` class
+ * will just delegate to the standard `std::ofstream` class. As a bonus,
+ * if you compile with MSVC the nonstandard constructor described above
+ * is used.
+ *
+ * Of course, there’s also a constructor that will make it work directly
+ * with instances of Pathie::Path:
+ *
+ * ~~~~~~~~~~~~~~~~~ c++
+ * Pathie::Path p("Bärenstark.txt");
+ * Pathie::ofstream file(p);
+ * file << "Some content" << std::endl;
+ * file.close()
+ * ~~~~~~~~~~~~~~~~~
+ *
+ * That is, you can stay with UTF-8 `char`-based strings (like `std::string`)
+ * for anything you use. Ain’t that great?
+ *
+ * \warning On Windows, this class tries to behave as similar as the standard
+ * `std::ofstream` as possible. Due to the file descriptor magic it does under
+ * the hood, however, there is a little difference: If you construct an
+ * instance of this class without associating it immediately with a filename
+ * (the constructor without arguments), using any methods apart from `is_open()`
+ * (which is specifically implemented for that purpose) that use the underlying
+ * filebuffer will result in segmentation faults, because the filebuffer has
+ * not yet been constructed (the area where it will be constructed into is
+ * full of NUL bytes if you wonder).
+ *
+ * \note Please refer to your preferred C++ STL documentation for the
+ * `std::ofstream` class for general usage of C++ file streams.
+ */
+ class ofstream: public std::basic_ostream<char, std::char_traits<char> >
+ {
+ public:
+ typedef char char_type; ///< Type used inside the stream.
+ typedef std::char_traits<char> traits_type; ///< Traits type
+ typedef typename traits_type::int_type int_type; ///< Int type
+ typedef typename traits_type::pos_type pos_type; ///< pos type
+ typedef typename traits_type::off_type off_type; ///< offset type
+
+ ofstream();
+ explicit ofstream(const char* filename, ios_base::openmode mode = ios_base::out|ios_base::trunc);
+ explicit ofstream(const std::string& filename, ios_base::openmode mode = ios_base::out|ios_base::trunc);
+ explicit ofstream(const Pathie::Path& filename, ios_base::openmode mode = ios_base::out|ios_base::trunc);
+ ~ofstream();
+
+ __gnu_cxx::stdio_filebuf<char>* rdbuf() const;
+ bool is_open() const; // C++11 mandates const this, C++98 hadn’t that
+ void open(const char* filename, ios_base::openmode mode = ios_base::out | ios_base::trunc);
+ void open(const std::string& filename, ios_base::openmode mode = ios_base::out | ios_base::trunc);
+ void open(const Pathie::Path& filename, ios_base::openmode mode = ios_base::out | ios_base::trunc);
+ void close();
+
+ private:
+ FILE* mp_file;
+ __gnu_cxx::stdio_filebuf<char>* mp_filebuffer;
+ bool m_buffer_allocated;
+ };
+
+# elif defined(_MSC_VER)
+ class ofstream: public std::ofstream {
+ public:
+ ofstream();
+ ofstream(char* path, std::ios_base::openmode = std::ios_base::out);
+ ofstream(std::string path, std::ios_base::openmode = std::ios_base::out);
+ ofstream(Pathie::Path path, std::ios_base::openmode = std::ios_base::out);
+ };
+# else
+# error Unsupported compiler: do not know how to open C++ stream on Unicode file.
+# endif
+#else
+# error Unsupported system.
+#endif
+
+}
+#endif
diff --git a/src/3rd_party/pathie-cpp/include/temp.hpp b/src/3rd_party/pathie-cpp/include/temp.hpp
new file mode 100644
index 00000000..02a35879
--- /dev/null
+++ b/src/3rd_party/pathie-cpp/include/temp.hpp
@@ -0,0 +1,83 @@
+#ifndef PATHIE_TEMPDIR_HPP
+#define PATHIE_TEMPDIR_HPP
+#include "path.hpp"
+
+namespace Pathie {
+
+ /**
+ * A class for working with temporary entries; this is the
+ * superclass of Tempdir and Tempfile that encapsulates the common
+ * logic between the two. This class cannot be instanciated
+ * directly, instead use Tempdir and Tempfile.
+ *
+ * This class relies on `rand()` when generating the temporary
+ * path name. Therefore, it is recommended to initialise the
+ * random number generator before creating instances of this class
+ * by calling the `srand()` function.
+ *
+ * In a multithreaded environment, this class generates conflicting
+ * directory names if the C random number generator is in the same state
+ * in two threads and an instance of Tempdir is constructed in these two
+ * threads in the very same second. You should not use an instance of
+ * this class in multiple threads.
+ */
+ class TempEntry
+ {
+ public:
+ TempEntry(std::string namepart);
+ virtual ~TempEntry();
+
+ virtual void remove() const = 0;
+ void keep(bool k = true);
+
+ Path path() const;
+ bool is_kept() const;
+ protected:
+ bool m_keep;
+ Path m_path;
+ };
+
+ /**
+ * Class for working with temporary directories. Creating
+ * an instance of this class creates a temporary directory,
+ * which is removed again when the object is destroyed.
+ * If you want to keep the directory for whatever reason,
+ * call TempEntry::keep().
+ *
+ * Call TempEntry::path() to retrieve the path of the
+ * generated directory.
+ *
+ * See the docs for the TempEntry class for information
+ * on how the temporary names are generated.
+ */
+ class Tempdir: public TempEntry
+ {
+ public:
+ Tempdir(std::string namepart);
+ virtual ~Tempdir();
+ virtual void remove() const;
+ };
+
+ /**
+ * Class for working with temporary files. Creating
+ * an instance of this class creates a temporary file,
+ * which is removed again when the object is destroyed.
+ * If you want to keep the file for whatever reason,
+ * call TempEntry::keep().
+ *
+ * Call TempEntry::path() to retrieve the path of the
+ * generated directory.
+ *
+ * See the docs for the TempEntry class for information
+ * on how the temporary names are generated.
+ */
+ class Tempfile: public TempEntry
+ {
+ public:
+ Tempfile(std::string namepart);
+ virtual ~Tempfile();
+ virtual void remove() const;
+ };
+}
+
+#endif /* PATHIE_TEMPDIR_HPP */
diff --git a/src/3rd_party/pathie-cpp/src/entry_iterator.cpp b/src/3rd_party/pathie-cpp/src/entry_iterator.cpp
new file mode 100644
index 00000000..e2ecb2fe
--- /dev/null
+++ b/src/3rd_party/pathie-cpp/src/entry_iterator.cpp
@@ -0,0 +1,279 @@
+/* -*- coding: utf-8 -*-
+ * This file is part of Pathie.
+ *
+ * Copyright © 2015, 2017 Marvin Gülker
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * “AS IS” AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "../include/entry_iterator.hpp"
+#include "../include/path.hpp"
+#include "../include/errors.hpp"
+
+#if defined(__unix__)
+#include <sys/types.h>
+#include <dirent.h>
+#include <errno.h>
+#include <stdexcept>
+#elif defined(_WIN32)
+#include <Windows.h>
+#else
+#error Unsupported system
+#endif
+
+using namespace Pathie;
+
+/**
+ * The default constructor always constructs the terminal
+ * iterator, i.e. the one you want to test for if you want
+ * to know whether an iteration has completed.
+ */
+entry_iterator::entry_iterator()
+ : mp_directory(NULL),
+ mp_cur(NULL),
+ mp_cur_path(new Path())
+{
+}
+
+/**
+ * Construct an iterator that reads the entries in the given directory.
+ */
+entry_iterator::entry_iterator(const Path* p_directory)
+ : mp_directory(p_directory),
+ mp_cur(NULL),
+ mp_cur_path(new Path())
+{
+ open_native_handle();
+}
+
+/**
+ * Destructor. Closes the open native handle, if it is open.
+ */
+entry_iterator::~entry_iterator()
+{
+ close_native_handle();
+
+ if (mp_cur_path)
+ delete mp_cur_path;
+
+ // `mp_directory' is NOT deleted, because this class does not own it!
+}
+
+/**
+ * Opens the native handle to the directory and reads the first
+ * entry from the directory.
+ */
+void entry_iterator::open_native_handle()
+{
+#if defined(_PATHIE_UNIX)
+ std::string nstr = mp_directory->native();
+ mp_cur = opendir(nstr.c_str());
+
+ if (mp_cur) {
+ struct dirent* p_dirent = readdir(static_cast<DIR*>(mp_cur));
+ *mp_cur_path = filename_to_utf8(p_dirent->d_name);
+ }
+ else {
+ throw(Pathie::ErrnoError(errno));
+ }
+#elif defined(_WIN32)
+ std::wstring utf16 = utf8_to_utf16(mp_directory->str() + "/*");
+ WIN32_FIND_DATAW finddata;
+
+ mp_cur = FindFirstFileW(utf16.c_str(), &finddata);
+ if (static_cast<HANDLE>(mp_cur) == INVALID_HANDLE_VALUE) {
+ DWORD err = GetLastError();
+ mp_cur = NULL;
+ throw(Pathie::WindowsError(err));
+ }
+ else {
+ *mp_cur_path = utf16_to_utf8(finddata.cFileName);
+ }
+#else
+#error Unsupported system
+#endif
+}
+
+/// Helper function for closing the native handle.
+void entry_iterator::close_native_handle()
+{
+ if (!mp_cur)
+ return;
+
+#if defined(_PATHIE_UNIX)
+ closedir(static_cast<DIR*>(mp_cur));
+#elif defined(_WIN32)
+ FindClose(static_cast<HANDLE>(mp_cur));
+#endif
+
+ // Reset member variables
+ *mp_cur_path = Path();
+ mp_cur = NULL;
+}
+
+/**
+ * Increment operator. Calling this advances the iterator by one,
+ * thus pointing it to the next entry. If the end is reached,
+ * the iterator will compare equal to the return value of the
+ * default constructor, and dereferencing it yields an undefined
+ * result.
+ *
+ * \remark Note that this operator does *not* return the old value
+ * the iterator had, simply because that would mean copying the
+ * receiver first, and copying instances of this class is not
+ * possible. Thus, *do not rely* on the return value of this
+ * method.
+ */
+entry_iterator& entry_iterator::operator++(int)
+{
+ if (mp_cur) {
+#if defined(_PATHIE_UNIX)
+ struct dirent* p_dirent = readdir(static_cast<DIR*>(mp_cur));
+ if (p_dirent) {
+ *mp_cur_path = filename_to_utf8(p_dirent->d_name);
+ }
+ else {
+ close_native_handle();
+ }
+#elif defined(_WIN32)
+ WIN32_FIND_DATAW finddata;
+ if (FindNextFileW(static_cast<HANDLE>(mp_cur), &finddata)) {
+ *mp_cur_path = utf16_to_utf8(finddata.cFileName);
+ }
+ else {
+ close_native_handle();
+ }
+#else
+#error Unsupported system
+#endif
+ }
+ else { // Finished already
+ throw(std::range_error("Tried to advance a finished entry_iterator!"));
+ }
+
+ return *this;
+}
+
+/// Same as the other operator++().
+entry_iterator& entry_iterator::operator++()
+{
+ return (operator++());
+}
+
+/**
+ * Derefence operator. Returns the entry the iterator currently
+ * points at.
+ */
+const Path& entry_iterator::operator*() const
+{
+ return *mp_cur_path;
+}
+
+/**
+ * Resets this iterator to start again on the path given.
+ */
+entry_iterator& entry_iterator::operator=(const Path* p_directory)
+{
+ close_native_handle();
+ mp_directory = p_directory;
+ open_native_handle();
+ return *this;
+}
+
+/**
+ * Boolean operator. In comparisons, this iterator is true if
+ * it has not yet finished, false otherwise.
+ */
+entry_iterator::operator bool() const
+{
+ return !!mp_directory;
+}
+
+/**
+ * Equality test. Two instances of this class are equal if:
+ *
+ * 1. If `other` is a terminal iterator as created by the parameterless
+ * constructor: if the receiver has finished iterating the directory.
+ * 2. If `other` is not a terminal iterator as described: if both
+ * iterators refer to the same top directory and their current
+ * native handle is the same and in the same state (hint: this
+ * is not going to happen under normal circumstances).
+ */
+bool entry_iterator::operator==(const entry_iterator& other) const
+{
+ if (other.mp_directory == NULL) {
+ /* `mp_directory' is only null for the terminal iterator, that is,
+ * a test for the terminal iterator was requested. An entry_iterator
+ * is terminated when `mp_cur' is null, so that's what is returned
+ * in reality when a test with the terminal iterator is
+ * requested. */
+ return !mp_cur;
+ }
+ else {
+ return mp_directory == other.mp_directory && mp_cur == other.mp_cur;
+ }
+}
+
+/// Inverse of operator==().
+bool entry_iterator::operator!=(const entry_iterator& other) const
+{
+ return !(*this == other);
+}
+
+/**
+ * Derefence operator. Returns the entry the iterator currently
+ * points at.
+ */
+const Path* entry_iterator::operator->() const
+{
+ return mp_cur_path;
+}
+
+/// "Copy" constructor -- see class docs for more info.
+entry_iterator::entry_iterator(const entry_iterator& other)
+ : mp_directory(other.mp_directory),
+ mp_cur(other.mp_cur),
+ mp_cur_path(other.mp_cur_path)
+{
+ entry_iterator& e = const_cast<entry_iterator&>(other);
+ e.mp_directory = NULL;
+ e.mp_cur = NULL;
+ e.mp_cur_path = new Path();
+}
+
+/// "Copy" assignment -- see class docs for more info.
+entry_iterator& entry_iterator::operator=(const entry_iterator& other)
+{
+ mp_directory = other.mp_directory;
+ mp_cur = other.mp_cur;
+ mp_cur_path = other.mp_cur_path;
+
+ entry_iterator& e = const_cast<entry_iterator&>(other);
+ e.mp_directory = NULL;
+ e.mp_cur = NULL;
+ e.mp_cur_path = new Path();
+
+ return *this;
+}
+
diff --git a/src/3rd_party/pathie-cpp/src/errors.cpp b/src/3rd_party/pathie-cpp/src/errors.cpp
new file mode 100644
index 00000000..f5e406b1
--- /dev/null
+++ b/src/3rd_party/pathie-cpp/src/errors.cpp
@@ -0,0 +1,150 @@
+/* -*- coding: utf-8 -*-
+ * This file is part of Pathie.
+ *
+ * Copyright © 2015, 2017 Marvin Gülker
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * “AS IS” AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "../include/errors.hpp"
+
+#include <cerrno>
+#include <cstring>
+#include <sstream>
+
+#if defined(_WIN32)
+#include <windows.h>
+#elif defined(_PATHIE_UNIX)
+#include <glob.h>
+#endif
+
+using namespace Pathie;
+
+PathieError::PathieError()
+{
+ m_pathie_errmsg = "Unknown pathie exception.";
+}
+
+PathieError::PathieError(std::string message)
+{
+ m_pathie_errmsg = message;
+}
+
+PathieError::~PathieError() throw()
+{
+ //
+}
+
+const char* PathieError::what() const throw()
+{
+ return m_pathie_errmsg.c_str();
+}
+
+ErrnoError::ErrnoError(int val)
+{
+ std::stringstream ss;
+ ss << val;
+
+ m_val = val;
+ m_pathie_errmsg = "Errno " + ss.str() + ": " + strerror(val);
+}
+
+ErrnoError::~ErrnoError() throw()
+{
+ //
+}
+
+#ifdef _WIN32
+WindowsError::WindowsError(DWORD val)
+{
+ std::stringstream ss;
+ ss << val;
+
+ wchar_t* buf = NULL;
+ FormatMessageW(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS,
+ NULL,
+ val,
+ LANG_USER_DEFAULT,
+ (wchar_t*) &buf, // What a weird API.
+ 0,
+ NULL);
+
+ m_val = val;
+ m_pathie_errmsg = std::string("Windows Error Code ") + ss.str() + ": " + utf16_to_utf8(buf);
+
+ LocalFree(buf);
+}
+
+WindowsError::~WindowsError() throw()
+{
+ //
+}
+
+WindowsHresultError::WindowsHresultError(HRESULT val)
+{
+ std::stringstream ss;
+ ss << val;
+
+ m_val = val;
+ m_pathie_errmsg = std::string("Windows HRESULT Error Code :") + ss.str();
+}
+
+WindowsHresultError::~WindowsHresultError() throw()
+{
+ //
+}
+
+#endif
+
+#ifdef _PATHIE_UNIX
+GlobError::GlobError(int val)
+{
+ std::stringstream ss;
+ ss << val;
+
+ m_val = val;
+
+ m_pathie_errmsg = "Glob error code " + ss.str() + ": ";
+
+ switch(val) {
+ case GLOB_NOSPACE:
+ m_pathie_errmsg += "GLOB_NOSPACE";
+ break;
+ case GLOB_ABORTED:
+ m_pathie_errmsg += "GLOB_ABORTED";
+ break;
+ case GLOB_NOMATCH:
+ m_pathie_errmsg += "GLOB_NOMATCH";
+ break;
+ default:
+ m_pathie_errmsg += "Unknown glob error";
+ break;
+ }
+}
+
+GlobError::~GlobError() throw()
+{
+ //
+}
+#endif
diff --git a/src/3rd_party/pathie-cpp/src/path.cpp b/src/3rd_party/pathie-cpp/src/path.cpp
new file mode 100644
index 00000000..99185085
--- /dev/null
+++ b/src/3rd_party/pathie-cpp/src/path.cpp
@@ -0,0 +1,3348 @@
+/* -*- coding: utf-8 -*-
+ * This file is part of Pathie.
+ *
+ * Copyright © 2015, 2017 Marvin Gülker
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * “AS IS” AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "../include/path.hpp"
+#include "../include/pathie.hpp"
+#include "../include/errors.hpp"
+
+#include <cstdlib>
+#include <cstdio>
+#include <ctime>
+#include <cstring>
+#include <iostream>
+#include <sstream>
+#include <string>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <stdexcept>
+#include <errno.h>
+
+#if defined(_WIN32)
+#include <windows.h>
+#include <winioctl.h>
+#include <direct.h>
+#include <shlobj.h>
+#include <shlwapi.h>
+//#include <ntifs.h> // Currently not in msys2
+
+#elif defined(_PATHIE_UNIX)
+#include <unistd.h>
+#include <limits.h>
+#include <fcntl.h>
+#include <sys/types.h>
+#include <sys/param.h> // defines "BSD" macro on BSD systems
+#include <pwd.h>
+#include <glob.h>
+#include <fnmatch.h>
+
+#else
+#error Unsupported system.
+#endif
+
+#ifdef BSD
+#include <sys/time.h>
+#include <sys/sysctl.h>
+#endif
+
+using namespace Pathie;
+using namespace std;
+
+Path::localpathtype Path::c_localdefault = LOCALPATH_LOCAL;
+
+/**
+ * The default constructor. It does **not** create an empty
+ * path, but a path whose value is ".", i.e. the current
+ * working directory as a relative path (see also pwd()).
+ */
+Path::Path()
+{
+ m_path = ".";
+}
+
+/**
+ * Copies contents from path to a new instance.
+ *
+ * \param[in] path The Path instance to copy.
+ */
+Path::Path(const Path& path)
+{
+ m_path = path.m_path;
+}
+
+/**
+ * This constructs a path from a given std::string.
+ *
+ * \param path String to construct from. Must be encoded in UTF-8.
+ *
+ * \returns a new instance of class Path.
+ */
+Path::Path(std::string path)
+{
+ m_path = path;
+ sanitize();
+}
+
+/**
+ * Constructs a Path instance from a list of path components.
+ * This is the inverse of the burst() method.
+ *
+ * \param[in] components List of components to join.
+ *
+ * \returns A new instance.
+ */
+Path::Path(const std::vector<Path>& components)
+{
+ m_path = components.front().m_path;
+
+ if (components.size() > 1) {
+ // Ensure that for both absolute and relative path we end in
+ // a slash for appending below
+ if (m_path[0] != '/') {
+ m_path += "/";
+ }
+
+ std::vector<Path>::const_iterator iter;
+ for(iter=components.begin()+1; iter != components.end(); iter++) { // first element has already been taken care of above
+ m_path += (*iter).m_path + "/";
+ }
+
+ // Trailing slash is unwanted, remove it
+ m_path = m_path.substr(0, m_path.length()-1);
+ }
+}
+
+/**
+ * Sanitizes the path. It:
+ *
+ * 1. Replaces any backslashes with forward slashes (read Windows).
+ * 2. Replaces all double forward slashes with single forward slashes
+ * 3. Delates a trailing slash, if any.
+ */
+void Path::sanitize()
+{
+ // Replace any backslashes \ with forward slashes /.
+ size_t cur = string::npos;
+ while ((cur = m_path.find("\\")) != string::npos) { // assignment intended
+ m_path.replace(cur, 1, "/");
+ }
+
+ // Replace all double slashes // with a single one
+ cur = string::npos;
+ while ((cur = m_path.find("//")) != string::npos) { // assignment intended
+ m_path.replace(cur, 2, "/");
+ }
+
+ // Remove trailing slash if any (except for the filesystem root)
+ long len = m_path.length();
+#if defined(_PATHIE_UNIX)
+ if (len > 1 && m_path[len - 1] == '/')
+ m_path = m_path.substr(0, len - 1);
+#elif defined(_WIN32)
+ if (len > 1) { // / is root of current drive, "x" is the relative path "./x"
+ // Check if X:/foo/bar
+ if (len > 3 && m_path[len - 1] == '/') { // More than 3 chars cannot be root
+ m_path = m_path.substr(0, len - 1);
+ }
+ else { // Only drive root?
+ if (m_path[1] == ':') {
+ // Here m_path must be a drive root. The colon ":" is not allowed in paths on Windows except as the 2nd char to denote the drive letter
+ if (len == 2) { // Whoa -- "X:" misses leading / for drive root, append it
+ m_path.append("/");
+ }
+ else if (len == 3 && m_path[2] != '/') { // Whoa -- "X:f" misses leading / for root directory, insert it
+ m_path.insert(2, "/");
+ }
+ // else length is 3 with a slash, i.e. "X:/". This is fine and shall not be touched.
+ }
+ else { // not a drive root, delete trailing / if any
+ if (m_path[len - 1] == '/') {
+ m_path = m_path.substr(0, len - 1);
+ }
+ }
+ }
+ }
+#else
+#error Unsupported system
+#endif
+}
+
+/** \name Conversion methods
+ *
+ * Convert a path to other objects.
+ */
+///@{
+
+/**
+ * Returns a copy of the underlying `std::string`. This is always
+ * encoded in UTF-8, regardless of the operating system.
+ *
+ * \see native() utf8_str()
+ */
+std::string Path::str() const
+{
+ return m_path;
+}
+
+/**
+ * This method does the same as str(). It exists to make code using
+ * the UTF-8 variant more readable, because one tends to forget
+ * whether str() returns the native or the UTF-8 variant.
+ *
+ * \see native() str()
+ */
+std::string Path::utf8_str() const
+{
+ return m_path;
+}
+
+#if defined(_PATHIE_UNIX)
+std::string Path::native() const
+{
+ return utf8_to_filename(m_path);
+}
+
+#elif defined(_WIN32)
+/**
+ * Returns the path in the platform’s native format. Note
+ * that this method returns a `std::string` on UNIX,
+ * whereas it returns a `std::wstring` on Windows.
+ *
+ * On Windows, the returned string also uses exclusively backslashes
+ * instead of forward slashes. It is encoded in UTF-16LE.
+ *
+ * On UNIX, the returned string is in the encoding dictated by the locale
+ * ($LANG and $LC_ALL variables).
+ */
+std::wstring Path::native() const
+{
+ std::string dup(m_path);
+
+ size_t pos = 0;
+ while((pos = dup.find("/", pos)) != std::string::npos) { // Single = intended
+ dup.replace(pos, 1, "\\");
+ }
+
+ return utf8_to_utf16(dup);
+}
+#else
+#error Unsupported system.
+#endif
+
+///@}
+
+
+/** \name Path decomposition
+ *
+ * Retrieve the parts of the path you want.
+ */
+///@{
+
+/**
+ * Returns the path’s basename, i.e. the last component
+ * of the path, including the file excention.
+ *
+ * For example, "/foo/bar.txt" has a basename of "bar.txt",
+ * and "/foo/bar" has a basename of "bar".
+ *
+ * \returns a new Path instance with only the basename.
+ *
+ * \see dirname()
+ */
+Path Path::basename() const
+{
+ if (m_path == ".")
+ return Path(".");
+ else if (m_path == "..")
+ return Path("..");
+ else if (is_root())
+ return Path(m_path);
+
+ size_t pos = 0;
+ if ((pos = m_path.rfind("/")) != string::npos) // Single = intended
+ return Path(m_path.substr(pos + 1));
+ else
+ return Path(m_path);
+}
+
+/**
+ * Returns the path’s dirname, i.e. all components of the
+ * path except for the basename component (see basename()).
+ *
+ * For example, "/foo/bar/baz.txt" has a dirname of "/foo/bar",
+ * and "/foo/bar/baz" has a dirname of "/foo/bar".
+ *
+ * \returns a new Path instance with only the dirname.
+ *
+ * \see basename() parent()
+ */
+Path Path::dirname() const
+{
+ if (m_path == ".")
+ return Path(".");
+ else if (m_path == "..")
+ return Path(".");
+ else if (is_root())
+ return Path(m_path);
+
+ size_t pos = 0;
+ if ((pos = m_path.rfind("/")) != string::npos) { // Single = intended
+ if (pos == 0) { // /usr
+ return root();
+ }
+#ifdef _WIN32
+ else if (pos == 1 && m_path[1] == ':') { // X:/foo
+ return root();
+ }
+#endif
+ else { // regular/path or /regular/path
+ return Path(m_path.substr(0, pos));
+ }
+ }
+ else // single relative directory
+ return Path(".");
+}
+
+/**
+ * This is a convenience method that allows you to retrieve
+ * both the dirname() and the basename() in one call.
+ *
+ * \param[out] dname Receives the dirname() value.
+ * \param[out] bname Receives the basename() value.
+ */
+void Path::split(Path& dname, Path& bname) const
+{
+ dname = dirname();
+ bname = basename();
+}
+
+/**
+ * This method returns the file extension of the path,
+ * if possible; otherwise it returns an empty string.
+ * Filenames that consist entirely of a "file extension",
+ * i.e. ".txt" or "/foo/.txt" will return an empty string.
+ */
+std::string Path::extension() const
+{
+ if (m_path == ".")
+ return "";
+ else if (m_path == "..")
+ return "";
+
+ size_t pos = 0;
+ if ((pos = m_path.rfind(".")) != string::npos) { // assignment intended
+ if (pos == 0 || pos == m_path.length() - 1) // .foo and foo.
+ return "";
+ else {
+ if (m_path[pos - 1] == '/') // foo/.txt
+ return "";
+ else
+ return m_path.substr(pos);
+ }
+ }
+ else
+ return "";
+}
+
+/**
+ * This is the same as dirname() and is provided only for convenience.
+ *
+ * \see dirname()
+ */
+Path Path::parent() const
+{
+ return dirname();
+}
+
+/**
+ * Returns the number of components in the path string, or
+ * in different words, counts the slashes and adds one for
+ * the last element, except if the path is just the root
+ * (see is_root()).
+ *
+ * The return value of this method minus one is the last
+ * possible index for operator[].
+ */
+size_t Path::component_count() const
+{
+ if (is_root())
+ return 1;
+
+ size_t result = 0;
+ size_t pos = 0;
+ while ((pos = m_path.find("/", pos)) != string::npos) { // Assignment intended
+ result++;
+ pos++;
+ }
+
+ return ++result;
+}
+
+/**
+ * Returns the filesystem root for this path. On UNIX,
+ * this will always return /, but on Windows it will
+ * return X:/ if the referenced path is an absolute path
+ * with drive letter, and / if the referenced path is
+ * a relative path or an absolute path on the current
+ * drive.
+ */
+Path Path::root() const
+{
+#if defined(_PATHIE_UNIX)
+ return Path("/");
+#elif defined(_WIN32)
+ // Check if we have an absolute path with drive,
+ // otherwise return the root for the current drive.
+ if (m_path[1] == ':') // Colon is on Windows only allowed here to denote a preceeding drive letter => absolute path
+ return Path(m_path.substr(0, 3));
+ else
+ return Path("/");
+#else
+#error Unsupported system.
+#endif
+}
+
+/**
+ * This method splits up the paths into its separate components,
+ * i.e. it splits it up at every /, except for the leading / of
+ * an absolute path, which is considered a component on its own
+ * and is thus the first element of a bursted absolute path.
+ *
+ * \param descend (`false`) If this is true, keeps the parent paths when bursting.
+ *
+ * \returns A vector of Path instances, where each instance
+ * corresponds to one component of the Path.
+ *
+ * Example:
+ *
+ * ~~~~~~~~~~~~~~~~~~~~ c++
+ * Path p("/tmp/foo/bar");
+ * p.burst(); // => /, tmp, foo, bar
+ * p.burst(true); // => /, /tmp, /tmp/foo, /tmp/foo/bar
+ * ~~~~~~~~~~~~~~~~~~~~
+ */
+std::vector<Path> Path::burst(bool descend /* = false */) const
+{
+ size_t pos = 0;
+ size_t lastpos = 0;
+ std::vector<Path> results;
+ std::string prefix;
+
+ // Take care of leading / of absolute paths
+ if (m_path[0] == '/') {
+ results.push_back(Path("/"));
+ prefix.append("/");
+
+ // Adjust pos so we don’t find the initial /
+ pos++;
+ lastpos++;
+ }
+
+ while((pos = m_path.find("/", pos)) != string::npos) {
+ std::string component = m_path.substr(lastpos, pos - lastpos);
+
+ if (descend) {
+ results.push_back(Path(prefix + component));
+ prefix.append(component);
+ prefix.append("/");
+ }
+ else {
+ results.push_back(Path(component));
+ }
+
+ lastpos = pos + 1;
+ pos++;
+ }
+
+ std::string lastcomponent = m_path.substr(lastpos);
+
+ if (descend)
+ results.push_back(Path(prefix + lastcomponent)); // Note no trailing /
+ else
+ results.push_back(Path(lastcomponent));
+
+ return results;
+}
+
+///@}
+
+/** \name Path expansion
+ *
+ * Expand paths to a more fuller version without shortcuts.
+ */
+
+///@{
+
+/**
+ * This method, removes all occurences of . and .. from the path,
+ * leaving a clean filesystem path.
+ *
+ * Note that neither an absolute path is created, nor
+ * are shortcuts other than . and .. expanded.
+ *
+ * This method does not access file filesystem, and thus does not
+ * know about symbolic links. Therefore, if the path contains symlinks,
+ * the result may not be the way you expect it. Use real() if
+ * you need to resolve all your symbolic links in the path.
+ *
+ * For example, if you have a directory `/tmp/foo`, which contains a
+ * symbolic link `bar` that points to `/tmp/bar`, then a path of
+ * `/tmp/foo/bar/..` will be prune()d to `/tmp/foo`, although the
+ * canonically correct result is `/tmp`. The latter is what you will
+ * get if you use real().
+ *
+ * \returns A new string with . and .. removed.
+ *
+ * \see expand() real()
+ */
+Path Path::prune() const
+{
+ std::string newpath(m_path); // copy
+ size_t pos = 0;
+ while((pos = newpath.find("/.", pos)) != string::npos) { // assignment intended
+ if (newpath.substr(pos, 3) == "/..") {
+
+ // Weird path like /..foo or foo/..bar, which are NOT relative paths
+ if (newpath.length() > pos + 3 && newpath[pos + 3] != '/') {
+ // Do not reset `pos' -- this has to stay. Advance to the next char.
+ pos++;
+ continue;
+ }
+
+ if (pos == 0) {
+ // /.. at beginning of string, replace with root / (/ on Windows is root on current drive)
+ newpath.erase(pos, 3);
+
+ // Whoops -- the entire string was just "/.."
+ if (newpath.empty()) {
+ newpath.append("/");
+ }
+ }
+#ifdef _WIN32
+ // Cater for paths with drive X:/ on Windows
+ else if (pos == 2 && newpath[1] == ':') { // ":" is on Windows only allowed at pos 1, where it signifies the preceding char is a drive letter
+ // X:/. or X:/.. at beginning of string
+ if(newpath.length() > 4 && newpath[4] == '.') { // X:/..
+ // Prevent special case "X:/..foo", which is directory "..foo" under the root
+ if (newpath.length() <= 5 || newpath[5] != '/') {
+ // X:/.. or X:/../foo/bar at beginning of string, replace with drive root
+ newpath.erase(pos, 3);
+ }
+ }
+ else { // X:/./foo/bar X:/..foo
+ // Prevent special case "X:/.foo", which is directory ".foo" under the root
+ if (newpath.length() <= 4 || newpath[4] != '/') {
+ // X:/. or X:/./foo/bar at beginning of string, replace with drive root
+ newpath.erase(pos, 2);
+ }
+ }
+
+ if (newpath.length() == 2) {
+ // Whoops -- the entire string was just "X:/.." or "X:/."
+ newpath.append("/");
+ }
+ }
+#endif
+ else {
+ size_t pos2 = 0;
+ if ((pos2 = newpath.rfind("/", pos - 1)) != string::npos) { // assignment intended
+ // Remove parent directory.
+ newpath.erase(pos2, pos - pos2 + 3);
+ }
+ else { // ../ for relative path (as in foo/../baz.txt)
+ newpath.erase(0, pos + 4);
+ }
+ }
+ }
+ else { // Single /.
+
+ // Weird path like /..foo or foo/..bar, which are NOT relative paths
+ if (newpath.length() > pos + 2 && newpath[pos + 2] != '/') {
+ // Do not reset `pos' -- this has to stay. Advance to the next char.
+ pos++;
+ continue;
+ }
+
+ newpath.erase(pos, 2);
+
+ // Whoops -- the entire string was just "/."
+ if (newpath.empty()) {
+ newpath.append("/");
+ }
+ }
+
+ // Reset as we have modified the string and might need to go again over it
+ pos = 0;
+ }
+
+ /* If we are empty now, the original string was a one-element
+ * relative path with .. appended. We cannot know what to set
+ * without referring to pwd(), which is external access and
+ * forbidden for this method. So instead, we do the one sane thing
+ * and just use ".". */
+ if (newpath.empty())
+ newpath = ".";
+
+ return Path(newpath);
+}
+
+/**
+ * \note Under specific circumstances (see below), this method
+ * accesses the file system.
+ *
+ * This method creates an absolute path by use of prune(), but
+ * additionally expands any expandable strings. If one of the
+ * following substitution sequences are encountered, it will be
+ * replaced accordingly.
+ *
+ * "~" is expanded to the user’s home directory, see home().
+ *
+ * \returns a new instance with everything expanded.
+ *
+ * \remark This method uses prune() to expand ".." entries, therefore
+ * it will not consider symbolic links when resolving those. Use
+ * real() if you need to do that.
+ *
+ * \see prune() real()
+ */
+Path Path::expand() const
+{
+ Path path(*this); // copy
+
+ if (m_path[0] != '~')
+ path = path.absolute();
+
+ std::string str = path.str();
+ if (str[0] == '~') {
+ Path homepath = home();
+
+ if (str[1] == '/' || str.length() == 1) {
+ // User home requested
+ str.replace(0, 1, homepath.m_path);
+ }
+
+ path = Path(str);
+ }
+
+ return path.prune();
+}
+
+/**
+ * \note This method acceses the filesystem.
+ *
+ * This is the bruteforce method for determing the real path
+ * of the entry in question on the filesystem. It looks on
+ * each single component of the path, checks if it is a
+ * symbolic link, and if so, resolves it.
+ *
+ * This method supports symbolic link resolving only on UNIX.
+ *
+ * It still does not consider hardlinks, mountpoints, and junctions,
+ * though. However, a hardlink is a real second valid name for an
+ * object; in contrast to a symbolic link, if one hardlink gets
+ * removed, the other one stays still valid. If you remove the file a
+ * symbolic link points to, the link breaks. Thus, it is not even
+ * possible to determine which of two hardlinks to a file is the
+ * "primary" one. Mountpoints and junctions (junctions are on Windows
+ * what mountpoints are on UNIX) behave similar with respect to
+ * entire directory hierarchies.
+ *
+ * \see expand() prune()
+ */
+Path Path::real() const
+{
+#if defined(_PATHIE_UNIX)
+ std::string nstr = native();
+ char path[PATH_MAX];
+ if (!realpath(nstr.c_str(), path))
+ throw(Pathie::ErrnoError(errno));
+
+ return Path(filename_to_utf8(path));
+#elif defined(_WIN32)
+ // On Windows there sadly is no easy way to do this. We can
+ // only determine if a given path is a symlink and resolve it...
+ // Instructions taken from: http://msdn.microsoft.com/en-us/library/windows/desktop/aa363940%28v=vs.85%29.aspx
+ std::vector<Path> components = burst();
+ unsigned int pos = 0;
+
+ while (pos < components.size()) {
+ // Build path consisting of all elements upto our position pointer
+ Path reduced_path(components.front());
+ if (components.size() - pos > 1) {
+ for (unsigned int i=1; i <= pos; i++) { // i=0 is already in the initialization above
+ reduced_path = reduced_path.join(components[i]);
+ }
+ }
+
+ // If that’s a symlink, resolve it and replace our path until
+ // the symlink with the symlink’s target.
+ /*std::wstring reduced_path_utf16 = utf8_to_utf16(reduced_path.m_path);
+ if (is_ntfs_symlink(reduced_path_utf16.c_str())) {
+ wchar_t* target_utf16 = read_ntfs_symlink(reduced_path_utf16.c_str());
+ Path target(utf16_to_utf8(target_utf16));
+ std::vector<Path> target_components = target.burst();
+ free(target_utf16);
+
+ // Replace all components up to pos with the symlink target
+ components.erase(components.begin(), components.begin() + pos);
+ std::vector<Path> temp(components);
+ components.clear();
+ for(auto iter=target_components.begin(); iter != target_components.end(); iter++)
+ components.push_back(*iter);
+ for(auto iter=temp.begin(); iter != temp.end(); iter++)
+ components.push_back(*iter);
+ }
+ else {*/
+ // Note a symlink can point to another symlink, so we can only
+ // advance to the next element if this element has been tested
+ // for not being a symlink.
+ pos++;
+ //}
+ }
+
+ // BUild a new path from the now resolved components
+ Path result(components.front());
+ if (components.size() > 1) {
+ for(std::vector<Path>::const_iterator iter=components.begin();
+ iter != components.end(); iter++) {
+ result = result.join(*iter);
+ }
+ }
+
+ return result;
+#else
+#error Unsupported system.
+#endif
+}
+
+// Msys2 does currently not have ntifs.h windows header, which
+// is required for reading NTFS symlinks.
+#if 0
+//#ifdef __WIN32
+/*
+ * Checking if a file is a symlink under Windows is insane.
+ * See http://msdn.microsoft.com/en-us/library/windows/desktop/aa363940%28v=vs.85%29.aspx
+ * for the detailed instructions by Microsoft on how to do
+ * that.
+ */
+bool Path::is_ntfs_symlink(const wchar_t* path) const
+{
+ // First we need to obtain the file attributes.
+ DWORD attrs = GetFileAttributesW(path);
+ if (attrs == INVALID_FILE_ATTRIBUTES) {
+ DWORD err = GetLastError();
+ throw(Pathie::WindowsError(err));
+ }
+
+ /* These file attributes must contain the REPARSE_POINT attribute
+ * that mark the file as being symlink, junction, or similar.
+ * Actually, reparse points can contain many more custom data, but
+ * we are not intersted in those. */
+ if (attrs & FILE_ATTRIBUTE_REPARSE_POINT) {
+ // Now we have to retrieve a special attributes handle from the file.
+ WIN32_FIND_DATAW finddata;
+ HANDLE findhandle = FindFirstFileW(path, &finddata);
+ if (findhandle == INVALID_HANDLE_VALUE) {
+ DWORD err = GetLastError();
+ throw(Pathie::WindowsError(err));
+ }
+ FindClose(findhandle);
+
+ // These extended attributes contain the SYMLINK tag if this file
+ // is a symlink.
+ if (finddata.dwReserved0 & IO_REPARSE_TAG_SYMLINK)
+ return true;
+
+ // Junction or so, we do not resolve that
+ return false;
+ }
+
+ // Regular file
+ return false;
+}
+
+/*
+ * Reading the link target also is insanely hard.
+ * The process is documented at http://msdn.microsoft.com/en-us/library/windows/desktop/aa365503%28v=vs.85%29.aspx
+ * in general. The key function is DeviceIoControl(), documented
+ * at http://msdn.microsoft.com/en-us/library/windows/desktop/aa363216%28v=vs.85%29.aspx
+ * .
+ *
+ * This function does not check if `path` is a symlink, but assumes it.
+ * It will exhibit unexpactable behaviour if this assumption is wrong.
+ *
+ * The returned pointer must be freed by you.
+ */
+wchar_t* Path::read_ntfs_symlink(const wchar_t* path) const
+{
+ // We have to open the file (directories are files on Windows also) first.
+ HANDLE filehandle = CreateFileW(path, GENERIC_READ, 0, NULL, OPEN_EXISTING, FILE_FLAG_OPEN_REPARSE_POINT, NULL);
+ if (filehandle == INVALID_HANDLE_VALUE) {
+ DWORD err = GetLastError();
+ throw(Pathie::WindowsError(err));
+ }
+
+ // This infamous structure is documented here: http://msdn.microsoft.com/en-us/library/ff552012.aspx
+ unsigned long reparsebufsize = REPARSE_GUID_DATA_BUFFER_HEADER_SIZE; // According to docs this is the minimum size
+ REPARSE_DATA_BUFFER* p_reparse_data = NULL;
+ while (true) {
+ reparsebufsize += 4096; // Do you have a better guess?
+ p_reparse_data = (REPARSE_DATA_BUFFER*) realloc(p_reparse_data, reparsebufsize);
+ memset(p_reparse_data, '\0', reparsebufsize);
+
+ DWORD bytecount = 0;
+ // Obtain the reparse tag. FSCTL_GET_REPARSE_POINT is documented here: http://msdn.microsoft.com/en-us/library/windows/desktop/aa364571(v=vs.85).aspx
+ if (DeviceIoControl(filehandle, FSCTL_GET_REPARSE_POINT, NULL, 0, p_reparse_data, reparsebufsize, &bytecount, NULL) == 0) {
+ DWORD errsav = GetLastError();
+ if (errsav == ERROR_INSUFFICIENT_BUFFER) { // buffer was to small, try again
+ continue;
+ }
+ else {
+ throw(Pathie::WindowsError(errsav));
+ }
+ }
+ else { // success
+ break;
+ }
+ }
+
+ // See also http://msdn.microsoft.com/en-us/library/windows/desktop/aa365511(v=vs.85).aspx
+ // And this one: http://www.codeproject.com/Articles/21202/Reparse-Points-in-Vista
+ if (p_reparse_data->ReparseTag == IO_REPARSE_TAG_SYMLINK) {
+ wchar_t* subsname = (wchar_t*) malloc(p_reparse_data->SymbolicLinkReparseBuffer.SubstituteNameLength + 2); // UTF-16 NUL
+ wchar_t* printname = (wchar_t*) malloc(p_reparse_data->SymbolicLinkReparseBuffer.PrintNameLength + 2); // UTF-16 NUL
+
+ memset(subsname, '\0', p_reparse_data->SymbolicLinkReparseBuffer.SubstituteNameLength + 2);
+ memset(printname, '\0', p_reparse_data->SymbolicLinkReparseBuffer.PrintNameLength + 2);
+
+ wcsncpy(subsname, &p_reparse_data->SymbolicLinkReparseBuffer.PathBuffer[p_reparse_data->SymbolicLinkReparseBuffer.SubstituteNameOffset], p_reparse_data->SymbolicLinkReparseBuffer.SubstituteNameLength / sizeof(WCHAR));
+ wcsncpy(printname, &p_reparse_data->SymbolicLinkReparseBuffer.PathBuffer[p_reparse_data->SymbolicLinkReparseBuffer.PrintNameOffset], p_reparse_data->SymbolicLinkReparseBuffer.PrintNameLength / sizeof(WCHAR));
+
+ // Actually, it appears the subsname has no real usecase...
+ free(subsname);
+ free(p_reparse_data);
+ CloseHandle(filehandle);
+ return printname;
+ }
+ else {
+ return NULL;
+ }
+}
+#endif
+
+///@}
+
+/** \name Special files and directories
+ *
+ * Files and directories with a special meaning that did not
+ * fit in the other groups.
+ */
+///@{
+
+/**
+ * Determines the current process working directory and returns
+ * it as an absolute path. Contains a leading drive letter on
+ * Windows.
+ */
+Path Path::pwd()
+{
+#if defined(_PATHIE_UNIX)
+ char cwd[PATH_MAX];
+ if (getcwd(cwd, PATH_MAX) != NULL)
+ return Path(filename_to_utf8(cwd));
+ else
+ throw(std::runtime_error("Failed to retrieve current working directory."));
+#elif defined(_WIN32)
+ wchar_t cwd[MAX_PATH];
+ if (GetCurrentDirectoryW(MAX_PATH, cwd) == 0)
+ throw(std::runtime_error("Failed to retrieve current working directory."));
+ else
+ return Path(utf16_to_utf8(std::wstring(cwd)));
+#else
+#error Unsupported platform.
+#endif
+}
+
+/**
+ * \note On Linux, this method accesses the `/proc` filesystem.
+ *
+ * This method returns the full absolute path to the currently running
+ * executable.
+ */
+Path Path::exe()
+{
+#if defined(__linux__)
+ char buf[PATH_MAX];
+ ssize_t size = ::readlink("/proc/self/exe", buf, PATH_MAX);
+
+ if (size < 0)
+ throw(Pathie::ErrnoError(errno));
+
+ return Path(filename_to_utf8(std::string(buf, size)));
+#elif defined(BSD)
+ // BSD does not have /proc mounted by default. However, using raw syscalls,
+ // we can figure out what would have been in /proc/curproc/file. See
+ // sysctl(3) for the management info base identifiers that are used here.
+ int mib[4];
+ char buf[PATH_MAX];
+ size_t bufsize = PATH_MAX;
+ mib[0] = CTL_KERN;
+ mib[1] = KERN_PROC;
+ mib[2] = KERN_PROC_PATHNAME;
+ mib[3] = -1; // According to sysctl(3), -1 means the current process.
+
+ if (sysctl(mib, 4, buf, &bufsize, NULL, 0) != 0) // Note this changes `bufsize' to the number of chars copied
+ throw(Pathie::ErrnoError(errno));
+
+ return Path(filename_to_utf8(std::string(buf, bufsize - 1))); // Exclude terminating NUL
+#elif defined(_WIN32)
+ wchar_t buf[MAX_PATH];
+ if (GetModuleFileNameW(NULL, buf, MAX_PATH) == 0) {
+ DWORD err = GetLastError();
+ throw(Pathie::WindowsError(err));
+ }
+
+ std::string str = utf16_to_utf8(buf);
+ return Path(str);
+#else
+#error Unsupported platform.
+#endif
+}
+
+/**
+ * This method returns the current user’s home directory. On UNIX
+ * systems, the $HOME environment variable is consulted, whereas
+ * on Windows the Windows API is queried for the directory.
+ *
+ * It will throw std::runtime_error if $HOME is not defined on
+ * UNIX.
+ */
+Path Path::home()
+{
+#if defined(_PATHIE_UNIX)
+ char* homedir = getenv("HOME");
+ if (homedir)
+ return Path(filename_to_utf8(homedir));
+ else
+ throw(std::runtime_error("$HOME not defined."));
+#elif defined(_WIN32)
+ /* TODO: Switch to KNOWNFOLDERID system as explained
+ * on http://msdn.microsoft.com/en-us/library/windows/desktop/bb762494%28v=vs.85%29.aspx
+ * and http://msdn.microsoft.com/en-us/library/windows/desktop/bb762181%28v=vs.85%29.aspx
+ *. Howevever, MinGW does currently (September 2014) not have
+ * the new KNOWNFOLDERID declarations.
+ */
+
+ wchar_t homedir[MAX_PATH];
+ if (SHGetFolderPathW(NULL, CSIDL_PROFILE, NULL, SHGFP_TYPE_CURRENT, homedir) != S_OK)
+ throw(std::runtime_error("Home directory not defined."));
+
+ return Path(utf16_to_utf8(homedir));
+#else
+#error Unsupported system.
+#endif
+}
+
+///@}
+
+/** \name Handling of absolute and relative paths
+ *
+ * Converting relative paths to absolute ones and vice-versa.
+ */
+///@{
+
+/**
+ * Builds an absolute path from the referenced path by
+ * prefixing it with a `base` path, which defaults to
+ * the current working directory. If the referenced path
+ * is absolute already, nothing is done and a copy of the
+ * referenced path is returned.
+ *
+ * \param[in] base Base path. Default is the return value of Path::pwd().
+ *
+ * \returns A new instance that is absolute.
+ *
+ * \see relative()
+ */
+Path Path::absolute(const Path& base /* = Path::pwd() */) const
+{
+ if (is_absolute())
+ return Path(m_path);
+ else
+ return base.join(m_path);
+}
+
+/**
+ * The referenced path has to to be absolute; by doing pure string
+ * manipulation (read: no symlinks), it will then be determined how to
+ * go from the (also absolute) `base` path to the referenced path. The
+ * result is a relative path, which will be returned by this method.
+ *
+ * On Windows, this method will throw an std::invalid_argument if the `base`
+ * is on a different drive than the referenced path. If either the referenced
+ * or the passed path is relative, std::invalid_argument will also be thrown.
+ *
+ * \param base Base path from which to start. Must also be absolute.
+ *
+ * \returns A new instance as a relative path.
+ *
+ * Example:
+ *
+ * ~~~~~~~~~~~~~~~~~~~~ c++
+ * Path p1("/tmp/foo/bar/baz");
+ * Path p2("/tmp/xxx/yyy");
+ *
+ * p1.relative(p2); // => ../../foo/bar/baz
+ * p2.relative(p1); // => ../../../xxx/yyy
+ * ~~~~~~~~~~~~~~~~~~~~
+ *
+ * \remark Both the referenced path and the `base` argument
+ * are prune()d before they are worked with.
+ *
+ * \see absolute()
+ */
+Path Path::relative(Path base) const
+{
+ if (is_relative())
+ throw(std::invalid_argument("Referenced path must be absolute."));
+ if (base.is_relative())
+ throw(std::invalid_argument("Argument path must be absolute."));
+
+ // Wipe all ".." and ".", this would break the below algorithm
+ base = base.prune();
+ Path refpath = prune();
+
+ // Shortcut for equal paths
+ if (base.m_path == refpath.m_path)
+ return Path(".");
+
+ // Shortcut if base is the root
+ if (base.is_root()) {
+#if defined(_PATHIE_UNIX)
+ return Path(refpath.m_path.substr(1)); // Skip leading /
+#elif defined(_WIN32)
+ return Path(refpath.m_path.substr(root().m_path.length())); // Skip leading / or X:/
+#else
+#error Unsupported system.
+#endif
+ }
+
+ size_t pos = 0;
+ size_t baselength = base.m_path.length();
+ size_t reflength = refpath.m_path.length();
+ while (true) {
+ if (pos >= baselength)
+ break;
+ else if (pos >= reflength)
+ break;
+ else if (base.m_path[pos] != refpath.m_path[pos])
+ break;
+ else
+ pos++;
+ }
+ // pos now points to the last character in which both strings were equal
+
+ // For each component in base that is not part of refpath, add a "..".
+ std::string resultstr;
+ Path basepart(base.m_path.substr(pos));
+ for(size_t i=0; i < basepart.component_count(); i++)
+ resultstr.append("../");
+
+ // Now append the part of refpath that is not part of base to the result.
+ resultstr.append(refpath.m_path.substr(pos));
+
+ // Done.
+ return Path(resultstr);
+}
+
+/**
+ * Checks if this is an absolute path, i.e. one that
+ * starts with a / on all systems or with X:/
+ * only on Windows, where `X` is a drive letter.
+ *
+ * Note that / on Windows is the root of the current drive
+ * and hence also an absolute path.
+ */
+bool Path::is_absolute() const
+{
+#if defined(_PATHIE_UNIX)
+ return m_path[0] == '/';
+#elif defined(_WIN32)
+ // / is root on current drive
+ if (m_path[0] == '/')
+ return true;
+
+ return m_path[1] == ':'; // This is the only position where : is allowed on windows, and if it is there, the path is absolute with a drive letter (X:/)
+#else
+#error Unsupported system.
+#endif
+}
+
+/**
+ * The inverse of is_absolute().
+ */
+bool Path::is_relative() const
+{
+ return !is_absolute();
+}
+
+/**
+ * Checks if this path is a filesystem root. On UNIX, this
+ * is the case if the path consists solely of one slash, on
+ * Windows this is the case if the path looks like this:
+ * "<letter>:/".
+ */
+bool Path::is_root() const
+{
+#if defined(_PATHIE_UNIX)
+ return m_path.length() == 1 && m_path[0] == '/';
+#elif defined(_WIN32)
+ // / on Windows is root on current drive
+ if (m_path.length() == 1 && m_path[0] == '/')
+ return true;
+
+ // X:/ is root including drive letter
+ return m_path.length() == 3 && m_path[1] == ':';
+#else
+#error Unsupported platform.
+#endif
+}
+
+///@}
+
+/** \name In-place substitution
+ *
+ * These methods change the underlying path string.
+ */
+///@{
+
+void Path::assign(std::string str)
+{
+ m_path = str;
+}
+
+void Path::swap(Path& path) throw()
+{
+ m_path.swap(path.m_path);
+}
+
+///@}
+
+/** \name File attributes
+ *
+ * Functions that work on file attributes like timestamps.
+ */
+///@{
+
+#if defined(_PATHIE_UNIX)
+struct stat* Path::stat() const
+{
+ struct stat* s = (struct stat*) malloc(sizeof(struct stat));
+ std::string nstr = native();
+
+ if (::stat(nstr.c_str(), s) < 0)
+ throw(Pathie::ErrnoError(errno));
+
+ return s;
+}
+#elif defined(_WIN32)
+/**
+ * \note This method accesses the file system.
+ *
+ * Returns a pointer to a C `stat` struct that describes the
+ * given file. You have to free() the pointer manually yourself.
+ *
+ * \returns A `struct stat` pointer on UNIX, and a `struct _stat`
+ * pointer on Windows.
+ */
+struct _stat* Path::stat() const
+{
+ struct _stat* s = (struct _stat*) malloc(sizeof(struct _stat));
+ std::wstring utf16 = utf8_to_utf16(m_path);
+
+ if (_wstat(utf16.c_str(), s) < 0)
+ throw(Pathie::ErrnoError(errno));
+
+ return s;
+}
+#else
+#error Unsupported system.
+#endif
+
+/**
+ * \note This method accesses the file system.
+ *
+ * Returns the file size.
+ */
+long Path::size() const
+{
+#if defined(_PATHIE_UNIX)
+ struct stat s;
+ std::string nstr = native();
+
+ if (::stat(nstr.c_str(), &s) < 0)
+ throw(Pathie::ErrnoError(errno));
+#elif defined(_WIN32)
+ struct _stat s;
+ std::wstring utf16 = utf8_to_utf16(m_path);
+
+ if (_wstat(utf16.c_str(), &s) < 0)
+ throw(Pathie::ErrnoError(errno));
+#else
+#error Unsupported system.
+#endif
+
+ return s.st_size;
+}
+
+/**
+ * \note This method accesses the file system.
+ *
+ * Returns the file’s last access time. The value is not
+ * really reliable.
+ */
+time_t Path::atime() const
+{
+#if defined(_PATHIE_UNIX)
+ struct stat s;
+ std::string nstr = native();
+
+ if (::stat(nstr.c_str(), &s) < 0)
+ throw(Pathie::ErrnoError(errno));
+#elif defined(_WIN32)
+ struct _stat s;
+ std::wstring utf16 = utf8_to_utf16(m_path);
+
+ if (_wstat(utf16.c_str(), &s) < 0)
+ throw(Pathie::ErrnoError(errno));
+#else
+#error Unsupported system.
+#endif
+
+ return s.st_atime;
+}
+
+/**
+ * \note This method accesses the file system.
+ *
+ * Returns the file’s last modification time.
+ */
+time_t Path::mtime() const
+{
+#if defined(_PATHIE_UNIX)
+ struct stat s;
+ std::string nstr = native();
+
+ if (::stat(nstr.c_str(), &s) < 0)
+ throw(Pathie::ErrnoError(errno));
+#elif defined(_WIN32)
+ struct _stat s;
+ std::wstring utf16 = utf8_to_utf16(m_path);
+
+ if (_wstat(utf16.c_str(), &s) < 0)
+ throw(Pathie::ErrnoError(errno));
+#else
+#error Unsupported system.
+#endif
+
+ return s.st_mtime;
+}
+
+/**
+ * \note This method accesses the file system.
+ *
+ * Returns the file’s creation time.
+ */
+time_t Path::ctime() const
+{
+#if defined(_PATHIE_UNIX)
+ struct stat s;
+ std::string nstr = native();
+
+ if (::stat(nstr.c_str(), &s) < 0)
+ throw(Pathie::ErrnoError(errno));
+#elif defined(_WIN32)
+ struct _stat s;
+ std::wstring utf16 = utf8_to_utf16(m_path);
+
+ if (_wstat(utf16.c_str(), &s) < 0)
+ throw(Pathie::ErrnoError(errno));
+#else
+#error Unsupported system.
+#endif
+
+ return s.st_ctime;
+}
+
+///@}
+
+/** \name Path traversal
+ *
+ * What’s in this directory?
+ */
+///@{
+
+/**
+ * Returns an entry_iterator instance you can use to iterate
+ * the entries in a directory. Note that the list somewhere
+ * always includes the "." (current directory) and ".."
+ * (parent directory) entries.
+ */
+entry_iterator Path::begin_entries() const
+{
+ return entry_iterator(this);
+}
+
+/**
+ * Returns the terminal iterator you test for in order to
+ * find out whether the iteration is complete.
+ */
+entry_iterator Path::end_entries() const
+{
+ return entry_iterator();
+}
+
+/**
+ * \note This method accesses the file system.
+ *
+ * This method assumes the path is a directory and returns
+ * a list of all entries in it. The items in the list follow
+ * the order of the items on the file system, i.e. for most
+ * applications they are to be considered unsorted.
+ *
+ * \see children()
+ */
+std::vector<Path> Path::entries() const
+{
+ std::vector<Path> results;
+ for(entry_iterator iter=begin_entries(); iter != end_entries(); iter++) {
+ results.push_back(*iter);
+ }
+
+ return results;
+}
+
+/**
+ * \note This method accesses the file system.
+ *
+ * This method assumes the path is a directory and returns
+ * a list of all its children. Children are all entries
+ * in the directory *except* for the entries for the directory
+ * itself and its parent directory.
+ *
+ * Or for short, this method is the same as children() except
+ * the return value does not include the "." and ".." entries.
+ *
+ * \see entries()
+ */
+std::vector<Path> Path::children() const
+{
+ std::vector<Path> results;
+ for(entry_iterator iter=begin_entries(); iter != end_entries(); iter++) {
+ if (*iter != Path(".") && *iter != Path(".."))
+ results.push_back(*iter);
+ }
+
+ return results;
+}
+
+/**
+ * \note This method accesses the file system.
+ *
+ * Recursively traverse the directory structure below the referenced
+ * path. Each entry will be passed to the callback while traversing
+ * from top to bottom. If the entry passed is a directory, you can return
+ * true if you want to traverse that directory down or false if you
+ * don't want to. If the entry passed is not a directory, the
+ * callback's return value is ignored.
+ *
+ * The callback will never be passed "." and ".." entries. All paths
+ * passed to the callback retain the full prefix, i.e. if you
+ * have this structure:
+ *
+ * ~~~~~~~~~~~~~~~~
+ * foo
+ * bar/
+ * baz.txt
+ * ~~~~~~~~~~~~~~~~
+ *
+ * Then find() will give you these paths in this order: `foo`,
+ * `foo/bar`, and `foo/bar/baz.txt`, rather than just the sole
+ * basename (which you can still obtain by calling basename() on the
+ * argument).
+ *
+ * \param cb Callback that takes the currently examined path.
+ *
+ * \remark Do not assume any order for the paths you receive,
+ * except that you will be given a directory entry before you
+ * are given its child entries.
+ */
+void Path::find(bool (*cb)(const Path& entry)) const
+{
+ for(entry_iterator iter=begin_entries(); iter != end_entries(); iter++) {
+ // Skip . and ..
+ if (iter->str() != "." && iter->str() != "..") {
+ Path path = join(*iter);
+ if (cb(path) && path.is_directory()) {
+ path.find(cb);
+ }
+ }
+ }
+}
+
+///@}
+
+/** \name Path status information
+ *
+ * Query information on the path.
+ */
+///@{
+
+
+/**
+ * \note This method acceses the filesystem.
+ *
+ * Checks if the file exists. Note that if you don’t have
+ * sufficient rights for the check on the given path, this
+ * method will throw an exception.
+ */
+bool Path::exists() const
+{
+#if defined(_PATHIE_UNIX)
+ std::string nstr = native();
+
+ if (access(nstr.c_str(), F_OK) == -1) {
+ int errsav = errno;
+ if (errsav == ENOENT) {
+ return false;
+ }
+ else {
+ throw(Pathie::ErrnoError(errsav));
+ }
+ }
+ else
+ return true;
+#elif defined(_WIN32)
+ std::wstring utf16 = utf8_to_utf16(m_path);
+ if (_waccess(utf16.c_str(), F_OK) == -1) {
+ int errsav = errno;
+ if (errsav == ENOENT) {
+ return false;
+ }
+ else {
+ throw(Pathie::ErrnoError(errsav));
+ }
+ }
+ else
+ return true;
+#else
+#error Unsupported system.
+#endif
+}
+
+/**
+ * \note This method acceses the filesystem.
+ *
+ * Checks if this file is a symbolic link; also
+ * works with NTFS symlinks on Windows. Returns false
+ * rather than erroring out if the referenced path does
+ * not exist.
+ */
+bool Path::is_symlink() const
+{
+#if defined(_PATHIE_UNIX)
+ struct stat s;
+ std::string nstr = native();
+
+ if (lstat(nstr.c_str(), &s) < 0) {
+ int errsav = errno;
+
+ if (errsav == ENOENT)
+ return false;
+ else
+ throw(Pathie::ErrnoError(errsav));
+ }
+
+ if (S_ISLNK(s.st_mode))
+ return true;
+ else
+ return false;
+#elif defined(_WIN32)
+ if (!exists())
+ return false;
+
+ return false;
+ // ntifs.h is currently not included in msys2
+ //std::wstring path = utf8_to_utf16(m_path);
+ //return is_ntfs_symlink(path.c_str());
+#else
+#error Unsupported system.
+#endif
+}
+
+/**
+ * \note This method acceses the filesystem.
+ *
+ * Checks if this is a directory. Returns false if the
+ * referenced path does not exist rather than erroring out.
+ */
+bool Path::is_directory() const
+{
+#if defined(_PATHIE_UNIX)
+ struct stat s;
+ std::string nstr = native();
+
+ if (::stat(nstr.c_str(), &s) < 0) {
+ int errsav = errno;
+
+ // "Not found" means it isn’t a directory.
+ if (errsav == ENOENT)
+ return false;
+ else
+ throw(Pathie::ErrnoError(errsav));
+ }
+
+ if (S_ISDIR(s.st_mode))
+ return true;
+ else
+ return false;
+#elif defined(_WIN32)
+ struct _stat s;
+ std::wstring utf16 = utf8_to_utf16(m_path);
+ if (_wstat(utf16.c_str(), &s) < 0) {
+ int errsav = errno;
+
+ if (errsav == ENOENT)
+ return false;
+ else
+ throw(Pathie::ErrnoError(errsav));
+ }
+
+ return s.st_mode & S_IFDIR;
+#else
+#error Unsupported system.
+#endif
+}
+
+/**
+ * \note This method accesses the filesystem.
+ *
+ * Checks if this is a file. Returns false
+ * if the referenced path does not exist rather
+ * than erroring out.
+ */
+bool Path::is_file() const
+{
+#if defined(_PATHIE_UNIX)
+ struct stat s;
+ std::string nstr = native();
+
+ if (::stat(nstr.c_str(), &s) < 0) {
+ int errsav = errno;
+
+ if (errsav == ENOENT)
+ return false;
+ else
+ throw(Pathie::ErrnoError(errsav));
+ }
+
+ if (S_ISREG(s.st_mode))
+ return true;
+ else
+ return false;
+#elif defined(_WIN32)
+ struct _stat s;
+ std::wstring utf16 = utf8_to_utf16(m_path);
+ if (_wstat(utf16.c_str(), &s) < 0) {
+ int errsav = errno;
+
+ if (errsav == ENOENT)
+ return false;
+ else
+ throw(Pathie::ErrnoError(errno));
+ }
+
+ return s.st_mode & S_IFREG;
+#else
+#error Unsupported system.
+#endif
+}
+
+///@}
+
+/** \name Utility methods
+ *
+ * These methods operate on the file or directory referenced
+ * by the path.
+ */
+/// @{
+
+/**
+ * \note This method writes to the filesystem.
+ *
+ * Creates the referenced directory non-recursively,
+ * i.e. parent directories are not created. Trying
+ * to create a directory below a nonexistant directory
+ * will result in an ErrnoError exception.
+ *
+ * \remark UNIX note: The directory is created with RWX permissions
+ * for everyone, but filtered by your current `umask` before applied
+ * to disk.
+ *
+ * \see mktree()
+ */
+void Path::mkdir() const
+{
+#if defined(_PATHIE_UNIX)
+ std::string nstr = native();
+
+ if (::mkdir(nstr.c_str(), S_IRWXU | S_IRWXG | S_IRWXO) < 0)
+ throw(Pathie::ErrnoError(errno));
+#elif defined(_WIN32)
+ std::wstring utf16 = utf8_to_utf16(m_path);
+
+ if (_wmkdir(utf16.c_str()) < 0)
+ throw(Pathie::ErrnoError(errno));
+#else
+#error Unsupported system.
+#endif
+}
+
+/**
+ * \note This method writes to the filesystem.
+ *
+ * Deletes the referenced directory, which is required
+ * to be empty, if not, an ErrnoError will be thrown.
+ *
+ * This cannot be used to delete a file rather than a
+ * directory.
+ *
+ * \see remove() unlink()
+ */
+void Path::rmdir() const
+{
+#if defined(_PATHIE_UNIX)
+ std::string nstr = native();
+
+ if (::rmdir(nstr.c_str()) < 0)
+ throw(Pathie::ErrnoError(errno));
+#elif defined(_WIN32)
+ std::wstring utf16 = utf8_to_utf16(m_path);
+ if (_wrmdir(utf16.c_str()) < 0)
+ throw(Pathie::ErrnoError(errno));
+#else
+#error Unsupported system.
+#endif
+}
+
+/**
+ * \note This method writes to the filesystem.
+ *
+ * Deletes the referenced file. This cannot be used to
+ * delete a directory rather than a file.
+ *
+ * \see remove() rmdir()
+ */
+void Path::unlink() const
+{
+#if defined(_PATHIE_UNIX)
+ std::string nstr = native();
+ if (::unlink(nstr.c_str()) < 0)
+ throw(Pathie::ErrnoError(errno));
+#elif defined(_WIN32)
+ std::wstring utf16 = utf8_to_utf16(m_path);
+ if (_wunlink(utf16.c_str()) < 0)
+ throw(Pathie::ErrnoError(errno));
+#else
+#error Unsupported system.
+#endif
+}
+
+/**
+ * \note This method writes to the filesystem.
+ *
+ * Delete this path, regardless of whether it is a file
+ * or an empty directory. This method can’t be used to
+ * delete a directory that isn’t empty.
+ *
+ * \see rmdir() unlink()
+ */
+void Path::remove() const
+{
+#if defined(_PATHIE_UNIX)
+ std::string nstr = native();
+
+ if (::remove(nstr.c_str()) < 0)
+ throw(Pathie::ErrnoError(errno));
+#elif defined(_WIN32)
+ std::wstring utf16 = utf8_to_utf16(m_path);
+ bool result = false;
+
+ /* On Windows, `_wremove()` doesn’t work on directories. This
+ * function uses the apropriate native Win32API function
+ * calls accordingly therefore. */
+ if (is_directory())
+ result = RemoveDirectoryW(utf16.c_str());
+ else
+ result = DeleteFileW(utf16.c_str());
+
+ if (!result) {
+ DWORD err = GetLastError();
+ throw(Pathie::WindowsError(err));
+ }
+
+#else
+#error Unsupported system.
+#endif
+}
+
+/**
+ * \note This method writes to the file system.
+ *
+ * This method provides a functionality akin to the UNIX `mkdir -p`
+ * command, i.e. it creates the referenced directory, and if necessary,
+ * also creates all parent directories. Note this method does not
+ * throw an ErrnoError if the referenced directory already exists;
+ * it just does nothing.
+ *
+ * \see mkdir()
+ */
+void Path::mktree() const
+{
+ // Root is required to exist
+ if (is_root())
+ return;
+
+ if (!is_directory()) {
+ Path p = parent();
+
+ if (!p.is_directory()) {
+ p.mktree();
+ }
+
+ mkdir();
+ }
+
+}
+
+/**
+ * \note This method accesses the filesystem.
+ *
+ * Open the referenced path as a file with the given mode.
+ * Refer to your preferred C documentation for the value
+ * of the `mode` parameter.
+ *
+ * As with all methods of this library, Unicode filenames
+ * are handled properly on both UNIX and Windows by transcoding
+ * to UTF-16LE on Windows. Therefore, on UNIX the file
+ * is opened using `fopen()`, and on Windows it is opened
+ * using `_wfopen()`. Thanksfully, as an exception
+ * to Microsoft’s wchar-them-all rule, it is possible to close
+ * a file that is opened with `_wfopen()` by means of the
+ * regular `fclose()` function, which saves me from implementing
+ * a wrapper around the C `FILE*` pointer to abstract the problem.
+ *
+ * In contrast to original `fopen()`, this method throws an
+ * ErrnoError exception if the call fails, i.e. if `fopen()`
+ * returns NULL. As a result, this method will _never_ return
+ * a NULL pointer.
+ *
+ * Here’s an example of how to use this method (with error checking
+ * ommited):
+ *
+ * ~~~~~~~~~~~~~~~~~ c++
+ * Path p("Unicöde file.txt");
+ * FILE* p_file = p.fopen("w");
+ * fwrite("A", 1, 1, p_file);
+ * fclose(p_file);
+ * ~~~~~~~~~~~~~~~~~
+ *
+ * This will create a file named "Unicöde.txt" both on UNIX and Windows.
+ *
+ * \param[in] mode File open mode as per the C `fopen()` documentation.
+ *
+ * \remark Don’t forget you have to close the file using `fclose()`, which
+ * works, as explained, both on UNIX and Windows. `fclose()` is
+ * not wrapped by this library, use your C libraries’ implementation
+ * directly.
+ *
+ * \remark The file’s actual _contents_ are not affected in any way
+ * by this method. They are outside the scope of this library; note
+ * however that with regard to line endings you might want to consider
+ * the "b" mode modifier for binary files.
+ *
+ * \see [Microsoft’s documentation on `fopen()` and `_wfopen()`](http://msdn.microsoft.com/en-us/library/yeby3zcb.aspx)
+ */
+FILE* Path::fopen(const char* mode) const
+{
+#if defined(_PATHIE_UNIX)
+ std::string nstr = native();
+
+ FILE* ptr = ::fopen(nstr.c_str(), mode);
+ if (ptr)
+ return ptr;
+ else
+ throw(Pathie::ErrnoError(errno));
+#elif defined(_WIN32)
+ std::wstring utf16_path = utf8_to_utf16(m_path);
+ std::wstring utf16_mode = utf8_to_utf16(mode);
+ FILE* ptr = _wfopen(utf16_path.c_str(), utf16_mode.c_str());
+
+ if (ptr)
+ return ptr;
+ else
+ throw(Pathie::ErrnoError(errno));
+#else
+#error Unsupported system.
+#endif
+}
+
+/**
+ * \note This method writes to the filesystem.
+ *
+ * Sets the file’s modification and access times to the
+ * current time. If the file does not yet exist, it is created.
+ *
+ * This is akin to the UNIX `touch` command.
+ */
+void Path::touch() const
+{
+#if defined(BSD) // FreeBSD didn’t have futimens() yet as of testing (december 2014)
+ FILE* p_file = Path::fopen("a");
+ if (futimes(fileno(p_file), NULL) < 0) {
+ fclose(p_file);
+ throw(Pathie::ErrnoError(errno));
+ }
+
+ fclose(p_file);
+#elif defined(_PATHIE_UNIX)
+ FILE* p_file = Path::fopen("a");
+ // futimens() is considered the modern variant of doing this
+ // (at least according to utimes(2) on my Linux system).
+ if (futimens(fileno(p_file), NULL) < 0) {
+ fclose(p_file);
+ throw(Pathie::ErrnoError(errno));
+ }
+
+ fclose(p_file);
+#elif defined(_WIN32)
+ // Create file if it does not exist yet
+ if (!exists()) {
+ FILE* p_file = Path::fopen("a");
+ fclose(p_file);
+ }
+
+ SYSTEMTIME currenttime;
+ GetSystemTime(&currenttime);
+
+ FILETIME newtime;
+ if (SystemTimeToFileTime(&currenttime, &newtime) == 0) {
+ DWORD err = GetLastError();
+ throw(Pathie::WindowsError(err));
+ }
+
+ std::wstring utf16 = utf8_to_utf16(m_path);
+ HANDLE filehandle = CreateFileW(utf16.c_str(), FILE_WRITE_ATTRIBUTES, 0, NULL, OPEN_EXISTING, 0, NULL);
+ if (filehandle == INVALID_HANDLE_VALUE) {
+ DWORD err = GetLastError();
+ throw(Pathie::WindowsError(err));
+ }
+
+ if (SetFileTime(filehandle, NULL, &newtime, &newtime) == 0) {
+ int errsav = GetLastError();
+ CloseHandle(filehandle);
+ throw(Pathie::WindowsError(errsav));
+ }
+
+ CloseHandle(filehandle);
+#else
+#error Unsupported system.
+#endif
+}
+
+/**
+ * \note This method writes to the filesystem.
+ *
+ * This method, which is akin to the UNIX "rm -r" command, removes
+ * the entire referenced directory hierarchy recursively, including
+ * any files and directories contained therein.
+ */
+void Path::rmtree() const
+{
+ if (is_directory()) {
+ std::vector<Path> kids = children();
+
+ for(std::vector<Path>::iterator iter=kids.begin(); iter != kids.end(); iter++) {
+ join(*iter).rmtree();
+ }
+
+ rmdir();
+ }
+ else { // file or similar
+ unlink();
+ }
+}
+
+/**
+ * \note This method writes to the filesystem.
+ *
+ * This method makes the referenced file a symbolic link
+ * to the path passed as an argument. On Windows, an
+ * NTFS symlink is created.
+ *
+ * \remark On Windows, this function requires that the process holds
+ * the `SE_CREATE_SYMBOLIC_LINK_NAME` privilege or it will fail with a
+ * WindowsError exception whose error code is 1314
+ * (`ERROR_PRIVILEGE_NOT_HELD`).
+ */
+void Path::make_symlink(const Path& target) const
+{
+#if defined(_PATHIE_UNIX)
+ std::string target_nstr = target.native();
+ std::string nstr = native();
+
+ if (symlink(target_nstr.c_str(), nstr.c_str()) < 0)
+ throw(Pathie::ErrnoError(errno));
+#elif defined(_WIN32)
+ std::wstring source = utf8_to_utf16(m_path);
+ std::wstring target2 = utf8_to_utf16(target.m_path);
+
+ DWORD flags = 0;
+ if (target.is_directory())
+ flags = SYMBOLIC_LINK_FLAG_DIRECTORY;
+
+ if (CreateSymbolicLinkW(source.c_str(), target2.c_str(), flags) == 0) {
+ DWORD err = GetLastError();
+ throw(Pathie::WindowsError(err));
+ }
+#else
+#error Unsupported system.
+#endif
+}
+
+/**
+ * \note This method accesses the file system.
+ *
+ * Treats the referened path as a symlink and reads in its target,
+ * returning it as a new Path intance. Supports NTFS symlinks.
+ */
+Path Path::readlink() const
+{
+#if defined(_PATHIE_UNIX)
+ std::string nstr = native();
+ char buf[PATH_MAX];
+ memset(buf, '\0', PATH_MAX);
+
+ ssize_t count = ::readlink(nstr.c_str(), buf, PATH_MAX);
+ if (count < 0)
+ throw(Pathie::ErrnoError(errno));
+
+ return Path(filename_to_utf8(std::string(buf, count)));
+#elif defined(_WIN32)
+ std::wstring utf16_path = utf8_to_utf16(m_path);
+
+ throw(std::runtime_error("NTFS symlinks currently not supported."));
+
+ // ntifs.h currently not included in msys2.h
+ //if (!is_ntfs_symlink(utf16_path.c_str()))
+ // throw(std::runtime_error("Not an NTFS symlink."));
+ //
+ //wchar_t* utf16_target = NULL;
+ //utf16_target = read_ntfs_symlink(utf16_path.c_str());
+ //
+ //Path result(utf16_to_utf8(utf16_target));
+ //free(utf16_target);
+ //
+ //return result;
+#else
+#error Unsupported system.
+#endif
+}
+
+/**
+ * \note This method writes to the file system.
+ *
+ * Renames a file to another name without involving file streams.
+ *
+ * \param[in] newname The new name of the file.
+ */
+void Path::rename(Path& newname) const
+{
+#if defined(_PATHIE_UNIX)
+ std::string nstr = native();
+ std::string newname_nstr = newname.native();
+
+ if (::rename(nstr.c_str(), newname_nstr.c_str()) != 0)
+ throw Pathie::ErrnoError(errno);
+#elif defined(_WIN32)
+ std::wstring utf16_oldname = utf8_to_utf16(m_path);
+ std::wstring utf16_newname = utf8_to_utf16(newname.m_path);
+
+ if (_wrename(utf16_oldname.c_str(), utf16_newname.c_str()) != 0)
+ throw Pathie::ErrnoError(errno);
+#else
+#error Unsupported system.
+#endif
+}
+
+///@}
+
+/** \name Operators
+ *
+ * C++ operators.
+ */
+///@{
+
+Path& Path::operator=(const Path& path)
+{
+ // Self-assignment
+ if (this == &path)
+ return *this;
+
+ m_path = path.m_path;
+ return *this;
+}
+
+Path& Path::operator=(const std::string& str)
+{
+ m_path = str;
+ return *this;
+}
+
+/**
+ * Compares two Path instances. Two paths are considered equal
+ * if their underlying path std::strings are equal.
+ */
+bool Path::operator==(const Path& other) const
+{
+ return m_path == other.m_path;
+}
+
+/**
+ * Compares two Path instances. Two paths are considered inequal
+ * if their underlying path std::strings are inequal.
+ */
+bool Path::operator!=(const Path& other) const
+{
+ return m_path != other.m_path;
+}
+
+/**
+ * Compares two Path instances. The referenced path is
+ * considered smaller than `other` if the underlying path
+ * std::string of the referenced path is smaller than the
+ * one of `other`.
+ */
+bool Path::operator<(const Path& other) const
+{
+ return m_path < other.m_path;
+}
+
+/**
+ * Compares two Path instances. The referenced path is
+ * considered greater than `other` if the underlying path
+ * std::string of the referenced path is greater than the
+ * one of `other`.
+ */
+bool Path::operator>(const Path& other) const
+{
+ return m_path > other.m_path;
+}
+
+/**
+ * Compares two Path instances. The referenced path is
+ * considered smaller than or equal to `other` if the underlying path
+ * std::string of the referenced path is smaller than or equal to the
+ * one of `other`.
+ */
+bool Path::operator<=(const Path& other) const
+{
+ return m_path <= other.m_path;
+}
+
+/**
+ * Compares two Path instances. The referenced path is
+ * considered greater than or equal to `other` if the underlying path
+ * std::string of the referenced path is greater than or equal to the
+ * one of `other`.
+ */
+bool Path::operator>=(const Path& other) const
+{
+ return m_path >= other.m_path;
+}
+
+/**
+ * This method allows you to access a specific component in the
+ * path string. The first component has the index 0; for an
+ * absolute path, it will be the / entry.
+ *
+ * If you specify an index that is beyond the end of the path,
+ * an std::out_of_range exception will be thrown.
+ *
+ * \param index Index of the component to retrieve.
+ *
+ * \see component_count()
+ *
+ * \remark This operator loops over the path string internally
+ * each time you request an element. If you want to index the
+ * path consecutively, you might consider using burst(), which
+ * can be more performant as it only loops once over the path
+ * string.
+ */
+Path Path::operator[](size_t index) const
+{
+ // Absolute path index 0 needs special treatment
+ if (index == 0 && m_path[0] == '/')
+ return Path("/");
+
+ size_t pos = 0;
+ size_t lastpos = 0;
+ size_t i = 0;
+ while ((pos = m_path.find("/", pos)) != string::npos) { // Assignment intended
+ if (i == index)
+ return Path(m_path.substr(lastpos, pos - lastpos));
+
+ lastpos = pos + 1;
+ pos++;
+ i++;
+ }
+
+ // Last element requested
+ if (index == i)
+ return Path(m_path.substr(lastpos));
+
+ // Out of range
+ throw(std::out_of_range("Index out of path range"));
+}
+
+/**
+ * Appends a /, then the new component, then calls expand(), and
+ * finally returns a new Path instance.
+ *
+ * \param path New component.
+ *
+ * \returns New Path instance.
+ */
+Path Path::operator/(Path path) const
+{
+ return join(path);
+}
+
+/**
+ * Appends a /, then the new component, and
+ * finally returns a new Path instance.
+ *
+ * \param str New component.
+ *
+ * \returns New Path instance.
+ */
+Path Path::operator/(std::string str) const
+{
+ return join(str);
+}
+
+/**
+ * Appends a / followed by the new component `path` onto this
+ * instance and returns this instance.
+ *
+ * \param path New component.
+ *
+ * \returns The receiver.
+ */
+Path& Path::operator/=(Path path)
+{
+ *this = join(path);
+ return *this;
+}
+
+/**
+ * Appends a / followed by the new component `path` onto this
+ * instance and returns this instance.
+ *
+ * \param str New component.
+ *
+ * \returns The receiver.
+ */
+Path& Path::operator/=(std::string str)
+{
+ *this = join(str);
+ return *this;
+}
+
+/**
+ * Allows you to insert Pathie::Path instances into `std::cout`.
+ *
+ * ~~~~~~~~~~ c++
+ * Pathie::Path p("foo/bar");
+ * std::cout << p << std::endl;
+ * ~~~~~~~~~~
+ */
+std::ostream& operator<<(std::ostream& stream, const Path& p)
+{
+ return stream << p.str();
+}
+
+///@}
+
+#ifdef _PATHIE_UNIX
+/*
+ * Returns the XDG directory for the given environment variable,
+ * if defined, otherwise returns home() with `defaultpath`
+ * appended.
+ *
+ * See http://standards.freedesktop.org/basedir-spec/basedir-spec-latest.html
+ * for values.
+ */
+Path Path::get_xdg_dir(const std::string& envvarname, const std::string& defaultpath)
+{
+ std::string env_nstr = utf8_to_filename(envvarname); // environment is encoded the same as the filenames
+ char* env_value = getenv(env_nstr.c_str());
+ if (env_value)
+ return Path(filename_to_utf8(env_value));
+
+ return Path::home().join(defaultpath);
+}
+
+std::vector<Path> Path::get_xdg_dirlist(const std::string& envvarname, const std::string& defaultlist)
+{
+ std::string env_nstr = utf8_to_filename(envvarname); // environment is encoded the same as the filenames
+ char* env_value = getenv(env_nstr.c_str());
+ std::string envstr;
+ if (env_value && strcmp(env_value, "") != 0)
+ envstr = filename_to_utf8(env_value); // Encode entire env string to UTF-8
+ else
+ envstr = defaultlist;
+
+ size_t pos = 0;
+ size_t lastpos = 0;
+ std::vector<Path> results;
+ while ((pos = envstr.find(":")) != string::npos) {
+ results.push_back(Path(envstr.substr(lastpos, pos))); // envstr is already UTF-8
+
+ lastpos = pos + 1;
+ pos++;
+ }
+
+ results.push_back(envstr.substr(lastpos));
+
+ return results;
+}
+
+std::string Path::get_xdg_userdir_setting(const std::string& setting)
+{
+ // XDG user-dirs spec recommends (only) checking for $XDG_CONFIG_HOME/user-dirs.dirs,
+ // the files under $XDG_CONFIG_DIRS are not to consider.
+ Path userconfig = Path::config_dir().join("user-dirs.dirs");
+
+ if (userconfig.is_file()) {
+ FILE* p_file = userconfig.fopen("r");
+
+ char line[256];
+ char buf[256];
+ bool found = false;
+ while (!feof(p_file)) {
+ memset(line, 0, 256);
+ memset(buf, 0, 256);
+
+ fgets(line, 256, p_file);
+
+ // Ignore comments and empty lines
+ if (line[0] == '#' || line[0] == '\n')
+ continue;
+
+ // Extract the setting name from the line, e.g. "DOWNLOAD" for
+ // "XDG_DOWNLOAD_DIR=...".
+ strncpy(buf, line + 4, setting.length()); // +4 for "XDG_"
+ if (strcmp(buf, setting.c_str()) == 0) {
+ found = true;
+ break;
+ }
+ }
+
+ fclose(p_file);
+
+ // Error out if not found
+ if (!found) {
+ std::string msg = "Unknown XDG directory '";
+ msg += setting + "' requested.";
+ throw(std::runtime_error(msg));
+ }
+
+ // OK, we have found the correct setting. Extract the value now.
+ // »XDG_DOWNLOAD_DIR="$HOME/Downloads"«
+ char* start = strchr(line, '"') + 1; // Exclude " itself
+ size_t len = strcspn(start, "\"");
+
+ if (!start) // Malformed
+ throw(std::runtime_error("Malformed XDG config file (quote mismatch/missing quotes)!"));
+
+ memset(buf, 0, 256);
+ strncpy(buf, start, len);
+ // buf now contains the part between the quotes followed by NUL bytes
+
+ char result[PATH_MAX];
+ memset(result, 0, PATH_MAX);
+
+ // Replace $HOME with env value of $HOME
+ start = strstr(buf, "$HOME");
+ if (start) { // Contains $HOME
+ char* homestr = getenv("HOME");
+ if (!homestr)
+ throw(std::runtime_error("$HOME not set!"));
+
+ // Stuff before $HOME
+ strncpy(result, buf, ((char*)start) - ((char*)buf)); // Compiler does not allow doing pointer arithmetics with char[], but with char* ??? They should be the same...
+ // $HOME replacement
+ strcpy(result + strlen(result), homestr);
+ // Suff after $HOME ($HOME is exactly 5 chars long)
+ strcpy(result + strlen(result), start + 5);
+ }
+ else { // No $HOME included. Copy everything verbosely.
+ strcpy(result, buf);
+ }
+
+ // result now holds the final result with lots of NUL bytes at the end.
+ return std::string(result);
+ }
+
+ // No XDG configuration on this system, use $HOME.
+ return Path::home().str();
+}
+#endif
+
+/** \name Program data directories
+ *
+ * Directories containing program data other than files the
+ * user works with (e.g. configuration files).
+ */
+///@{
+
+/**
+ * Returns the directory for application- and user-specific permanent
+ * data.
+ *
+ * On UNIX, this returns $XDG_DATA_HOME, defaulting to ~/.local/share.
+ *
+ * On Windows, this returns the roaming appdata folder, which defaults
+ * to `C:/Users/username/AppData/Roaming`.
+ */
+Path Path::data_dir()
+{
+#if defined(_PATHIE_UNIX)
+ return get_xdg_dir("XDG_DATA_HOME", ".local/share");
+#elif defined(_WIN32)
+ wchar_t dir[MAX_PATH];
+ HRESULT result = SHGetFolderPathW(NULL, CSIDL_APPDATA, NULL, SHGFP_TYPE_CURRENT, dir);
+ if (result != S_OK)
+ throw(Pathie::WindowsHresultError(result));
+
+ return Path(utf16_to_utf8(dir));
+#else
+#error Unsupported system.
+#endif
+}
+
+/**
+ * \warning This method may behave unexpectedly on Windows; see below.
+ *
+ * Returns the directory for application- and user-specific configuration
+ * files.
+ *
+ * On UNIX, this returns $XDG_CONFIG_HOME, defaulting to ~/.config.
+ *
+ * Windows does not have a notion of a directory for configuration
+ * files, hence some return value for this method had to be chosen. I
+ * think it is best to not clutter a user’s home directory with config
+ * files, and [this stackoverflow thread](https://stackoverflow.com/questions/2243895/location-to-put-user-configuration-files-in-windows)
+ * suggests to place the files in the data_dir(). That however yields
+ * the problem of possible name clashes when you want to name a file
+ * the same in data_dir() and config_dir(). It is not an option to
+ * fall back to the "LocalSettings" directory instead, because 99% of
+ * the applications written are "roaming" applications rather than
+ * "local" ones, and any use of the "LocalSettings" directory
+ * (available via cache_dir()) must be a specific decision of the
+ * programmer therefore. The decision was made that this method on
+ * Windows should return the same as data_dir() without a specific
+ * encforcing reason, but, as said, some decision needed to be
+ * made. As a consequence, you have to be careful to not accidentally
+ * place equally named files in data_dir() and config_dir() as they
+ * would conflict.
+ *
+ * I want to point out that on Windows, configuration files are rather
+ * unusual. The normal way to save configuration settings on Windows
+ * is use of the Windows Registry, which is beyond the scope of a
+ * path manipulation library like Pathie.
+ */
+Path Path::config_dir()
+{
+#if defined(_PATHIE_UNIX)
+ return get_xdg_dir("XDG_CONFIG_HOME", ".config");
+#elif defined(_WIN32)
+ return data_dir();
+#else
+#error Unsupported system.
+#endif
+}
+
+/**
+ * Returns the directory for application- and user-specific cache files, i.e.
+ * files that, when deleted, do not impact the application apart from resetting
+ * it to some default values. A typical example for cache data is saving the
+ * folder where the user last opened a file, so that when he starts the application
+ * the next time and wants to open a file, is directly taken to the directory
+ * where he last picked a file from. Positions of windows could also be saved
+ * here, allowing application windows to be placed exactly where they were
+ * when the application was closed last time. In short, store the unimportant
+ * stuff here and be prepared the data is gone on application startup.
+ *
+ * On UNIX, this returns $XDG_CACHE_HOME, defaulting to ~/.cache.
+ *
+ * On Windows, this method returns the LOCAL_APPDATA folder, which means that
+ * in corporate setups using Windows roaming the data will not be available
+ * if you log in on another machine (which is expected, cf. the directory
+ * saving example above, which would break if this was saved into the roaming
+ * folder). This defaults to `C:/Users/username/AppData/Local`.
+ */
+Path Path::cache_dir()
+{
+#if defined(_PATHIE_UNIX)
+ return get_xdg_dir("XDG_CACHE_HOME", ".cache");
+#elif defined(_WIN32)
+ wchar_t dir[MAX_PATH];
+ HRESULT result = SHGetFolderPathW(NULL, CSIDL_LOCAL_APPDATA, NULL, SHGFP_TYPE_CURRENT, dir);
+ if (result != S_OK)
+ throw(Pathie::WindowsHresultError(result));
+
+ return Path(utf16_to_utf8(dir));
+#else
+#error Unsupported system.
+#endif
+}
+
+/**
+ * Returns the directory for application- and user-specific volatile
+ * runtime data, i.e. data that WILL be deleted once the user logs
+ * off.
+ *
+ * On UNIX, this returns $XDG_RUNTIME_DIR. That environment variable is
+ * required to be defined as per the XDG standard, and if it isn’t, this
+ * method prints a warning to the standard error stream and uses the
+ * value of Path::temp_dir() instead.
+ *
+ * On Windows, the return value of this method is equivalent to that
+ * of temp_dir() always.
+ */
+Path Path::runtime_dir()
+{
+#if defined(_PATHIE_UNIX)
+ std::string nstr = utf8_to_filename("XDG_RUNTIME_DIR"); // environment is encoded the same as paths
+ char* env_value = getenv(nstr.c_str());
+ if (env_value)
+ return Path(filename_to_utf8(env_value));
+
+ Path tmp = Path::temp_dir();
+ std::cerr << "(pathie XDG) WARNING: XDG_RUNTIME_DIR not defined in environment. Falling back to '" << tmp.str() << "'." << std::endl;
+
+ return tmp;
+#elif defined(_WIN32)
+ return temp_dir();
+#else
+#error Unsupported system.
+#endif
+}
+
+/**
+ * Returns the root directory for temporary directories, i.e.
+ * directories which are expected to vanish when the application
+ * closes. Do not assume that anything you created in this
+ * directory still exists after your application exited and is
+ * restarted.
+ *
+ * \returns Path instance for temporary directory.
+ *
+ * \remark On UNIX, this function honours the value of the
+ * environment variable $TMPDIR. If that is not defined, the standard
+ * "/tmp" location will be returned. On Windows, GetTempPath() is
+ * called to retrieve the path, which in turn honours the environment
+ * variables $TMP, $TEMP, and $USERPROFILE (in that order); if all
+ * of them are undefined, a Windows-predefined path is returned,
+ * which defaults to `C:/Users/username/AppData/Local/Temp`.
+ *
+ * \see mktmpdir(3), [GetTempPath()](http://msdn.microsoft.com/en-us/library/windows/desktop/aa364992%28v=vs.85%29.aspx)
+ */
+Path Path::temp_dir()
+{
+#if defined(_PATHIE_UNIX)
+ std::string nstr = utf8_to_filename("TMPDIR"); // environment is encoded the same as paths
+ char* env_value = NULL;
+
+ if ((env_value = getenv(nstr.c_str()))) // Single = intended
+ return Path(filename_to_utf8(env_value));
+
+
+ return Path("/tmp"); // As per the Filesystem Hierarchy Standard.
+#elif defined(_WIN32)
+ wchar_t buf[MAX_PATH +1]; // See http://msdn.microsoft.com/en-us/library/windows/desktop/aa364992%28v=vs.85%29.aspx for the +1
+ DWORD count = GetTempPathW(MAX_PATH + 1, buf);
+
+ if (count == 0) {
+ DWORD err = GetLastError();
+ throw(Pathie::WindowsError(err));
+ }
+
+ std::wstring utf16(buf, count);
+ return utf16_to_utf8(utf16);
+#else
+#error Unsupported system.
+#endif
+}
+
+///@}
+
+/**
+ * Create a temporary directory (with permissions set to
+ * 0700 on UNIX). The directory is guaranteed to be empty, and
+ * it is your responsibility to recursively remove the
+ * directory on program exit (or earlier).
+ *
+ * \param[in] name (`"tmpd"`) This will be used as part of
+ * the name of the directory, _not_ as the entire name.
+ *
+ * \returns Path instance for the new temporary directory.
+ *
+ * \remark Parts of the random name are generated with the
+ * C rand() function, so you might want to call srand()
+ * before using this function in order to seed the random
+ * number generator with a useful value.
+ */
+Path Path::mktmpdir(const std::string& name /* = "tmpd" */)
+{
+ Path tmp = Path::temp_dir() / Path(make_tempname(name));
+ tmp.mkdir();
+
+#ifdef _PATHIE_UNIX
+ std::string nstr = tmp.native();
+ chmod(nstr.c_str(), S_IRWXU); // Silently ignore failure of setting file permissions
+#endif
+ // TODO: How to do that on Windows?
+
+ return tmp;
+}
+
+// Constructs a filename that tries to be unique.
+std::string Path::make_tempname(const std::string& namepart)
+{
+ time_t now;
+ struct tm* p_nowinfo = NULL;
+ time(&now);
+ p_nowinfo = localtime(&now);
+
+ char buf[16]; // 15 + NUL
+ memset(buf, '\0', 16);
+ strftime(buf, 16, "%Y%m%d-%H%M%S", p_nowinfo);
+ std::string timepart(buf, 15);
+
+#if defined(_PATHIE_UNIX)
+ std::stringstream ss;
+ ss << getpid();
+ std::string pidpart = ss.str();
+#elif defined(_WIN32)
+ std::stringstream ss;
+ ss << GetCurrentProcessId();
+ std::string pidpart = ss.str();
+#else
+#error Unsupported system.
+#endif
+
+ memset(buf, '\0', 16);
+ short i;
+ for(i=0; i < 16; i++)
+ buf[i] = 97 + rand() % 26; // Random char between a and z
+
+ std::string randompart(buf, 15);
+
+ return namepart + "_" + timepart + pidpart + randompart;
+}
+
+#if defined(_PATHIE_UNIX)
+/**
+ * \note Only available on UNIX. Accesses the file system.
+ *
+ * Returns $XDG_DATA_DIRS as per the XDG specification.
+ * If that is not set, returns a vector of paths for
+ * /usr/local/share and /usr/share.
+ */
+std::vector<Path> Path::data_dirs()
+{
+ return get_xdg_dirlist("XDG_DATA_DIRS", "/usr/local/share/:/usr/share/");
+}
+
+/**
+ * \note Only available on UNIX. Accesses the file system.
+ *
+ * Returns $XDG_CONFIG_DIRS as per the XDG specification.
+ * If that is not set, returns a vector of paths for
+ * /etc/xdg (i.e. a one-element vector).
+ */
+std::vector<Path> Path::config_dirs()
+{
+ return get_xdg_dirlist("XDG_CONFIG_DIRS", "/etc/xdg");
+}
+#endif
+
+/** \name User data directories
+ *
+ * Directories that contain user data like music or text files
+ * the user works with.
+ */
+///@{
+
+/**
+ * \note On UNIX, this method accesses the file system.
+ *
+ * Retrieves the directory of the user’s desktop. Generally,
+ * any files placed in this directory will appear on the
+ * user’s desktop view (the area shown when no windows
+ * are open).
+ *
+ * On UNIX, this is $XDG_DESKTOP_DIR, defaulting to `~/Desktop`.
+ * Note you likely will receive a localised version (like “Schreibtisch”
+ * on a German Linux).
+ *
+ * On Windows, the default is `C:/Users/username/Desktop` or a localised
+ * version.
+ */
+Path Path::desktop_dir()
+{
+#if defined(_PATHIE_UNIX)
+ return Path(get_xdg_userdir_setting("DESKTOP"));
+#elif defined(_WIN32)
+ wchar_t dir[MAX_PATH];
+ HRESULT result = SHGetFolderPathW(NULL, CSIDL_DESKTOPDIRECTORY, NULL, SHGFP_TYPE_CURRENT, dir);
+ if (result != S_OK)
+ throw(Pathie::WindowsHresultError(result));
+
+ return Path(utf16_to_utf8(dir));
+#else
+#error Unsupported system.
+#endif
+}
+
+/**
+ * \note On UNIX, this method accesses the file system.
+ *
+ * Retrieves the directory for the user’s documents. This is
+ * not the place for your data files, savegames, or configuration
+ * files -- it is meant only for textual and other documents you can
+ * access with an office or similar program. See data_dir() for a directory
+ * you can store your data into.
+ *
+ * On UNIX, this is $XDG_DOCUMENTS_DIR, defaulting to `~/Documents`.
+ * Note you likely will receive a localised version (like "Dokumente"
+ * on a German Linux).
+ *
+ * On Windows, the default is `C:/Users/username/Documents` or a localised
+ * version.
+ */
+Path Path::documents_dir()
+{
+#if defined(_PATHIE_UNIX)
+ return Path(get_xdg_userdir_setting("DOCUMENTS"));
+#elif defined(_WIN32)
+ wchar_t dir[MAX_PATH];
+ HRESULT result = SHGetFolderPathW(NULL, CSIDL_PERSONAL, NULL, SHGFP_TYPE_CURRENT, dir);
+ if (result != S_OK)
+ throw(Pathie::WindowsHresultError(result));
+
+ return Path(utf16_to_utf8(dir));
+#else
+#error Unsupported system.
+#endif
+}
+
+/**
+ * \note On UNIX, this method accesses the file system.
+ *
+ * Retrieves the user’s download directory. Unfortunately, this function
+ * is currently unsupported under Windows, because MinGW has not yet
+ * adapted the necessary win32api changes.
+ *
+ * On UNIX, this is $XDG_DOWNLOAD_DIR, defaulting to `~/Downloads`.
+ * Note you likely will receive a localised version.
+ */
+Path Path::download_dir()
+{
+#if defined(_PATHIE_UNIX)
+ return Path(get_xdg_userdir_setting("DOWNLOAD"));
+#elif defined(_WIN32)
+ // Not available via CSIDL, must use the newer KNOWNFOLDERID system,
+ // which is not supported by MinGW yet.
+ throw(std::runtime_error("KNOWNFOLDERID is not supported by MinGW yet, can't retrieve this directory."));
+#else
+#error Unsupported system.
+#endif
+}
+
+/**
+ * \note On UNIX, this method accesses the file system.
+ *
+ * Retrieves the user’s music directory.
+ *
+ * On UNIX, this is $XDG_MUSIC_DIR, defaulting to `~/Music`.
+ * Note you likely will receive a localised version (like "Musik"
+ * on a German Linux).
+ *
+ * On Windows, this defaults to `C:/users/username/Music` or a localised
+ * version.
+ */
+Path Path::music_dir()
+{
+#if defined(_PATHIE_UNIX)
+ return Path(get_xdg_userdir_setting("MUSIC"));
+#elif defined(_WIN32)
+ wchar_t dir[MAX_PATH];
+ HRESULT result = SHGetFolderPathW(NULL, CSIDL_MYMUSIC, NULL, SHGFP_TYPE_CURRENT, dir);
+ if (result != S_OK)
+ throw(Pathie::WindowsHresultError(result));
+
+ return Path(utf16_to_utf8(dir));
+#else
+#error Unsupported system.
+#endif
+}
+
+/**
+ * \note On UNIX, this method accesses the file system.
+ *
+ * Retrieves the user’s pictures directory.
+ *
+ * On UNIX, this is $XDG_PICTURES_DIR, defaulting to `~/Pictures`.
+ * Note you likely will receive a localised version (like "Bilder"
+ * on a German Linux).
+ *
+ * On Windows, this defaults to `C:/users/username/Pictures` or a
+ * localised version.
+ */
+Path Path::pictures_dir()
+{
+#if defined(_PATHIE_UNIX)
+ return Path(get_xdg_userdir_setting("PICTURES"));
+#elif defined(_WIN32)
+ wchar_t dir[MAX_PATH];
+ HRESULT result = SHGetFolderPathW(NULL, CSIDL_MYPICTURES, NULL, SHGFP_TYPE_CURRENT, dir);
+ if (result != S_OK)
+ throw(Pathie::WindowsHresultError(result));
+
+ return Path(utf16_to_utf8(dir));
+#else
+#error Unsupported system.
+#endif
+}
+
+/**
+ * \note On UNIX, this method accesses the file system.
+ *
+ * Retrieves the user’s publicshare directory. This directory may
+ * be exposed to network access on the local network, though this
+ * is not required.
+ *
+ * On UNIX, this is $XDG_PUBLICSHARE_DIR, defaulting to `~/Public`.
+ * Note you likely will receive a localised version (like "Öffentlich"
+ * on a German Linux).
+ *
+ * On Windows, this defaults to `C:/users/username/AppData/Roaming/Microsoft/Windows/Network Shortcuts`.
+ */
+Path Path::publicshare_dir()
+{
+#if defined(_PATHIE_UNIX)
+ return Path(get_xdg_userdir_setting("PUBLICSHARE"));
+#elif defined(_WIN32)
+ wchar_t dir[MAX_PATH];
+ HRESULT result = SHGetFolderPathW(NULL, CSIDL_NETHOOD, NULL, SHGFP_TYPE_CURRENT, dir);
+ if (result != S_OK)
+ throw(Pathie::WindowsHresultError(result));
+
+ return Path(utf16_to_utf8(dir));
+#else
+#error Unsupported system.
+#endif
+}
+
+/**
+ * \note On UNIX, this method accesses the file system.
+ *
+ * Retrieves the user’s directory for document templates. The files
+ * in this directory can generally be accessed by right-clicking
+ * in the user’s favourite file manager and selecting "new" followed
+ * by the desired file. The file will then be copied from the templates
+ * directory into the directory the user works in at the moment.
+ *
+ * On UNIX, this is $XDG_TEMPLATES_DIR, defaulting to `~/Templates`.
+ * Note you likely will receive a localised version (like "Vorlagen"
+ * on a German Linux).
+ *
+ * On Windows, this defaults to `C:/users/username/AppData/Roaming/Microsoft/Windows/Templates`.
+ */
+Path Path::templates_dir()
+{
+#if defined(_PATHIE_UNIX)
+ return Path(get_xdg_userdir_setting("TEMPLATES"));
+#elif defined(_WIN32)
+ wchar_t dir[MAX_PATH];
+ HRESULT result = SHGetFolderPathW(NULL, CSIDL_TEMPLATES, NULL, SHGFP_TYPE_CURRENT, dir);
+ if (result != S_OK)
+ throw(Pathie::WindowsHresultError(result));
+
+ return Path(utf16_to_utf8(dir));
+#else
+#error Unsupported system.
+#endif
+}
+
+/**
+ * \note On UNIX, this method accesses the file system.
+ *
+ * Retrieves the user’s directory for videos.
+ *
+ * On UNIX, this is $XDG_VIDEOS_DIR, defaulting to `~/Videos`
+ * or a localised version.
+ *
+ * On Windows, this defaults to `C:/users/username/Videos` or a
+ * localised version.
+ */
+Path Path::videos_dir()
+{
+#if defined(_PATHIE_UNIX)
+ return Path(get_xdg_userdir_setting("VIDEOS"));
+#elif defined(_WIN32)
+ wchar_t dir[MAX_PATH];
+ HRESULT result = SHGetFolderPathW(NULL, CSIDL_MYVIDEO, NULL, SHGFP_TYPE_CURRENT, dir);
+ if (result != S_OK)
+ throw(Pathie::WindowsHresultError(result));
+
+ return Path(utf16_to_utf8(dir));
+#else
+#error Unsupported system.
+#endif
+}
+
+/**
+ * \note On UNIX, this method accesses the file system.
+ *
+ * Retrieves the user’s path for application starters. On UNIX,
+ * this will return a directory (typically `~/.local/share/applications`)
+ * where you can store XDG `.desktop` files in so they get picked up
+ * by the desktop environment’s application menu for that user. On Windows,
+ * the user’s startmenu folder is returned, and any files and directories
+ * you add there will show up in the user’s startmenu.
+ *
+ * \remark On Windows, this is not the global startmenu folder, but the
+ * user’s specific ones. Other users will not have the entries you put
+ * here in their startmenu.
+ */
+Path Path::appentries_dir()
+{
+#if defined(_PATHIE_UNIX)
+ return data_dir().join("applications");
+#elif defined(_WIN32)
+ wchar_t dir[MAX_PATH];
+ HRESULT result = SHGetFolderPathW(NULL, CSIDL_STARTMENU, NULL, SHGFP_TYPE_CURRENT, dir);
+ if (result != S_OK)
+ throw(Pathie::WindowsHresultError(result));
+
+ return Path(utf16_to_utf8(dir));
+#else
+#error Unsupported system.
+#endif
+}
+
+///@}
+
+/** \name Global data directories
+ *
+ * Directories that contain data either unrelated to users at all,
+ * or applicable to all users at once. Be careful to read the
+ * Windows notes in the documentation of these methods, as Windows
+ * only supplies are much smaller set of system directories than UNIX.
+ */
+///@{
+
+/**
+ * Retrieves the global directory for application starters. On UNIX,
+ * any XDG `.desktop` files you place there should show up in any user’s
+ * desktop environment’s menu, and on Windows, anything you place there
+ * should show up in any user’s startmenu.
+ *
+ * \param local (true) If true, this method returns the location
+ * under the `/usr/local` hierarchy, otherwise it returns the
+ * location under the `/usr` hierarchy. This parameter has no meaning
+ * on Windows and is ignored.
+ */
+Path Path::global_appentries_dir(localpathtype local)
+{
+#if defined(_PATHIE_UNIX)
+ if (local == Path::LOCALPATH_LOCAL || (local == Path::LOCALPATH_DEFAULT && get_global_dir_default() == Path::LOCALPATH_LOCAL))
+ return Path("/usr/local/share/applications");
+ else
+ return Path("/usr/share/applications");
+#elif defined(_WIN32)
+ wchar_t dir[MAX_PATH];
+ HRESULT result = SHGetFolderPathW(NULL, CSIDL_COMMON_STARTMENU, NULL, SHGFP_TYPE_CURRENT, dir);
+ if (result != S_OK)
+ throw(Pathie::WindowsHresultError(result));
+
+ return Path(utf16_to_utf8(dir));
+#else
+#error Unsupported system.
+#endif
+}
+
+/**
+ * Retrieves the directory for immutable application data that isn’t user-specific,
+ * i.e. which shall be available to all users using the system.
+ *
+ * On UNIX, this is `/usr/share`. On Windows, this is `C:/Windows/system32`.
+ * On Windows, beware conflicts with files of the same name in
+ * global_config_dir()!
+ *
+ * \param local (true) If true, this method returns the location
+ * under the `/usr/local` hierarchy, otherwise it returns the
+ * location under the `/usr` hierarchy. This parameter has no meaning
+ * under Windows and is ignored.
+ */
+Path Path::global_immutable_data_dir(localpathtype local)
+{
+#if defined(_PATHIE_UNIX)
+ if (local == Path::LOCALPATH_LOCAL || (local == Path::LOCALPATH_DEFAULT && get_global_dir_default() == Path::LOCALPATH_LOCAL))
+ return Path("/usr/local/share");
+ else
+ return Path("/usr/share");
+#elif defined(_WIN32)
+ wchar_t dir[MAX_PATH];
+ HRESULT result = SHGetFolderPathW(NULL, CSIDL_SYSTEM, NULL, SHGFP_TYPE_CURRENT, dir);
+ if (result != S_OK)
+ throw(Pathie::WindowsHresultError(result));
+
+ return Path(utf16_to_utf8(dir));
+#else
+#error Unsupported system.
+#endif
+}
+
+/**
+ * Retrieves the directory for mutable application data that isn’t user-specific,
+ * i.e. which shall be available to all users using the system.
+ *
+ * On UNIX, this is `/var/lib`. On Windows, this is the Application Data folder
+ * for the "All Users" account. On Windows, this is equivalent to global_cache_dir(),
+ * so beware file name conflicts on Windows!
+ *
+ * \param local (true) If true, this method returns the location
+ * under the `/var/local` hierarchy, otherwise it returns the
+ * location under the `/var` hierarchy. This parameter has no meaning
+ * under Windows and is ignored.
+ */
+Path Path::global_mutable_data_dir(localpathtype local)
+{
+#if defined(_PATHIE_UNIX)
+ if (local == Path::LOCALPATH_LOCAL || (local == Path::LOCALPATH_DEFAULT && get_global_dir_default() == Path::LOCALPATH_LOCAL))
+ return Path("/var/local/lib");
+ else
+ return Path("/var/lib");
+#elif defined (_WIN32)
+ wchar_t dir[MAX_PATH];
+ HRESULT result = SHGetFolderPathW(NULL, CSIDL_COMMON_APPDATA, NULL, SHGFP_TYPE_CURRENT, dir);
+ if (result != S_OK)
+ throw(Pathie::WindowsHresultError(result));
+
+ return Path(utf16_to_utf8(dir));
+#else
+#error Unsupported system
+#endif
+}
+
+/**
+ * Retrieves the directory for global cache data, i.e. data, which
+ * is not essential to the program and can be reconstructed if it
+ * gets lost.
+ *
+ * On UNIX, this returns `/var/cache`. Windows does not have a notion
+ * of such a directory, hence the value is equal to the return value
+ * of global_mutable_data_dir(). Therefore: On Windows, beware conflicts if you
+ * use files of the same name in global_mutable_data_dir() and
+ * global_cache_dir()!
+ *
+ * \param local (true) If true, returns the cache directory for locally installed
+ * programs, which is `/var/local/cache`. This parameter has no effect under
+ * systems other than UNIX.
+ */
+Path Path::global_cache_dir(localpathtype local)
+{
+#if defined(_PATHIE_UNIX)
+ if (local == Path::LOCALPATH_LOCAL || (local == Path::LOCALPATH_DEFAULT && get_global_dir_default() == Path::LOCALPATH_LOCAL))
+ return Path("/var/local/cache");
+ else
+ return Path("/var/cache");
+#elif defined(_WIN32)
+ return global_mutable_data_dir();
+#else
+#error Unsupported system.
+#endif
+}
+
+/**
+ * \note On UNIX, this method accesses the filesystem.
+ *
+ * Returns the directory for volatile information that will be deleted
+ * on system shutdown.
+ *
+ * On UNIX, this returns `/run` if it exists, otherwise `/var/run`.
+ * Windows does not have a notion of such a directory; as a replacement,
+ * `C:/Temp` is returned.
+ *
+ * \param local (true) If true, returns the equivalent directory for
+ * `/run` for locally installed programs, which is `/var/local/run`. This
+ * parameter has no effect on systems other than UNIX.
+ */
+Path Path::global_runtime_dir(localpathtype local)
+{
+#if defined(_PATHIE_UNIX)
+ if (local == Path::LOCALPATH_LOCAL || (local == Path::LOCALPATH_DEFAULT && get_global_dir_default() == Path::LOCALPATH_LOCAL))
+ return Path("/var/local/run");
+
+ Path run("/run");
+ if (run.exists())
+ return run;
+ else
+ return Path("/var/run");
+#elif defined(_WIN32)
+ return Path("C:/Temp");
+#else
+#error Unsupported system.
+#endif
+}
+
+/**
+ * Returns the global directory for configuration files.
+ *
+ * On UNIX, this is `/etc`. Windows does not really have a notion
+ * for configuration directories. This method returns the Windows
+ * system folder for that purpose, typically `C:/Windows/system32`;
+ * this is equivalent to global_immutable_data_dir(), so be careful
+ * when you place files of the same name in global_config_dir()!
+ *
+ * \param local (true) If true, returns the global configuration
+ * directory for locally installed programs instead, which is
+ * `/usr/local/etc`.
+ */
+Path Path::global_config_dir(localpathtype local)
+{
+#if defined(_PATHIE_UNIX)
+ if (local == Path::LOCALPATH_LOCAL || (local == Path::LOCALPATH_DEFAULT && get_global_dir_default() == Path::LOCALPATH_LOCAL))
+ return Path("/usr/local/etc");
+ else
+ return Path("/etc");
+
+#elif defined(_WIN32)
+ wchar_t dir[MAX_PATH];
+ HRESULT result = SHGetFolderPathW(NULL, CSIDL_SYSTEM, NULL, SHGFP_TYPE_CURRENT, dir);
+ if (result != S_OK)
+ throw(Pathie::WindowsHresultError(result));
+
+ return Path(utf16_to_utf8(dir));
+#else
+#error Unsupported system.
+#endif
+}
+
+/**
+ * Retrieves the global directory for self-contained applications, i.e.
+ * applications that require a directory structure different from the
+ * Filesystem Hierarchy Standard (FHS). Such programs are an exception
+ * under UNIX, but are the regular case on Windows. The programs placed
+ * in this directory are intended to be available to all users using the
+ * system.
+ *
+ * Under UNIX, this method returns the `/opt` directory. On Windows,
+ * it returns the Program Files directory (typically `C:\Program Files`).
+ *
+ * \note On UNIX, the FHS mandates that programs installed under
+ * `/opt` do not use the usual directories for variable information
+ * returned by global_mutable_data_dir() and global_cache_dir(), but
+ * instead use `/var/opt`.
+ */
+Path Path::global_programs_dir()
+{
+#if defined(_PATHIE_UNIX)
+ return Path("/opt");
+#elif defined(_WIN32)
+ wchar_t dir[MAX_PATH];
+ HRESULT result = SHGetFolderPathW(NULL, CSIDL_PROGRAM_FILES, NULL, SHGFP_TYPE_CURRENT, dir);
+ if (result != S_OK)
+ throw(Pathie::WindowsHresultError(result));
+
+ return Path(utf16_to_utf8(dir));
+
+#else
+#error Unsupported system.
+#endif
+}
+
+///@}
+
+/** \name Miscellaneous static functions
+ *
+ * Other functions that didn’t fit somewhere else.
+ */
+///@{
+
+/// \note This method accesses the filesystem.
+///
+/// Uses a shell-like glob pattern on the current working directory.
+/// Typically available patterns include "*" for a string of
+/// arbitrary length and "?" for a string of length one.
+///
+/// Refer to glob(7) for glob patterns available on UNIX.
+/// Refer to [MSDN](http://msdn.microsoft.com/en-us/library/windows/desktop/aa364418%28v=vs.85%29.aspx)
+/// for glob patterns available on Windows.
+///
+/// Windows does not support recursive patterns like
+///
+/// \verbatim **/* \endverbatim
+///
+/// or
+///
+/// \verbatim foo/*/bar \endverbatim
+///
+/// . This will result in a Pathie::WindowsError exception
+/// with Windows error code 123 (“invalid filename”). For cross-platform
+/// recursive matching, you can try to combine find() and fnmatch().
+///
+/// \param[in] pattern Glob pattern.
+/// \param flags (`0`) Globbing flags. Refer to glob(3) for
+/// possible values; the parameter is ignored on Windows.
+///
+/// \returns A vector of Path instances that matched the glob
+/// pattern.
+///
+/// \remark Glob patterns on UNIX are generally much more powerful than
+/// those on Windows. Be careful when using anything apart from "*" and "?"
+/// patterns on Windows.
+///
+/// \see dglob() fnmatch()
+///
+std::vector<Path> Path::glob(const std::string& pattern, int flags /* = 0 */)
+{
+#if defined(_PATHIE_UNIX)
+ std::string nstr = utf8_to_filename(pattern);
+ glob_t globinfo;
+ int result = ::glob(nstr.c_str(), flags, NULL, &globinfo);
+
+ if (result == GLOB_NOMATCH) {
+ return std::vector<Path>(); // Empty vector
+ }
+ else if (result == 0) {
+ std::vector<Path> result;
+
+ for(size_t i=0; i < globinfo.gl_pathc; i++) {
+ result.push_back(Path(filename_to_utf8(globinfo.gl_pathv[i])));
+ }
+
+ globfree(&globinfo);
+ return result;
+ }
+ else {
+ throw(GlobError(result));
+ }
+#elif defined(_WIN32)
+ std::vector<Path> results;
+ std::wstring utf16_pattern = utf8_to_utf16(pattern);
+
+ /* Windows’ FindFirstFile()/FindNextFile() returns bare file names.
+ * However, to ensure output similar to the UNIX version, we prepend
+ * the pattern’s stem if a slash / is found in the pattern; FindFirstFile()/
+ * FindNextFile() don’t support recursive matching anyway, so this is safe. */
+ std::string stem;
+ size_t pos = 0;
+ if ((pos = pattern.rfind("/")) != string::npos) // Single = intended
+ stem = pattern.substr(0, pos + 1); // Trailing / included
+
+ // Prepare
+ HANDLE filehandle = INVALID_HANDLE_VALUE;
+ WIN32_FIND_DATAW finddata;
+ memset(&finddata, '\0', sizeof(WIN32_FIND_DATA));
+
+ // Try finding the first file
+ filehandle = FindFirstFileW(utf16_pattern.c_str(), &finddata);
+
+ // Check if some error happened
+ if (filehandle == INVALID_HANDLE_VALUE) {
+ DWORD errval = GetLastError();
+ if (errval == ERROR_FILE_NOT_FOUND) // According to docs, this means no matching files were found. Return empty list.
+ return results;
+ else if (errval != ERROR_SUCCESS)
+ throw Pathie::WindowsError(errval);
+ }
+
+ // All well, save this one...
+ results.push_back(Path(stem + utf16_to_utf8(finddata.cFileName)));
+
+ // ...and continue.
+ while (FindNextFileW(filehandle, &finddata)) {
+ results.push_back(Path(stem + utf16_to_utf8(finddata.cFileName)));
+ }
+
+ DWORD errval = GetLastError();
+ FindClose(filehandle);
+
+ if (errval != ERROR_NO_MORE_FILES)
+ throw(Pathie::WindowsError(errval));
+
+ return results;
+#else
+#error Unsupported system.
+#endif
+}
+
+///@}
+
+/** \name Miscellaneous member functions
+ *
+ * Methods that didn’t fit anywhere else.
+ */
+
+///@{
+
+/**
+ * This method tests whether the referenced path matches the
+ * given pattern under the rules of the local glob-matching
+ * function. Note this method does _not_ access the filesystem,
+ * hence there is no guarantee that the referenced path exists.
+ *
+ * \param[in] pattern The pattern to match.
+ * \param flags Any flags. This parameter is ignored on Windows,
+ * for UNIX refer to the fnmatch(3) manpage.
+ *
+ * \returns Whether the path matches the pattern.
+ *
+ * \remark On Windows, this method uses the [PathMatchSpec()](http://msdn.microsoft.com/en-us/library/bb773727%28VS.85%29.aspx)
+ * function; on UNIX, it uses fnmatch(3).
+ *
+ * \remark Windows’s `PathMatchSpec()` function does not support
+ * recursive matching patterns, while the UNIX fnmatch(8), relying
+ * on glob(7), does.
+ *
+ * \remark Glob patterns on UNIX are generally much more powerful than
+ * those on Windows. Be careful when using anything apart from "*" and "?"
+ * patterns on Windows.
+ *
+ * \see glob() dglob()
+ */
+bool Path::fnmatch(const std::string& pattern, int flags /* = 0 */) const
+{
+#if defined(_PATHIE_UNIX)
+ std::string nstr = native();
+ std::string pattern_nstr = utf8_to_filename(pattern);
+ return ::fnmatch(pattern_nstr.c_str(), nstr.c_str(), flags) == 0;
+#elif defined(_WIN32)
+ std::wstring utf16path = utf8_to_utf16(m_path);
+ std::wstring utf16pattern = utf8_to_utf16(pattern);
+ return PathMatchSpecW(utf16path.c_str(), utf16pattern.c_str());
+#else
+#error Unsupported system.
+#endif
+}
+
+/**
+ * \note This method acceses the filesystem.
+ *
+ * Like glob(), but prepends the referenced path to the glob
+ * pattern.
+ *
+ * \see glob() fnmatch()
+ */
+std::vector<Path> Path::dglob(const std::string& pattern, int flags /* = 0 */) const
+{
+ return glob(m_path + "/" + pattern, flags);
+}
+
+/**
+ * Appends a /, then the new component, and
+ * finally returns a new Path instance.
+ *
+ * \param path New component.
+ *
+ * \returns New Path instance.
+ */
+Path Path::join(Path path) const
+{
+ Path p(m_path + "/" + path.str());
+ return p;
+}
+
+/**
+ * Appends a /, then the new component, and
+ * finally returns a new Path instance.
+ *
+ * \param str New component.
+ *
+ * \returns New Path instance.
+ */
+Path Path::join(std::string str) const
+{
+ Path path(m_path + "/" + str);
+ return path;
+}
+
+/**
+ * Replaces the current extension with the given new extension
+ * and returns the result. If the referenced path doesn’t have
+ * a file extension currently, the new extension is appended.
+ *
+ * \param new_extension The new extension. If the leading point
+ * is missing, it will automatically be prepended.
+ *
+ * \returns The new Path instance.
+ */
+Path Path::sub_ext(std::string new_extension) const
+{
+ // If the point is missing, add it to the beginning.
+ if (new_extension.find(".") == string::npos)
+ new_extension.insert(0, ".");
+
+ std::string old_extension = extension();
+ if (old_extension.empty()) {
+ return Path(m_path + new_extension);
+ }
+ else {
+ size_t pos = m_path.find(old_extension);
+ return Path(m_path.substr(0, pos) + new_extension);
+ }
+}
+
+///@}
diff --git a/src/3rd_party/pathie-cpp/src/pathie.cpp b/src/3rd_party/pathie-cpp/src/pathie.cpp
new file mode 100644
index 00000000..9df1f733
--- /dev/null
+++ b/src/3rd_party/pathie-cpp/src/pathie.cpp
@@ -0,0 +1,226 @@
+/* -*- coding: utf-8 -*-
+ * This file is part of Pathie.
+ *
+ * Copyright © 2015, 2017 Marvin Gülker
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * “AS IS” AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "../include/pathie.hpp"
+#include "../include/errors.hpp"
+
+#if defined(_WIN32)
+#include <windows.h>
+
+/**
+ * Converts a UTF-16LE string into UTF-8. Only available
+ * on Windows.
+ */
+std::string Pathie::utf16_to_utf8(std::wstring str)
+{
+ int size = WideCharToMultiByte(CP_UTF8, 0, str.c_str(), str.length(), NULL, 0, NULL, NULL);
+
+ char* utf8 = (char*) malloc(size); // sizeof(char) = 1 per ANSI C standard.
+ memset(utf8, 0, size);
+
+ size = WideCharToMultiByte(CP_UTF8, 0, str.c_str(), str.length(), utf8, size, NULL, NULL);
+
+ if (size == 0)
+ throw(Pathie::WindowsError(GetLastError()));
+
+ std::string utf8str(utf8, size);
+ free(utf8);
+
+ return utf8str;
+}
+
+/**
+ * Converts a UTF-8 string into UTF-16LE. Only available
+ * on Windows.
+ */
+std::wstring Pathie::utf8_to_utf16(std::string str)
+{
+ int count = MultiByteToWideChar(CP_UTF8, 0, str.c_str(), str.length(), NULL, 0);
+
+ wchar_t* utf16 = (wchar_t*) malloc(count * sizeof(wchar_t));
+ memset(utf16, 0, count * sizeof(wchar_t));
+
+ count = MultiByteToWideChar(CP_UTF8, 0, str.c_str(), str.length(), utf16, count);
+
+ if (count == 0)
+ throw(Pathie::WindowsError(GetLastError()));
+
+ std::wstring utf16str(utf16, count);
+ free(utf16);
+
+ return utf16str;
+}
+#endif
+
+#ifdef _PATHIE_UNIX
+#include <cstring>
+#include <cstdlib>
+#include <errno.h>
+#include <iconv.h>
+#include <langinfo.h>
+#include <sys/param.h> // defines "BSD" macro on BSD systems
+
+/* iconv() function family is available on every POSIX-conformant
+ * system. In POSIX.1-2008, it’s specified in the "System Interfaces"
+ * section.
+ *
+ * nl_langinfo() is also specified by POSIX, though I’ve found no evidence
+ * that iconv() is required to understand the encoding output by nl_langinfo(CODESET).
+ * From checking on Linux and FreeBSD, this however seems very likely, so we have
+ * to assume that this always is the case.
+ */
+
+/**
+ * This function converts the given string from the given source encoding
+ * to another given target encoding and returns the result as a std::string.
+ *
+ * \param[in] from_encoding Convert from this encoding.
+ * \param[in] to_encoding Convert into this encoding.
+ * \param[in] string The string to convert.
+ *
+ * \returns The converted string.
+ *
+ * \remark See the output of the `iconv --list` command for a list of
+ * supported encodings.
+ */
+std::string Pathie::convert_encodings(const char* from_encoding, const char* to_encoding, const std::string& string)
+{
+ size_t input_length = string.length();
+
+ // We need a C string working copy that isn’t const
+ char* copy = (char*) malloc(input_length + 1); // Terminating NUL
+ strcpy(copy, string.c_str());
+
+ // Set up the encoding converter
+ iconv_t converter = iconv_open(to_encoding, from_encoding);
+ size_t outbytes_left = 0;
+ size_t inbytes_left = input_length;
+
+ if (converter == (iconv_t) -1)
+ throw Pathie::ErrnoError(errno);
+
+ /* There is no way to know how much space iconv() will need. So we keep
+ * allocating more and more memory as needed. `current_size' keeps track
+ * of how large our memory blob is currently. `outbuf' is the pointer to
+ * that memory blob. */
+ size_t current_size = input_length + 1; // NUL
+ char* outbuf = NULL;
+ char* inbuf = copy; // Copy the pointer
+
+ int errsav = 0;
+ outbytes_left = current_size;
+ while(true) {
+ outbuf = (char*) realloc(outbuf - (current_size - outbytes_left), current_size + 10);
+ current_size += 10;
+ outbytes_left += 10;
+
+ errno = 0;
+ errsav = 0;
+
+#ifdef BSD
+ // What the heck. FreeBSD violates POSIX.1-2008: it declares iconv()
+ // differently than mandated by POSIX: http://pubs.opengroup.org/onlinepubs/9699919799/functions/iconv.html
+ // (it declares a `const' where it must not be).
+ iconv(converter, const_cast<const char**>(&inbuf), &inbytes_left, &outbuf, &outbytes_left); // sets outbytes_left to 0 or very low values if not enough space (E2BIG)
+#else
+ iconv(converter, &inbuf, &inbytes_left, &outbuf, &outbytes_left); // sets outbytes_left to 0 or very low values if not enough space (E2BIG)
+#endif
+ errsav = errno;
+
+ if (errsav != E2BIG) {
+ break;
+ }
+ }
+
+ iconv_close(converter);
+ free(copy);
+
+ size_t count = current_size - outbytes_left;
+ outbuf -= count; // iconv() advances the pointer!
+
+ if (errsav != 0) {
+ free(outbuf);
+ throw(Pathie::ErrnoError(errsav));
+ }
+
+ std::string result(outbuf, count);
+ free(outbuf);
+
+ return result;
+}
+
+/**
+ * Converts the given UTF-8 string into the native filename encoding.
+ */
+std::string Pathie::utf8_to_filename(const std::string& utf8)
+{
+ bool fs_encoding_is_utf8 = false;
+
+#if defined(__APPLE__) || defined(PATHIE_ASSUME_UTF8_ON_UNIX)
+ fs_encoding_is_utf8 = true;
+#else
+ char* fsencoding = NULL;
+ fsencoding = nl_langinfo(CODESET);
+ fs_encoding_is_utf8 = (strcmp(fsencoding, "UTF-8") == 0);
+#endif
+
+ // Skip the expensive convert_encodings() call if the filesystem
+ // encoding already is UTF-8.
+ if (fs_encoding_is_utf8) {
+ return std::string(utf8);
+ }
+
+ return convert_encodings("UTF-8", fsencoding, utf8);
+}
+
+/**
+ * Converts the given string in native filesystem encoding to
+ * UTF-8.
+ */
+std::string Pathie::filename_to_utf8(const std::string& native_filename)
+{
+ bool fs_encoding_is_utf8 = false;
+
+#if defined(__APPLE__) || defined(PATHIE_ASSUME_UTF8_ON_UNIX)
+ fs_encoding_is_utf8 = true;
+#else
+ char* fsencoding = NULL;
+ fsencoding = nl_langinfo(CODESET);
+ fs_encoding_is_utf8 = (strcmp(fsencoding, "UTF-8") == 0);
+#endif
+
+ // Skip the expensive convert_encodings() call if the filesystem
+ // encoding already is UTF-8.
+ if (fs_encoding_is_utf8) {
+ return std::string(native_filename);
+ }
+
+ return convert_encodings(fsencoding, "UTF-8", native_filename);
+}
+#endif
diff --git a/src/3rd_party/pathie-cpp/src/pathie_ifstream.cpp b/src/3rd_party/pathie-cpp/src/pathie_ifstream.cpp
new file mode 100644
index 00000000..06b80731
--- /dev/null
+++ b/src/3rd_party/pathie-cpp/src/pathie_ifstream.cpp
@@ -0,0 +1,320 @@
+/* -*- coding: utf-8 -*-
+ * This file is part of Pathie.
+ *
+ * Copyright © 2015, 2017 Marvin Gülker
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * “AS IS” AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "../include/pathie_ifstream.hpp"
+
+#include <cstdlib>
+
+namespace Pathie {
+#if defined(_PATHIE_UNIX)
+ // All well and easy under UNIX. Just delegate to standard constructor.
+ Pathie::ifstream::ifstream(Pathie::Path path, std::ios_base::openmode mode)
+ : std::ifstream(path.native().c_str(), mode)
+ {
+ //
+ }
+
+ Pathie::ifstream::ifstream()
+ : std::ifstream()
+ {
+ //
+ }
+
+ Pathie::ifstream::ifstream(std::string path, std::ios_base::openmode mode)
+ : std::ifstream(utf8_to_filename(path).c_str(), mode)
+ {
+ //
+ }
+
+ Pathie::ifstream::ifstream(char* path, std::ios_base::openmode mode)
+ : std::ifstream(utf8_to_filename(path).c_str(), mode)
+ {
+ //
+ }
+
+ void Pathie::ifstream::open(const char* filename, ios_base::openmode mode)
+ {
+ std::string filename_nstr = utf8_to_filename(filename);
+ std::ifstream::open(filename_nstr.c_str(), mode);
+ }
+
+ void Pathie::ifstream::open(const std::string& filename, ios_base::openmode mode)
+ {
+ std::ifstream::open(utf8_to_filename(filename).c_str(), mode);
+ }
+
+ void Pathie::ifstream::open(const Pathie::Path& filename, ios_base::openmode mode)
+ {
+ std::ifstream::open(filename.native().c_str(), mode);
+ }
+
+
+#elif defined (_WIN32)
+# if defined(_MSC_VER)
+ // Easy again under MSVC under Windows; using Microsoft’s nonstandard constructor
+ // for Unicode filenames.
+ // It is documented here: http://msdn.microsoft.com/en-us/library/8et8s826.aspx
+ Pathie::ifstream::ifstream(Pathie::Path path, std::ios_base::openmode mode)
+ : std::ifstream(path.native(), mode)
+ {
+ //
+ }
+
+ Pathie::ifstream::ifstream()
+ : std::ifstream()
+ {
+ //
+ }
+
+ Pathie::ifstream::ifstream(std::string path, std::ios_base::openmode mode)
+ : std::ifstream(path, mode)
+ {
+ //
+ }
+
+ Pathie::ifstream::ifstream(char* path, std::ios_base::openmode mode)
+ : std::ifstream(path, mode)
+ {
+ //
+ }
+# elif defined(__GNUC__)
+ // This one is tough, but solveable. There’s a nonstandard C++ extension by the
+ // GCC team to create a C++ stream from a file descriptor and similar.
+ // It is documented here: https://gcc.gnu.org/onlinedocs/gcc-4.9.2/libstdc++/api/a00054.html
+
+ /**
+ * Default constructor for deferred initialisation via open().
+ * Beware that before you called open(), any methods other than
+ * is_open() may behave unexpectedly!
+ */
+ Pathie::ifstream::ifstream()
+ : std::basic_istream<char, std::char_traits<char> >()
+ {
+ mp_file = NULL;
+ mp_filebuffer = NULL;
+ m_buffer_allocated = false;
+
+ // See the lengthy explanation in open() for why we do this here.
+ mp_filebuffer = (__gnu_cxx::stdio_filebuf<char>*) malloc(sizeof(__gnu_cxx::stdio_filebuf<char>));
+ memset(mp_filebuffer, '\0', sizeof(__gnu_cxx::stdio_filebuf<char>));
+
+ this->init(mp_filebuffer);
+ }
+
+ /**
+ * Construct a stream for the given UTF-8 file path.
+ *
+ * \param[in] filename The path to open the stream for. UTF-8.
+ * \param mode Mode to open the file in.
+ */
+ Pathie::ifstream::ifstream(const char* filename, ios_base::openmode mode)
+ : std::basic_istream<char, std::char_traits<char> >()
+ {
+ mp_file = NULL;
+ mp_filebuffer = NULL;
+ m_buffer_allocated = false;
+
+ // See the lengthy explanation in open() for why we do this here.
+ mp_filebuffer = (__gnu_cxx::stdio_filebuf<char>*) malloc(sizeof(__gnu_cxx::stdio_filebuf<char>));
+ memset(mp_filebuffer, '\0', sizeof(__gnu_cxx::stdio_filebuf<char>));
+
+ this->init(mp_filebuffer);
+ this->open(filename, mode);
+ }
+
+ /**
+ * Construct a stream for the given UTF-8 file path.
+ *
+ * \param[in] filename The path to open the stream for. UTF-8.
+ * \param mode Mode to open the file in.
+ */
+ Pathie::ifstream::ifstream(const std::string& filename, ios_base::openmode mode)
+ : std::basic_istream<char, std::char_traits<char> >()
+ {
+ mp_file = NULL;
+ mp_filebuffer = NULL;
+ m_buffer_allocated = false;
+
+ // See the lengthy explanation in open() for why we do this here.
+ mp_filebuffer = (__gnu_cxx::stdio_filebuf<char>*) malloc(sizeof(__gnu_cxx::stdio_filebuf<char>));
+ memset(mp_filebuffer, '\0', sizeof(__gnu_cxx::stdio_filebuf<char>));
+
+ this->init(mp_filebuffer);
+ this->open(filename, mode);
+ }
+
+ /**
+ * Construct a stream for the given Pathie::Path instance.
+ *
+ * \param[in] filename The path to open the stream for. A Pathie::Path instance.
+ * \param mode Mode to open the file in.
+ */
+ Pathie::ifstream::ifstream(const Pathie::Path& filename, ios_base::openmode mode)
+ : std::basic_istream<char, std::char_traits<char> >()
+ {
+ mp_file = NULL;
+ mp_filebuffer = NULL;
+ m_buffer_allocated = false;
+
+ // See the lengthy explanation in open() for why we do this here.
+ mp_filebuffer = (__gnu_cxx::stdio_filebuf<char>*) malloc(sizeof(__gnu_cxx::stdio_filebuf<char>));
+ memset(mp_filebuffer, '\0', sizeof(__gnu_cxx::stdio_filebuf<char>));
+
+ this->init(mp_filebuffer);
+ this->open(filename, mode);
+ }
+
+ Pathie::ifstream::~ifstream()
+ {
+ free(mp_filebuffer);
+ }
+
+ /**
+ * The underlying buffer.
+ */
+ __gnu_cxx::stdio_filebuf<char>* Pathie::ifstream::rdbuf() const
+ {
+ return mp_filebuffer;
+ }
+
+ /**
+ * Checks whether the stream has been open()ed already. This is the only
+ * method safe to use before you called open() on a stream constructed
+ * with the default constructor (apart from open() itself of course).
+ */
+ bool Pathie::ifstream::is_open() const
+ {
+ if (!m_buffer_allocated)
+ return false;
+
+ return mp_filebuffer->is_open();
+ }
+
+ /**
+ * Open the given UTF-8 file path in this stream. You can call this anytime
+ * after you constructed an instance with the default constructor; otherwise,
+ * you have to close() whatever was opened before you call this method.
+ *
+ * \param[in] filename UTF-8 filename to open
+ * \param mode Mode to open the stream in.
+ */
+ void Pathie::ifstream::open(const char* filename, ios_base::openmode mode)
+ {
+ std::wstring w_filename = Pathie::utf8_to_utf16(filename);
+
+ mp_file = _wfopen(w_filename.c_str(), L"r"); // Mode will be overridden
+ if (!mp_file) {
+ setstate(ios_base::failbit);
+ return;
+ }
+
+ /* The following construction uses a “placement new” as it appears
+ * to be the only "clean" solution applicable. The init() method,
+ * an internum of the GCC implementation of basic_istream that
+ * needs to be called in the stream’s constructor, requires a
+ * pointer to the filebuffer object. However, we do not have that
+ * filebuffer object at hand in the constructor, the
+ * __gnu_cxx::stdio_filebuf instance will be created later when
+ * open() is called. It is impossible to construct it earlier,
+ * because it does not support a delayed open() call, the file
+ * descriptor or FILE* pointer must be passed during its
+ * construction, but we don’t have it there; it is available only
+ * in open() -- remember that you can create the ifstream instance
+ * without being attached to a file and then call open() later
+ * with a filename. To be able to pass something meaningful to
+ * init(), we have to "foresee" where in memory the stdio_filebuf
+ * instance will be created. This only is possible with a
+ * placement new into a place we have allocated previously using
+ * malloc().
+ *
+ * An alternative would be to use internal GCC APIs by duplicating
+ * the sourcecode of the __gnu_cxx::stdio_filebuf constructor; however
+ * undocumented internal APIs are never good to use. For informational
+ * purposes therefore the sourcecode link:
+ *
+ * https://gcc.gnu.org/onlinedocs/gcc-4.9.2/libstdc++/api/a01222_source.html
+ */
+
+ new (mp_filebuffer) __gnu_cxx::stdio_filebuf<char>(mp_file, mode);
+ m_buffer_allocated = true;
+
+ if (!mp_filebuffer->is_open())
+ setstate(ios_base::failbit);
+ else
+ clear();
+ }
+
+ /**
+ * Open the given UTF-8 file path in this stream. You can call this anytime
+ * after you constructed an instance with the default constructor; otherwise,
+ * you have to close() whatever was opened before you call this method.
+ *
+ * \param[in] filename UTF-8 filename to open
+ * \param mode Mode to open the stream in.
+ */
+ void Pathie::ifstream::open(const std::string& filename, ios_base::openmode mode)
+ {
+ open(filename.c_str(), mode);
+ }
+
+ /**
+ * Open the given Pathie::Path in this stream. You can call this anytime
+ * after you constructed an instance with the default constructor; otherwise,
+ * you have to close() whatever was opened before you call this method.
+ *
+ * \param[in] filename Pathie::Path to open the stream for.
+ * \param mode Mode to open the stream in.
+ */
+ void Pathie::ifstream::open(const Pathie::Path& filename, ios_base::openmode mode)
+ {
+ open(filename.str(), mode);
+ }
+
+ /**
+ * Close the underlying file. Has no effect if no file is opened.
+ */
+ void Pathie::ifstream::close()
+ {
+ if (mp_file) {
+ if (!mp_filebuffer->close())
+ setstate(ios_base::failbit);
+
+ // Do not deallocate, we may need it later if an open() call follows.
+ memset(mp_filebuffer, '\0', sizeof(__gnu_cxx::stdio_filebuf<char>));
+ m_buffer_allocated = false;
+ fclose(mp_file);
+ }
+ }
+# else
+# error Unsupported compiler: do not know how to open C++ stream on Unicode file.
+# endif
+#else
+# error Unsupported system.
+#endif
+};
diff --git a/src/3rd_party/pathie-cpp/src/pathie_ofstream.cpp b/src/3rd_party/pathie-cpp/src/pathie_ofstream.cpp
new file mode 100644
index 00000000..f1085043
--- /dev/null
+++ b/src/3rd_party/pathie-cpp/src/pathie_ofstream.cpp
@@ -0,0 +1,326 @@
+/* -*- coding: utf-8 -*-
+ * This file is part of Pathie.
+ *
+ * Copyright © 2015, 2017 Marvin Gülker
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * “AS IS” AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "../include/pathie_ofstream.hpp"
+
+#if defined(_WIN32) && defined(__GNUC__)
+#include <cstdio>
+#include <cstdlib>
+#endif
+
+namespace Pathie {
+#if defined(_PATHIE_UNIX)
+ // All well and easy under UNIX. Just delegate to standard constructor.
+ Pathie::ofstream::ofstream(Pathie::Path path, std::ios_base::openmode mode)
+ : std::ofstream(path.native().c_str(), mode)
+ {
+ //
+ }
+
+ Pathie::ofstream::ofstream()
+ : std::ofstream()
+ {
+ //
+ }
+
+ Pathie::ofstream::ofstream(std::string path, std::ios_base::openmode mode)
+ : std::ofstream(utf8_to_filename(path).c_str(), mode)
+ {
+ //
+ }
+
+ Pathie::ofstream::ofstream(char* path, std::ios_base::openmode mode)
+ : std::ofstream(utf8_to_filename(path).c_str(), mode)
+ {
+ //
+ }
+
+ void Pathie::ofstream::open(const char* filename, ios_base::openmode mode)
+ {
+ std::string filename_nstr = utf8_to_filename(filename);
+ std::ofstream::open(filename_nstr.c_str(), mode);
+ }
+
+ void Pathie::ofstream::open(const std::string& filename, ios_base::openmode mode)
+ {
+ std::ofstream::open(utf8_to_filename(filename).c_str(), mode);
+ }
+
+ void Pathie::ofstream::open(const Pathie::Path& filename, ios_base::openmode mode)
+ {
+ std::ofstream::open(filename.native().c_str(), mode);
+ }
+
+
+#elif defined (_WIN32)
+# if defined(_MSC_VER)
+ // Easy again under MSVC under Windows; using Microsoft’s nonstandard constructor
+ // for Unicode filenames.
+ // It is documented here: http://msdn.microsoft.com/en-us/library/8et8s826.aspx
+ Pathie::ofstream::ofstream(Pathie::Path path, std::ios_base::openmode mode)
+ : std::ofstream(path.native(), mode)
+ {
+ //
+ }
+
+ Pathie::ofstream::ofstream()
+ : std::ofstream()
+ {
+ //
+ }
+
+ Pathie::ofstream::ofstream(std::string path, std::ios_base::openmode mode)
+ : std::ofstream(path, mode)
+ {
+ //
+ }
+
+ Pathie::ofstream::ofstream(char* path, std::ios_base::openmode mode)
+ : std::ofstream(path, mode)
+ {
+ //
+ }
+# elif defined(__GNUC__)
+ // This one is tough, but solveable. There’s a nonstandard C++ extension by the
+ // GCC team to create a C++ stream from a file descriptor and similar.
+ // It is documented here: https://gcc.gnu.org/onlinedocs/gcc-4.9.2/libstdc++/api/a00054.html
+
+ /**
+ * Default constructor for deferred initialisation via open().
+ * Beware that before you called open(), any methods other than
+ * is_open() may behave unexpectedly!
+ */
+ Pathie::ofstream::ofstream()
+ : std::basic_ostream<char, std::char_traits<char> >()
+ {
+ mp_file = NULL;
+ mp_filebuffer = NULL;
+ m_buffer_allocated = false;
+
+ // See the lengthy explanation in open() for why we do this here.
+ mp_filebuffer = (__gnu_cxx::stdio_filebuf<char>*) malloc(sizeof(__gnu_cxx::stdio_filebuf<char>));
+ memset(mp_filebuffer, '\0', sizeof(__gnu_cxx::stdio_filebuf<char>));
+
+ this->init(mp_filebuffer);
+ }
+
+ /**
+ * Construct a stream for the given UTF-8 file path.
+ *
+ * \param[in] filename The path to open the stream for. UTF-8.
+ * \param mode Mode to open the file in.
+ */
+ Pathie::ofstream::ofstream(const char* filename, ios_base::openmode mode)
+ : std::basic_ostream<char, std::char_traits<char> >()
+ {
+ mp_file = NULL;
+ mp_filebuffer = NULL;
+ m_buffer_allocated = false;
+
+ // See the lengthy explanation in open() for why we do this here.
+ mp_filebuffer = (__gnu_cxx::stdio_filebuf<char>*) malloc(sizeof(__gnu_cxx::stdio_filebuf<char>));
+ memset(mp_filebuffer, '\0', sizeof(__gnu_cxx::stdio_filebuf<char>));
+
+ this->init(mp_filebuffer);
+ this->open(filename, mode);
+ }
+
+ /**
+ * Construct a stream for the given UTF-8 file path.
+ *
+ * \param[in] filename The path to open the stream for. UTF-8.
+ * \param mode Mode to open the file in.
+ */
+ Pathie::ofstream::ofstream(const std::string& filename, ios_base::openmode mode)
+ : std::basic_ostream<char, std::char_traits<char> >()
+ {
+ mp_file = NULL;
+ mp_filebuffer = NULL;
+ m_buffer_allocated = false;
+
+ // See the lengthy explanation in open() for why we do this here.
+ mp_filebuffer = (__gnu_cxx::stdio_filebuf<char>*) malloc(sizeof(__gnu_cxx::stdio_filebuf<char>));
+ memset(mp_filebuffer, '\0', sizeof(__gnu_cxx::stdio_filebuf<char>));
+
+ this->init(mp_filebuffer);
+ this->open(filename, mode);
+ }
+
+ /**
+ * Construct a stream for the given Pathie::Path instance.
+ *
+ * \param[in] filename The path to open the stream for. A Pathie::Path instance.
+ * \param mode Mode to open the file in.
+ */
+ Pathie::ofstream::ofstream(const Pathie::Path& filename, ios_base::openmode mode)
+ : std::basic_ostream<char, std::char_traits<char> >()
+ {
+ mp_file = NULL;
+ mp_filebuffer = NULL;
+ m_buffer_allocated = false;
+
+ // See the lengthy explanation in open() for why we do this here.
+ mp_filebuffer = (__gnu_cxx::stdio_filebuf<char>*) malloc(sizeof(__gnu_cxx::stdio_filebuf<char>));
+ memset(mp_filebuffer, '\0', sizeof(__gnu_cxx::stdio_filebuf<char>));
+
+ this->init(mp_filebuffer);
+ this->open(filename, mode);
+ }
+
+ Pathie::ofstream::~ofstream()
+ {
+ free(mp_filebuffer);
+ }
+
+ /**
+ * The underlying buffer.
+ */
+ __gnu_cxx::stdio_filebuf<char>* Pathie::ofstream::rdbuf() const
+ {
+ return mp_filebuffer;
+ }
+
+ /**
+ * Checks whether the stream has been open()ed already. This is the only
+ * method safe to use before you called open() on a stream constructed
+ * with the default constructor (apart from open() itself of course).
+ */
+ bool Pathie::ofstream::is_open() const
+ {
+ if (!m_buffer_allocated)
+ return false;
+
+ return mp_filebuffer->is_open();
+ }
+
+ /**
+ * Open the given UTF-8 file path in this stream. You can call this anytime
+ * after you constructed an instance with the default constructor; otherwise,
+ * you have to close() whatever was opened before you call this method.
+ *
+ * \param[in] filename UTF-8 filename to open
+ * \param mode Mode to open the stream in.
+ */
+ void Pathie::ofstream::open(const char* filename, ios_base::openmode mode)
+ {
+ std::wstring w_filename = Pathie::utf8_to_utf16(filename);
+
+ mp_file = _wfopen(w_filename.c_str(),
+ (mode & ios_base::trunc) ? L"w" : L"a");
+
+ if (!mp_file) {
+ setstate(ios_base::failbit);
+ return;
+ }
+
+ /* The following construction uses a “placement new” as it appears
+ * to be the only "clean" solution applicable. The init() method,
+ * an internum of the GCC implementation of basic_ostream that
+ * needs to be called in the stream’s constructor, requires a
+ * pointer to the filebuffer object. However, we do not have that
+ * filebuffer object at hand in the constructor, the
+ * __gnu_cxx::stdio_filebuf instance will be created later when
+ * open() is called. It is impossible to construct it earlier,
+ * because it does not support a delayed open() call, the file
+ * descriptor or FILE* pointer must be passed during its
+ * construction, but we don’t have it there; it is available only
+ * in open() -- remember that you can create the ofstream instance
+ * without being attached to a file and then call open() later
+ * with a filename. To be able to pass something meaningful to
+ * init(), we have to "foresee" where in memory the stdio_filebuf
+ * instance will be created. This only is possible with a
+ * placement new into a place we have allocated previously using
+ * malloc().
+ *
+ * An alternative would be to use internal GCC APIs by duplicating
+ * the sourcecode of the __gnu_cxx::stdio_filebuf constructor; however
+ * undocumented internal APIs are never good to use. For informational
+ * purposes therefore the sourcecode link:
+ *
+ * https://gcc.gnu.org/onlinedocs/gcc-4.9.2/libstdc++/api/a01222_source.html
+ */
+
+ new (mp_filebuffer) __gnu_cxx::stdio_filebuf<char>(mp_file, mode);
+ m_buffer_allocated = true;
+
+ if (!mp_filebuffer->is_open())
+ setstate(ios_base::failbit);
+ else
+ clear();
+ }
+
+ /**
+ * Open the given UTF-8 file path in this stream. You can call this anytime
+ * after you constructed an instance with the default constructor; otherwise,
+ * you have to close() whatever was opened before you call this method.
+ *
+ * \param[in] filename UTF-8 filename to open
+ * \param mode Mode to open the stream in.
+ */
+ void Pathie::ofstream::open(const std::string& filename, ios_base::openmode mode)
+ {
+ open(filename.c_str(), mode);
+ }
+
+ /**
+ * Open the given Pathie::Path in this stream. You can call this anytime
+ * after you constructed an instance with the default constructor; otherwise,
+ * you have to close() whatever was opened before you call this method.
+ *
+ * \param[in] filename Pathie::Path to open the stream for.
+ * \param mode Mode to open the stream in.
+ */
+ void Pathie::ofstream::open(const Pathie::Path& filename, ios_base::openmode mode)
+ {
+ open(filename.str(), mode);
+ }
+
+ /**
+ * Close the underlying file. Has no effect if no file is opened.
+ */
+ void Pathie::ofstream::close()
+ {
+ if (mp_file) {
+ if (!mp_filebuffer->close())
+ setstate(ios_base::failbit);
+
+ // Do not deallocate, we may need it later if an open() call follows.
+ memset(mp_filebuffer, '\0', sizeof(__gnu_cxx::stdio_filebuf<char>));
+ m_buffer_allocated = false;
+ fclose(mp_file);
+ }
+ }
+
+# else
+# error Unsupported compiler: do not know how to open C++ stream on Unicode file.
+# endif
+#else
+# error Unsupported system.
+#endif
+};
diff --git a/src/3rd_party/pathie-cpp/src/temp.cpp b/src/3rd_party/pathie-cpp/src/temp.cpp
new file mode 100644
index 00000000..ae51bf87
--- /dev/null
+++ b/src/3rd_party/pathie-cpp/src/temp.cpp
@@ -0,0 +1,197 @@
+#include "../include/temp.hpp"
+#include <sstream>
+#include <cstdlib>
+#include <cstring>
+#include <ctime>
+
+#if defined(_PATHIE_UNIX)
+#include <sys/types.h>
+#include <unistd.h>
+#elif defined(_WIN32)
+#include <Windows.h>
+#else
+#error Unsupported system
+#endif
+
+using namespace Pathie;
+
+static std::string generate_random_filename(const std::string& namepart)
+{
+ std::stringstream name;
+ name << namepart << "-" << time(NULL) << rand();
+
+#if defined(_PATHIE_UNIX)
+ name << getpid();
+#elif defined(_WIN32)
+ name << GetCurrentProcessId();
+#else
+#error Unsupported system
+#endif
+
+ return name.str();
+}
+
+/**
+ * TempEntry is not meant to be instanciated on itself. This
+ * constructor does the common work between the Tempdir and Tempfile
+ * classes, namely it generates a temporary filename that is not
+ * currently in use.
+ *
+ * \param namepart
+ * A string that will be included verbatim into the basename
+ * of the created directory.
+ *
+ * \remark The generated path name is of form
+ * `<namepart>-<currenttime><random><pid>`. However, future releases
+ * may change this format, so do not rely on it.
+ */
+TempEntry::TempEntry(std::string namepart)
+ : m_keep(false)
+{
+ do {
+ m_path = Path::temp_dir() / generate_random_filename(namepart);
+ } while (m_path.exists());
+}
+
+/**
+ * Destructor.
+ */
+TempEntry::~TempEntry()
+{
+ //
+}
+
+/**
+ * Returns the absolute path to the temporary entry
+ * that was created by the constructor.
+ */
+Path TempEntry::path() const
+{
+ return m_path;
+}
+
+/**
+ * Call this function if you do not want the destructor to delete
+ * the created temporary entry. You can still expressly delete
+ * the temporary entry by calling remove().
+ *
+ * \param k
+ * If true (default), the destructor will not delete the temporary entry.
+ * If false, the destructor will delete the temporary entry.
+ */
+void TempEntry::keep(bool k)
+{
+ m_keep = k;
+}
+
+/**
+ * Returns the keep status; see keep().
+ */
+bool TempEntry::is_kept() const
+{
+ return m_keep;
+}
+
+/**
+ * Constructs an instance of this class. A temporary directory
+ * is created that will be recursively removed when the object
+ * is deleted.
+ *
+ * \param namepart
+ * A string that will be included verbatim into the basename
+ * of the created directory.
+ *
+ * \returns The newly created instance.
+ *
+ * \remark There is a small timespan between the generation of the
+ * temporary path name and the creation of the directory in which it
+ * is theoretically possible for another process to create an entry
+ * that conflicts with the generated name. However, since the
+ * generated name includes a random number, the process identifier,
+ * and the number of seconds since epoch as well as the given
+ * `namepart`, the chance of an accidental collision is very low.
+ * Even a malicious attacker would have to guess the random number, so
+ * if your `srand()` seed is chosen properly and your C standard
+ * library is properly impelemented, this risk is again very low.
+ */
+Tempdir::Tempdir(std::string namepart)
+ : TempEntry(namepart)
+{
+ m_path.mktree();
+}
+
+/**
+ * Destructor, removes the temporary entry unless keep() has been called.
+ * Does nothing if the temporary file does not exist anymore for whatever
+ * reason.
+ */
+Tempdir::~Tempdir()
+{
+ if (!m_keep)
+ remove();
+}
+
+/**
+ * Recursively removes the temporary directory. This method
+ * ignores what was set with keep(), i.e., it *always* deletes
+ * the temporary directory if you call it. This method does
+ * nothing if the directory does not exist anymore for whatever
+ * reason.
+ */
+void Tempdir::remove() const
+{
+ if (m_path.exists())
+ m_path.rmtree();
+}
+
+/**
+ * Constructs an instance of this class. A temporary file
+ * is created that will be recursively removed when the object
+ * is deleted.
+ *
+ * \param namepart
+ * A string that will be included verbatim into the basename
+ * of the created filename.
+ *
+ * \returns The newly created instance.
+ *
+ * \remark There is a small timespan between the generation of the
+ * temporary path name and the creation of the file in which it
+ * is theoretically possible for another process to create an entry
+ * that conflicts with the generated name. However, since the
+ * generated name includes a random number, the process identifier,
+ * and the number of seconds since epoch as well as the given
+ * `namepart`, the chance of an accidental collision is very low.
+ * Even a malicious attacker would have to guess the random number, so
+ * if your `srand()` seed is chosen properly and your C standard
+ * library is properly impelemented, this risk is again very low.
+ */
+Tempfile::Tempfile(std::string namepart)
+ : TempEntry(namepart)
+{
+ m_path.touch();
+}
+
+/**
+ * Destructor, removes the temporary file unless keep() has been called.
+ * Does nothing if the temporary directory does not exist anymore for whatever
+ * reason.
+ */
+Tempfile::~Tempfile()
+{
+ if (!m_keep)
+ remove();
+}
+
+/**
+ * Removes the temporary file. This method
+ * ignores what was set with keep(), i.e., it *always* deletes
+ * the temporary file if you call it. This method does nothing
+ * if the file does not exist anymore for whatever
+ * reason.
+ */
+void Tempfile::remove() const
+{
+ if (m_path.exists())
+ m_path.unlink();
+}
diff --git a/src/3rd_party/sentencepiece b/src/3rd_party/sentencepiece
new file mode 160000
+Subproject 1a38d26a13cc67b1aae641d4983b624bef6d530
diff --git a/src/3rd_party/zstr/LICENSE b/src/3rd_party/zstr/LICENSE
new file mode 100644
index 00000000..841c7214
--- /dev/null
+++ b/src/3rd_party/zstr/LICENSE
@@ -0,0 +1,21 @@
+The MIT License (MIT)
+
+Copyright (c) 2015 Matei David, Ontario Institute for Cancer Research
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/src/3rd_party/zstr/README.org b/src/3rd_party/zstr/README.org
new file mode 100644
index 00000000..bc0dd3e5
--- /dev/null
+++ b/src/3rd_party/zstr/README.org
@@ -0,0 +1,54 @@
+# -*- mode:org; mode:visual-line; coding:utf-8; -*-
+
+** A C++ ZLib wrapper
+
+[[http://travis-ci.org/mateidavid/zstr][http://travis-ci.org/mateidavid/zstr.svg?branch=master]] [[https://tldrlegal.com/license/mit-license][http://img.shields.io/:license-mit-blue.svg]]
+
+This C++ header-only library enables the use of C++ standard iostreams to access ZLib-compressed streams.
+
+For input access (decompression), the compression format is auto-detected, and multiple concatenated compressed streams are decompressed seamlessly.
+
+For output access (compression), the only parameter exposed by this API is the compression level.
+
+Alternatives to this library include:
+
+- The original [[http://www.zlib.net/][ZLib]], through its [[http://www.zlib.net/manual.html][C API]]. This does not interact nicely with C++ iostreams.
+
+- The [[http://www.cs.unc.edu/Research/compgeom/gzstream/][GZStream]] library. This library does not auto-detect input compression, and it cannot wrap streams (only files).
+
+- The [[http://www.boost.org/doc/libs/release/libs/iostreams/][Boost IOStreams]] library. The library does not auto-detect input compression (by default, though that can be easily implemented with filters), and more importantly, it is not a header-only Boost library.
+
+For an example usage, see [[examples/ztxtpipe.cpp]] and [[examples/zc.cpp]].
+
+**** Input Auto-detection
+
+For input access, the library seamlessly auto-detects whether the source stream is compressed or not. The following compressed streams are detected:
+
+- GZip header, when stream starts with =1F 8B=. See [[http://en.wikipedia.org/wiki/Gzip][GZip format]].
+
+- ZLib header, when stream starts with =78 01=, =78 9C=, and =78 DA=. See [[http://stackoverflow.com/a/17176881][answer here]].
+
+If none of these formats are detected, the library assumes the input is not compressed, and it produces a plain copy of the source stream.
+
+**** Classes
+
+The package provides 6 classes for accessing ZLib streams:
+
+- =zstr::istreambuf= is the core decompression class. This is constructed from an existing =std::streambuf= that contains source data. The =zstr::istreambuf= constructor accepts explicit settings for the internal buffer size (default: 1 MB) and the auto-detection option (default: on). ZLib errors cause exceptions to be thrown.
+
+- =zstr::ostreambuf= is the core compression class. This is constructed from an existing =std::streambuf= that contains sink data. The =zstr::ostreambuf= constructor accepts explicit settings for the internal buffer size (default: 1 MB) and the compression option (default: ZLib default). ZLib errors cause exceptions to be thrown.
+
+- =zstr::istream= is a wrapper for a =zstr::istreambuf= that accesses an /external/ =std::streambuf=. It can be constructed from an existing =std::istream= (such as =std::cin=) or =std::streambuf=.
+
+- =zstr::ostream= is a wrapper for a =zstr::ostreambuf= that accesses an /external/ =std::streambuf=. It can be constructed from an existing =std::ostream= (such as =std::cout=) or =std::streambuf=.
+
+- =zstr::ifstream= is a wrapper for a =zstr::istreambuf= that accesses an /internal/ =std::ifstream=. This can be used to open a file and read decompressed data from it.
+
+- =zstr::ofstream= is a wrapper for a =zstr::ostreambuf= that accesses an /internal/ =std::ofstream=. This can be used to open a file and write compressed data to it.
+
+For all stream objects, the =badbit= of their expection mask is turned on in order to propagate exceptions.
+
+**** License
+
+Released under the [[file:LICENSE][MIT license]].
+
diff --git a/src/3rd_party/zstr/strict_fstream.hpp b/src/3rd_party/zstr/strict_fstream.hpp
new file mode 100644
index 00000000..21173c73
--- /dev/null
+++ b/src/3rd_party/zstr/strict_fstream.hpp
@@ -0,0 +1,202 @@
+#ifndef __STRICT_FSTREAM_HPP
+#define __STRICT_FSTREAM_HPP
+
+#include <cassert>
+#include <fstream>
+#include <cstring>
+#include <string>
+
+/**
+ * This namespace defines wrappers for std::ifstream, std::ofstream, and
+ * std::fstream objects. The wrappers perform the following steps:
+ * - check the open modes make sense
+ * - check that the call to open() is successful
+ * - (for input streams) check that the opened file is peek-able
+ * - turn on the badbit in the exception mask
+ */
+namespace strict_fstream
+{
+
+/// Overload of error-reporting function, to enable use with VS.
+/// Ref: http://stackoverflow.com/a/901316/717706
+static std::string strerror()
+{
+ std::string buff(80, '\0');
+#ifdef _WIN32
+ if (strerror_s(&buff[0], buff.size(), errno) != 0)
+ {
+ buff = "Unknown error";
+ }
+#elif (_POSIX_C_SOURCE >= 200112L || _XOPEN_SOURCE >= 600) && ! _GNU_SOURCE
+// XSI-compliant strerror_r()
+ if (strerror_r(errno, &buff[0], buff.size()) != 0)
+ {
+ buff = "Unknown error";
+ }
+#else
+// GNU-specific strerror_r()
+ auto p = strerror_r(errno, &buff[0], buff.size());
+ std::string tmp(p, std::strlen(p));
+ std::swap(buff, tmp);
+#endif
+ buff.resize(buff.find('\0'));
+ return buff;
+}
+
+/// Exception class thrown by failed operations.
+class Exception
+ : public std::exception
+{
+public:
+ Exception(const std::string& msg) : _msg(msg) {}
+ const char * what() const noexcept { return _msg.c_str(); }
+private:
+ std::string _msg;
+}; // class Exception
+
+namespace detail
+{
+
+struct static_method_holder
+{
+ static std::string mode_to_string(std::ios_base::openmode mode)
+ {
+ static const int n_modes = 6;
+ static const std::ios_base::openmode mode_val_v[n_modes] =
+ {
+ std::ios_base::in,
+ std::ios_base::out,
+ std::ios_base::app,
+ std::ios_base::ate,
+ std::ios_base::trunc,
+ std::ios_base::binary
+ };
+
+ static const char * mode_name_v[n_modes] =
+ {
+ "in",
+ "out",
+ "app",
+ "ate",
+ "trunc",
+ "binary"
+ };
+ std::string res;
+ for (int i = 0; i < n_modes; ++i)
+ {
+ if (mode & mode_val_v[i])
+ {
+ res += (! res.empty()? "|" : "");
+ res += mode_name_v[i];
+ }
+ }
+ if (res.empty()) res = "none";
+ return res;
+ }
+ static void check_mode(const std::string& filename, std::ios_base::openmode mode)
+ {
+ if ((mode & std::ios_base::trunc) && ! (mode & std::ios_base::out))
+ {
+ throw Exception(std::string("strict_fstream: open('") + filename + "'): mode error: trunc and not out");
+ }
+ else if ((mode & std::ios_base::app) && ! (mode & std::ios_base::out))
+ {
+ throw Exception(std::string("strict_fstream: open('") + filename + "'): mode error: app and not out");
+ }
+ else if ((mode & std::ios_base::trunc) && (mode & std::ios_base::app))
+ {
+ throw Exception(std::string("strict_fstream: open('") + filename + "'): mode error: trunc and app");
+ }
+ }
+ static void check_open(std::ios * s_p, const std::string& filename, std::ios_base::openmode mode)
+ {
+ if (s_p->fail())
+ {
+ throw Exception(std::string("strict_fstream: open('")
+ + filename + "'," + mode_to_string(mode) + "): open failed: "
+ + strerror());
+ }
+ }
+ static void check_peek(std::istream * is_p, const std::string& filename, std::ios_base::openmode mode)
+ {
+ bool peek_failed = true;
+ try
+ {
+ is_p->peek();
+ peek_failed = is_p->fail();
+ }
+ catch (std::ios_base::failure e) {}
+ if (peek_failed)
+ {
+ throw Exception(std::string("strict_fstream: open('")
+ + filename + "'," + mode_to_string(mode) + "): peek failed: "
+ + strerror());
+ }
+ is_p->clear();
+ }
+}; // struct static_method_holder
+
+} // namespace detail
+
+class ifstream
+ : public std::ifstream
+{
+public:
+ ifstream() = default;
+ ifstream(const std::string& filename, std::ios_base::openmode mode = std::ios_base::in)
+ {
+ open(filename, mode);
+ }
+ void open(const std::string& filename, std::ios_base::openmode mode = std::ios_base::in)
+ {
+ mode |= std::ios_base::in;
+ exceptions(std::ios_base::badbit);
+ detail::static_method_holder::check_mode(filename, mode);
+ std::ifstream::open(filename, mode);
+ detail::static_method_holder::check_open(this, filename, mode);
+ detail::static_method_holder::check_peek(this, filename, mode);
+ }
+}; // class ifstream
+
+class ofstream
+ : public std::ofstream
+{
+public:
+ ofstream() = default;
+ ofstream(const std::string& filename, std::ios_base::openmode mode = std::ios_base::out)
+ {
+ open(filename, mode);
+ }
+ void open(const std::string& filename, std::ios_base::openmode mode = std::ios_base::out)
+ {
+ mode |= std::ios_base::out;
+ exceptions(std::ios_base::badbit);
+ detail::static_method_holder::check_mode(filename, mode);
+ std::ofstream::open(filename, mode);
+ detail::static_method_holder::check_open(this, filename, mode);
+ }
+}; // class ofstream
+
+class fstream
+ : public std::fstream
+{
+public:
+ fstream() = default;
+ fstream(const std::string& filename, std::ios_base::openmode mode = std::ios_base::in)
+ {
+ open(filename, mode);
+ }
+ void open(const std::string& filename, std::ios_base::openmode mode = std::ios_base::in)
+ {
+ if (! (mode & std::ios_base::out)) mode |= std::ios_base::in;
+ exceptions(std::ios_base::badbit);
+ detail::static_method_holder::check_mode(filename, mode);
+ std::fstream::open(filename, mode);
+ detail::static_method_holder::check_open(this, filename, mode);
+ detail::static_method_holder::check_peek(this, filename, mode);
+ }
+}; // class fstream
+
+} // namespace strict_fstream
+
+#endif
diff --git a/src/3rd_party/zstr/zstr.hpp b/src/3rd_party/zstr/zstr.hpp
new file mode 100644
index 00000000..6b633728
--- /dev/null
+++ b/src/3rd_party/zstr/zstr.hpp
@@ -0,0 +1,411 @@
+//---------------------------------------------------------
+// Copyright 2015 Ontario Institute for Cancer Research
+// Written by Matei David (matei@cs.toronto.edu)
+//---------------------------------------------------------
+
+// Reference:
+// http://stackoverflow.com/questions/14086417/how-to-write-custom-input-stream-in-c
+
+#ifndef __ZSTR_HPP
+#define __ZSTR_HPP
+
+#include <cassert>
+#include <fstream>
+#include <sstream>
+#include <zlib.h>
+#include "strict_fstream.hpp"
+
+namespace zstr
+{
+
+/// Exception class thrown by failed zlib operations.
+class Exception
+ : public std::exception
+{
+public:
+ Exception(z_stream * zstrm_p, int ret)
+ : _msg("zlib: ")
+ {
+ switch (ret)
+ {
+ case Z_STREAM_ERROR:
+ _msg += "Z_STREAM_ERROR: ";
+ break;
+ case Z_DATA_ERROR:
+ _msg += "Z_DATA_ERROR: ";
+ break;
+ case Z_MEM_ERROR:
+ _msg += "Z_MEM_ERROR: ";
+ break;
+ case Z_VERSION_ERROR:
+ _msg += "Z_VERSION_ERROR: ";
+ break;
+ case Z_BUF_ERROR:
+ _msg += "Z_BUF_ERROR: ";
+ break;
+ default:
+ std::ostringstream oss;
+ oss << ret;
+ _msg += "[" + oss.str() + "]: ";
+ break;
+ }
+ _msg += zstrm_p->msg;
+ }
+ Exception(const std::string msg) : _msg(msg) {}
+ const char * what() const noexcept { return _msg.c_str(); }
+private:
+ std::string _msg;
+}; // class Exception
+
+namespace detail
+{
+
+class z_stream_wrapper
+ : public z_stream
+{
+public:
+ z_stream_wrapper(bool _is_input = true, int _level = Z_DEFAULT_COMPRESSION)
+ : is_input(_is_input)
+ {
+ this->zalloc = Z_NULL;
+ this->zfree = Z_NULL;
+ this->opaque = Z_NULL;
+ int ret;
+ if (is_input)
+ {
+ this->avail_in = 0;
+ this->next_in = Z_NULL;
+ ret = inflateInit2(this, 15+32);
+ }
+ else
+ {
+ ret = deflateInit2(this, _level, Z_DEFLATED, 15+16, 8, Z_DEFAULT_STRATEGY);
+ }
+ if (ret != Z_OK) throw Exception(this, ret);
+ }
+ ~z_stream_wrapper()
+ {
+ if (is_input)
+ {
+ inflateEnd(this);
+ }
+ else
+ {
+ deflateEnd(this);
+ }
+ }
+private:
+ bool is_input;
+}; // class z_stream_wrapper
+
+} // namespace detail
+
+class istreambuf
+ : public std::streambuf
+{
+public:
+ istreambuf(std::streambuf * _sbuf_p,
+ std::size_t _buff_size = default_buff_size, bool _auto_detect = true)
+ : sbuf_p(_sbuf_p),
+ zstrm_p(nullptr),
+ buff_size(_buff_size),
+ auto_detect(_auto_detect),
+ auto_detect_run(false),
+ is_text(false)
+ {
+ assert(sbuf_p);
+ in_buff = new char [buff_size];
+ in_buff_start = in_buff;
+ in_buff_end = in_buff;
+ out_buff = new char [buff_size];
+ setg(out_buff, out_buff, out_buff);
+ }
+
+ istreambuf(const istreambuf &) = delete;
+ istreambuf(istreambuf &&) = default;
+ istreambuf & operator = (const istreambuf &) = delete;
+ istreambuf & operator = (istreambuf &&) = default;
+
+ virtual ~istreambuf()
+ {
+ delete [] in_buff;
+ delete [] out_buff;
+ if (zstrm_p) delete zstrm_p;
+ }
+
+ virtual std::streambuf::int_type underflow()
+ {
+ if (this->gptr() == this->egptr())
+ {
+ // pointers for free region in output buffer
+ char * out_buff_free_start = out_buff;
+ do
+ {
+ // read more input if none available
+ if (in_buff_start == in_buff_end)
+ {
+ // empty input buffer: refill from the start
+ in_buff_start = in_buff;
+ std::streamsize sz = sbuf_p->sgetn(in_buff, buff_size);
+ in_buff_end = in_buff + sz;
+ if (in_buff_end == in_buff_start) break; // end of input
+ }
+ // auto detect if the stream contains text or deflate data
+ if (auto_detect && ! auto_detect_run)
+ {
+ auto_detect_run = true;
+ unsigned char b0 = *reinterpret_cast< unsigned char * >(in_buff_start);
+ unsigned char b1 = *reinterpret_cast< unsigned char * >(in_buff_start + 1);
+ // Ref:
+ // http://en.wikipedia.org/wiki/Gzip
+ // http://stackoverflow.com/questions/9050260/what-does-a-zlib-header-look-like
+ is_text = ! (in_buff_start + 2 <= in_buff_end
+ && ((b0 == 0x1F && b1 == 0x8B) // gzip header
+ || (b0 == 0x78 && (b1 == 0x01 // zlib header
+ || b1 == 0x9C
+ || b1 == 0xDA))));
+ }
+ if (is_text)
+ {
+ // simply swap in_buff and out_buff, and adjust pointers
+ assert(in_buff_start == in_buff);
+ std::swap(in_buff, out_buff);
+ out_buff_free_start = in_buff_end;
+ in_buff_start = in_buff;
+ in_buff_end = in_buff;
+ }
+ else
+ {
+ // run inflate() on input
+ if (! zstrm_p) zstrm_p = new detail::z_stream_wrapper(true);
+ zstrm_p->next_in = reinterpret_cast< decltype(zstrm_p->next_in) >(in_buff_start);
+ zstrm_p->avail_in = in_buff_end - in_buff_start;
+ zstrm_p->next_out = reinterpret_cast< decltype(zstrm_p->next_out) >(out_buff_free_start);
+ zstrm_p->avail_out = (out_buff + buff_size) - out_buff_free_start;
+ int ret = inflate(zstrm_p, Z_NO_FLUSH);
+ // process return code
+ if (ret != Z_OK && ret != Z_STREAM_END) throw Exception(zstrm_p, ret);
+ // update in&out pointers following inflate()
+ in_buff_start = reinterpret_cast< decltype(in_buff_start) >(zstrm_p->next_in);
+ in_buff_end = in_buff_start + zstrm_p->avail_in;
+ out_buff_free_start = reinterpret_cast< decltype(out_buff_free_start) >(zstrm_p->next_out);
+ assert(out_buff_free_start + zstrm_p->avail_out == out_buff + buff_size);
+ // if stream ended, deallocate inflator
+ if (ret == Z_STREAM_END)
+ {
+ delete zstrm_p;
+ zstrm_p = nullptr;
+ }
+ }
+ } while (out_buff_free_start == out_buff);
+ // 2 exit conditions:
+ // - end of input: there might or might not be output available
+ // - out_buff_free_start != out_buff: output available
+ this->setg(out_buff, out_buff, out_buff_free_start);
+ }
+ return this->gptr() == this->egptr()
+ ? traits_type::eof()
+ : traits_type::to_int_type(*this->gptr());
+ }
+private:
+ std::streambuf * sbuf_p;
+ char * in_buff;
+ char * in_buff_start;
+ char * in_buff_end;
+ char * out_buff;
+ detail::z_stream_wrapper * zstrm_p;
+ std::size_t buff_size;
+ bool auto_detect;
+ bool auto_detect_run;
+ bool is_text;
+
+ static const std::size_t default_buff_size = (std::size_t)1 << 20;
+}; // class istreambuf
+
+class ostreambuf
+ : public std::streambuf
+{
+public:
+ ostreambuf(std::streambuf * _sbuf_p,
+ std::size_t _buff_size = default_buff_size, int _level = Z_DEFAULT_COMPRESSION)
+ : sbuf_p(_sbuf_p),
+ zstrm_p(new detail::z_stream_wrapper(false, _level)),
+ buff_size(_buff_size)
+ {
+ assert(sbuf_p);
+ in_buff = new char [buff_size];
+ out_buff = new char [buff_size];
+ setp(in_buff, in_buff + buff_size);
+ }
+
+ ostreambuf(const ostreambuf &) = delete;
+ ostreambuf(ostreambuf &&) = default;
+ ostreambuf & operator = (const ostreambuf &) = delete;
+ ostreambuf & operator = (ostreambuf &&) = default;
+
+ int deflate_loop(int flush)
+ {
+ while (true)
+ {
+ zstrm_p->next_out = reinterpret_cast< decltype(zstrm_p->next_out) >(out_buff);
+ zstrm_p->avail_out = buff_size;
+ int ret = deflate(zstrm_p, flush);
+ if (ret != Z_OK && ret != Z_STREAM_END && ret != Z_BUF_ERROR) throw Exception(zstrm_p, ret);
+ std::streamsize sz = sbuf_p->sputn(out_buff, reinterpret_cast< decltype(out_buff) >(zstrm_p->next_out) - out_buff);
+ if (sz != reinterpret_cast< decltype(out_buff) >(zstrm_p->next_out) - out_buff)
+ {
+ // there was an error in the sink stream
+ return -1;
+ }
+ if (ret == Z_STREAM_END || ret == Z_BUF_ERROR || sz == 0)
+ {
+ break;
+ }
+ }
+ return 0;
+ }
+
+ virtual ~ostreambuf()
+ {
+ // flush the zlib stream
+ //
+ // NOTE: Errors here (sync() return value not 0) are ignored, because we
+ // cannot throw in a destructor. This mirrors the behaviour of
+ // std::basic_filebuf::~basic_filebuf(). To see an exception on error,
+ // close the ofstream with an explicit call to close(), and do not rely
+ // on the implicit call in the destructor.
+ //
+ sync();
+ delete [] in_buff;
+ delete [] out_buff;
+ delete zstrm_p;
+ }
+ virtual std::streambuf::int_type overflow(std::streambuf::int_type c = traits_type::eof())
+ {
+ zstrm_p->next_in = reinterpret_cast< decltype(zstrm_p->next_in) >(pbase());
+ zstrm_p->avail_in = pptr() - pbase();
+ while (zstrm_p->avail_in > 0)
+ {
+ int r = deflate_loop(Z_NO_FLUSH);
+ if (r != 0)
+ {
+ setp(nullptr, nullptr);
+ return traits_type::eof();
+ }
+ }
+ setp(in_buff, in_buff + buff_size);
+ return traits_type::eq_int_type(c, traits_type::eof()) ? traits_type::eof() : sputc(c);
+ }
+ virtual int sync()
+ {
+ // first, call overflow to clear in_buff
+ overflow();
+ if (! pptr()) return -1;
+ // then, call deflate asking to finish the zlib stream
+ zstrm_p->next_in = nullptr;
+ zstrm_p->avail_in = 0;
+ if (deflate_loop(Z_FINISH) != 0) return -1;
+ deflateReset(zstrm_p);
+ return 0;
+ }
+private:
+ std::streambuf * sbuf_p;
+ char * in_buff;
+ char * out_buff;
+ detail::z_stream_wrapper * zstrm_p;
+ std::size_t buff_size;
+
+ static const std::size_t default_buff_size = (std::size_t)1 << 20;
+}; // class ostreambuf
+
+class istream
+ : public std::istream
+{
+public:
+ istream(std::istream & is)
+ : std::istream(new istreambuf(is.rdbuf()))
+ {
+ exceptions(std::ios_base::badbit);
+ }
+ explicit istream(std::streambuf * sbuf_p)
+ : std::istream(new istreambuf(sbuf_p))
+ {
+ exceptions(std::ios_base::badbit);
+ }
+ virtual ~istream()
+ {
+ delete rdbuf();
+ }
+}; // class istream
+
+class ostream
+ : public std::ostream
+{
+public:
+ ostream(std::ostream & os)
+ : std::ostream(new ostreambuf(os.rdbuf()))
+ {
+ exceptions(std::ios_base::badbit);
+ }
+ explicit ostream(std::streambuf * sbuf_p)
+ : std::ostream(new ostreambuf(sbuf_p))
+ {
+ exceptions(std::ios_base::badbit);
+ }
+ virtual ~ostream()
+ {
+ delete rdbuf();
+ }
+}; // class ostream
+
+namespace detail
+{
+
+template < typename FStream_Type >
+struct strict_fstream_holder
+{
+ strict_fstream_holder(const std::string& filename, std::ios_base::openmode mode = std::ios_base::in)
+ : _fs(filename, mode)
+ {}
+ FStream_Type _fs;
+}; // class strict_fstream_holder
+
+} // namespace detail
+
+class ifstream
+ : private detail::strict_fstream_holder< strict_fstream::ifstream >,
+ public std::istream
+{
+public:
+ explicit ifstream(const std::string& filename, std::ios_base::openmode mode = std::ios_base::in)
+ : detail::strict_fstream_holder< strict_fstream::ifstream >(filename, mode),
+ std::istream(new istreambuf(_fs.rdbuf()))
+ {
+ exceptions(std::ios_base::badbit);
+ }
+ virtual ~ifstream()
+ {
+ if (rdbuf()) delete rdbuf();
+ }
+}; // class ifstream
+
+class ofstream
+ : private detail::strict_fstream_holder< strict_fstream::ofstream >,
+ public std::ostream
+{
+public:
+ explicit ofstream(const std::string& filename, std::ios_base::openmode mode = std::ios_base::out)
+ : detail::strict_fstream_holder< strict_fstream::ofstream >(filename, mode | std::ios_base::binary),
+ std::ostream(new ostreambuf(_fs.rdbuf()))
+ {
+ exceptions(std::ios_base::badbit);
+ }
+ virtual ~ofstream()
+ {
+ if (rdbuf()) delete rdbuf();
+ }
+}; // class ofstream
+
+} // namespace zstr
+
+#endif
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 913ab17d..09864161 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -83,7 +83,9 @@ add_library(marian STATIC
$<TARGET_OBJECTS:libyaml-cpp>
$<TARGET_OBJECTS:SQLiteCpp>
+ $<TARGET_OBJECTS:pathie-cpp>
)
+target_compile_options(marian PUBLIC ${ALL_WARNINGS})
# Generate git_revision.h to reflect current git revision information
# [https://stackoverflow.com/questions/1435953/how-can-i-pass-git-sha1-to-compiler-as-definition-using-cmake]
@@ -110,6 +112,8 @@ cuda_add_library(marian_cuda
training/gradient_dropping/gpu/dropper.cu
training/gradient_dropping/gpu/sparse_algorithm.cu
STATIC)
+
+ target_compile_options(marian_cuda PUBLIC ${ALL_WARNINGS})
endif(CUDA_FOUND)
set_target_properties(marian PROPERTIES LIBRARY_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}")
@@ -117,18 +121,23 @@ set_target_properties(marian PROPERTIES ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_BINARY
add_executable(marian_train command/marian_main.cpp)
set_target_properties(marian_train PROPERTIES OUTPUT_NAME marian)
+target_compile_options(marian_train PUBLIC ${ALL_WARNINGS})
add_executable(marian_decoder command/marian_decoder.cpp)
set_target_properties(marian_decoder PROPERTIES OUTPUT_NAME marian-decoder)
+target_compile_options(marian_decoder PUBLIC ${ALL_WARNINGS})
add_executable(marian_scorer command/marian_scorer.cpp)
set_target_properties(marian_scorer PROPERTIES OUTPUT_NAME marian-scorer)
+target_compile_options(marian_scorer PUBLIC ${ALL_WARNINGS})
add_executable(marian_vocab command/marian_vocab.cpp)
set_target_properties(marian_vocab PROPERTIES OUTPUT_NAME marian-vocab)
+target_compile_options(marian_vocab PUBLIC ${ALL_WARNINGS})
add_executable(marian_conv command/marian_conv.cpp)
set_target_properties(marian_conv PROPERTIES OUTPUT_NAME marian-conv)
+target_compile_options(marian_conv PUBLIC ${ALL_WARNINGS})
set(EXECUTABLES ${EXECUTABLES} marian_train marian_decoder marian_scorer marian_vocab marian_conv)
@@ -165,6 +174,7 @@ endif()
if(COMPILE_SERVER)
add_executable(marian_server command/marian_server.cpp)
set_target_properties(marian_server PROPERTIES OUTPUT_NAME marian-server)
+ target_compile_options(marian_server PUBLIC ${ALL_WARNINGS})
set(EXECUTABLES ${EXECUTABLES} marian_server)
endif(COMPILE_SERVER)
diff --git a/src/command/marian_vocab.cpp b/src/command/marian_vocab.cpp
index d53dc5f0..de8ef3c7 100755
--- a/src/command/marian_vocab.cpp
+++ b/src/command/marian_vocab.cpp
@@ -25,9 +25,7 @@ int main(int argc, char** argv) {
LOG(info, "Creating vocabulary...");
auto vocab = New<Vocab>(options, 0);
- io::InputFileStream corpusStrm(std::cin);
- io::OutputFileStream vocabStrm(std::cout);
- vocab->create(corpusStrm, vocabStrm, options->get<size_t>("max-size"));
+ vocab->create("stdout", "stdin", options->get<size_t>("max-size"));
LOG(info, "Finished");
diff --git a/src/common/cli_wrapper.cpp b/src/common/cli_wrapper.cpp
index 0e230c04..28826bb2 100755
--- a/src/common/cli_wrapper.cpp
+++ b/src/common/cli_wrapper.cpp
@@ -1,6 +1,8 @@
#include "common/cli_wrapper.h"
+#include "common/cli_helper.h"
#include "common/logging.h"
#include "common/options.h"
+#include "common/timer.h"
#include "common/version.h"
namespace marian {
@@ -85,8 +87,7 @@ CLIWrapper::CLIWrapper(YAML::Node &config,
app_->formatter(fmt);
// add --version option
- optVersion_
- = app_->add_flag("--version", "Print the version number and exit");
+ optVersion_ = app_->add_flag("--version", "Print the version number and exit");
optVersion_->group(defaultGroup_);
}
@@ -96,20 +97,12 @@ CLIWrapper::CLIWrapper(Ptr<marian::Options> options,
const std::string &footer,
size_t columnWidth,
size_t screenWidth)
- : CLIWrapper(options->getYaml(),
- description,
- header,
- footer,
- columnWidth,
- screenWidth) {}
+ : CLIWrapper(options->getYaml(), description, header, footer, columnWidth, screenWidth) {}
CLIWrapper::~CLIWrapper() {}
void CLIWrapper::switchGroup(const std::string &name) {
- if(name.empty())
- currentGroup_ = defaultGroup_;
- else
- currentGroup_ = name;
+ currentGroup_ = name.empty() ? defaultGroup_ : name;
}
void CLIWrapper::parse(int argc, char **argv) {
@@ -126,25 +119,77 @@ void CLIWrapper::parse(int argc, char **argv) {
}
}
-std::string CLIWrapper::failureMessage(const CLI::App *app,
- const CLI::Error &e) {
+std::string CLIWrapper::failureMessage(const CLI::App *app, const CLI::Error &e) {
std::string header = "Error: " + std::string(e.what()) + "\n";
if(app->get_help_ptr() != nullptr)
- header += "Run with " + app->get_help_ptr()->get_name()
- + " for more information.\n";
+ header += "Run with " + app->get_help_ptr()->get_name() + " for more information.\n";
return header;
}
-void CLIWrapper::overwriteDefault(const YAML::Node &node) {
- // iterate requested default values
- for(auto it : node) {
+bool CLIWrapper::updateConfig(const YAML::Node &config) {
+ bool success = true;
+ auto cmdOptions = getParsedOptionNames();
+ for(auto it : config) {
auto key = it.first.as<std::string>();
- ABORT_IF(!allVars_.count(key), "The following option was not expected: '{}'", key);
- // if we have an option but it was not specified on command-line
- if(allVars_.count(key) > 0 && opts_.at(key)->empty()) {
+ // skip options specified via command-line to allow overwriting them
+ if(cmdOptions.count(key))
+ continue;
+ if(options_.count(key)) {
config_[key] = YAML::Clone(it.second);
+ options_[key].modified = true;
+ } else {
+ success = false;
}
}
+ return success;
+}
+
+std::string CLIWrapper::dumpConfig(bool skipDefault /*= false*/) const {
+ YAML::Emitter out;
+ out << YAML::Comment("Marian configuration file generated at " + timer::currentDate()
+ + " with version " + buildVersion());
+ out << YAML::BeginMap;
+ std::string comment;
+ for(const auto &key : getOrderedOptionNames()) {
+ // do not proceed keys that are removed from config_
+ if(!config_[key])
+ continue;
+ if(skipDefault && !options_.at(key).modified)
+ continue;
+ auto group = options_.at(key).opt->get_group();
+ if(comment != group) {
+ if(!comment.empty())
+ out << YAML::Newline;
+ comment = group;
+ out << YAML::Comment(group);
+ }
+ out << YAML::Key;
+ out << key;
+ out << YAML::Value;
+ cli::OutputYaml(config_[key], out);
+ }
+ out << YAML::EndMap;
+ return out.c_str();
+}
+
+std::unordered_set<std::string> CLIWrapper::getParsedOptionNames() const {
+ std::unordered_set<std::string> keys;
+ for(const auto &it : options_)
+ if(!it.second.opt->empty())
+ keys.emplace(it.first);
+ return keys;
+}
+
+std::vector<std::string> CLIWrapper::getOrderedOptionNames() const {
+ std::vector<std::string> keys;
+ // extract all option names
+ for(auto const &it : options_)
+ keys.push_back(it.first);
+ // sort option names by creation index
+ sort(keys.begin(), keys.end(), [this](const std::string &a, const std::string &b) {
+ return options_.at(a).idx < options_.at(b).idx;
+ });
+ return keys;
}
} // namespace cli
diff --git a/src/common/cli_wrapper.h b/src/common/cli_wrapper.h
index 67f2dff4..cf47a310 100755
--- a/src/common/cli_wrapper.h
+++ b/src/common/cli_wrapper.h
@@ -8,6 +8,7 @@
#include <iostream>
#include <map>
#include <string>
+#include <unordered_set>
namespace marian {
@@ -46,6 +47,19 @@ private:
size_t screenWidth_{0};
};
+// @TODO: in this file review the use of naked pointers. We use Ptr<Type> anywhere else,
+// what's up with that?
+
+/**
+ * The helper structure storing an option object, the associated variable and creation index.
+ */
+struct CLIOptionTuple {
+ CLI::Option *opt;
+ Ptr<any_type> var;
+ size_t idx{0};
+ bool modified{false};
+};
+
/**
* @brief The class used to define and parse command-line arguments.
*
@@ -63,10 +77,10 @@ private:
*/
class CLIWrapper {
private:
- // [option name] -> option value
- std::map<std::string, Ptr<any_type>> allVars_;
- // Map with option names and objects
- std::map<std::string, CLI::Option *> opts_;
+ // Map with option names and option tuples
+ std::unordered_map<std::string, CLIOptionTuple> options_;
+ // Counter for created options
+ size_t counter_{0};
// Command-line argument parser
Ptr<CLI::App> app_;
@@ -75,23 +89,22 @@ private:
// Name of the current option group
std::string currentGroup_{""};
- // If this is a wrapper then this should just be a reference,
- // then we do not have the added level of containment.
+ // Reference to the main config object
YAML::Node &config_;
// Option for --version flag. This is a special flag and similarly to --help,
// the key "version" will be not added into the YAML config
- CLI::Option* optVersion_;
+ CLI::Option *optVersion_;
static std::string failureMessage(const CLI::App *app, const CLI::Error &e);
- // Extract an option name from comma-separated list of command-line arguments,
- // e.g. 'help' from '--help,-h'
+ // Extract option name from a comma-separated list of long and short options, e.g. 'help' from
+ // '--help,-h'
std::string keyName(const std::string &args) const {
// re-use existing functions from CLI11 to keep option names consistent
- return std::get<1>(CLI::detail::get_names(CLI::detail::split_names(
- args))) // get long names only
- .front(); // get first long name
+ return std::get<1>(
+ CLI::detail::get_names(CLI::detail::split_names(args))) // get long names only
+ .front(); // get first long name
}
public:
@@ -112,23 +125,14 @@ public:
const std::string &description = "",
const std::string &header = "General options",
const std::string &footer = "",
- size_t columnWidth = 35,
+ size_t columnWidth = 40,
size_t screenWidth = 0);
/**
* @brief Create an instance of the command-line argument parser,
* short-cuft for Options object.
*
- * Option --help, -h is automatically added.
- *
- * @param options A smart pointer to the Options object containing the
- * to-be-wrapped yaml tree
- * @param description Program description
- * @param header Header text for the main option group
- * @param footer Text displayed after the list of options
- * @param columnWidth Width of the column with option names
- * @param screenWidth Maximum allowed width for help messages, 0 means no
- * limit
+ * @see Other constructor
*/
CLIWrapper(Ptr<Options> options,
const std::string &description = "",
@@ -201,8 +205,7 @@ public:
* have a default value or be non-defaulted
*/
template <typename T>
- CLI::Option *add_nondefault(const std::string &args,
- const std::string &help) {
+ CLI::Option *add_nondefault(const std::string &args, const std::string &help) {
return add_option<T>(keyName(args),
args,
help,
@@ -212,8 +215,7 @@ public:
}
/**
- * Switch to different option group or to the default group if
- * argument is empty.
+ * Switch to different option group or to the default group if argument is empty.
*
* @param name Header of the option group
*/
@@ -222,23 +224,31 @@ public:
// Parse command-line arguments. Handles --help and --version options
void parse(int argc, char **argv);
- /**
+ /*
* @brief Overwrite values for unparsed options
*
- * Default values are overwritten with the options found in the config
- * provided as the argument, while parsed command-line options remain
- * unchanged
+ * Default values are overwritten with the options from the config provided, while parsed
+ * command-line options remain unchanged.
+ * This should be a preferred way of updating config options as the class keeps track of options,
+ * which values have changed.
*
* @param node YAML config with new default values for options
*/
- void overwriteDefault(const YAML::Node &node);
+ bool updateConfig(const YAML::Node &config);
+
+ // Get textual YAML representation of the config
+ std::string dumpConfig(bool skipDefault = false) const;
private:
- template <
- typename T,
- // options with numeric and string-like values
- CLI::enable_if_t<!CLI::is_bool<T>::value && !CLI::is_vector<T>::value,
- CLI::detail::enabler> = CLI::detail::dummy>
+ // Get names of options passed via command-line
+ std::unordered_set<std::string> getParsedOptionNames() const;
+ // Get option names in the same order as they are created
+ std::vector<std::string> getOrderedOptionNames() const;
+
+ template <typename T,
+ // options with numeric and string-like values
+ CLI::enable_if_t<!CLI::is_bool<T>::value && !CLI::is_vector<T>::value,
+ CLI::detail::enabler> = CLI::detail::dummy>
CLI::Option *add_option(const std::string &key,
const std::string &args,
const std::string &help,
@@ -248,13 +258,17 @@ private:
// define YAML entry if requested
if(addToConfig)
config_[key] = val;
- // create variable for the option
- allVars_.insert(std::make_pair(key, std::make_shared<any_type>(val)));
+
+ // create option tuple
+ CLIOptionTuple option;
+ option.idx = counter_++;
+ option.var = std::make_shared<any_type>(val);
// callback function collecting a command-line argument
CLI::callback_t fun = [this, key](CLI::results_t res) {
+ options_[key].modified = true;
// get variable associated with the option
- auto &var = allVars_[key]->as<T>();
+ auto &var = options_[key].var->as<T>();
// store parser result in var
auto ret = CLI::detail::lexical_cast(res[0], var);
// update YAML entry
@@ -275,15 +289,15 @@ private:
opt->default_str(ss.str());
}
- // store option object
- opts_.insert(std::make_pair(key, opt));
- return opts_[key];
+ // store option tuple
+ option.opt = opt;
+ options_.insert(std::make_pair(key, option));
+ return options_[key].opt;
}
template <typename T,
// options with vector values
- CLI::enable_if_t<CLI::is_vector<T>::value,
- CLI::detail::enabler> = CLI::detail::dummy>
+ CLI::enable_if_t<CLI::is_vector<T>::value, CLI::detail::enabler> = CLI::detail::dummy>
CLI::Option *add_option(const std::string &key,
const std::string &args,
const std::string &help,
@@ -293,13 +307,17 @@ private:
// define YAML entry if requested
if(addToConfig)
config_[key] = val;
- // create variable for the option
- allVars_.insert(std::make_pair(key, std::make_shared<any_type>(val)));
+
+ // create option tuple
+ CLIOptionTuple option;
+ option.idx = counter_++;
+ option.var = std::make_shared<any_type>(val);
// callback function collecting command-line arguments
CLI::callback_t fun = [this, key](CLI::results_t res) {
+ options_[key].modified = true;
// get vector variable associated with the option
- auto &vec = allVars_[key]->as<T>();
+ auto &vec = options_[key].var->as<T>();
vec.clear();
bool ret = true;
// handle '[]' as an empty vector
@@ -330,15 +348,15 @@ private:
if(defaulted)
opt->default_str(CLI::detail::join(val));
- // store option object
- opts_.insert(std::make_pair(key, opt));
- return opts_[key];
+ // store option tuple
+ option.opt = opt;
+ options_.insert(std::make_pair(key, option));
+ return options_[key].opt;
}
template <typename T,
// options with boolean values, called flags in CLI11
- CLI::enable_if_t<CLI::is_bool<T>::value,
- CLI::detail::enabler> = CLI::detail::dummy>
+ CLI::enable_if_t<CLI::is_bool<T>::value, CLI::detail::enabler> = CLI::detail::dummy>
CLI::Option *add_option(const std::string &key,
const std::string &args,
const std::string &help,
@@ -348,19 +366,23 @@ private:
// define YAML entry if requested
if(addToConfig)
config_[key] = val;
- // create variable for the option
- allVars_.insert(std::make_pair(key, std::make_shared<any_type>(val)));
+
+ // create option tuple
+ CLIOptionTuple option;
+ option.idx = counter_++;
+ option.var = std::make_shared<any_type>(val);
// callback function setting the flag
CLI::callback_t fun = [this, key](CLI::results_t res) {
+ options_[key].modified = true;
// get parser result, it is safe as boolean options have an implicit value
auto val = res[0];
auto ret = true;
if(val == "true" || val == "on" || val == "yes" || val == "1") {
- allVars_[key]->as<T>() = true;
+ options_[key].var->as<T>() = true;
config_[key] = true;
} else if(val == "false" || val == "off" || val == "no" || val == "0") {
- allVars_[key]->as<T>() = false;
+ options_[key].var->as<T>() = false;
config_[key] = false;
} else {
ret = false;
@@ -378,9 +400,10 @@ private:
// allow to use the flag without any argument
opt->implicit_val("true");
- // store option object
- opts_.insert(std::make_pair(key, opt));
- return opts_[key];
+ // store option tuple
+ option.opt = opt;
+ options_.insert(std::make_pair(key, option));
+ return options_[key].opt;
}
};
diff --git a/src/common/config.cpp b/src/common/config.cpp
index c5209008..e5208b0d 100755
--- a/src/common/config.cpp
+++ b/src/common/config.cpp
@@ -38,7 +38,7 @@ void Config::initialize(int argc, char** argv, cli::mode mode, bool validate) {
std::string quote; // attempt to quote special chars
if (arg.empty() || arg.find_first_of(" #`\"'\\${}|&^?*!()%><") != std::string::npos)
quote = "'";
- arg = regex::regex_replace(arg, std::regex("'"), "'\\''");
+ arg = regex::regex_replace(arg, regex::regex("'"), "'\\''");
if (!cmdLine.empty())
cmdLine.push_back(' ');
cmdLine += quote + arg + quote;
diff --git a/src/common/config_parser.cpp b/src/common/config_parser.cpp
index e7676b01..539579a1 100755
--- a/src/common/config_parser.cpp
+++ b/src/common/config_parser.cpp
@@ -77,8 +77,9 @@ void ConfigParser::addOptionsGeneral(cli::CLIWrapper& cli) {
"allow the use of environment variables in paths, of the form ${VAR_NAME}");
cli.add<bool>("--relative-paths",
"All paths are relative to the config file location");
- cli.add<bool>("--dump-config",
- "Dump current (modified) configuration to stdout and exit");
+ cli.add_nondefault<std::string>("--dump-config",
+ "Dump current (modified) configuration to stdout and exit. Possible values: full, minimal")
+ ->implicit_val("full");
// clang-format on
}
@@ -255,8 +256,13 @@ void ConfigParser::addOptionsTraining(cli::CLIWrapper& cli) {
"If these files do not exist they are created");
#ifdef USE_SENTENCEPIECE
cli.add<std::vector<float>>("--sentencepiece-alphas",
- "Sampling factors for SentencePieceVocab;"
- "i-th factor corresponds to i-th vocabulary");
+ "Sampling factors for SentencePiece vocabulary; i-th factor corresponds to i-th vocabulary");
+ cli.add<std::string>("--sentencepiece-options",
+ "Pass-through command-line options to SentencePiece trainer");
+ cli.add<size_t>("--sentencepiece-max-lines",
+ "Maximum lines to train SentencePiece vocabulary, selected with sampling from all data. "
+ "When set to 0 all lines are going to be used.",
+ 10000000);
#endif
// scheduling options
cli.add<size_t>("--after-epochs,-e",
@@ -356,10 +362,10 @@ void ConfigParser::addOptionsTraining(cli::CLIWrapper& cli) {
"none");
cli.add<std::string>("--guided-alignment-cost",
"Cost type for guided alignment: ce (cross-entropy), mse (mean square error), mult (multiplication)",
- "ce");
+ "mse");
cli.add<double>("--guided-alignment-weight",
"Weight for guided alignment cost",
- 1);
+ 0.1);
cli.add_nondefault<std::string>("--data-weighting",
"Path to a file with sentence or word weights");
cli.add<std::string>("--data-weighting-type",
@@ -396,8 +402,8 @@ void ConfigParser::addOptionsValidation(cli::CLIWrapper& cli) {
"Validate model every arg updates (append 't' for every arg target labels)",
"10000u");
cli.add<std::vector<std::string>>("--valid-metrics",
- "Metric to use during validation: cross-entropy, perplexity, valid-script, translation."
- " Multiple metrics can be specified",
+ "Metric to use during validation: cross-entropy, ce-mean-words, perplexity, valid-script, "
+ " translation, bleu, bleu-detok. Multiple metrics can be specified",
std::vector<std::string>({"cross-entropy"}));
cli.add<size_t>("--early-stopping",
"Stop if the first validation metric does not improve for arg consecutive validation steps",
@@ -452,7 +458,7 @@ void ConfigParser::addOptionsTranslation(cli::CLIWrapper& cli) {
"Paths to input file(s), stdin by default",
std::vector<std::string>({"stdin"}));
cli.add<std::string>("--output,-o",
- "Paths to output file(s), stdout by default",
+ "Path to output file, stdout by default",
"stdout");
cli.add<std::vector<std::string>>("--vocabs,-v",
"Paths to vocabulary files have to correspond to --input");
@@ -511,6 +517,9 @@ void ConfigParser::addOptionsScoring(cli::CLIWrapper& cli) {
// TODO: move options like vocabs and train-sets to a separate procedure as they are defined twice
cli.add<std::vector<std::string>>("--train-sets,-t",
"Paths to corpora to be scored: source target");
+ cli.add<std::string>("--output,-o",
+ "Path to output file, stdout by default",
+ "stdout");
cli.add<std::vector<std::string>>("--vocabs,-v",
"Paths to vocabulary files have to correspond to --train-sets."
" If this parameter is not supplied we look for vocabulary files source.{yml,json} and target.{yml,json}."
@@ -519,6 +528,8 @@ void ConfigParser::addOptionsScoring(cli::CLIWrapper& cli) {
"Score n-best list instead of plain text corpus");
cli.add<std::string>("--n-best-feature",
"Feature name to be inserted into n-best list", "Score");
+ cli.add<bool>("--normalize,-n",
+ "Divide translation score by translation length");
cli.add_nondefault<std::string>("--summary",
"Only print total cost, possible values: cross-entropy (ce-mean), ce-mean-words, ce-sum, perplexity")
->implicit_val("cross-entropy");
@@ -607,8 +618,40 @@ void ConfigParser::addSuboptionsInputLength(cli::CLIWrapper& cli) {
// clang-format on
}
+void ConfigParser::addSuboptionsULR(cli::CLIWrapper& cli) {
+ // clang-format off
+ // support for universal encoder ULR https://arxiv.org/pdf/1802.05368.pdf
+ cli.add<bool>("--ulr",
+ "Enable ULR (Universal Language Representation)",
+ false);
+ // reading pre-trained universal embeddings for multi-sources.
+ // Note that source and target here is relative to ULR not the translation langs
+ // queries: EQ in Fig2 : is the unified embeddings projected to one space.
+ cli.add<std::string>("--ulr-query-vectors",
+ "Path to file with universal sources embeddings from projection into universal space",
+ "");
+ // keys: EK in Fig2 : is the keys of the target embbedings projected to unified space (i.e. ENU in
+ // multi-lingual case)
+ cli.add<std::string>("--ulr-keys-vectors",
+ "Path to file with universal sources embeddings of traget keys from projection into universal space",
+ "");
+ cli.add<bool>("--ulr-trainable-transformation",
+ "Make Query Transformation Matrix A trainable",
+ false);
+ cli.add<int>("--ulr-dim-emb",
+ "ULR monolingual embeddings dimension");
+ cli.add<float>("--ulr-dropout",
+ "ULR dropout on embeddings attentions. Default is no dropout",
+ 0.0f);
+ cli.add<float>("--ulr-softmax-temperature",
+ "ULR softmax temperature to control randomness of predictions. Deafult is 1.0: no temperature",
+ 1.0f);
+ // clang-format on
+}
+
void ConfigParser::expandAliases(cli::CLIWrapper& cli) {
YAML::Node config;
+ // The order of aliases does matter as later options overwrite earlier
if(config_["best-deep"].as<bool>()) {
config["layer-normalization"] = true;
@@ -622,11 +665,10 @@ void ConfigParser::expandAliases(cli::CLIWrapper& cli) {
config["skip"] = true;
}
- // @TODO: Quite sure CLIWrapper should not do that;
- // that's semantics that seem to belong into the current class
- // and has not really anything to do with CLI proper.
- if(config)
- cli.overwriteDefault(config);
+ if(config) {
+ auto success = cli.updateConfig(config);
+ ABORT_IF(!success, "Unknown option(s) in aliases, check if aliases consist of correct options");
+ }
}
void ConfigParser::parseOptions(int argc, char** argv, bool doValidate) {
@@ -661,8 +703,8 @@ void ConfigParser::parseOptions(int argc, char** argv, bool doValidate) {
auto configPaths = findConfigPaths();
if(!configPaths.empty()) {
auto config = loadConfigFiles(configPaths);
- // combine loaded options with the main config object
- cli.overwriteDefault(config);
+ auto success = cli.updateConfig(config);
+ ABORT_IF(!success, "There are option(s) in a config file that are not expected");
}
if(get<bool>("interpolate-env-vars")) {
@@ -677,11 +719,10 @@ void ConfigParser::parseOptions(int argc, char** argv, bool doValidate) {
// remove extra config files from the config to avoid redundancy
config_.remove("config");
- if(get<bool>("dump-config")) {
+ if(has("dump-config")) {
+ bool skipDefault = get<std::string>("dump-config") == "minimal";
config_.remove("dump-config");
- YAML::Emitter emit;
- cli::OutputYaml(config_, emit);
- std::cout << emit.c_str() << std::endl;
+ std::cout << cli.dumpConfig(skipDefault) << std::endl;
exit(0);
}
@@ -714,8 +755,7 @@ std::vector<std::string> ConfigParser::findConfigPaths() {
return paths;
}
-YAML::Node ConfigParser::loadConfigFiles(
- const std::vector<std::string>& paths) {
+YAML::Node ConfigParser::loadConfigFiles(const std::vector<std::string>& paths) {
YAML::Node configAll;
for(auto& path : paths) {
@@ -750,33 +790,4 @@ YAML::Node ConfigParser::loadConfigFiles(
YAML::Node ConfigParser::getConfig() const {
return config_;
}
-
-void ConfigParser::addSuboptionsULR(cli::CLIWrapper& cli) {
- // support for universal encoder ULR https://arxiv.org/pdf/1802.05368.pdf
- cli.add<bool>("--ulr",
- "Is ULR (Universal Language Representation) enabled?",
- false);
- // reading pre-trained universal embedings for multi-sources
- // note that source and target here is relative to ULR not the translation langs
- //queries: EQ in Fig2 : is the unified embbedins projected to one space.
- //"Path to file with universal sources embeddings from projection into universal space")
- cli.add<std::string>("--ulr-query-vectors",
- "Path to file with universal sources embeddings from projection into universal space",
- "");
- //keys: EK in Fig2 : is the keys of the target embbedins projected to unified space (i.e. ENU in multi-lingual case)
- cli.add<std::string>("--ulr-keys-vectors",
- "Path to file with universal sources embeddings of traget keys from projection into universal space",
- "");
- cli.add<bool>("--ulr-trainable-transformation",
- "Is Query Transformation Matrix A trainable ?",
- false);
- cli.add<int>("--ulr-dim-emb",
- "ULR mono embed dim");
- cli.add<float>("--ulr-dropout",
- "ULR dropout on embeddings attentions: default is no dropuout",
- 0.0f);
- cli.add<float>("--ulr-softmax-temperature",
- "ULR softmax temperature to control randomness of predictions- deafult is 1.0: no temperature ",
- 1.0f);
-}
} // namespace marian
diff --git a/src/common/config_parser.h b/src/common/config_parser.h
index 80f7e81c..de1cb70e 100755
--- a/src/common/config_parser.h
+++ b/src/common/config_parser.h
@@ -63,7 +63,7 @@ private:
// Abort if not set.
template <typename T>
T get(const std::string& key) const {
- ABORT_IF(!has(key), "CLI object has no key {}", key);
+ ABORT_IF(!has(key), "CLI object has no key '{}'", key);
return config_[key].as<T>();
}
diff --git a/src/common/config_validator.cpp b/src/common/config_validator.cpp
index 625748e9..5086c726 100755
--- a/src/common/config_validator.cpp
+++ b/src/common/config_validator.cpp
@@ -84,9 +84,6 @@ void ConfigValidator::validateOptionsTraining() const {
ABORT_IF(!modelDir.empty() && !filesystem::isDirectory(modelDir),
"Model directory does not exist");
- ABORT_IF(!modelDir.empty() && !filesystem::canWrite(modelDir),
- "No write permission in model directory");
-
ABORT_IF(
has("valid-sets") && get<std::vector<std::string>>("valid-sets").size() != trainSets.size(),
"There should be as many validation sets as training sets");
diff --git a/src/common/file_stream.h b/src/common/file_stream.h
index caa12a6c..87cb7f9a 100755
--- a/src/common/file_stream.h
+++ b/src/common/file_stream.h
@@ -1,38 +1,24 @@
#pragma once
-// @TODO: this file still contains lots of stuff from boost::filesystem and boost::iostreams,
-// this has to be figured out.
-
#include "common/filesystem.h"
#include "common/logging.h"
#include "common/definitions.h"
-#include <boost/filesystem/fstream.hpp>
-#include <boost/iostreams/device/file_descriptor.hpp>
-#pragma warning(push)
-#pragma warning(disable: 4458) // declaration of 'traits_type' hides class member
-#pragma warning(disable: 4456) // declaration of 'c' hides previous local declaration
-#pragma warning(disable: 4244) // conversion from 'int' to 'char', possible loss of data
-#pragma warning(disable: 4706) // assignment within conditional expression
-#include <boost/iostreams/filter/gzip.hpp>
-#pragma warning(pop)
-#ifdef __GNUC__
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wsuggest-override"
-#endif
-#include <boost/iostreams/filtering_stream.hpp>
-#ifdef __GNUC__
+#include "3rd_party/zstr/zstr.hpp"
#pragma GCC diagnostic pop
-#endif
+
+#include <boost/iostreams/device/file_descriptor.hpp>
+#include <boost/iostreams/stream_buffer.hpp>
+
#include <iostream>
#include <memory>
#ifdef _MSC_VER
-
#include <fcntl.h>
#include <io.h>
#include <stdlib.h>
-
#endif
namespace marian {
@@ -40,7 +26,7 @@ namespace io {
class TemporaryFile {
private:
- int fd_;
+ int fd_{-1};
bool unlink_;
std::string name_;
@@ -145,79 +131,78 @@ public:
class InputFileStream {
public:
- InputFileStream(const std::string& file) : file_(file), ifstream_(file_.getBoost()) {
- ABORT_IF(
- !marian::filesystem::exists(file_), "File '{}' does not exist", file);
-
- if(file_.extension() == marian::filesystem::Path(std::string(".gz")))
- istream_.push(boost::iostreams::gzip_decompressor());
- istream_.push(ifstream_);
+ InputFileStream(const std::string& file)
+ : file_(file) {
+ ABORT_IF(!marian::filesystem::exists(file_), "File '{}' does not exist", file);
+
+ if(file_.extension() == marian::filesystem::Path(".gz"))
+ // @TODO: consider make_unique for next refactoring
+ istream_.reset(new zstr::ifstream(file_.string()));
+ else
+ istream_.reset(new std::ifstream(file_.string()));
}
InputFileStream(TemporaryFile& tempfile)
: fds_(tempfile.getFileDescriptor(), boost::iostreams::never_close_handle) {
lseek(tempfile.getFileDescriptor(), 0, SEEK_SET);
- istream_.push(fds_, 1024);
+
+ namespace bio = boost::iostreams;
+ fdsBuffer_.reset(new bio::stream_buffer<bio::file_descriptor_source>(fds_));
+ istream_.reset(new std::istream(fdsBuffer_.get()));
}
- InputFileStream(std::istream& strm) { istream_.push(strm, 0); }
+ InputFileStream(std::istream& strm)
+ : istream_(new std::istream(strm.rdbuf())) {}
- operator std::istream&() { return istream_; }
+ operator std::istream&() { return *istream_; }
- operator bool() { return (bool)istream_; }
+ operator bool() { return (bool)*istream_; }
bool bad() const {
- return istream_.bad();
+ return istream_->bad();
}
bool fail() const {
- return istream_.fail();
+ return istream_->fail();
}
char widen(char c) {
- return istream_.widen(c);
- }
-
- bool isOpen() const {
- return ifstream_.is_open();
+ return istream_->widen(c);
}
std::string path() { return file_.string(); }
- bool empty() { return ifstream_.peek() == std::ifstream::traits_type::eof(); }
+ bool empty() { return istream_->peek() == std::ifstream::traits_type::eof(); }
void setbufsize(size_t size) const {
- ifstream_.rdbuf()->pubsetbuf(0, 0);
- //readBuf_ = std::make_unique<char[]>(size);
+ istream_->rdbuf()->pubsetbuf(0, 0);
readBuf_.reset(new char[size]);
- ifstream_.rdbuf()->pubsetbuf(readBuf_.get(), 0);
+ istream_->rdbuf()->pubsetbuf(readBuf_.get(), 0);
}
template <typename T>
friend InputFileStream& operator>>(InputFileStream& stream, T& t) {
- stream.istream_ >> t;
+ *stream.istream_ >> t;
// bad() seems to be correct here. Should not abort on EOF.
- ABORT_IF(stream.bad(),
- "Error reading from file '{}'",
- stream.path());
+ ABORT_IF(stream.bad(), "Error reading from file '{}'", stream.path());
return stream;
}
template <typename T>
size_t read(T* ptr, size_t num = 1) {
- istream_.read((char*)ptr, num * sizeof(T));
+ istream_->read((char*)ptr, num * sizeof(T));
// fail() seems to be correct here. Failure to read should abort.
- ABORT_IF(fail(),
- "Error reading from file '{}'",
- path());
+ ABORT_IF(fail(), "Error reading from file '{}'", path());
return num * sizeof(T);
}
private:
marian::filesystem::Path file_;
- boost::filesystem::ifstream ifstream_;
+ std::unique_ptr<std::istream> istream_;
+
boost::iostreams::file_descriptor_source fds_;
- boost::iostreams::filtering_istream istream_;
+ std::unique_ptr<boost::iostreams::stream_buffer<boost::iostreams::file_descriptor_source>> fdsBuffer_;
+
mutable UPtr<char[]> readBuf_; // for setbuf()
};
@@ -226,9 +211,7 @@ private:
static inline InputFileStream& getline(InputFileStream& in, std::string& line) {
std::getline((std::istream&)in, line);
// bad() seems to be correct here. Should not abort on EOF.
- ABORT_IF(in.bad(),
- "Error reading from file '{}'",
- in.path());
+ ABORT_IF(in.bad(), "Error reading from file '{}'", in.path());
// strip terminal CR if present
if(in && !line.empty() && line.back() == in.widen('\r'))
line.pop_back();
@@ -240,9 +223,7 @@ static inline InputFileStream& getline(InputFileStream& in, std::string& line) {
static inline InputFileStream& getline(InputFileStream& in, std::string& line, char delim) {
std::getline((std::istream&)in, line, delim);
// bad() seems to be correct here. Should not abort on EOF.
- ABORT_IF(in.bad(),
- "Error reading from file '{}'",
- in.path());
+ ABORT_IF(in.bad(), "Error reading from file '{}'", in.path());
// strip terminal CR if present
if(in && !line.empty() && line.back() == in.widen('\r'))
line.pop_back();
@@ -251,62 +232,61 @@ static inline InputFileStream& getline(InputFileStream& in, std::string& line, c
class OutputFileStream {
public:
- OutputFileStream(const std::string& file) : file_(file), ofstream_(file_.getBoost()) {
- ABORT_IF(
- !marian::filesystem::exists(file_), "File '{}' does not exist", file);
+ OutputFileStream(const std::string& file) : file_(file) {
+ if(file_.extension() == marian::filesystem::Path(".gz"))
+ ostream_.reset(new zstr::ofstream(file_.string()));
+ else
+ ostream_.reset(new std::ofstream(file_.string()));
- if(file_.extension() == marian::filesystem::Path(std::string(".gz")))
- ostream_.push(boost::iostreams::gzip_compressor());
- ostream_.push(ofstream_);
+ ABORT_IF(!marian::filesystem::exists(file_), "File '{}' could not be opened", file);
}
OutputFileStream(TemporaryFile& tempfile)
: fds_(tempfile.getFileDescriptor(), boost::iostreams::never_close_handle) {
lseek(tempfile.getFileDescriptor(), 0, SEEK_SET);
- ostream_.push(fds_, 1024);
+
+ namespace bio = boost::iostreams;
+ fdsBuffer_.reset(new bio::stream_buffer<bio::file_descriptor_sink>(fds_));
+ ostream_.reset(new std::ostream(fdsBuffer_.get()));
}
- OutputFileStream(std::ostream& strm) { ostream_.push(strm, 0); }
+ OutputFileStream(std::ostream& strm) {
+ ostream_.reset(new std::ostream(strm.rdbuf()));
+ }
- operator std::ostream&() { return ostream_; }
+ operator std::ostream&() { return *ostream_; }
- operator bool() { return (bool)ostream_; }
+ operator bool() { return (bool)*ostream_; }
bool bad() const {
- return ostream_.bad();
+ return ostream_->bad();
}
bool fail() const {
- return ostream_.fail();
+ return ostream_->fail();
}
template <typename T>
friend OutputFileStream& operator<<(OutputFileStream& stream, const T& t) {
- stream.ostream_ << t;
+ *stream.ostream_ << t;
// fail() seems to be correct here. Failure to write should abort.
- ABORT_IF(stream.fail(),
- "Error writing to file '{}'",
- stream.path());
+ ABORT_IF(stream.fail(), "Error writing to file '{}'", stream.path());
return stream;
}
// handle things like std::endl which is actually a function not a value
friend OutputFileStream& operator<<(OutputFileStream& stream, std::ostream& (*var)(std::ostream&)) {
- stream.ostream_ << var;
+ *stream.ostream_ << var;
// fail() seems to be correct here. Failure to write should abort.
- ABORT_IF(stream.fail(),
- "Error writing to file '{}'",
- stream.path());
+ ABORT_IF(stream.fail(), "Error writing to file '{}'", stream.path());
return stream;
}
template <typename T>
size_t write(const T* ptr, size_t num = 1) {
- ostream_.write((char*)ptr, num * sizeof(T));
+ ostream_->write((char*)ptr, num * sizeof(T));
// fail() seems to be correct here. Failure to write should abort.
- ABORT_IF(fail(),
- "Error writing to file '{}'",
- path());
+ ABORT_IF(fail(), "Error writing to file '{}'", path());
return num * sizeof(T);
}
@@ -314,9 +294,10 @@ public:
private:
marian::filesystem::Path file_;
- boost::filesystem::ofstream ofstream_;
+ std::unique_ptr<std::ostream> ostream_;
+
boost::iostreams::file_descriptor_sink fds_;
- boost::iostreams::filtering_ostream ostream_;
+ std::unique_ptr<boost::iostreams::stream_buffer<boost::iostreams::file_descriptor_sink>> fdsBuffer_;
};
}
diff --git a/src/common/filesystem.h b/src/common/filesystem.h
index 66927313..9dd0ae55 100755
--- a/src/common/filesystem.h
+++ b/src/common/filesystem.h
@@ -1,66 +1,57 @@
#pragma once
-// @TODO: This is a temporary file to move every function from boost::filesystem used in Marian
-// into one place. Marian should call functions only from this file. boost::filesystem will
-// be removed. This needs to be portable to Windows too.
+// This is a shallow wrapper around a filesystem path library.
+// We used this to wrap boost::filesystem, now we are wrapping
+// Pathie, a small open source lib.
+// @TODO: go back to canonical names for functions and objects
+// as specified in C++17 so it becomes easy to move in the future
-#ifdef __GNUC__
#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wunused-variable"
#pragma GCC diagnostic ignored "-Wsuggest-override"
-#endif
-#include <boost/filesystem.hpp>
-#ifdef __GNUC__
-// "ignored -Wunused-variable" above ignores 'static const' declarations (where 'static'
-// is not needed). We work around by referencing the offending variables in dummy code.
-static inline void boost_dummy_filesystem() { boost::system::posix_category; boost::system::errno_ecat; boost::system::native_ecat; }
+#include "3rd_party/pathie-cpp/include/path.hpp"
+#include "3rd_party/pathie-cpp/include/errors.hpp"
#pragma GCC diagnostic pop
-#endif
namespace marian {
namespace filesystem {
- struct Path {
+ class Path {
private:
- boost::filesystem::path path;
+ Pathie::Path path;
public:
Path() {}
Path(const Path& p) : path{p.path} {}
Path(const std::string& s) : path{s} {}
- Path(const boost::filesystem::path& p) : path{p} {}
+ Path(const Pathie::Path& p) : path{p} {}
Path parentPath() const {
- return Path{path.parent_path()};
+ return Path(path.parent());
}
Path filename() const {
- return Path{path.filename()};
+ return Path(path.basename());
}
Path extension() const {
- return Path{path.extension()};
+ return Path(path.extension());
}
bool empty() const {
- return path.empty();
+ return path.str().empty();
}
- const boost::filesystem::path& getBoost() const {
+ const Pathie::Path& getImpl() const {
return path;
}
- operator std::string&() {
- return (std::string&)path;
- }
-
operator std::string() const {
- return path.string();
+ return path.str();
}
std::string string() const {
- return path.string();
+ return path.str();
}
bool operator==(const Path& p) const {
@@ -73,35 +64,31 @@ namespace filesystem {
};
static inline Path currentPath() {
- return Path{boost::filesystem::current_path()};
+ return Path(Pathie::Path::pwd());
}
- static inline Path canonical(const Path& p, const Path& dir) {
- return Path{ boost::filesystem::canonical(p.getBoost(), dir.getBoost()) };
+ static inline Path canonical(const Path& p, const Path& base) {
+ // create absolute base path
+ return p.getImpl().absolute(base.getImpl()).expand();
}
static inline bool exists(const Path& p) {
- return boost::filesystem::exists(p.getBoost());
+ return p.getImpl().exists();
}
static inline size_t fileSize(const Path& p) {
- return boost::filesystem::file_size(p.getBoost());
+ return p.getImpl().size();
}
static inline bool isDirectory(const Path& p) {
- return boost::filesystem::is_directory(p.getBoost());
- }
-
- static inline bool canWrite(const Path& p) {
- return (boost::filesystem::status(p.getBoost()).permissions() & boost::filesystem::owner_write) != 0;
+ return p.getImpl().is_directory();
}
- // concatenation?
static inline Path operator/ (const Path& lhs, const Path& rhs) {
- return lhs.getBoost() / rhs.getBoost();
+ return Path(lhs.getImpl() / rhs.getImpl());
}
- using FilesystemError = boost::filesystem::filesystem_error;
+ using FilesystemError = Pathie::PathieError;
}
} \ No newline at end of file
diff --git a/src/common/logging.cpp b/src/common/logging.cpp
index fdb999ca..0170d633 100755
--- a/src/common/logging.cpp
+++ b/src/common/logging.cpp
@@ -150,9 +150,11 @@ void switchtoMultinodeLogging(std::string nodeIdStr) {
namespace marian {
- void noinline logCallStack(size_t skipLevels)
- {
- auto callStack = ::Microsoft::MSR::CNTK::DebugUtil::GetCallStack(skipLevels + 2, /*makeFunctionNamesStandOut=*/true);
- checkedLog("general", "critical", "Call stack:{}", callStack);
+ std::string noinline getCallStack(size_t skipLevels) {
+ return ::Microsoft::MSR::CNTK::DebugUtil::GetCallStack(skipLevels + 2, /*makeFunctionNamesStandOut=*/true);
+ }
+
+ void noinline logCallStack(size_t skipLevels) {
+ checkedLog("general", "critical", getCallStack(skipLevels));
}
}
diff --git a/src/common/logging.h b/src/common/logging.h
index 091044ea..cdaa806c 100755
--- a/src/common/logging.h
+++ b/src/common/logging.h
@@ -6,6 +6,7 @@
namespace marian {
void logCallStack(size_t skipLevels);
+ std::string getCallStack(size_t skipLevels);
}
/**
@@ -46,13 +47,19 @@ namespace marian {
*
* @param ... Message text and variables
*/
-#define ABORT(...) \
- do { \
- checkedLog("general", "critical", __VA_ARGS__); \
- ::marian::logCallStack(/*skipLevels=*/0); \
- std::cerr << "Aborted from " << FUNCTION_NAME << " in " << __FILE__ \
- << ": " << __LINE__ << std::endl; \
- std::abort(); \
+#define ABORT(...) \
+ do { \
+ auto logger = spdlog::get("general"); \
+ if(logger == nullptr) \
+ logger = createStderrLogger("general", "[%Y-%m-%d %T] Error: %v"); \
+ else \
+ logger->set_pattern("[%Y-%m-%d %T] Error: %v"); \
+ checkedLog("general", "critical", __VA_ARGS__); \
+ checkedLog("general", "critical", "Aborted from {} in {}:{}", \
+ FUNCTION_NAME, __FILE__, __LINE__); \
+ logger->set_pattern("%v"); \
+ checkedLog("general", "critical", marian::getCallStack(/*skipLevels=*/0)); \
+ std::abort(); \
} while(0)
/**
@@ -85,18 +92,6 @@ template <class... Args>
void checkedLog(std::string logger, std::string level, Args... args) {
Logger log = spdlog::get(logger);
if(!log) {
- if(level == "critical") {
- // log and errlog are not the same, hence we need to check
- // if an error logger exists first and not try to create a
- // second one. Otherwise this will throw an exception.
- Logger errlog = spdlog::get("error");
- if(!errlog)
- errlog = createStderrLogger("error", "Error: %v - aborting");
- errlog->critical(args...);
- }
- // @TODO: should other loggers do something? This seems to be
- // a sink state when logs are not intialized. Critical errors
- // should log nevertheless, non-critical go unreported.
return;
}
diff --git a/src/common/timer.h b/src/common/timer.h
index 6f86b54f..4172cfc7 100755
--- a/src/common/timer.h
+++ b/src/common/timer.h
@@ -12,6 +12,14 @@
namespace marian {
namespace timer {
+// Helper function to get the current date and time
+static std::string currentDate() {
+ std::time_t now = std::chrono::system_clock::to_time_t(std::chrono::system_clock::now());
+ char date[100] = {0};
+ std::strftime(date, sizeof(date), "%F %X %z", std::localtime(&now));
+ return date;
+}
+
// Timer measures elapsed time.
// This is a wrapper around std::chrono providing wall time only
class Timer {
diff --git a/src/common/version.cpp b/src/common/version.cpp
index a31c7df7..75814d92 100755
--- a/src/common/version.cpp
+++ b/src/common/version.cpp
@@ -1,12 +1,10 @@
#include "common/version.h"
-#include "common/project_version.h" // cmake-generated file, major/minor/tweak versions
-#include "common/git_revision.h" // make-generated file, contains git commit info
+#include "common/git_revision.h" // make-generated file, contains git commit info
+#include "common/project_version.h" // cmake-generated file, major/minor/tweak versions
namespace marian {
-std::string buildVersion()
-{
+std::string buildVersion() {
return std::string(PROJECT_VERSION) + " " + GIT_REVISION;
}
-
}
diff --git a/src/common/version.h b/src/common/version.h
index a0c8ab22..a425af93 100755
--- a/src/common/version.h
+++ b/src/common/version.h
@@ -3,5 +3,5 @@
#include <string>
namespace marian {
- std::string buildVersion();
+ std::string buildVersion();
}
diff --git a/src/data/corpus.cpp b/src/data/corpus.cpp
index 70660467..7a7a846e 100755
--- a/src/data/corpus.cpp
+++ b/src/data/corpus.cpp
@@ -106,9 +106,12 @@ void Corpus::shuffleData(const std::vector<std::string>& paths) {
size_t numStreams = paths.size();
+ size_t numSentences;
std::vector<std::vector<std::string>> corpus(numStreams); // [stream][id]
- if (!corpusInRAM_.empty()) // when caching, we use what we have instead
+ if (!corpusInRAM_.empty()) { // when caching, we use what we have instead
corpus = std::move(corpusInRAM_); // temporarily move ownership here, will be moved back
+ numSentences = corpus[0].size();
+ }
else {
files_.resize(numStreams);
for(size_t i = 0; i < numStreams; ++i) {
@@ -132,10 +135,9 @@ void Corpus::shuffleData(const std::vector<std::string>& paths) {
ABORT_IF(eofsHit != 0, "Not all input files have the same number of lines");
}
files_.clear();
- LOG(info, "[data] Done reading {} sentences.", corpus[0].size());
+ numSentences = corpus[0].size();
+ LOG(info, "[data] Done reading {} sentences", numSentences);
}
- size_t numSentences = corpus[0].size();
- LOG(info, "[data] Done reading {} sentences", numSentences);
// randomize sequence ids, and remember them
ids_.resize(numSentences);
diff --git a/src/data/corpus_base.cpp b/src/data/corpus_base.cpp
index 880a6946..c9704313 100755
--- a/src/data/corpus_base.cpp
+++ b/src/data/corpus_base.cpp
@@ -75,10 +75,14 @@ CorpusBase::CorpusBase(Ptr<Options> options, bool translate)
if(maxVocabs.size() < paths_.size())
maxVocabs.resize(paths_.size(), 0);
+ LOG(info, "No vocabulary files given, trying to find or build based on training data. "
+ "Vocabularies will be built separately for each file.");
+
// Create vocabs if not provided
for(size_t i = 0; i < paths_.size(); ++i) {
Ptr<Vocab> vocab = New<Vocab>(options_, i);
- int vocSize = vocab->loadOrCreate("", paths_[i], maxVocabs[i]);
+ std::vector<std::string> trainPaths = { paths_[i] };
+ int vocSize = vocab->loadOrCreate("", trainPaths, maxVocabs[i]);
// TODO: this is not nice as it modifies the option object and needs to expose the changes
// outside the corpus as models need to know about the vocabulary size; extract the vocab
// creation functionality from the class.
@@ -92,9 +96,31 @@ CorpusBase::CorpusBase(Ptr<Options> options, bool translate)
if(maxVocabs.size() < vocabPaths.size())
maxVocabs.resize(paths_.size(), 0);
+ // Helper object to for grouping training data based on vocabulary file name
+ struct PathsAndSize {
+ std::set<std::string> paths; // contains all paths that are used for training the vocabulary
+ size_t size; // contains the maximum vocabulary size
+ };
+
+ // Group training files based on vocabulary path. If the same
+ // vocab path corresponds to different training files, this means
+ // that a single vocab should combine tokens from all files.
+ std::map<std::string, PathsAndSize> groupVocab;
+ for(size_t i = 0; i < vocabPaths.size(); ++i) {
+ groupVocab[vocabPaths[i]].paths.insert(paths_[i]);
+ if(groupVocab[vocabPaths[i]].size < maxVocabs[i])
+ groupVocab[vocabPaths[i]].size = maxVocabs[i];
+ }
+
for(size_t i = 0; i < vocabPaths.size(); ++i) {
Ptr<Vocab> vocab = New<Vocab>(options_, i);
- int vocSize = vocab->loadOrCreate(vocabPaths[i], paths_[i], maxVocabs[i]);
+
+ // Get the set of files that corresponds to the vocab. If the next file is the same vocab,
+ // it wild not be created again, but just correctly loaded.
+ auto pathsAndSize = groupVocab[vocabPaths[i]];
+ std::vector<std::string> groupedPaths(pathsAndSize.paths.begin(), pathsAndSize.paths.end());
+ int vocSize = vocab->loadOrCreate(vocabPaths[i], groupedPaths, pathsAndSize.size);
+
// TODO: this is not nice as it modifies the option object and needs to expose the changes
// outside the corpus as models need to know about the vocabulary size; extract the vocab
// creation functionality from the class.
diff --git a/src/data/default_vocab.cpp b/src/data/default_vocab.cpp
index 98bf5d8f..1ce055db 100755
--- a/src/data/default_vocab.cpp
+++ b/src/data/default_vocab.cpp
@@ -30,14 +30,16 @@ private:
class VocabFreqOrderer {
private:
- std::unordered_map<std::string, size_t>& counter_;
+ const std::unordered_map<std::string, size_t>& counter_;
public:
- VocabFreqOrderer(std::unordered_map<std::string, size_t>& counter)
- : counter_(counter) {}
+ VocabFreqOrderer(const std::unordered_map<std::string, size_t>& counter)
+ : counter_(counter) {}
+ // order first by decreasing frequency,
+ // if frequencies are the same order lexicographically by vocabulary string
bool operator()(const std::string& a, const std::string& b) const {
- return counter_[a] > counter_[b] || (counter_[a] == counter_[b] && a < b);
+ return counter_.at(a) > counter_.at(b) || (counter_.at(a) == counter_.at(b) && a < b);
}
};
@@ -117,10 +119,6 @@ public:
auto str = pair.first;
auto id = pair.second;
- if(SPEC2SYM.count(str)) {
- seenSpecial.insert(id);
- }
-
// note: this requires ids to be sorted by frequency
if(!max || id < (Word)max) {
insertWord(id, str);
@@ -174,8 +172,6 @@ public:
};
// @TODO: the hard-att code has not yet been updated to accept EOS at any id
requireWord(DEFAULT_EOS_ID, DEFAULT_EOS_STR);
- for(auto id : seenSpecial)
- requireWord(id, SYM2SPEC.at(id));
}
return std::max((int)id2str_.size(), max);
@@ -187,52 +183,50 @@ public:
unkId_ = insertWord(DEFAULT_UNK_ID, DEFAULT_UNK_STR);
}
- void create(const std::string& vocabPath, const std::string& trainPath) override {
- LOG(info, "[data] Creating vocabulary {} from {}", vocabPath, trainPath);
-
- filesystem::Path path(vocabPath);
- auto dir = path.parentPath();
- if(dir.empty())
- dir = filesystem::currentPath();
+ virtual void create(const std::string& vocabPath,
+ const std::vector<std::string>& trainPaths,
+ size_t maxSize = 0) override {
- ABORT_IF(!dir.empty() && !filesystem::isDirectory(dir),
- "Specified vocab directory {} does not exist",
- dir.string());
+ LOG(info, "[data] Creating vocabulary {} from {}",
+ vocabPath,
+ utils::join(trainPaths, ", "));
- ABORT_IF(!dir.empty() && !filesystem::canWrite(dir),
- "No write permission in vocab directory {}",
- dir.string());
+ if(vocabPath != "stdout") {
+ filesystem::Path path(vocabPath);
+ auto dir = path.parentPath();
+ if(dir.empty())
+ dir = filesystem::currentPath();
- ABORT_IF(filesystem::exists(vocabPath),
- "DefaultVocab file '{}' exists. Not overwriting",
- path.string());
+ ABORT_IF(!dir.empty() && !filesystem::isDirectory(dir),
+ "Specified vocab directory {} does not exist",
+ dir.string());
- io::InputFileStream trainStrm(trainPath);
- io::OutputFileStream vocabStrm(vocabPath);
- create(trainStrm, vocabStrm);
+ ABORT_IF(filesystem::exists(vocabPath),
+ "Vocabulary file '{}' exists. Not overwriting",
+ path.string());
+ }
+
+ std::unordered_map<std::string, size_t> counter;
+ for(const auto& trainPath : trainPaths)
+ addCounts(counter, trainPath);
+ create(vocabPath, counter, maxSize);
}
- void create(io::InputFileStream& trainStrm,
- io::OutputFileStream& vocabStrm,
- size_t maxSize = 0) override {
- std::string line;
- std::unordered_map<std::string, size_t> counter;
+private:
- std::unordered_set<Word> seenSpecial;
+ void addCounts(std::unordered_map<std::string, size_t>& counter,
+ const std::string& trainPath) {
+ std::unique_ptr<io::InputFileStream> trainStrm(
+ trainPath == "stdin" ? new io::InputFileStream(std::cin)
+ : new io::InputFileStream(trainPath)
+ );
- while(getline((std::istream&)trainStrm, line)) {
+ std::string line;
+ while(getline(*trainStrm, line)) {
std::vector<std::string> toks;
-
- // we do not want any unexpected behavior during creation
- // e.g. sampling, hence use inference mode
utils::split(line, toks, " ");
for(const std::string& tok : toks) {
- if(SPEC2SYM.count(tok)) {
- seenSpecial.insert(SPEC2SYM.at(tok));
- continue;
- }
-
auto iter = counter.find(tok);
if(iter == counter.end())
counter[tok] = 1;
@@ -240,6 +234,11 @@ public:
iter->second++;
}
}
+ }
+
+ void create(const std::string& vocabPath,
+ const std::unordered_map<std::string, size_t>& counter,
+ size_t maxSize = 0) {
std::vector<std::string> vocabVec;
for(auto& p : counter)
@@ -251,14 +250,7 @@ public:
vocabYaml.force_insert(DEFAULT_EOS_STR, DEFAULT_EOS_ID);
vocabYaml.force_insert(DEFAULT_UNK_STR, DEFAULT_UNK_ID);
- for(auto word : seenSpecial)
- vocabYaml.force_insert(SYM2SPEC.at(word), word);
-
Word maxSpec = 1;
- for(auto i : seenSpecial)
- if(i > maxSpec)
- maxSpec = i;
-
auto vocabSize = vocabVec.size();
if(maxSize > maxSpec)
vocabSize = std::min(maxSize - maxSpec - 1, vocabVec.size());
@@ -266,10 +258,13 @@ public:
for(size_t i = 0; i < vocabSize; ++i)
vocabYaml.force_insert(vocabVec[i], i + maxSpec + 1);
- vocabStrm << vocabYaml;
+ std::unique_ptr<io::OutputFileStream> vocabStrm(
+ vocabPath == "stdout" ? new io::OutputFileStream(std::cout)
+ : new io::OutputFileStream(vocabPath)
+ );
+ *vocabStrm << vocabYaml;
}
-private:
Words operator()(const std::vector<std::string>& lineTokens,
bool addEOS) const {
Words words(lineTokens.size());
diff --git a/src/data/sentencepiece_vocab.cpp b/src/data/sentencepiece_vocab.cpp
index d9c24415..ed476ec4 100755
--- a/src/data/sentencepiece_vocab.cpp
+++ b/src/data/sentencepiece_vocab.cpp
@@ -2,13 +2,18 @@
#ifdef USE_SENTENCEPIECE
#include "sentencepiece/src/sentencepiece_processor.h"
-#endif
+#include "sentencepiece/src/sentencepiece_trainer.h"
+#endif
+#include "common/config.h"
#include "common/options.h"
#include "common/logging.h"
#include "common/filesystem.h"
#include "common/regex.h"
+#include <sstream>
+#include <random>
+
namespace marian {
#ifdef USE_SENTENCEPIECE
@@ -28,9 +33,85 @@ private:
Ptr<Options> options_;
size_t batchIndex_{0};
+ std::mt19937 generator_;
+ std::uniform_int_distribution<int> randInt_; // from 0 to INT_MAX
+
+ // Sample from one file, based on first algorithm from:
+ // https://en.wikipedia.org/wiki/Reservoir_sampling
+ void reservoirSampling(std::vector<std::string>& sample, size_t& seenLines,
+ const std::string& trainPath, size_t maxLines, size_t maxBytes) {
+
+ ABORT_IF(maxLines == 0, "Sample needs to be larger 0");
+
+ std::unique_ptr<io::InputFileStream> trainStrm(
+ trainPath == "stdin" ? new io::InputFileStream(std::cin)
+ : new io::InputFileStream(trainPath)
+ );
+
+ std::string line;
+ while(getline(*trainStrm, line)) {
+ if(line.size() > 0 && line.size() < maxBytes) {
+ if(sample.size() < maxLines) {
+ sample.push_back(line);
+ }
+ else {
+ size_t i = randInt_(generator_) % (seenLines + 1);
+ if(i < maxLines)
+ sample[i] = line;
+ }
+ seenLines++;
+ }
+ }
+ }
+
+ // Iterate over all input files and collect a representative sample via reservoir sampling.
+ // The sample will first grow to the desired size and next keep sampling with decreasing
+ // probability in the hope to get a uniform sample from the union of all files.
+ size_t reservoirSamplingAll(io::TemporaryFile& temp,
+ const std::vector<std::string>& trainPaths,
+ size_t maxLines, size_t maxBytes) {
+ LOG(info, "[SentencePiece] Sampling at most {} lines from {}", maxLines, utils::join(trainPaths, ", "));
+
+ std::vector<std::string> sample;
+ size_t seenLines = 0;
+ for(const auto& trainPath : trainPaths)
+ reservoirSampling(sample, seenLines, trainPath, maxLines, maxBytes);
+ std::shuffle(sample.begin(), sample.end(), generator_);
+
+ io::OutputFileStream out(temp);
+ for(const auto& line : sample)
+ out << line << std::endl;
+
+ LOG(info, "[SentencePiece] Selected {} lines", sample.size());
+ return sample.size();
+ }
+
+ // Just concatenate all files to a temporary file so SentencePiece can consume it.
+ size_t dumpAll(io::TemporaryFile& temp,
+ const std::vector<std::string>& trainPaths,
+ size_t maxBytes) {
+ LOG(info, "[SentencePiece] Selecting all lines from {}", utils::join(trainPaths, ", "));
+
+ size_t seenLines = 0;
+ std::string line;
+ io::OutputFileStream out(temp);
+ for(const auto& trainPath : trainPaths) {
+ io::InputFileStream in(trainPath);
+ while(getline(in, line)) {
+ if(line.size() > 0 && line.size() < maxBytes) {
+ out << line << std::endl;
+ seenLines++;
+ }
+ }
+ }
+
+ LOG(info, "[SentencePiece] Selected {} lines", seenLines);
+ return seenLines;
+ }
+
public:
SentencePieceVocab(Ptr<Options> options, size_t batchIndex)
- : options_(options), batchIndex_(batchIndex) {
+ : options_(options), batchIndex_(batchIndex), generator_(Config::seed) {
if(options_->has("sentencepiece-alphas")) {
auto alphas = options_->get<std::vector<float>>("sentencepiece-alphas");
@@ -41,47 +122,91 @@ public:
if(alpha_ > 0)
LOG(debug,
- "Setting SentencePieceVocab sampling factor to {} for input {}",
+ "Setting SentencePiece vocabulary sampling factor to {} for input {}",
alpha_,
batchIndex_);
}
}
- virtual const std::string& canonicalExtension() const { return suffixes_[0]; }
- virtual const std::vector<std::string>& suffixes() const { return suffixes_; }
+ virtual const std::string& canonicalExtension() const override { return suffixes_[0]; }
+ virtual const std::vector<std::string>& suffixes() const override { return suffixes_; }
virtual std::string suffix() { return suffixes_[0]; };
- virtual std::string type() const { return "SentencePieceVocab"; }
+ virtual std::string type() const override { return "SentencePieceVocab"; }
virtual Word getEosId() const override { return (Word)spm_->eos_id(); }
virtual Word getUnkId() const override { return (Word)spm_->unk_id(); }
- void create(const std::string& /*vocabPath*/, const std::string& /*trainPath*/) {
- ABORT("[data] Training of SentencePieceVocab not yet supported");
- }
+ void create(const std::string& vocabPath,
+ const std::vector<std::string>& trainPaths,
+ size_t maxSize) override {
+
+ size_t defaultMaxSize = 32000;
+ size_t maxLines = options_->get<size_t>("sentencepiece-max-lines");
+ size_t maxBytes = 2048;
+
+ LOG(info, "[SentencePiece] Training SentencePiece vocabulary {}", vocabPath);
- void create(io::InputFileStream& /*trainStrm*/,
- io::OutputFileStream& /*vocabStrm*/,
- size_t /*maxSize*/) {
- ABORT("[data] Training of SentencePieceVocab not yet supported");
+ if(maxSize == 0) {
+ LOG(info, "[SentencePiece] Vocabulary size is undefined (set with --dim-vocabs ...) - setting to {}", defaultMaxSize);
+ maxSize = defaultMaxSize;
+ }
+
+ // Create temporary file to hold the sample for the SentencePiece trainer
+ io::TemporaryFile temp(options_->get<std::string>("tempdir"), false);
+ std::string tempFileName = temp.getFileName();
+ LOG(info, "[SentencePiece] Creating temporary file {}", tempFileName);
+
+ size_t seenLines = 0;
+ if(maxLines == 0)
+ seenLines = dumpAll(temp, trainPaths, maxBytes);
+ else
+ seenLines = reservoirSamplingAll(temp, trainPaths, maxLines, maxBytes);
+
+ // Compose the SentencePiece training command from filenames and parameters0
+ std::stringstream command;
+ command
+ << " --bos_id=-1 --eos_id=0 --unk_id=1" // these should not be changed as they match Marian defaults
+ << " --input=" << tempFileName
+ << " --model_prefix=" << vocabPath
+ << " --vocab_size=" << maxSize
+ << " --max_sentence_length=" << maxBytes
+ << " --input_sentence_size=" << seenLines
+ << " " << options_->get<std::string>("sentencepiece-options"); // these are SentencePiece command line options
+
+ // Train the SentencePiece model
+ const auto status = sentencepiece::SentencePieceTrainer::Train(command.str());
+ ABORT_IF(!status.ok(),
+ "SentencePiece vocabulary error: {}",
+ status.ToString());
+
+ LOG(info, "[SentencePiece] Removing {}", vocabPath + ".vocab");
+ ABORT_IF(remove((vocabPath + ".vocab").c_str()) != 0,
+ "Could not remove {}",
+ vocabPath + ".vocab");
+
+ LOG(info, "[SentencePiece] Renaming {} to {}", vocabPath + ".model", vocabPath);
+ ABORT_IF(rename((vocabPath + ".model").c_str(), vocabPath.c_str()) != 0,
+ "Could not rename {} to {}",
+ vocabPath + ".model", vocabPath);
}
- void createFake() {
- ABORT("[data] Fake SentencePieceVocab not supported");
+ void createFake() override {
+ ABORT("[SentencePiece] Fake SentencePiece vocabulary not supported");
}
- Word operator[](const std::string& token) const {
+ Word operator[](const std::string& token) const override {
return (Word)spm_->PieceToId(token);
}
- const std::string& operator[](Word id) const {
+ const std::string& operator[](Word id) const override {
ABORT_IF(id >= size(), "Unknown word id: ", id);
return spm_->IdToPiece(id);
}
- Words encode(const std::string& line, bool addEOS, bool inference) const {
+ Words encode(const std::string& line, bool addEOS, bool inference) const override {
std::vector<int> spmIds;
if(inference || alpha_ == 0)
spm_->Encode(line, &spmIds);
@@ -95,7 +220,7 @@ public:
return words;
}
- std::string decode(const Words& sentence, bool ignoreEOS) const {
+ std::string decode(const Words& sentence, bool /*ignoreEOS*/) const override {
std::string line;
// convert vector of Word to vector of int
std::vector<int> spmSentence(sentence.begin(), sentence.end());
@@ -103,29 +228,29 @@ public:
return line;
}
- size_t size() const {
+ size_t size() const override {
return spm_->GetPieceSize();
}
- int load(const std::string& vocabPath, int /*max*/) {
- LOG(info, "[data] Loading SentencePieceVocab from file {}", vocabPath);
+ int load(const std::string& vocabPath, int /*max*/) override {
+ LOG(info, "[data] Loading SentencePiece vocabulary from file {}", vocabPath);
ABORT_IF(!filesystem::exists(vocabPath),
- "SentencePieceVocab file {} does not exits",
- vocabPath);
+ "SentencePiece vocabulary file {} does not exits",
+ vocabPath);
spm_.reset(new sentencepiece::SentencePieceProcessor());
const auto status = spm_->Load(vocabPath);
ABORT_IF(!status.ok(),
- "SentencePieceVocab error: {}",
- status.ToString());
+ "SentencePiece vocabulary error: {}",
+ status.ToString());
return spm_->GetPieceSize();
}
};
-#endif
+#endif // USE_SENTENCEPIECE
Ptr<VocabBase> createSentencePieceVocab(const std::string& vocabPath, Ptr<Options> options, size_t batchIndex) {
bool isSentencePiece = regex::regex_search(vocabPath, regex::regex("\\.(spm)$"));
diff --git a/src/data/types.h b/src/data/types.h
index 62566a74..2bda6ece 100644
--- a/src/data/types.h
+++ b/src/data/types.h
@@ -28,27 +28,4 @@ const std::string DEFAULT_UNK_STR = "<unk>";
const std::string NEMATUS_EOS_STR = "eos";
const std::string NEMATUS_UNK_STR = "UNK";
-const Word STP_ID = 2;
-const Word CPY_ID = 3;
-const Word DEL_ID = 4;
-const Word RPL_ID = 5;
-
-const std::string STP_STR = "<step>";
-const std::string CPY_STR = "<c>";
-const std::string DEL_STR = "<d>";
-const std::string RPL_STR = "<r>";
-
-const std::unordered_map<std::string, Word> SPEC2SYM = {
- {STP_STR, STP_ID},
- {CPY_STR, CPY_ID},
- {DEL_STR, DEL_ID},
- {RPL_STR, RPL_ID},
-};
-
-const std::unordered_map<Word, std::string> SYM2SPEC = {
- {STP_ID, STP_STR},
- {CPY_ID, CPY_STR},
- {DEL_ID, DEL_STR},
- {RPL_ID, RPL_STR},
-};
} // namespace marian
diff --git a/src/data/vocab.cpp b/src/data/vocab.cpp
index 09849b2e..e95ea721 100755
--- a/src/data/vocab.cpp
+++ b/src/data/vocab.cpp
@@ -13,62 +13,63 @@ Ptr<VocabBase> createVocab(const std::string& vocabPath, Ptr<Options> options, s
}
int Vocab::loadOrCreate(const std::string& vocabPath,
- const std::string& trainPath,
- int max) {
+ const std::vector<std::string>& trainPaths,
+ size_t maxSize) {
size_t size = 0;
if(vocabPath.empty()) {
// No vocabulary path was given, attempt to first find a vocabulary
- // for trainPath + possible suffixes. If not found attempt to create
- // as trainPath + canonical suffix.
+ // for trainPaths[0] + possible suffixes. If not found attempt to create
+ // as trainPaths[0] + canonical suffix.
+ // Only search based on first path, maybe disable this at all?
LOG(info,
"No vocabulary path given; "
"trying to find default vocabulary based on data path {}",
- trainPath);
+ trainPaths[0]);
vImpl_ = createDefaultVocab();
- size = vImpl_->findAndLoad(trainPath, max);
+ size = vImpl_->findAndLoad(trainPaths[0], maxSize);
if(size == 0) {
- auto path = trainPath + vImpl_->canonicalExtension();
+ auto newVocabPath = trainPaths[0] + vImpl_->canonicalExtension();
LOG(info,
"No vocabulary path given; "
- "trying to find vocabulary based on data path {}",
- trainPath);
- vImpl_->create(path, trainPath);
- size = vImpl_->load(path, max);
+ "trying to create vocabulary based on data paths {}",
+ utils::join(trainPaths, ", "));
+ create(newVocabPath, trainPaths, maxSize);
+ size = load(newVocabPath, maxSize);
}
} else {
if(!filesystem::exists(vocabPath)) {
// Vocabulary path was given, but no vocabulary present,
// attempt to create in specified location.
- create(vocabPath, trainPath);
+ create(vocabPath, trainPaths, maxSize);
}
// Vocabulary path exists, attempting to load
- size = load(vocabPath, max);
+ size = load(vocabPath, maxSize);
}
LOG(info, "[data] Setting vocabulary size for input {} to {}", batchIndex_, size);
return (int)size;
}
-int Vocab::load(const std::string& vocabPath, int max) {
+int Vocab::load(const std::string& vocabPath, size_t maxSize) {
if(!vImpl_)
vImpl_ = createVocab(vocabPath, options_, batchIndex_);
- return vImpl_->load(vocabPath, max);
+ return vImpl_->load(vocabPath, maxSize);
}
-void Vocab::create(const std::string& vocabPath, const std::string& trainPath) {
+void Vocab::create(const std::string& vocabPath,
+ const std::vector<std::string>& trainPaths,
+ size_t maxSize) {
if(!vImpl_)
vImpl_ = createVocab(vocabPath, options_, batchIndex_);
- vImpl_->create(vocabPath, trainPath);
+ vImpl_->create(vocabPath, trainPaths, maxSize);
}
-void Vocab::create(io::InputFileStream& trainStrm,
- io::OutputFileStream& vocabStrm,
+void Vocab::create(const std::string& vocabPath,
+ const std::string& trainPath,
size_t maxSize) {
- if(!vImpl_)
- vImpl_ = createDefaultVocab(); // Only DefaultVocab can be built from streams
- vImpl_->create(trainStrm, vocabStrm, maxSize);
+ create(vocabPath, std::vector<std::string>({trainPath}), maxSize);
}
void Vocab::createFake() {
diff --git a/src/data/vocab.h b/src/data/vocab.h
index 1551f746..4bad1795 100755
--- a/src/data/vocab.h
+++ b/src/data/vocab.h
@@ -26,15 +26,18 @@ public:
: options_(options), batchIndex_(batchIndex) {}
int loadOrCreate(const std::string& vocabPath,
- const std::string& textPath,
- int max = 0);
+ const std::vector<std::string>& trainPaths,
+ size_t maxSize = 0);
- int load(const std::string& vocabPath, int max = 0);
- void create(const std::string& vocabPath, const std::string& trainPath);
+ int load(const std::string& vocabPath, size_t maxSize = 0);
- void create(io::InputFileStream& trainStrm,
- io::OutputFileStream& vocabStrm,
- size_t maxSize = 0);
+ void create(const std::string& vocabPath,
+ const std::vector<std::string>& trainPaths,
+ size_t maxSize);
+
+ void create(const std::string& vocabPath,
+ const std::string& trainPath,
+ size_t maxSize);
// string token to token id
Word operator[](const std::string& word) const;
diff --git a/src/data/vocab_base.h b/src/data/vocab_base.h
index 23e1520c..d3078d9a 100644
--- a/src/data/vocab_base.h
+++ b/src/data/vocab_base.h
@@ -1,19 +1,19 @@
#pragma once
+#include "data/types.h"
#include "common/definitions.h"
+#include "common/utils.h"
#include "common/file_stream.h"
-#include "data/types.h"
namespace marian {
class VocabBase {
public:
virtual int load(const std::string& vocabPath, int max = 0) = 0;
- virtual void create(const std::string& vocabPath, const std::string& trainPath) = 0;
- virtual void create(io::InputFileStream& trainStrm,
- io::OutputFileStream& vocabStrm,
- size_t maxSize = 0) = 0;
+ virtual void create(const std::string& vocabPath,
+ const std::vector<std::string>& trainPaths,
+ size_t maxSize) = 0;
// return canonical suffix for given type of vocabulary
virtual const std::string& canonicalExtension() const = 0;
diff --git a/src/examples/mnist/model_lenet.h b/src/examples/mnist/model_lenet.h
index ac0298e3..c2a39977 100644
--- a/src/examples/mnist/model_lenet.h
+++ b/src/examples/mnist/model_lenet.h
@@ -12,12 +12,12 @@ public:
MnistLeNet(Ptr<Options> options, Args... args)
: MnistFeedForwardNet(options, args...) {}
- virtual void clear(Ptr<ExpressionGraph> graph) { graph->clear(); };
+ virtual void clear(Ptr<ExpressionGraph> graph) override { graph->clear(); };
protected:
virtual Expr construct(Ptr<ExpressionGraph> g,
Ptr<data::Batch> batch,
- bool inference = false) {
+ bool inference = false) override {
const std::vector<int> dims = {784, 128, 10};
// Start with an empty expression graph
diff --git a/src/graph/node_operators_binary.h b/src/graph/node_operators_binary.h
index 01546c1e..7da85443 100755
--- a/src/graph/node_operators_binary.h
+++ b/src/graph/node_operators_binary.h
@@ -490,20 +490,20 @@ struct RowsNodeOp : public NaryNodeOp {
// This operation indexes a tensor along an axis.
// This is similar to the common gather() operation in other toolkits.
// For example, this can be used for:
-// - Same index applied to all batch items (today's select()):
-// 'index' has 1 in the axes that match batch axes in the input, and axis set to the one axis that gets selected over.
-// Example: Selecting Transformer head 0, i.e. return a[:,1,:,:]
-// axis = -3
-// a : (B, H , S, T) B=batch dim, H=#heads, S=src length, T=trg length
-// idx: ( #1#, 1, 1) #1# denotes 'axis'. All values are zero.
-// out: (B, 1 , S, T) out[b, 0, s, t] == a[b, idx[/*0,*/ 0, s, t], s, t]
-// - Same data with batched indices (today's rows()):
-// 'data' has 1 in the batch axes.
-// Example: Embedding lookup as done today using rows():
-// axis = -2
-// e : ( V , E) V=vocab size, E=embedding dimension
-// idx: (#(B*S)#, 1) B=batch size, S=source length, idx values are in range 0..V-1
-// out: ( (B*S) , E) out[b, s, e] == e[/*0,*/ idx[b, s, 0], e]
+// - Same index applied to all batch items (today's select()):
+// 'index' has 1 in the axes that match batch axes in the input, and axis set to the one axis that gets selected over.
+// Example: Selecting Transformer head 0, i.e. return a[:,1,:,:]
+// axis = -3
+// a : (B, H , S, T) B=batch dim, H=#heads, S=src length, T=trg length
+// idx: ( #1#, 1, 1) #1# denotes 'axis'. All values are zero.
+// out: (B, 1 , S, T) out[b, 0, s, t] == a[b, idx[/*0,*/ 0, s, t], s, t]
+// - Same data with batched indices (today's rows()):
+// 'data' has 1 in the batch axes.
+// Example: Embedding lookup as done today using rows():
+// axis = -2
+// e : ( V , E) V=vocab size, E=embedding dimension
+// idx: (#(B*S)#, 1) B=batch size, S=source length, idx values are in range 0..V-1
+// out: ( (B*S) , E) out[b, s, e] == e[/*0,*/ idx[b, s, 0], e]
// - Batched selection (x-ent scenario): Both 'index' and 'data' have matching batch axes.
// Example: Cross-entropy loss as -select(logSoftmax(logits), groundTruth, axis=-1):
// axis = -1
@@ -511,14 +511,14 @@ struct RowsNodeOp : public NaryNodeOp {
// idx: (B, T, #1#) idx values are in range 0..V-1
// out: (B, T, 1 ) out[b,t,0] == lp[b, t, idx[b, t, 0]]
// Example for 2D tensor with axis=0:
-// | t[index[0, 0] 0] t[index[0, 1] 1] |
-// | t[index[1, 0] 0] t[index[1, 1] 1] |
-// And for axis 1:
-// | t[0 index[0, 0]] t[0 index[0, 1]] |
-// | t[1 index[1, 0]] t[1 index[1, 1]] |
-// For a 3-D tensor the output is specified by:
-// out[i][j][k] = input[index[i][j][k]][j][k] # if dim == 0
-// out[i][j][k] = input[i][index[i][j][k]][k] # if dim == 1
+// | t[index[0, 0] 0] t[index[0, 1] 1] |
+// | t[index[1, 0] 0] t[index[1, 1] 1] |
+// And for axis 1:
+// | t[0 index[0, 0]] t[0 index[0, 1]] |
+// | t[1 index[1, 0]] t[1 index[1, 1]] |
+// For a 3-D tensor the output is specified by:
+// out[i][j][k] = input[index[i][j][k]][j][k] # if dim == 0
+// out[i][j][k] = input[i][index[i][j][k]][k] # if dim == 1
// out[i][j][k] = input[i][j][index[i][j][k]] # if dim == 2
// If 'a' and 'indices' do not have the same rank, then negative 'axis' is
// interpreted relative to 'a', and 'indices' must have the resulting axis.
@@ -953,6 +953,7 @@ struct HighwayNodeOp : public NaryNodeOp {
};
#ifdef CUDNN
+
class ConvolutionOp : public NaryNodeOp {
public:
ConvolutionOp(const std::vector<Expr>& nodes,
@@ -970,12 +971,12 @@ public:
conv_.getOutputShape(nodes[0]->shape(), shape_);
}
- NodeOps forwardOps() {
+ NodeOps forwardOps() override {
return {NodeOp(conv_.forward(
child(0)->val(), child(1)->val(), child(2)->val(), val_))};
}
- NodeOps backwardOps() {
+ NodeOps backwardOps() override {
return {NodeOp(conv_.backward(child(0)->val(),
child(0)->grad(),
child(1)->val(),
@@ -984,7 +985,7 @@ public:
adj_))};
}
- const std::string type() { return "layer_convolution"; }
+ const std::string type() override { return "layer_convolution"; }
protected:
ConvolutionWrapper conv_;
diff --git a/src/graph/node_operators_unary.h b/src/graph/node_operators_unary.h
index 9a752786..b8b19208 100644
--- a/src/graph/node_operators_unary.h
+++ b/src/graph/node_operators_unary.h
@@ -881,16 +881,16 @@ public:
strideWidth,
mode) {}
- NodeOps forwardOps() {
+ NodeOps forwardOps() override {
return {NodeOp(pooling_.forward(child(0)->val(), val_))};
}
- NodeOps backwardOps() {
+ NodeOps backwardOps() override {
return {NodeOp(
pooling_.backward(child(0)->val(), child(0)->grad(), val_, adj_))};
}
- const std::string type() { return "layer_pooling"; }
+ const std::string type() override { return "layer_pooling"; }
protected:
PoolingWrapper pooling_;
diff --git a/src/layers/loss.cpp b/src/layers/loss.cpp
index 87a2a1fd..03b79682 100755
--- a/src/layers/loss.cpp
+++ b/src/layers/loss.cpp
@@ -15,6 +15,8 @@ Ptr<LossBase> LossFactory(Ptr<Options> options, bool inference) {
return New<PerplexityLoss>(smoothing);
} else if(costType == "ce-rescore") {
return New<CrossEntropyRescoreLoss>(smoothing);
+ } else if(costType == "ce-rescore-mean") {
+ return New<CrossEntropyRescoreMeanLoss>(smoothing);
} else { // same as ce-mean
return New<CrossEntropyMeanLoss>(smoothing);
}
@@ -108,4 +110,14 @@ Expr CrossEntropyRescoreLoss::getCost(Expr logits,
auto ce = getCrossEntropy(logits, indices, mask, weights);
return -sum(ce, /*axis =*/ -3);
}
+
+Expr CrossEntropyRescoreMeanLoss::getCost(Expr logits,
+ Expr indices,
+ Expr mask,
+ Expr weights) {
+ auto ce = getCrossEntropy(logits, indices, mask, weights);
+ // divide by number of words in sentence
+ return -sum(ce, /*axis =*/ -3) / sum(mask, /*axis =*/ -3);
+}
+
} // namespace marian
diff --git a/src/layers/loss.h b/src/layers/loss.h
index 89d20819..ebf71147 100644
--- a/src/layers/loss.h
+++ b/src/layers/loss.h
@@ -66,5 +66,11 @@ public:
Expr getCost(Expr logits, Expr indices, Expr mask, Expr weights) override;
};
+class CrossEntropyRescoreMeanLoss : public LossBase {
+public:
+ explicit CrossEntropyRescoreMeanLoss(float smoothing = 0) : LossBase(smoothing){};
+ Expr getCost(Expr logits, Expr indices, Expr mask, Expr weights) override;
+};
+
Ptr<LossBase> LossFactory(Ptr<Options> options, bool inference);
} // namespace marian
diff --git a/src/layers/word2vec_reader.h b/src/layers/word2vec_reader.h
index f18fd439..a7e85592 100755
--- a/src/layers/word2vec_reader.h
+++ b/src/layers/word2vec_reader.h
@@ -18,8 +18,6 @@ public:
LOG(info, "[data] Loading embedding vectors from {}", fileName);
io::InputFileStream embFile(fileName);
- ABORT_IF(!embFile.isOpen(),
- "Unable to open file with embeddings: " + fileName);
std::string line;
std::vector<std::string> values;
@@ -75,19 +73,19 @@ private:
values.reserve(dimEmb);
// Glorot numal distribution
float scale = sqrtf(2.0f / (dimVoc + dimEmb));
-
+
// @TODO: switch to new random generator back-end.
- // This is rarly used however.
+ // This is rarly used however.
std::random_device rd;
std::mt19937 engine(rd());
-
+
std::normal_distribution<float> d(0, scale);
auto gen = [&d, &engine] () {
return d(engine);
};
std::generate(values.begin(), values.end(), gen);
-
+
return values;
}
};
diff --git a/src/models/char_s2s.h b/src/models/char_s2s.h
index c4dce6f5..6d5d1db1 100644
--- a/src/models/char_s2s.h
+++ b/src/models/char_s2s.h
@@ -12,7 +12,7 @@ public:
CharS2SEncoder(Ptr<Options> options) : EncoderS2S(options) {}
virtual Ptr<EncoderState> build(Ptr<ExpressionGraph> graph,
- Ptr<data::CorpusBatch> batch) {
+ Ptr<data::CorpusBatch> batch) override {
auto embeddings = buildSourceEmbeddings(graph);
// select embeddings that occur in the batch
diff --git a/src/models/hardatt.h b/src/models/hardatt.h
deleted file mode 100755
index 77ba7f44..00000000
--- a/src/models/hardatt.h
+++ /dev/null
@@ -1,303 +0,0 @@
-#pragma once
-
-#include "marian.h"
-
-#include "layers/generic.h"
-#include "rnn/attention_constructors.h"
-#include "rnn/types.h"
-
-#include <numeric>
-
-namespace marian {
-
-class DecoderStateHardAtt : public DecoderState {
-protected:
- std::vector<IndexType> attentionIndices_;
-
-public:
- DecoderStateHardAtt(const rnn::States& states,
- Expr logProbs,
- const std::vector<Ptr<EncoderState>>& encStates,
- Ptr<data::CorpusBatch> batch)
- : DecoderState(states, logProbs, encStates, batch) {}
-
- virtual Ptr<DecoderState> select(const std::vector<IndexType>& selIdx,
- int beamSize) const override {
- std::vector<IndexType> selectedAttentionIndices;
- for(auto i : selIdx)
- selectedAttentionIndices.push_back(attentionIndices_[i]);
-
- auto selectedState = New<DecoderStateHardAtt>(states_.select(selIdx, beamSize, /*isBatchMajor=*/false),
- logProbs_,
- encStates_,
- batch_);
- selectedState->attentionIndices_ = selectedAttentionIndices;
-
- // Set positon of new state based on the target token position of current
- // state
- // @TODO: I copied this to make this consistent with the other instances. Needed?
- selectedState->setPosition(getPosition());
- return selectedState;
- }
-
- // @TODO: why are these virtual?
- virtual void setAttentionIndices(
- const std::vector<IndexType>& attentionIndices) {
- attentionIndices_ = attentionIndices;
- }
-
- virtual std::vector<IndexType>& getAttentionIndices() {
- ABORT_IF(attentionIndices_.empty(), "Empty attention indices");
- return attentionIndices_;
- }
-
- virtual void blacklist(Expr totalCosts, Ptr<data::CorpusBatch> batch) override {
- auto attentionIdx = getAttentionIndices();
- int dimVoc = totalCosts->shape()[-1];
- for(size_t i = 0; i < attentionIdx.size(); i++) {
- if(batch->front()->data()[attentionIdx[i]] != 0) {
- totalCosts->val()->set(
- i * dimVoc + DEFAULT_EOS_ID, // this is checked at vocab-load time
- // if the special tokens are present
- std::numeric_limits<float>::lowest());
- } else {
- totalCosts->val()->set(i * dimVoc + STP_ID,
- std::numeric_limits<float>::lowest());
- }
- }
- }
-};
-
-class DecoderHardAtt : public DecoderBase {
-protected:
- Ptr<rnn::RNN> rnn_;
- std::unordered_set<Word> specialSymbols_;
-
-public:
- DecoderHardAtt(Ptr<Options> options) : DecoderBase(options) {
- if(options->has("special-vocab")) {
- auto spec = options->get<std::vector<Word>>("special-vocab");
- specialSymbols_.insert(spec.begin(), spec.end());
- }
- }
-
- virtual Ptr<DecoderState> startState(
- Ptr<ExpressionGraph> graph,
- Ptr<data::CorpusBatch> batch,
- std::vector<Ptr<EncoderState>>& encStates) override {
-
- std::vector<Expr> meanContexts;
- for(auto& encState : encStates) {
- // average the source context weighted by the batch mask
- // this will remove padded zeros from the average
- meanContexts.push_back(weighted_average(
- encState->getContext(), encState->getMask(), /*axis =*/ -3));
- }
-
- Expr start;
- if(!meanContexts.empty()) {
- // apply single layer network to mean to map into decoder space
- auto mlp = mlp::mlp(graph) //
- .push_back(mlp::dense(graph) //
- ("prefix", prefix_ + "_ff_state") //
- ("dim", opt<int>("dim-rnn")) //
- ("activation", (int)mlp::act::tanh) //
- ("layer-normalization",
- opt<bool>("layer-normalization")));
- start = mlp->apply(meanContexts);
- }
-
- rnn::States startStates(opt<size_t>("dec-depth"), {start, start});
- auto startState = New<DecoderStateHardAtt>(startStates, nullptr, encStates, batch);
- startState->setAttentionIndices(std::vector<IndexType>({ 0 }));
- return startState;
- }
-
- virtual Ptr<DecoderState> step(Ptr<ExpressionGraph> graph,
- Ptr<DecoderState> state) override {
-
- auto type = options_->get<std::string>("type");
-
- int dimTrgVoc = options_->get<std::vector<int>>("dim-vocabs").back();
-
- int dimTrgEmb = options_->get<int>("dim-emb");
-
- int dimDecState = options_->get<int>("dim-rnn");
- bool layerNorm = options_->get<bool>("layer-normalization");
- bool skipDepth = options_->get<bool>("skip");
-
- size_t decoderLayers = options_->get<size_t>("dec-depth");
- auto cellType = options_->get<std::string>("dec-cell");
-
- float dropoutRnn = inference_ ? 0 : options_->get<float>("dropout-rnn");
- float dropoutTrg = inference_ ? 0 : options_->get<float>("dropout-trg");
-
- auto stateHardAtt = std::dynamic_pointer_cast<DecoderStateHardAtt>(state);
-
- auto trgEmbeddings = stateHardAtt->getTargetEmbeddings();
-
- auto context = stateHardAtt->getEncoderStates()[0]->getContext();
- int dimContext = context->shape()[-1];
- int dimSrcWords = context->shape()[-3];
-
- int dimBatch = context->shape()[-2];
- int dimTrgWords = trgEmbeddings->shape()[-3];
- int dimBeam = trgEmbeddings->shape()[-4];
-
- if(dropoutTrg) {
- trgEmbeddings
- = dropout(trgEmbeddings, dropoutTrg, {dimTrgWords, dimBatch, 1});
- }
-
- auto flatContext = reshape(context, {dimBatch * dimSrcWords, dimContext});
- auto attendedContext
- = rows(flatContext, stateHardAtt->getAttentionIndices());
- attendedContext = reshape(attendedContext,
- {dimBeam, dimTrgWords, dimBatch, dimContext});
-
- auto rnnInputs = concatenate({trgEmbeddings, attendedContext}, /*axis =*/ -1);
- int dimInput = rnnInputs->shape()[-1];
-
- if(!rnn_) {
- auto rnn = rnn::rnn(graph) //
- ("type", cellType) //
- ("dimInput", dimInput) //
- ("dimState", dimDecState) //
- ("dropout", dropoutRnn) //
- ("layer-normalization", layerNorm) //
- ("skip", skipDepth);
-
- if(type == "hard-soft-att") {
- auto attCell = rnn::stacked_cell(graph) //
- .push_back(rnn::cell(graph) //
- ("prefix", prefix_ + "_cell1"));
- for(size_t i = 0; i < state->getEncoderStates().size(); ++i) {
- std::string prefix = prefix_;
- if(state->getEncoderStates().size() > 1)
- prefix += "_att" + std::to_string(i + 1);
-
- attCell.push_back(rnn::attention(graph) //
- ("prefix", prefix) //
- .set_state(state->getEncoderStates()[i]));
- }
-
- attCell.push_back(rnn::cell(graph) //
- ("prefix", prefix_ + "_cell2") //
- ("final", true));
- rnn.push_back(attCell);
- } else {
- rnn.push_back(rnn::cell(graph)("prefix", prefix_));
- }
-
- for(size_t i = 0; i < decoderLayers - 1; ++i)
- rnn.push_back(rnn::cell(graph) //
- ("prefix", prefix_ + "_l" + std::to_string(i)));
-
- rnn_ = rnn.construct();
- }
-
- auto decContext = rnn_->transduce(rnnInputs, stateHardAtt->getStates());
- rnn::States decStates = rnn_->lastCellStates();
-
- //// 2-layer feedforward network for outputs and cost
- auto out = mlp::mlp(graph)
- .push_back(mlp::dense(graph) //
- ("prefix", prefix_ + "_ff_logit_l1") //
- ("dim", dimTrgEmb) //
- ("activation", (int)mlp::act::tanh) //
- ("layer-normalization", layerNorm)) //
- .push_back(mlp::dense(graph) //
- ("prefix", prefix_ + "_ff_logit_l2") //
- ("dim", dimTrgVoc));
-
- Expr logits;
- if(type == "hard-soft-att") {
- std::vector<Expr> alignedContexts;
- for(int k = 0; k < state->getEncoderStates().size(); ++k) {
- // retrieve all the aligned contexts computed by the attention mechanism
- auto att = rnn_->at(0)
- ->as<rnn::StackedCell>()
- ->at(k + 1)
- ->as<rnn::Attention>();
- alignedContexts.push_back(att->getContext());
- }
-
- Expr alignedContext;
- if(alignedContexts.size() > 1)
- alignedContext = concatenate(alignedContexts, /*axis =*/ -1);
- else if(alignedContexts.size() == 1)
- alignedContext = alignedContexts[0];
-
- logits = out->apply(rnnInputs, decContext, alignedContext);
- } else {
- logits = out->apply(rnnInputs, decContext);
- }
-
- auto nextState = New<DecoderStateHardAtt>(decStates,
- logits,
- stateHardAtt->getEncoderStates(),
- stateHardAtt->getBatch());
- nextState->setAttentionIndices(std::vector<IndexType>(stateHardAtt->getAttentionIndices()));
- nextState->setPosition(state->getPosition() + 1); // @TODO: I added this for consistency. Correct?
- return nextState;
- }
-
- const std::vector<Expr> getAlignments() {
- auto att = rnn_->at(0)->as<rnn::StackedCell>()->at(1)->as<rnn::Attention>();
- return att->getAlignments();
- }
-
- void embeddingsFromBatch(Ptr<ExpressionGraph> graph,
- Ptr<DecoderState> state,
- Ptr<data::CorpusBatch> batch) override {
- DecoderBase::embeddingsFromBatch(graph, state, batch);
-
- auto subBatch = (*batch)[batchIndex_];
- int dimBatch = (int)subBatch->batchSize();
- int dimWords = (int)subBatch->batchWidth();
-
- std::vector<IndexType> attentionIndices(dimBatch, 0);
- std::vector<IndexType> currentPos(dimBatch, 0);
- std::iota(currentPos.begin(), currentPos.end(), 0);
-
- for(int i = 0; i < dimWords - 1; ++i) {
- for(int j = 0; j < dimBatch; ++j) {
- Word word = subBatch->data()[i * dimBatch + j];
- if(specialSymbols_.count(word))
- currentPos[j] += dimBatch;
- attentionIndices.push_back(currentPos[j]);
- }
- }
-
- std::dynamic_pointer_cast<DecoderStateHardAtt>(state)->setAttentionIndices(
- attentionIndices);
- }
-
- virtual void embeddingsFromPrediction(Ptr<ExpressionGraph> graph,
- Ptr<DecoderState> state,
- const std::vector<IndexType>& embIdx,
- int dimBatch,
- int beamSize) override {
- DecoderBase::embeddingsFromPrediction(
- graph, state, embIdx, dimBatch, beamSize);
-
- auto stateHardAtt = std::dynamic_pointer_cast<DecoderStateHardAtt>(state);
-
- size_t dimSrcWords
- = state->getEncoderStates()[0]->getContext()->shape()[-3];
-
- if(embIdx.empty()) {
- stateHardAtt->setAttentionIndices({0});
- } else {
- for(size_t i = 0; i < embIdx.size(); ++i)
- if(specialSymbols_.count(embIdx[i])) {
- stateHardAtt->getAttentionIndices()[i]++;
- if(stateHardAtt->getAttentionIndices()[i] >= dimSrcWords)
- stateHardAtt->getAttentionIndices()[i] = (IndexType)dimSrcWords - 1;
- }
- }
- }
-
- void clear() override { rnn_ = nullptr; }
-};
-} // namespace marian
diff --git a/src/models/model_factory.cpp b/src/models/model_factory.cpp
index 65629176..d42f07c8 100644
--- a/src/models/model_factory.cpp
+++ b/src/models/model_factory.cpp
@@ -6,7 +6,6 @@
#include "models/costs.h"
#include "models/amun.h"
-#include "models/hardatt.h"
#include "models/nematus.h"
#include "models/s2s.h"
#include "models/transformer_factory.h"
@@ -47,11 +46,6 @@ Ptr<DecoderBase> DecoderFactory::construct() {
if(options_->get<std::string>("type") == "transformer")
// return New<DecoderTransformer>(options_);
return NewDecoderTransformer(options_);
- if(options_->get<std::string>("type") == "hard-att")
- return New<DecoderHardAtt>(options_);
- if(options_->get<std::string>("type") == "hard-soft-att")
- return New<DecoderHardAtt>(options_);
-
ABORT("Unknown decoder type");
}
@@ -120,24 +114,6 @@ Ptr<ModelBase> by_type(std::string type, usage use, Ptr<Options> options) {
.construct();
}
- if(type == "hard-att") {
- return models::encoder_decoder()(options)
- ("usage", use)
- ("original-type", type)
- .push_back(models::encoder()("type", "s2s"))
- .push_back(models::decoder()("type", "hard-att"))
- .construct();
- }
-
- if(type == "hard-soft-att") {
- return models::encoder_decoder()(options)
- ("usage", use)
- ("original-type", type)
- .push_back(models::encoder()("type", "s2s"))
- .push_back(models::decoder()("type", "hard-soft-att"))
- .construct();
- }
-
if(type == "multi-s2s") {
size_t numEncoders = 2;
auto ms2sFactory = models::encoder_decoder()(options)
@@ -172,25 +148,6 @@ Ptr<ModelBase> by_type(std::string type, usage use, Ptr<Options> options) {
return ms2sFactory.construct();
}
- if(type == "multi-hard-att") {
- size_t numEncoders = 2;
- auto ms2sFactory = models::encoder_decoder()(options)
- ("usage", use)
- ("type", "s2s")
- ("original-type", type);
-
- for(size_t i = 0; i < numEncoders; ++i) {
- auto prefix = "encoder" + std::to_string(i + 1);
- ms2sFactory.push_back(models::encoder()("prefix", prefix)("index", i));
- }
-
- ms2sFactory.push_back(models::decoder()
- ("index", numEncoders)
- ("type", "hard-soft-att"));
-
- return ms2sFactory.construct();
- }
-
if(type == "multi-transformer") {
size_t numEncoders = 2;
auto mtransFactory = models::encoder_decoder()(options)
diff --git a/src/rescorer/rescorer.h b/src/rescorer/rescorer.h
index bf57a2b6..fa456856 100644
--- a/src/rescorer/rescorer.h
+++ b/src/rescorer/rescorer.h
@@ -52,8 +52,12 @@ public:
ABORT_IF(options_->has("summary") && options_->has("alignment"),
"Alignments can not be produced with summarized score");
+ ABORT_IF(options_->has("summary") && options_->get<bool>("normalize"),
+ "Normalization by length cannot be used with summary scores");
+
options_->set("inference", true);
- options_->set("cost-type", "ce-rescore");
+ // @TODO: make normalize here a float and pass into loss to compute the same way as in decoding
+ options_->set("cost-type", options_->get<bool>("normalize") ? "ce-rescore-mean" : "ce-rescore");
if(options_->get<bool>("n-best"))
corpus_ = New<CorpusNBest>(options_);
@@ -97,6 +101,8 @@ public:
std::string alignment = options_->get<std::string>("alignment", "");
bool summarize = options_->has("summary");
+ bool normalize = options_->get<bool>("normalize");
+
std::string summary = summarize ? options_->get<std::string>("summary") : "cross-entropy";
float sumCost = 0;
@@ -118,7 +124,11 @@ public:
builder = models_[id % graphs_.size()];
}
+ // @TODO: normalize by length as in normalize
+ // Once we have Frank's concept of ce-sum with sample size by words we will return a pair
+ // here which will make it trivial to report all variants.
auto costNode = builder->build(graph, batch);
+
graph->forward();
std::vector<float> scores;
@@ -141,13 +151,29 @@ public:
output->Write((long)batch->getSentenceIds()[i], scores[i], aligns[i]);
}
}
+
+ // progress heartbeat for MS-internal Philly compute cluster
+ // otherwise this job may be killed prematurely if no log for 4 hrs
+ if (getenv("PHILLY_JOB_ID") // this environment variable exists when running on the cluster
+ && id % 1000 == 0) // hard beat once every 1000 batches
+ {
+ auto progress = id / 10000.f; //fake progress for now, becomes >100 after 1M batches
+ fprintf(stdout, "PROGRESS: %.2f%%\n", progress);
+ fflush(stdout);
+ }
};
- pool.enqueue(task, batchId % graphs_.size());
- batchId++;
+ pool.enqueue(task, batchId++);
}
}
+ if(normalize) {
+ LOG(info, "Total normalized log probs {} : Total sentences {} : Total words {}", sumCost, sumSamples, sumWords);
+ LOG(warn, "Sum of normalized log probs is a sum of averages");
+ } else {
+ LOG(info, "Total log probs {} : Total sentences {} : Total words {}", sumCost, sumSamples, sumWords);
+ }
+
if(summarize) {
float cost = 0;
if(summary == "perplexity")
diff --git a/src/rescorer/score_collector.cpp b/src/rescorer/score_collector.cpp
index 65f43c70..ac118a6a 100644
--- a/src/rescorer/score_collector.cpp
+++ b/src/rescorer/score_collector.cpp
@@ -9,9 +9,14 @@ namespace marian {
ScoreCollector::ScoreCollector(const Ptr<Options>& options)
: nextId_(0),
- outStrm_(new io::OutputFileStream(std::cout)),
alignment_(options->get<std::string>("alignment", "")),
- alignmentThreshold_(getAlignmentThreshold(alignment_)) {}
+ alignmentThreshold_(getAlignmentThreshold(alignment_)) {
+
+ if(options->get<std::string>("output") == "stdout")
+ outStrm_.reset(new io::OutputFileStream(std::cout));
+ else
+ outStrm_.reset(new io::OutputFileStream(options->get<std::string>("output")));
+ }
void ScoreCollector::Write(long id, const std::string& message) {
std::lock_guard<std::mutex> lock(mutex_);
diff --git a/src/tensors/cpu/prod.cpp b/src/tensors/cpu/prod.cpp
index 0baeeb96..69923f87 100755
--- a/src/tensors/cpu/prod.cpp
+++ b/src/tensors/cpu/prod.cpp
@@ -99,7 +99,7 @@ void Prod(marian::Tensor C,
}
void ProdBatched(marian::Tensor C,
- Ptr<Allocator> allocator,
+ Ptr<Allocator> /*allocator*/,
const marian::Tensor A,
const marian::Tensor B,
bool transA,
@@ -150,7 +150,7 @@ void ProdBatched(marian::Tensor C,
(int)ldc);
}
#else
- C; allocator; A; B; transA; transB; beta; scalar;
+ C; A; B; transA; transB; beta; scalar;
ABORT("You need to compile with MKL in order to use the CPU version");
#endif
}
diff --git a/src/training/scheduler.h b/src/training/scheduler.h
index 8fc31d70..dee62496 100755
--- a/src/training/scheduler.h
+++ b/src/training/scheduler.h
@@ -246,8 +246,7 @@ public:
} else {
if(options_->get<bool>("lr-report")) {
LOG(info,
- "Ep. {} : Up. {} : Sen. {} : Cost {:.2f} : Time {:2f}s : {:.2f} words/s : L.r. "
- "{:.4e}",
+ "Ep. {} : Up. {} : Sen. {} : Cost {:.8f} : Time {:.2f}s : {:.2f} words/s : L.r. {:.4e}",
state_->epochs,
state_->batches,
utils::withCommas(state_->samplesEpoch),
@@ -257,7 +256,7 @@ public:
state_->eta);
} else {
LOG(info,
- "Ep. {} : Up. {} : Sen. {} : Cost {:.2f} : Time {:.2f}s : {:.2f} words/s",
+ "Ep. {} : Up. {} : Sen. {} : Cost {:.8f} : Time {:.2f}s : {:.2f} words/s",
state_->epochs,
state_->batches,
utils::withCommas(state_->samplesEpoch),
diff --git a/src/translator/output_collector.cpp b/src/translator/output_collector.cpp
index c7114a56..58fba69b 100755
--- a/src/translator/output_collector.cpp
+++ b/src/translator/output_collector.cpp
@@ -6,6 +6,10 @@
namespace marian {
+OutputCollector::OutputCollector()
+ : nextId_(0),
+ printing_(new DefaultPrinting()) {}
+
OutputCollector::OutputCollector(std::string outFile)
: nextId_(0),
outStrm_(new io::OutputFileStream(std::cout)),
diff --git a/src/translator/output_collector.h b/src/translator/output_collector.h
index 154e8ded..51b47159 100755
--- a/src/translator/output_collector.h
+++ b/src/translator/output_collector.h
@@ -45,10 +45,13 @@ private:
class OutputCollector {
public:
- OutputCollector(std::string outFile = "stdout");
+ OutputCollector();
+ OutputCollector(std::string outFile);
template <class T>
- OutputCollector(T&& arg) : nextId_(0), outStrm_(new io::OutputFileStream(arg)) {}
+ OutputCollector(T&& arg)
+ : nextId_(0),
+ outStrm_(new io::OutputFileStream(arg)) {}
OutputCollector(const OutputCollector&) = delete;
diff --git a/src/translator/translator.h b/src/translator/translator.h
index cc2cbea2..9f973113 100755
--- a/src/translator/translator.h
+++ b/src/translator/translator.h
@@ -111,19 +111,21 @@ public:
bestn.str(),
options_->get<bool>("n-best"));
}
+
+
+ // progress heartbeat for MS-internal Philly compute cluster
+ // otherwise this job may be killed prematurely if no log for 4 hrs
+ if (getenv("PHILLY_JOB_ID") // this environment variable exists when running on the cluster
+ && id % 1000 == 0) // hard beat once every 1000 batches
+ {
+ auto progress = 0.f; //fake progress for now
+ fprintf(stdout, "PROGRESS: %.2f%%\n", progress);
+ fflush(stdout);
+ }
};
threadPool.enqueue(task, batchId++);
- // progress heartbeat for MS-internal Philly compute cluster
- //otherwise this job may be killed prematurely if no log for 4 hrs
- if (getenv("PHILLY_JOB_ID")) // this environment variable exists when running on the cluster
- {
- auto progress = 0.f; //fake progress for now
- fprintf(stdout, "PROGRESS: %.2f%%\n", progress);
- fflush(stdout);
- }
-
}
}
};