diff options
author | Marcin Junczys-Dowmunt <marcinjd@microsoft.com> | 2018-12-07 00:21:25 +0300 |
---|---|---|
committer | Marcin Junczys-Dowmunt <marcinjd@microsoft.com> | 2018-12-07 00:21:25 +0300 |
commit | 1b2968c8b9465ce2225f304f5deea7f642f3e533 (patch) | |
tree | 3dc8fd194c28a635ac57dea9951ba74f9c21cef6 | |
parent | 9562338ff78e226caad84ac29aa0be4e8b344368 (diff) | |
parent | e78d805955a5613e91cc3f2af1db2776a6c6e3da (diff) |
Merge branch 'master' into nccl
69 files changed, 7821 insertions, 862 deletions
@@ -62,5 +62,3 @@ examples/mnist/*ubyte .vs
.vscode
-# SentencePiece is automatically downloaded when requested -src/3rd_party/sentencepiece/ diff --git a/.gitmodules b/.gitmodules index 903659e7..623b7060 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,3 +1,6 @@ [submodule "examples"] path = examples url = https://github.com/marian-nmt/marian-examples +[submodule "src/3rd_party/sentencepiece"] + path = src/3rd_party/sentencepiece + url = https://github.com/marian-nmt/sentencepiece diff --git a/CHANGELOG.md b/CHANGELOG.md index ce5fb3a4..a2c2e48d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,9 +7,22 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. ## [Unreleased] +### Fixed +- Errors due to warnings + +### Changed +- Set nearly all warnings as errors for Marian's own targets. Disable warnings for 3rd party. + +## [1.7.0] - 2018-11-27 + ### Added - Word alignment generation in scorer - Attention output generation in decoder and scorer with `--alignment soft` +- Support for SentencePiece vocabularies and run-time segmentation/desegmentation +- Support for SentencePiece vocabulary training during model training +- Group training files by filename when creating vocabularies for joint vocabularies +- Updated examples +- Synchronous multi-node training (early version) ### Fixed - Delayed output in line-by-line translation @@ -17,6 +30,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. ### Changed - Generated word alignments include alignments for target EOS tokens - Boost::program_options has been replaced by another CLI library +- Replace boost::file_system with Pathie - Expansion of unambiguous command-line arguments is no longer supported ## [1.6.0] - 2018-08-08 diff --git a/CMakeLists.txt b/CMakeLists.txt index 1293d39a..c585b9f4 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,6 +1,11 @@ cmake_minimum_required(VERSION 3.5.1) set(CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake) +if (POLICY CMP0074) + cmake_policy(SET CMP0074 NEW) # CMake 3.12 +endif () + + project(marian CXX C) set(CMAKE_CXX_STANDARD 11) set(CMAKE_CXX_STANDARD_REQUIRED ON) @@ -38,33 +43,33 @@ if(MSVC) set(CMAKE_STATIC_LINKER_FLAGS "${CMAKE_STATIC_LINKER_FLAGS} /LTCG:incremental") else() - set(CMAKE_CXX_FLAGS " -std=c++11 -O3 -Ofast -m64 -pthread -march=${BUILD_ARCH} -msse4.1 -Wl,--no-as-needed -funroll-loops -ffinite-math-only -fPIC -Wno-unused-result -Wno-deprecated -Werror -Wno-pragmas") + set(DISABLE_GLOBALLY "-Wno-unused-result") + + # These are used in src/CMakeLists.txt on a per-target basis + list(APPEND ALL_WARNINGS -Wall; -Werror; -Wno-unused-result; -Wno-deprecated; -Wno-pragmas; -Wno-unused-parameter; -Wextra; -Wno-unused-function; + -Wno-unused-value; -Wno-unknown-pragmas; -Wno-sign-compare; -Wno-missing-field-initializers) + + # This warning does not exist prior to gcc 5.0 + if(CMAKE_COMPILER_IS_GNUCC AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 5.0) + list(APPEND ALL_WARNINGS -Wsuggest-override) + endif() + + set(CMAKE_CXX_FLAGS "-std=c++11 -O3 -Ofast -m64 -pthread -march=${BUILD_ARCH} -msse4.1 -Wl,--no-as-needed -funroll-loops -ffinite-math-only -fPIC ${DISABLE_GLOBALLY}") set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS} -g -rdynamic") - set(CMAKE_CXX_FLAGS_DEBUG " -std=c++11 -g -rdynamic -O0 -pthread -Wl,--no-as-needed -fPIC -Wno-unused-result -Wno-deprecated -Werror -Wno-pragmas") + set(CMAKE_CXX_FLAGS_DEBUG "-std=c++11 -g -rdynamic -O0 -pthread -Wl,--no-as-needed -fPIC -Wno-unused-result -Wno-deprecated -Werror -Wno-pragmas") set(CMAKE_CXX_FLAGS_SLIM "${CMAKE_CXX_FLAGS} -DNDEBUG") - set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELEASE} -pg -g -rdynamic -Wall -Wextra -Wsuggest-override -Wno-unused-value -Wno-unknown-pragmas -Wno-sign-compare -Wno-missing-field-initializers") + set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS} -g -rdynamic") set(CMAKE_CXX_FLAGS_PROFILE "${CMAKE_CXX_FLAGS_RELEASE} -pg -g -rdynamic") set(CMAKE_CXX_FLAGS_PROFGEN "${CMAKE_CXX_FLAGS_RELEASE} -fprofile-generate -fprofile-correction") set(CMAKE_CXX_FLAGS_PROFUSE "${CMAKE_CXX_FLAGS_RELEASE} -fprofile-use -fprofile-correction") -endif() + endif() # Downloading SentencePiece if requested and set to compile with it. # Requires all the dependencies imposed by SentencePiece if(USE_SENTENCEPIECE) - message(STATUS "Using SentencePiece from our fork https://github.com/marian-nmt/sentencepiece.git") - if(NOT EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/src/3rd_party/sentencepiece) - execute_process(COMMAND git clone https://github.com/marian-nmt/sentencepiece.git - WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/src/3rd_party - RESULT_VARIABLE git_result - ERROR_QUIET) - message(STATUS "Downloaded SentencePiece [code: ${git_result}]") - else() - message(STATUS "It seems that SentencePiece has already been downloaded. Reusing.") - endif() - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DUSE_SENTENCEPIECE") LIST(APPEND CUDA_NVCC_FLAGS -DUSE_SENTENCEPIECE; ) - set(EXT_LIBS ${EXT_LIBS} sentencepiece) + set(EXT_LIBS ${EXT_LIBS} sentencepiece sentencepiece_train) endif() @@ -121,6 +126,7 @@ else(CMAKE_BUILD_TYPE STREQUAL "Debug") list(APPEND CUDA_NVCC_FLAGS --default-stream per-thread; -O3; -g; --use_fast_math; -arch=sm_30; -gencode=arch=compute_30,code=sm_30; -gencode=arch=compute_50,code=sm_50; -gencode=arch=compute_52,code=sm_52; -gencode=arch=compute_60,code=sm_60; -gencode=arch=compute_61,code=sm_61; -gencode=arch=compute_61,code=compute_61 ;) endif(CMAKE_BUILD_TYPE STREQUAL "Debug") if(NOT MSVC) + # @TODO: add warnings here too list(APPEND CUDA_NVCC_FLAGS -std=c++11; -Xcompiler\ -fPIC; -Xcompiler\ -Wno-unused-result; -Xcompiler\ -Wno-deprecated; -Xcompiler\ -Wno-pragmas; -Xcompiler\ -Wno-unused-value; -Xcompiler\ -Werror;) else() list(APPEND CUDA_NVCC_FLAGS -Xcompiler\ /FS; ) @@ -1 +1 @@ -v1.6.2 +v1.7.1 diff --git a/examples b/examples -Subproject 8c6f4ef6859ef224dbc7ff891884bf7050d718c +Subproject 336740065d9c23e53e912a1befff18981d9d27a diff --git a/src/3rd_party/CMakeLists.txt b/src/3rd_party/CMakeLists.txt index faf37527..f7eabf54 100644 --- a/src/3rd_party/CMakeLists.txt +++ b/src/3rd_party/CMakeLists.txt @@ -3,9 +3,9 @@ include_directories(.) add_subdirectory(./yaml-cpp) add_subdirectory(./SQLiteCpp) +add_subdirectory(./pathie-cpp) if(USE_SENTENCEPIECE) - if(USE_STATIC_LIBS) set(_ORIG_CMAKE_FIND_LIBRARY_SUFFIXES ${CMAKE_FIND_LIBRARY_SUFFIXES}) if(WIN32) @@ -29,9 +29,8 @@ if(USE_SENTENCEPIECE) if(USE_STATIC_LIBS) set(CMAKE_FIND_LIBRARY_SUFFIXES ${_ORIG_CMAKE_FIND_LIBRARY_SUFFIXES}) endif() - endif(USE_SENTENCEPIECE) include_directories(./SQLiteCpp/include) include_directories(./CLI) - +include_directories(./pathie-cpp/include) diff --git a/src/3rd_party/pathie-cpp/CHANGELOG b/src/3rd_party/pathie-cpp/CHANGELOG new file mode 100644 index 00000000..52942338 --- /dev/null +++ b/src/3rd_party/pathie-cpp/CHANGELOG @@ -0,0 +1,52 @@ +-- Version 0.1.0 (2017-10-28) -- + +* Add Pathie::Tempdir and Pathie::Tempfile classes for + creating temporary directories and files. +* Add Pathie::entry_iterator, Path::begin_entries(), and + Path::end_entries(). These allow you to use real C++ + iterators for working with directory entries. +* Change Path::find() to take a callback instead of std::function to + make it compile under C++98. +* Add Path::operator/=. +* Add `const' qualifier to Path::fopen() and Path::touch() as these + methods leave the path itself unchanged. +* Add C++98 compatibility (instead of just C++11). +* Mark stream replacements as experimental. They are mostly untested + and I don't really use them. +* Add PATHIE_BUILD_STREAM_REPLACEMENTS for building the stream + replacements. +* Rename build option ASSUME_UTF8_ON_UNIX to + PATHIE_ASSUME_UTF8_ON_UNIX. +* Switch license from GPL3 to BSD-2clause. +* Drop support for expanding "~username/foo" constructs. This + nonstandard extension was unportable and caused problems when + linking Pathie statically. +* Add Path::utf8_str() method. +* Restructure header #include order. Pathie now requires you + to specify the exact header to include (e.g. <pathie/path.hpp>) + instead of one global header. There was no point in having the + stream replacements included if not required. +* Fix compilation problem with _PATHIE_UNIX not being defined +* Do not include <windows.h> in Pathie public headers. This caused + problems in some circumstances when a certain macro combination + of windows.h was needed. +* Remove config.hpp. This caused confusion when the library was used. + Build configuration now only happens via comandline options. +* Drop shaky support for NTFS symlinks. It never worked really well + anyway. + +-- Version 0.0.3 (2015-04-30) -- + +* Don't use CMake's global configuration variables, allowing pathie to + be built as a subproject. +* Fix compilation error on systems that do not automatically + #include <stdexcept>. + +-- Version 0.0.2 (2015-02-16) -- + +* Fix installation error on config.hpp +* Add message that C++11 is required for compilation + +-- Version 0.0.1 (2015-02-13) -- + +First public release. diff --git a/src/3rd_party/pathie-cpp/CMakeLists.txt b/src/3rd_party/pathie-cpp/CMakeLists.txt new file mode 100644 index 00000000..db5744f5 --- /dev/null +++ b/src/3rd_party/pathie-cpp/CMakeLists.txt @@ -0,0 +1,8 @@ +include_directories(..) +include_directories(.) +include_directories(include) + +FILE(GLOB PathieCppSources src/*.cpp) +if (NOT TARGET pathie-cpp) + add_library(pathie-cpp OBJECT ${PathieCppSources}) +endif() diff --git a/src/3rd_party/pathie-cpp/LICENSE b/src/3rd_party/pathie-cpp/LICENSE new file mode 100644 index 00000000..f74dec43 --- /dev/null +++ b/src/3rd_party/pathie-cpp/LICENSE @@ -0,0 +1,24 @@ +Copyright © 2015, 2017 Marvin Gülker + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + +1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +“AS IS” AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/src/3rd_party/pathie-cpp/README.md b/src/3rd_party/pathie-cpp/README.md new file mode 100644 index 00000000..80b68770 --- /dev/null +++ b/src/3rd_party/pathie-cpp/README.md @@ -0,0 +1,359 @@ +PATHIE. +======= + +This is the Pathie project. It aims to provide a C++ library that covers +all needs of pathname manipulation and filename fiddling, without +having to worry about the underlying platform. That is, it is a glue +library that allows you to create platform-independent filename +handling code with special regard to Unicode path names. + +Supported systems +----------------- + +Currently supported platforms are Linux and Windows, the latter via +MSYS2 GCC. Any other compiler or system might or might not work. Mac +OS should work as well, but I cannot test this due to lack of a Mac. I +gladly accept contributions for any system or compiler. + +Pathie's source code itself is written conforming to C++98. On UNIX +systems, it assumes the system supports POSIX.1-2001. On Windows +systems, the minimum supported Windows version is Windows Vista. + +Installation +------------ + +See INSTALL.md. + +The library +----------- + +The entire world is using UTF-8 as the primary Unicode encoding. The +entire world? No, a little company from Redmond resists the temptation +and instead uses UTF-16LE, causing cross-platform handling of Unicode +paths to be a nightmare. + +One of the main problems the author ran into was compiler-dependant +code that was not marked as such. Many sites on the Internet claim +Unicode path handling on Windows is easy, but in fact, it only is if +you define “development for Windows” as “development with MSVC”, +Microsoft’s proprietary C/C++ compiler, which provides nonstandard +interfaces to allow for handling UTF-16LE filenames. The Pathie +library has been developed with a focus on MinGW and crosscompilation +from Linux to Windows and thus does not suffer from this problem. + +The Pathie library has been developed to release the programmer from +the burden of handling the different encodings in use for filenames, +and does so by focusing its API on UTF-8 regardless of the platform in +use. Thus, if you use UTF-8 as your preferred encoding inside your +program (take a look at the [UTF8 Everywhere +website](http://www.utf8everywhere.org) for reasons why you should do +that), Pathie will be of the most use for you, since it transparently +converts whatever filesystem encoding is encountered to UTF-8 in its +public interface. Likewise, any pathname you pass to the library is +assumed to be UTF-8 and is transcoded transparently to the filesystem +encoding before invoking the respective OS' filesystem access +methods. Of course, explicit conversion functions are also provided, +in case you do need a string in the native encoding or need to +construct a path from a string in the native encoding. + +General Usage +------------- + +First thing is to include the main header: + +~~~~~~~~~~~~~~~~~~{.cpp} +#include <pathie/path.hpp> +~~~~~~~~~~~~~~~~~~ + +Now consider the simple task to get all children of a directory, which +have Unicode filenames. Doing that manually will result in you having +to convert between UTF-8 and UTF-16 all the time. With pathie, you can +just do this: + +~~~~~~~~~~~~~~~~~~~{.cpp} +std::vector<Pathie::Path> children = your_path.children(); +~~~~~~~~~~~~~~~~~~~ + +Done. Retrieving the parent directory of your directory is pretty easy: + +~~~~~~~~~~~~~~~~~~~{.cpp} +Pathie::Path yourpath("foo/bar/baz"); +Pathie::Path parent = yourpath.parent(); +~~~~~~~~~~~~~~~~~~~ + +But Pathie is much more than just an abstraction of different filepath +encodings. It is a utility library for pathname manipulation, i.e. it +allows you to do things like finding the parent directory, expanding +relative to absolute paths, decomposing a filename into basename, +dirname, and extension, and so on. See the documentation of the +central Pathie::Path class on what you can do. + +~~~~~~~~~~~~~~~~~~~~~~{.cpp} +// Assume current directory is /tmp +Pathie::Path p("foo/bar/../baz"); +p.expand(); // => /tmp/foo/baz +~~~~~~~~~~~~~~~~~~~~~~ + +Or my personal favourite: + +~~~~~~~~~~~~~~~~~~~{.cpp} +Pathie::Path p1("/tmp/foo/bar"); +Pathie::Path p2("/tmp/bar/foo"); +Pathie::Path p3 = p1.relative(p2); // => ../../foo/bar +~~~~~~~~~~~~~~~~~~~ + +It also provides you with commonly used paths like the user’s +configuration directory or the path to the running executable. + +~~~~~~~~~~~~~~~~~~~~{.cpp} +Pathie::Path configdir = Pathie::Path::config_dir(); +Pathie::Path exepath = Pathie::Path::exe(); +~~~~~~~~~~~~~~~~~~~~ + +Pathie assumes that all string arguments passed are in UTF-8 and +transparently converts to the native filesystem encoding internally. + +Still, if you interface directly with the Windows API or other external +libraries, you might want to retrieve the native representation from a +Path or construct a Path from the native representation. Pathie +doesn’t want to be in your way then. The following example constructs +from and converts to the native representation on Windows, which is +UTF-16LE: + +~~~~~~~~~~~~~~~~~~~~{.cpp} +// Contruct from native +wchar_t* utf16 = Win32ApiCall(); +Path mypath = Path::from_native(utf16); // also accepts std::wstring + +// Retrieve native (Note C++’ish std::wstring rather than +// raw wchar_t* on Windows) +std::wstring native_utf16 = mypath.native(); +~~~~~~~~~~~~~~~~~~~~ + +On UNIX, these methods work with normal strings (std::string instead +of std::wstring) in the underlying filesystem encoding. In most cases, +that will be UTF-8, but some legacy systems may still use something +like ISO-8859-1 in which case that will differ. + +### Temporary files and directories + +There are two classes `Pathie::Tempdir` and `Pathie::Tempfile` that +you can use if you need to work with temporary files or directories, +respectively. Constructing instances of these classes creates a +temporary entry, which is removed (recursively in case of directories) +when the instance is destroyed again. Use TempEntry::path() to get +access to the Path instance pointing to the created entry. + +~~~~~~~~~~~~~~~~~~~~{.cpp} +#include <pathie/tempdir.hpp> + +//... + +{ + srand(time(NULL)); // Needs random number generator + Pathie::Tempdir tmpdir("foo"); // Pass a fragment to use as part of filename + std::cout << "Temporary dir is: " << tmpdir.path() << std::endl; +} +// When `tmpdir' is destroyed, the destructor recursively +// deletes the directory that was created. +~~~~~~~~~~~~~~~~~~~~ + +### Opening a file with a Unicode path name + +On Windows with GCC, it is [not possible to open a file with Unicode +pathname](https://stackoverflow.com/questions/821873) via C++'s usual +`std::ifstream` and `std::ofstream` mechanism. There's a nonstandard +extension provided by Microsoft's proprietary compiler that does this, +but GCC does not have this extension. Consequently, code that is +intended to compile on GCC (like Pathie) has to avoid it. + +There *is* however a function in the Win32API that allows to open a +file with a Unicode pathname *and* that returns a standard C `FILE*` +handle, +[_wfopen()](http://msdn.microsoft.com/en-us/library/yeby3zcb.aspx). The +method Path::fopen() uses this function on Windows and a regular C +`fopen()` on all other platforms, thus allowing you to just deal with +your Unicode filename via the regular C I/O interface. If you urgently +need C++ I/O streams, read on. + +### Stream replacements + +Pathie mainly provides you with the means to handle paths, compose, +and decompose them. There is an experimental feature however that +provides replacements for C++ file streams that work with instances of +Pathie::Path instead of strings for opening a file. These replacements +are neither elegant nor portable, because they don't nicely honour the +template concept the STL is based on by directly subclassing the +standard streams in the matter needed most frequently and additionally +relying on vendor-specific details. For GCC, an internal (but at least +documented) interface is used to exchange the file descriptor inside a +stream, and for MSVC, a nonstandard (but documented) constructor is +used. Other compilers are not supported by this feature (which most +notably affects clang, where I have no idea on the interfaces I need +to use for such a trick). + +In one word, these replacements are hacky and I consider them +experimental. If that does not strike you as problematic, you can +enable this feature by passing `-DPATHIE_BUILD_STREAM_REPLACEMENTS=ON` +when invoking `cmake` during the build process. + +In order to use the replacements, include the respective header +(either `pathie_ifstream` or `pathie_ofstream`) and use the +`Pathie::ifstream` and `Pathie::ofstream` classes just like you would +use `std::ifstream` and `std::ofstream`, with the only difference +being that you construct them from a Pathie::Path instance instead of +a string. See the documentation of Pathie::ofstream for more +information. + +~~~~~~~~~~~~~~~~~{.cpp} +#include <pathie/pathie_ofstream> + +// ... + +Pathie::Path p("Bärenstark.txt"); +Pathie::ofstream file(p); +file << "Some content" << std::endl; +file.close() +~~~~~~~~~~~~~~~~~ + +There's also the inofficial +[boost::nowide](http://cppcms.com/files/nowide/html/), which is +similar to this feature and maybe more reliable. It has [recently been +accepted into +boost](https://lists.boost.org/boost-announce/2017/06/0516.php). + +Dependencies and linking +------------------------ + +Pathie is standalone, that is, it requires no other libraries except +for those provided by your operating system. Note that there’s a +caveat with this on Windows, which does provide the `Shlwapi` library +by default, but MinGW's GCC does not automatically link it in. Be sure +to link to this library explicitely when compiling for MinGW Windows +by appending `-lShlwapi` to the end of your linking command line. + +It is recommended to link in pathie as a dynamic library, because +there are some problems with it when linked statically on certain +operating systems (see _Caveats_ below). If you are sure you aren’t +affected by those problems, it is possible to link in pathie +statically. + +Caveats +------- + +This library assumes that under all UNIX systems out there (I also +consider Mac OSX to be a UNIX system) the file system root always is +`/` and the directory separator also always is `/`. This structure is +mandatory as per POSIX -- in POSIX.1-2008, it’s specified in section +10.1. Systems which do neither follow POSIX directory structure, nor +are Windows, are unsupported. + +On POSIX-compliant systems other than Mac OS X, the filesystem +encoding [generally is +unspecified](https://unix.stackexchange.com/questions/2089/what-charset-encoding-is-used-for-filenames-and-paths-on-linux). +Pathnames are merely byte blobs which do not contain NUL bytes, and +components are separated by `/`. It’s up to the applications, +including utilities like a shell or the ls(1) program, to make +something of those byte streams. Therefore, it is perfectly possible +that on one system, user A uses ISO-8859-1 filenames and user B uses +UTF-8 filenames. Even the same user could use differently encoded +filenames. Programs that have to interpret the byte blobs in pathnames +on these systems look at the locale environment variables, namely +`LANG` and `LC_ALL`, see section 7 of POSIX.1-2008. As a consequence, +it may happen you want to create filenames with characters not +supported in the user’s pathname encoding. For example, if you want to +create a file with a hebrew filename and the user’s pathname encoding +is ISO-8859-1, there’s a problem, because ISO-8859-1 has no hebrew +characters in it, but in UTF-8, which is the encoding you are advised +to use and which is what Pathie’s API expects from you, they are +available. There is no sensible solution to this problem that the +Pathie library could dictate; the `iconv()` function used by pathie +just replaces characters that are unavailable in the target encoding +with a system-defined default (probably “?”). Note that on systems +which have a Unicode pathname encoding, especially modern Linuxes with +UTF-8, such a situation can’t ever arise, because the Unicode +encodings (UTF-*) cover all characters you can ever use. + +At least on FreeBSD, calling the POSIX `iconv()` function fails with +the cryptic error message “Service unavailable” if a program is linked +statically. I’ve reported [a bug on +this](https://bugs.freebsd.org/bugzilla/show_bug.cgi?id=196567). This +means that you currently can’t link in pathie statically on FreeBSD +and systems which don’t allow statically linked executables to call +`iconv()`. + +On Linux systems, it is recommended to set your program’s locale to the +environment’s locale before you call any functions the Pathie library +provides, because this will allow Pathie to use the correct encoding +for filenames. This is relevant where the environment’s encoding is +not UTF-8, e.g. with $LANG set to `de_DE.ISO-8859-1`. You can do this +as follows (the `""` locale always refers to the locale of the +environment): + +~~~~~~~~~~~~~~~~~~~~~{.cpp} +#include <locale> +std::locale::global(std::locale("")); +~~~~~~~~~~~~~~~~~~~~~ + +This is not required on Windows nor on Mac OS X, because these +operating systems always use UTF-16LE (Windows) or UTF-8 (Mac OS X) as +the filesystem encoding, regardless of the user's locale. It however +does not hurt to call this either, it simply makes no difference for +Pathie on these systems. If you urgently need to avoid this call on +Linux, you need to compile pathie with the special build option +PATHIE_ASSUME_UTF8_ON_UNIX, which will force Pathie to assume that +UTF-8 is used as the filesystem encoding under any UNIX-based system. + +Links +----- + +* Project page: https://www.guelkerdev.de/projects/pathie/ +* GitHub mirror: https://github.com/Quintus/pathie-cpp +* Issue tracker: https://github.com/Quintus/pathie-cpp/issues + +Contributing +------------ + +Feel free to submit any contributions you deem useful. Try to make +separate branches for your new features, give a description on what +you changed, etc. + +Don’t you duplicate boost::filesystem? +------------------------------------- + +Yes and +no. [boost::filesystem](http://www.boost.org/doc/libs/1_56_0/libs/filesystem/doc/index.htm) +provides many methods pathie provides, but has a major problem with +Unicode path handling if you are not willing to do the UTF-8/UTF-16 +conversion manually. boost::filesystem always uses UTF-8 to store the +paths on UNIX, and, which is the problem, always uses UTF-16LE to +store the paths on a Windows system. There is no way to override +this, although there is a [hidden documentation +page](http://www.boost.org/doc/libs/1_51_0/libs/locale/doc/html/default_encoding_under_windows.html) +that claims to solve the problem. I have wasted a great amount of time +to persuade boost::filesystem to automatically convert all +`std::string` input it receives into UTF-16LE, but failed to +succeed. Each time I wanted to create a file with a Unicode filename, +the test failed on Windows by producing garbage filenames. Finally I +found out that the neat trick shown in the documentation above indeed +does work -- but only if you use the Microsoft Visual C++ compiler +(MSVC) to compile your code. I don’t, I generally use g++ via the +[MinGW](http://www.mingw.org) toolchain. boost::filesystem fails with +g++ via MinGW with regard to Unicode filenames on Windows as of this +writing (September 2014). + +Apart from that, pathie provides some additional methods, especially +with regard to finding out where the user’s paths are. It is modelled +after Ruby’s popular +[Pathname](http://ruby-doc.org/stdlib-2.1.2/libdoc/pathname/rdoc/Pathname.html#method-i-rmtree) +class, but it doesn’t entirely duplicate its interface (which wouldn’t +be idiomatic C++). + +Also, pathie is a small library. Adding it to your project shouldn’t +hurt too much, while boost::filesystem is quite a large dependency. + +License +------- + +Pathie is BSD-licensed; see the file “LICENSE” for the exact license +conditions. diff --git a/src/3rd_party/pathie-cpp/include/entry_iterator.hpp b/src/3rd_party/pathie-cpp/include/entry_iterator.hpp new file mode 100644 index 00000000..85a53b1f --- /dev/null +++ b/src/3rd_party/pathie-cpp/include/entry_iterator.hpp @@ -0,0 +1,119 @@ +/* -*- coding: utf-8 -*- + * This file is part of Pathie. + * + * Copyright © 2015, 2017 Marvin Gülker + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * “AS IS” AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef PATHIE_ENTRY_ITERATOR_HPP +#define PATHIE_ENTRY_ITERATOR_HPP +#include <iterator> + +namespace Pathie { + + class Path; + + /** + * An iterator class for reading the entries in a directory. + * Note that the entries of a directory always include the + * "." (current directory) and ".." (parent directory) entries + * unresolved, and that the order in which the entries in the + * directory are returned is undefined (actually, the order + * depends on the filesystem used). + * + * The iterators of this class are always const. You cannot change + * the values referenced. + * + * It is unspecified behaviour what happens if a directory entry is + * added or removed to/from the directory while you are iterating + * it. Thus, keep iterations short in time. + * + * Instances of this class wrap an ephemeral handle like for example + * a directory descriptor on Linux. This handle is not copiable, + * which should normally mean that instances of this class cannot be + * copied. However, the `std::iterator` interface mandates that + * iterator instances are copiable (see "Requirements" here: + * <http://en.cppreference.com/w/cpp/concept/Iterator>) and in fact + * the language copies iterators all the time if you use them for + * example in a for loop. Consequently, this class implements the + * copy constructor and the copy assignment. However, these operations + * do *not* actually copy the instance, but instead *move* the content + * from the source instance to the target instance. The source intance + * is afterwards unusable and looks like a finished iterator. The + * `const` qualifiers in the copy operations are explicitely casted + * away inside the functions to allow this, so they don't mean anything + * for them. This works fairly nice for the ordinary use case (where + * the language creates implicit copies), but the API may look as if + * copying instances is allowed. It is not. *Do not copy* instances of + * this class even though it looks as if it's possible. Implicit + * copies automatically done by C++ as in for loops are okay, but + * that's it. That is, you *can* do this: + * + * ~~~~{.cpp} + * entry_iterator iter; + * for(iter=my_path.begin_entries(); iter != my_path.end_entries(); iter++) { + * // Work with iter... + * } + * ~~~~ + * + * But you *cannot* do this: + * + * ~~~~{.cpp} + * entry_iterator iter=my_path.begin_entries(); + * entry_iterator iter2(iter); + * ~~~~ + * + * This example does compile, but `iter` will be unusable after + * `iter2` has been constructed. + */ + class entry_iterator: public std::iterator<std::input_iterator_tag, Path, int> + { + public: + entry_iterator(); + entry_iterator(const Path* p_top); + ~entry_iterator(); + entry_iterator& operator=(const Path* p_top); // Restart assignment + operator bool() const; + bool operator==(const entry_iterator& other) const; + bool operator!=(const entry_iterator& other) const; + entry_iterator& operator++(int); + entry_iterator& operator++(); + const Path& operator*() const; + const Path* operator->() const; + + // "Copy" operations that really move the content, see class docs + entry_iterator(const entry_iterator& other); + entry_iterator& operator=(const entry_iterator& other); + private: + void open_native_handle(); + void close_native_handle(); + + const Path* mp_directory; ///< Path requested to read from. + void* mp_cur; ///< Native handle to the opened directory. + Path* mp_cur_path; ///< Path instance of the path pointed to by mp_cur (only a pointer to allow forward-declaration of Path). + }; +} + +#endif /* PATHIE_ENTRY_ITERATOR_HPP */ diff --git a/src/3rd_party/pathie-cpp/include/errors.hpp b/src/3rd_party/pathie-cpp/include/errors.hpp new file mode 100644 index 00000000..d79fb3c3 --- /dev/null +++ b/src/3rd_party/pathie-cpp/include/errors.hpp @@ -0,0 +1,119 @@ +/* -*- coding: utf-8 -*- + * This file is part of Pathie. + * + * Copyright © 2015, 2017 Marvin Gülker + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * “AS IS” AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef PATHIE_ERRORS_HPP +#define PATHIE_ERRORS_HPP +#include <exception> +#include <string> +#include <cstdlib> + +/* DWORD is typedef'ed from unsigned long, see + * <https://msdn.microsoft.com/en-us/library/cc230318.aspx> + * HRESULT is typedef'ed from LONG, which in turn is a typedef + * of long, see <https://msdn.microsoft.com/en-us/library/cc230330.aspx>. + * I spell the types out here in this header to avoid having to + * include windows.h, which might interfer with programmes using + * pathie that want to include windows.h on itself. */ + +#include "pathie.hpp" + +namespace Pathie { + + /// Base class for all exceptions in this library. + class PathieError: public std::exception { + public: + PathieError(); ///< Constructs a new instance. + PathieError(std::string message); ///< Contructs a new instance with the given what() message. + virtual ~PathieError() throw(); + + virtual const char* what() const throw(); ///< The error message. + protected: + std::string m_pathie_errmsg; ///< The error message given in the constructor. + }; + + + /// This exception is thrown when a call to a C/system function results + /// in `errno` being set. + class ErrnoError: public PathieError { + public: + ErrnoError(int val); ///< Constructs a new instance from the given `errno` value. + virtual ~ErrnoError() throw(); + + inline int get_val(){return m_val;} ///< The `errno` value. + private: + int m_val; + }; + +#ifdef _WIN32 + + /// This exception is thrown only on Windows, when a call to the Win32API + /// fails. + /// The "unsigned long" type here is actually DWORD (which is it a + /// typedef of in Win32). + class WindowsError: public PathieError { + public: + WindowsError(unsigned long val); ///< Constructs a new instance from the given GetLastError() value. + virtual ~WindowsError() throw(); + + inline int get_val(){return m_val;} ///< The GetLastError() value. + private: + unsigned long m_val; + }; + + /// Similar to WindowsError, this exception is thrown when a HANDLE function + /// from the Win32API fails. + /// The "long" type here is actually HRESULT (which it is a typedef of in Win32). + class WindowsHresultError: public PathieError { + public: + WindowsHresultError(long value); ///< Constructs a new instance from the given handle function result. + virtual ~WindowsHresultError() throw(); + + inline long get_val(){return m_val;} ///< The handle function result. + private: + int m_val; + }; +#endif + +#ifdef _PATHIE_UNIX + + /// This exception is thrown only on UNIX, when a call to the POSIX glob(3) + /// function fails. + class GlobError: public PathieError { + public: + GlobError(int val); ///< Contructs a new instance from the given glob(3) error code. + virtual ~GlobError() throw(); + + inline int get_val(){return m_val;} ///< The glob(3) error code. + private: + int m_val; + }; +#endif + +} +#endif diff --git a/src/3rd_party/pathie-cpp/include/path.hpp b/src/3rd_party/pathie-cpp/include/path.hpp new file mode 100644 index 00000000..90729709 --- /dev/null +++ b/src/3rd_party/pathie-cpp/include/path.hpp @@ -0,0 +1,377 @@ +/* -*- coding: utf-8 -*- + * This file is part of Pathie. + * + * Copyright © 2015, 2017 Marvin Gülker + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * “AS IS” AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef PATHIE_PATH_HPP +#define PATHIE_PATH_HPP +#include <string> +#include <iostream> +#include <vector> +#include <sys/stat.h> + +#include "pathie.hpp" +#include "entry_iterator.hpp" + +namespace Pathie { + + // Forward-declare, defined in pathie.cpp. +#if defined(_WIN32) + std::string utf16_to_utf8(std::wstring); + std::wstring utf8_to_utf16(std::string); +#elif defined(_PATHIE_UNIX) + std::string utf8_to_filename(const std::string& utf8); + std::string filename_to_utf8(const std::string& native_filename); +#endif + + /** + * \brief Main class, describing paths. + * + * This class represents a single path on the filesystem. + * The path does not have to exist, but this class provides + * you with means to create it. + * + * Note on predefined directories + * ------------------------------ + * + * This class provides a lot of methods for retrieving information about + * system and user predefined directories. Note however that the + * referenced directories may or may not exist. + * + * See the pathlist.md document for an overview of possible path + * return values. + * + * Note on XDG directories on UNIX + * ------------------------------- + * + * Nowadays UNIX systems have adapted the Freedesktop.org + * XDG standards, and it is highly recommended to follow them + * when you write an application that stores user-specific data. + * XDG directories fall in two groups: Core data directories, covered + * by the main XDG specification, and user-dir directories, described + * in the documentation of the XDG user-dirs software. Directories of + * the first group are available today on all Linux systems, examples + * for them are ~/.config, ~/.local/share, and others. Directories + * of the latter group are typically found on desktop systems and + * are missing on servers, examples include ~/Documents and ~/Downloads. + * + * The following XDG specifications are followed: + * + * * XDG main specification: http://standards.freedesktop.org/basedir-spec/basedir-spec-latest.html + * * XDG user-dirs specification: http://www.freedesktop.org/wiki/Software/xdg-user-dirs/ + * + * Pathie is not a Shellscript parser, so it will fail if your XDG configuration + * files do not follow the usually found format. Especially no other variable + * substitution except from exactly one $HOME is understood. + * + * While the XDG specification for the core directories clearly says + * which directory to use if the administrator/user has not specified + * in his system configuration (by setting the appropriate environment + * variables), the user-dirs isn’t that easy. Or rather, it is, but not + * all desktop environment bother to follow it. The XDG user-dirs spec + * requires a file `~/.config/user-dirs.dirs` to exist, generated by the + * program xdg-user-dirs-update(1), which is run by all major desktop + * environments. Smaller ones don’t always do that, resulting in the file + * missing. The spec leaves open what should happen in such a case, i.e. + * it’s implemention-defined behaviour. I have chosen to return the + * user’s $HOME directory in such a case. The methods affected by this + * decision are the following ones: + * + * * documents_dir() + * * download_dir() + * * music_dir() + * * pictures_dir() + * * publicshare_dir() + * * templates_dir() + * * videos_dir() + * + * Other notes + * ----------- + * + * On UNIX, this library follows the Filesystem Hierarchy Standard, + * version 2.3 (http://refspecs.linuxfoundation.org/FHS_2.3/fhs-2.3.html). + * + * On UNIX, the FHS defines a "normal" file hierarchy and a "local" one; for + * example, /usr/share is part of the "normal" file hierarchy, which is mirrored + * to the "local" one in /usr/local/share. The "local" hierarchy is inteded to be + * used by programs that the system administrator manually installed without resorting + * to the system’s default package manager. Such a difference does not exist on Windows. + * Pathie allows you to decide yourself which information you want to query when calling + * one of the following functions: + * + * * global_mutable_data_dir() + * * global_immutable_data_dir() + * * global_config_dir() + * * global_cache_dir() + * * global_appentries_dir() + * + * Each of these functions takes an argument that allows you to specify whether + * you want the "local" or the "normal" hierarchy’s paths returned. The argument + * however is optional, and you can use the set_global_dir_default() method to + * specify what should happen if no argument is specified. By default, paths of + * the "local" hierarchy are returned. For example: + * + * ~~~~~~~~~~~~~~~~~~~ c++ + * Path p1 = Path::global_immutable_data_dir(); // /usr/local/share + * Path p2 = Path::global_immutable_data_dir(Path::LOCALPATH_NORMAL); // /usr/share + * Path p3 = Path::global_immutable_data_dir(Path::LOCALPATH_LOCAL); // /usr/local/share + * + * Path::set_global_dir_default(Path::LOCALPATH_NORMAL); + * Path p4 = Path::global_immutable_data_dir(); // /usr/share + * Path p5 = Path::global_immutable_data_dir(Path::LOCALPATH_LOCAL); // /usr/local/share + * Path p6 = Path::global_immutable_data_dir(Path::LOCALPATH_NORMAL); // /usr/share + * ~~~~~~~~~~~~~~~~~~~ + * + * As you can see, the argument, if given, always takes precedence over the + * default set with set_global_dir_default(). + */ + class Path + { + public: + + /** + * Specifies the argument type for the `global_*_dir()` functions. + * `LOCALPATH_DEFAULT` means fall back to the default set with `set_global_dir_default()`, + * `LOCALPATH_NORMAL` means to use the normal FHS paths, and `LOCALPATH_LOCAL` means to use + * the paths the FHS specifies for local additions. + */ + enum localpathtype { + LOCALPATH_DEFAULT = 1, + LOCALPATH_NORMAL, + LOCALPATH_LOCAL + }; + + /// Default constructor. + Path(); + /// Copy constructor. + Path(const Path& path); + /// Construct a path from a string. + Path(std::string path); + /// Construct a path from components. + Path(const std::vector<Path>& components); + +#if defined(_PATHIE_UNIX) + static inline Path from_native(const std::string& native_filename) + { return Path(filename_to_utf8(native_filename)); } +#elif defined(_WIN32) + /** Convert a path that is in the native representation of + * the system into a Path instance. The argument will be + * transcoded from the system’s native encoding to UTF-8; + * on Windows, the argument is expected to be UTF-16LE therefore, + * while on UNIX, it is expected to be encoded in the environment’s + * locale. */ + static inline Path from_native(const std::wstring& native_filename) + { return Path(utf16_to_utf8(native_filename)); } +#else +#error Unsupported system. +#endif + + /// Returns the current working directory. + static Path pwd(); + /// Returns the path to the running executable. + static Path exe(); + /// Returns the home directory. + static Path home(); + + static Path data_dir(); ///< Directory for permanent user data + static Path config_dir(); ///< Directory for permanent user configuration files + static Path cache_dir(); ///< Directory for cached user data + static Path runtime_dir(); ///< Directory for volatile information + static Path temp_dir(); ///< Directory for temporary data + static Path desktop_dir(); ///< User’s desktop directory + static Path documents_dir(); ///< User’s documents directory + static Path download_dir(); ///< User’s download directory + static Path music_dir(); ///< User’s music directory + static Path pictures_dir(); ///< User’s pictures directory + static Path publicshare_dir(); ///< User’s networking directory + static Path templates_dir(); ///< User’s document templates directory + static Path videos_dir(); ///< User’s video directory + static Path appentries_dir(); ///< User’s application starters directory + + static Path global_mutable_data_dir(localpathtype local = LOCALPATH_DEFAULT); ///< Global directory for immutable permanent data + static Path global_immutable_data_dir(localpathtype local = LOCALPATH_DEFAULT); ///< Global directory for mutable permanent data + static Path global_config_dir(localpathtype local = LOCALPATH_DEFAULT); ///< Global directory for configuration files + static Path global_cache_dir(localpathtype local = LOCALPATH_DEFAULT); ///< Global directory for cached data + static Path global_runtime_dir(localpathtype local = LOCALPATH_DEFAULT); ///< Global directory for volatile information + static Path global_appentries_dir(localpathtype local = LOCALPATH_DEFAULT); ///< Global application starters directory + static Path global_programs_dir(); ///< Global directory for selfcontained programs + + static Path mktmpdir(const std::string& name = "tmpd"); ///< Create a temporary directory + + static inline void set_global_dir_default(localpathtype localdefault){ c_localdefault = localdefault; } ///< Specify what do do for the `global_*_dir()` methods if no argument is passed to them. + static inline localpathtype get_global_dir_default(){ return c_localdefault; } ///< Returns what was set with set_global_dir_default(). + +#ifdef _PATHIE_UNIX + static std::vector<Path> data_dirs(); + static std::vector<Path> config_dirs(); +#endif + + /// Shell-like glob. + static std::vector<Path> glob(const std::string& pattern, int flags = 0); + /// Traverse directory recursively. + void find(bool (*cb)(const Path& entry)) const; + + /// Return the path as a raw std::string. + std::string str() const; + /// Alias for str(). + std::string utf8_str() const; + /// Assign the given string to the underlying path. + void assign(std::string str); + +#if defined(_PATHIE_UNIX) + std::string native() const; +#elif defined(_WIN32) + /// Return the path in the native format. + std::wstring native() const; +#else +#error Unsupported system. +#endif + + void swap(Path& path) throw(); + + /// Number of components in the path string. + size_t component_count() const; + /// Burst path into components. + std::vector<Path> burst(bool descend = false) const; + /// Shell-like globbing. + std::vector<Path> dglob(const std::string& pattern, int flags = 0) const; + /// Glob pattern check without filesystem access. + bool fnmatch(const std::string& pattern, int flags = 0) const; + + Path& operator=(const Path& path); + Path& operator=(const std::string& str); + /// Access single component in the path. + Path operator[](size_t index) const; + bool operator==(const Path& path) const; + bool operator!=(const Path& path) const; + bool operator<(const Path& path) const; + bool operator>(const Path& path) const; + bool operator<=(const Path& path) const; + bool operator>=(const Path& path) const; + + Path operator/(Path path) const; + Path operator/(std::string str) const; + Path& operator/=(Path path); + Path& operator/=(std::string str); + Path join(Path path) const; + Path join(std::string path) const; + Path sub_ext(std::string new_extension) const; + + /// Platform-independant C fopen(). + FILE* fopen(const char* mode) const; + /// Update modification and access time to now. + void touch() const; + + bool is_absolute() const; ///< Checks if a path is relative. + bool is_relative() const; ///< Checks if a path is absolute. + bool is_root() const; ///< Checks if a path is the file system root. + + /// Remove all . and .. occurences. + Path prune() const; + /// Creates an absolute path for this path. + Path absolute(const Path& base = Path::pwd()) const; + /// Creates a relative path from an absolute one. + Path relative(Path base) const; + /// Expands all shortcuts plus create an absolute path for this path. + Path expand() const; + /// Get the one real path for this path. + Path real() const; + + Path parent() const; + Path root() const; + Path basename() const; + Path dirname() const; + std::string extension() const; + void split(Path& dirname, Path& basename) const; + + /// C stat information. +#if defined(_PATHIE_UNIX) + struct stat* stat() const; +#elif defined(_WIN32) + struct _stat* stat() const; +#else +#error Unsupported system. +#endif + + /// File size. + long size() const; + time_t atime() const; + time_t mtime() const; + time_t ctime() const; + + /// List of entries. + std::vector<Path> entries() const; + /// List of children. + std::vector<Path> children() const; + + bool exists() const; + bool is_directory() const; + bool is_file() const; + bool is_symlink() const; + + Path readlink() const; + /// Create a symbolic link. + void make_symlink(const Path& target) const; + void mkdir() const; + void rmdir() const; + void unlink() const; + void remove() const; + /// "mkdir -p"-like functionality. + void mktree() const; + /// "rm -r"-link functionality. + void rmtree() const; + /// Change file names. + void rename(Path& newname) const; + + entry_iterator begin_entries() const; + entry_iterator end_entries() const; + + private: + static std::string make_tempname(const std::string& namepart); + // Remove double // and trailing /, replace \ with /. + void sanitize(); + +#if defined(_PATHIE_UNIX) + static Path get_xdg_dir(const std::string& envvarname, const std::string& defaultpath); + static std::vector<Path> get_xdg_dirlist(const std::string& envvarname, const std::string& defaultlist); + static std::string get_xdg_userdir_setting(const std::string& setting); + static std::string get_home(std::string username); +#elif defined(_WIN32) + bool is_ntfs_symlink(const wchar_t* path) const; + wchar_t* read_ntfs_symlink(const wchar_t* path) const; +#endif + + static localpathtype c_localdefault; + std::string m_path; + }; + +} + +/// std::cout compatibility. +std::ostream& operator<<(std::ostream& stream, const Pathie::Path& p); + +#endif diff --git a/src/3rd_party/pathie-cpp/include/pathie.hpp b/src/3rd_party/pathie-cpp/include/pathie.hpp new file mode 100644 index 00000000..6afbf5b0 --- /dev/null +++ b/src/3rd_party/pathie-cpp/include/pathie.hpp @@ -0,0 +1,67 @@ +/* -*- coding: utf-8 -*- + * This file is part of Pathie. + * + * Copyright © 2015, 2017 Marvin Gülker + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * “AS IS” AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef PATHIE_PATHIE_HPP +#define PATHIE_PATHIE_HPP +#if __cplusplus < 199711L +#error Pathie requires C++98 support. Please use an option such as -std=c++98 to enable it. +#endif + +#if !defined(_PATHIE_UNIX) && (defined(unix) || defined(__unix__) || defined(__unix) || defined(__APPLE__) || defined(BSD)) +#define _PATHIE_UNIX +#endif + +#include <string> + +/// Namespace for this library. +namespace Pathie { + + /// Returns the version number is MAJOR.MINOR.TINY. + std::string version(); + + /** + * Returns the Git commit this was build from. + * Empty string if build without Git. + */ + std::string gitrevision(); + +#ifdef _WIN32 + std::string utf16_to_utf8(std::wstring); + std::wstring utf8_to_utf16(std::string); +#endif + +#ifdef _PATHIE_UNIX + std::string utf8_to_filename(const std::string& utf8); + std::string filename_to_utf8(const std::string& native_filename); + std::string convert_encodings(const char* from_encoding, const char* to_encoding, const std::string& string); +#endif + +} + +#endif diff --git a/src/3rd_party/pathie-cpp/include/pathie_ifstream.hpp b/src/3rd_party/pathie-cpp/include/pathie_ifstream.hpp new file mode 100644 index 00000000..c5736b37 --- /dev/null +++ b/src/3rd_party/pathie-cpp/include/pathie_ifstream.hpp @@ -0,0 +1,111 @@ +/* -*- coding: utf-8 -*- + * This file is part of Pathie. + * + * Copyright © 2015, 2017 Marvin Gülker + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * “AS IS” AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef PATHIE_IFSTREAM_HPP +#define PATHIE_IFSTREAM_HPP +#include <fstream> + +#if defined(_WIN32) && defined(__GNUC__) +#include <ext/stdio_filebuf.h> +#endif + +#include "path.hpp" + +namespace Pathie { + +#if defined(_PATHIE_UNIX) + class ifstream: public std::ifstream { + public: + ifstream(); + ifstream(char* path, std::ios_base::openmode = std::ios_base::in); + ifstream(std::string path, std::ios_base::openmode = std::ios_base::in); + ifstream(Pathie::Path path, std::ios_base::openmode = std::ios_base::in); + + void open(const char* filename, ios_base::openmode mode = ios_base::in); + void open(const std::string& filename, ios_base::openmode mode = ios_base::in); + void open(const Pathie::Path& filename, ios_base::openmode mode = ios_base::in); + }; + +#elif defined (_WIN32) +# if defined(__GNUC__) + /** + * \brief Input stream for UTF-8-encoded filenames. + * + * This class implements an interface like `std::ifstream` that works + * with Unicode paths regardless of the platform. Please refer to + * the documentation of Pathie::ofstream for more information on + * rationale and usage; this class works the same way as Pathie::ofstream, + * just for input rather than output file streams. + */ + class ifstream: public std::basic_istream<char, std::char_traits<char> > + { + public: + typedef char char_type; ///< Type used inside the stream. + typedef std::char_traits<char> traits_type; ///< Traits type + typedef typename traits_type::int_type int_type; ///< Int type + typedef typename traits_type::pos_type pos_type; ///< pos type + typedef typename traits_type::off_type off_type; ///< offset type + + ifstream(); + explicit ifstream(const char* filename, ios_base::openmode mode = ios_base::in); + explicit ifstream(const std::string& filename, ios_base::openmode mode = ios_base::in); + explicit ifstream(const Pathie::Path& filename, ios_base::openmode mode = ios_base::in); + ~ifstream(); + + __gnu_cxx::stdio_filebuf<char>* rdbuf() const; + bool is_open() const; // C++11 mandates const this, C++98 hadn’t that + void open(const char* filename, ios_base::openmode mode = ios_base::in); + void open(const std::string& filename, ios_base::openmode mode = ios_base::in); + void open(const Pathie::Path& filename, ios_base::openmode mode = ios_base::in); + void close(); + + private: + FILE* mp_file; + __gnu_cxx::stdio_filebuf<char>* mp_filebuffer; + bool m_buffer_allocated; + }; + +# elif defined(_MSC_VER) + class ifstream: public std::ifstream { + public: + ifstream(); + ifstream(char* path, std::ios_base::openmode = std::ios_base::in); + ifstream(std::string path, std::ios_base::openmode = std::ios_base::in); + ifstream(Pathie::Path path, std::ios_base::openmode = std::ios_base::in); + }; + +# else +# error Unsupported compiler: do not know how to open C++ stream on Unicode file. +# endif +#else +# error Unsupported system. +#endif + +} +#endif diff --git a/src/3rd_party/pathie-cpp/include/pathie_ofstream.hpp b/src/3rd_party/pathie-cpp/include/pathie_ofstream.hpp new file mode 100644 index 00000000..1ff43e6c --- /dev/null +++ b/src/3rd_party/pathie-cpp/include/pathie_ofstream.hpp @@ -0,0 +1,192 @@ +/* -*- coding: utf-8 -*- + * This file is part of Pathie. + * + * Copyright © 2015, 2017 Marvin Gülker + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * “AS IS” AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef PATHIE_OFSTREAM_HPP +#define PATHIE_OFSTREAM_HPP + +#if defined(_WIN32) && defined(__GNUC__) +#include <ostream> +#include <ext/stdio_filebuf.h> +#else +#include <fstream> +#endif + +#include "path.hpp" + +namespace Pathie { + +#if defined(_PATHIE_UNIX) + class ofstream: public std::ofstream { + public: + ofstream(); + ofstream(char* path, std::ios_base::openmode = std::ios_base::out); + ofstream(std::string path, std::ios_base::openmode = std::ios_base::out); + ofstream(Pathie::Path path, std::ios_base::openmode = std::ios_base::out); + + void open(const char* filename, ios_base::openmode mode = ios_base::out | ios_base::trunc); + void open(const std::string& filename, ios_base::openmode mode = ios_base::out | ios_base::trunc); + void open(const Pathie::Path& filename, ios_base::openmode mode = ios_base::out | ios_base::trunc); + }; +#elif defined (_WIN32) +# if defined(__GNUC__) + /** + * \brief Output stream for UTF-8-encoded filenames. + * + * Unicode filenames with C++ are horrible, and this is why the Pathie library + * was written in the first sense. However, working with paths may be nice, + * but what does this mean for you if you cannot actually open the file + * whose path you have been manipulating? On UNIX, the `std::ofstream` class + * will work just as expected if you pass it a UTF-8 unicode filename and it + * will open exactly the path you specified. Windows however uses UTF-16LE + * as the encoding for pathnames, and the same code that runs on UNIX will + * produce garbage filenames on Windows. Take this as an example: + * + * ~~~~~~~~~~~~~~~~~ c++ + * std::ofstream file("Bärenstark.txt"); + * file << "Some content" << std::endl; + * file.close(); + * ~~~~~~~~~~~~~~~~~ + * + * The file will appear as expected on UNIX, but on Windows it will have + * a garbage filename because Windows interprets filenames based on the + * `char` type as in the local encoding (Windows-1252 on a Western European + * Windows system). You have to use filenames based on `wchar_t` on Windows + * to get the desired effect. This, however, doesn’t work neither: + * + * ~~~~~~~~~~~~~~~~~ c++ + * std::ofstream file(L"Bärenstark.txt"); + * file << "Some content" << std::endl; + * file.close() + * ~~~~~~~~~~~~~~~~~ + * + * That is, it works on the Microsoft Visual C++ Compiler (MSVC). The reason + * for this is that the ISO C++ standard does not specify a constructor + * that takes filenames based on `wchar_t`, but only on `char`, which Windows + * interpretes as described above. That’s a nice proof of how Windows tries + * to be inherently different from all other modern OSes in this world, and + * how it makes simple tasks a pain if you want cross-platform behaviour. + * GCC on Windows, as distributed by the MinGW project, does not support the + * nonstandard contructor. As it stands, you **cannot** create Unicode files + * via the standard C++ interface with MinGW GCC. There is, however, a special + * function in the Windows API called `_wfopen()` that lets you at least open + * a file via a `fopen()`-like C API. Thankfully GCC provides a (also nonstandard) + * measure to create a filebuffer (this is what is used by the C++ streams + * under the hood to access the files) from a C `FILE*`. This class wraps + * that GNU C++ extension (`gnu_cxx::stdio_filebuf`) on Windows, as well as it wraps + * the standard stream API on other platforms. It therefore unites the different + * access methods under a single uniform interface that allows you to + * create Unicode filenames regardless of the platform you run on. + * + * Let’s revisit the previous example, now with Pathie’s streams: + * + * ~~~~~~~~~~~~~~~~~ c++ + * Pathie::ofstream file("Bärenstark.txt"); + * file << "Some content" << std::endl; + * file.close() + * ~~~~~~~~~~~~~~~~~ + * + * The `Pathie::ofstream` constructor takes a UTF-8 string and does the + * necessary conversion to UTF-16, uses `_wfopen()` under the hood to access + * the file, and then wraps a C++ stream around the already opened file + * descriptor. On platforms other than MiNGW Windows, the `Pathie::ofstream` class + * will just delegate to the standard `std::ofstream` class. As a bonus, + * if you compile with MSVC the nonstandard constructor described above + * is used. + * + * Of course, there’s also a constructor that will make it work directly + * with instances of Pathie::Path: + * + * ~~~~~~~~~~~~~~~~~ c++ + * Pathie::Path p("Bärenstark.txt"); + * Pathie::ofstream file(p); + * file << "Some content" << std::endl; + * file.close() + * ~~~~~~~~~~~~~~~~~ + * + * That is, you can stay with UTF-8 `char`-based strings (like `std::string`) + * for anything you use. Ain’t that great? + * + * \warning On Windows, this class tries to behave as similar as the standard + * `std::ofstream` as possible. Due to the file descriptor magic it does under + * the hood, however, there is a little difference: If you construct an + * instance of this class without associating it immediately with a filename + * (the constructor without arguments), using any methods apart from `is_open()` + * (which is specifically implemented for that purpose) that use the underlying + * filebuffer will result in segmentation faults, because the filebuffer has + * not yet been constructed (the area where it will be constructed into is + * full of NUL bytes if you wonder). + * + * \note Please refer to your preferred C++ STL documentation for the + * `std::ofstream` class for general usage of C++ file streams. + */ + class ofstream: public std::basic_ostream<char, std::char_traits<char> > + { + public: + typedef char char_type; ///< Type used inside the stream. + typedef std::char_traits<char> traits_type; ///< Traits type + typedef typename traits_type::int_type int_type; ///< Int type + typedef typename traits_type::pos_type pos_type; ///< pos type + typedef typename traits_type::off_type off_type; ///< offset type + + ofstream(); + explicit ofstream(const char* filename, ios_base::openmode mode = ios_base::out|ios_base::trunc); + explicit ofstream(const std::string& filename, ios_base::openmode mode = ios_base::out|ios_base::trunc); + explicit ofstream(const Pathie::Path& filename, ios_base::openmode mode = ios_base::out|ios_base::trunc); + ~ofstream(); + + __gnu_cxx::stdio_filebuf<char>* rdbuf() const; + bool is_open() const; // C++11 mandates const this, C++98 hadn’t that + void open(const char* filename, ios_base::openmode mode = ios_base::out | ios_base::trunc); + void open(const std::string& filename, ios_base::openmode mode = ios_base::out | ios_base::trunc); + void open(const Pathie::Path& filename, ios_base::openmode mode = ios_base::out | ios_base::trunc); + void close(); + + private: + FILE* mp_file; + __gnu_cxx::stdio_filebuf<char>* mp_filebuffer; + bool m_buffer_allocated; + }; + +# elif defined(_MSC_VER) + class ofstream: public std::ofstream { + public: + ofstream(); + ofstream(char* path, std::ios_base::openmode = std::ios_base::out); + ofstream(std::string path, std::ios_base::openmode = std::ios_base::out); + ofstream(Pathie::Path path, std::ios_base::openmode = std::ios_base::out); + }; +# else +# error Unsupported compiler: do not know how to open C++ stream on Unicode file. +# endif +#else +# error Unsupported system. +#endif + +} +#endif diff --git a/src/3rd_party/pathie-cpp/include/temp.hpp b/src/3rd_party/pathie-cpp/include/temp.hpp new file mode 100644 index 00000000..02a35879 --- /dev/null +++ b/src/3rd_party/pathie-cpp/include/temp.hpp @@ -0,0 +1,83 @@ +#ifndef PATHIE_TEMPDIR_HPP +#define PATHIE_TEMPDIR_HPP +#include "path.hpp" + +namespace Pathie { + + /** + * A class for working with temporary entries; this is the + * superclass of Tempdir and Tempfile that encapsulates the common + * logic between the two. This class cannot be instanciated + * directly, instead use Tempdir and Tempfile. + * + * This class relies on `rand()` when generating the temporary + * path name. Therefore, it is recommended to initialise the + * random number generator before creating instances of this class + * by calling the `srand()` function. + * + * In a multithreaded environment, this class generates conflicting + * directory names if the C random number generator is in the same state + * in two threads and an instance of Tempdir is constructed in these two + * threads in the very same second. You should not use an instance of + * this class in multiple threads. + */ + class TempEntry + { + public: + TempEntry(std::string namepart); + virtual ~TempEntry(); + + virtual void remove() const = 0; + void keep(bool k = true); + + Path path() const; + bool is_kept() const; + protected: + bool m_keep; + Path m_path; + }; + + /** + * Class for working with temporary directories. Creating + * an instance of this class creates a temporary directory, + * which is removed again when the object is destroyed. + * If you want to keep the directory for whatever reason, + * call TempEntry::keep(). + * + * Call TempEntry::path() to retrieve the path of the + * generated directory. + * + * See the docs for the TempEntry class for information + * on how the temporary names are generated. + */ + class Tempdir: public TempEntry + { + public: + Tempdir(std::string namepart); + virtual ~Tempdir(); + virtual void remove() const; + }; + + /** + * Class for working with temporary files. Creating + * an instance of this class creates a temporary file, + * which is removed again when the object is destroyed. + * If you want to keep the file for whatever reason, + * call TempEntry::keep(). + * + * Call TempEntry::path() to retrieve the path of the + * generated directory. + * + * See the docs for the TempEntry class for information + * on how the temporary names are generated. + */ + class Tempfile: public TempEntry + { + public: + Tempfile(std::string namepart); + virtual ~Tempfile(); + virtual void remove() const; + }; +} + +#endif /* PATHIE_TEMPDIR_HPP */ diff --git a/src/3rd_party/pathie-cpp/src/entry_iterator.cpp b/src/3rd_party/pathie-cpp/src/entry_iterator.cpp new file mode 100644 index 00000000..e2ecb2fe --- /dev/null +++ b/src/3rd_party/pathie-cpp/src/entry_iterator.cpp @@ -0,0 +1,279 @@ +/* -*- coding: utf-8 -*- + * This file is part of Pathie. + * + * Copyright © 2015, 2017 Marvin Gülker + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * “AS IS” AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "../include/entry_iterator.hpp" +#include "../include/path.hpp" +#include "../include/errors.hpp" + +#if defined(__unix__) +#include <sys/types.h> +#include <dirent.h> +#include <errno.h> +#include <stdexcept> +#elif defined(_WIN32) +#include <Windows.h> +#else +#error Unsupported system +#endif + +using namespace Pathie; + +/** + * The default constructor always constructs the terminal + * iterator, i.e. the one you want to test for if you want + * to know whether an iteration has completed. + */ +entry_iterator::entry_iterator() + : mp_directory(NULL), + mp_cur(NULL), + mp_cur_path(new Path()) +{ +} + +/** + * Construct an iterator that reads the entries in the given directory. + */ +entry_iterator::entry_iterator(const Path* p_directory) + : mp_directory(p_directory), + mp_cur(NULL), + mp_cur_path(new Path()) +{ + open_native_handle(); +} + +/** + * Destructor. Closes the open native handle, if it is open. + */ +entry_iterator::~entry_iterator() +{ + close_native_handle(); + + if (mp_cur_path) + delete mp_cur_path; + + // `mp_directory' is NOT deleted, because this class does not own it! +} + +/** + * Opens the native handle to the directory and reads the first + * entry from the directory. + */ +void entry_iterator::open_native_handle() +{ +#if defined(_PATHIE_UNIX) + std::string nstr = mp_directory->native(); + mp_cur = opendir(nstr.c_str()); + + if (mp_cur) { + struct dirent* p_dirent = readdir(static_cast<DIR*>(mp_cur)); + *mp_cur_path = filename_to_utf8(p_dirent->d_name); + } + else { + throw(Pathie::ErrnoError(errno)); + } +#elif defined(_WIN32) + std::wstring utf16 = utf8_to_utf16(mp_directory->str() + "/*"); + WIN32_FIND_DATAW finddata; + + mp_cur = FindFirstFileW(utf16.c_str(), &finddata); + if (static_cast<HANDLE>(mp_cur) == INVALID_HANDLE_VALUE) { + DWORD err = GetLastError(); + mp_cur = NULL; + throw(Pathie::WindowsError(err)); + } + else { + *mp_cur_path = utf16_to_utf8(finddata.cFileName); + } +#else +#error Unsupported system +#endif +} + +/// Helper function for closing the native handle. +void entry_iterator::close_native_handle() +{ + if (!mp_cur) + return; + +#if defined(_PATHIE_UNIX) + closedir(static_cast<DIR*>(mp_cur)); +#elif defined(_WIN32) + FindClose(static_cast<HANDLE>(mp_cur)); +#endif + + // Reset member variables + *mp_cur_path = Path(); + mp_cur = NULL; +} + +/** + * Increment operator. Calling this advances the iterator by one, + * thus pointing it to the next entry. If the end is reached, + * the iterator will compare equal to the return value of the + * default constructor, and dereferencing it yields an undefined + * result. + * + * \remark Note that this operator does *not* return the old value + * the iterator had, simply because that would mean copying the + * receiver first, and copying instances of this class is not + * possible. Thus, *do not rely* on the return value of this + * method. + */ +entry_iterator& entry_iterator::operator++(int) +{ + if (mp_cur) { +#if defined(_PATHIE_UNIX) + struct dirent* p_dirent = readdir(static_cast<DIR*>(mp_cur)); + if (p_dirent) { + *mp_cur_path = filename_to_utf8(p_dirent->d_name); + } + else { + close_native_handle(); + } +#elif defined(_WIN32) + WIN32_FIND_DATAW finddata; + if (FindNextFileW(static_cast<HANDLE>(mp_cur), &finddata)) { + *mp_cur_path = utf16_to_utf8(finddata.cFileName); + } + else { + close_native_handle(); + } +#else +#error Unsupported system +#endif + } + else { // Finished already + throw(std::range_error("Tried to advance a finished entry_iterator!")); + } + + return *this; +} + +/// Same as the other operator++(). +entry_iterator& entry_iterator::operator++() +{ + return (operator++()); +} + +/** + * Derefence operator. Returns the entry the iterator currently + * points at. + */ +const Path& entry_iterator::operator*() const +{ + return *mp_cur_path; +} + +/** + * Resets this iterator to start again on the path given. + */ +entry_iterator& entry_iterator::operator=(const Path* p_directory) +{ + close_native_handle(); + mp_directory = p_directory; + open_native_handle(); + return *this; +} + +/** + * Boolean operator. In comparisons, this iterator is true if + * it has not yet finished, false otherwise. + */ +entry_iterator::operator bool() const +{ + return !!mp_directory; +} + +/** + * Equality test. Two instances of this class are equal if: + * + * 1. If `other` is a terminal iterator as created by the parameterless + * constructor: if the receiver has finished iterating the directory. + * 2. If `other` is not a terminal iterator as described: if both + * iterators refer to the same top directory and their current + * native handle is the same and in the same state (hint: this + * is not going to happen under normal circumstances). + */ +bool entry_iterator::operator==(const entry_iterator& other) const +{ + if (other.mp_directory == NULL) { + /* `mp_directory' is only null for the terminal iterator, that is, + * a test for the terminal iterator was requested. An entry_iterator + * is terminated when `mp_cur' is null, so that's what is returned + * in reality when a test with the terminal iterator is + * requested. */ + return !mp_cur; + } + else { + return mp_directory == other.mp_directory && mp_cur == other.mp_cur; + } +} + +/// Inverse of operator==(). +bool entry_iterator::operator!=(const entry_iterator& other) const +{ + return !(*this == other); +} + +/** + * Derefence operator. Returns the entry the iterator currently + * points at. + */ +const Path* entry_iterator::operator->() const +{ + return mp_cur_path; +} + +/// "Copy" constructor -- see class docs for more info. +entry_iterator::entry_iterator(const entry_iterator& other) + : mp_directory(other.mp_directory), + mp_cur(other.mp_cur), + mp_cur_path(other.mp_cur_path) +{ + entry_iterator& e = const_cast<entry_iterator&>(other); + e.mp_directory = NULL; + e.mp_cur = NULL; + e.mp_cur_path = new Path(); +} + +/// "Copy" assignment -- see class docs for more info. +entry_iterator& entry_iterator::operator=(const entry_iterator& other) +{ + mp_directory = other.mp_directory; + mp_cur = other.mp_cur; + mp_cur_path = other.mp_cur_path; + + entry_iterator& e = const_cast<entry_iterator&>(other); + e.mp_directory = NULL; + e.mp_cur = NULL; + e.mp_cur_path = new Path(); + + return *this; +} + diff --git a/src/3rd_party/pathie-cpp/src/errors.cpp b/src/3rd_party/pathie-cpp/src/errors.cpp new file mode 100644 index 00000000..f5e406b1 --- /dev/null +++ b/src/3rd_party/pathie-cpp/src/errors.cpp @@ -0,0 +1,150 @@ +/* -*- coding: utf-8 -*- + * This file is part of Pathie. + * + * Copyright © 2015, 2017 Marvin Gülker + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * “AS IS” AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "../include/errors.hpp" + +#include <cerrno> +#include <cstring> +#include <sstream> + +#if defined(_WIN32) +#include <windows.h> +#elif defined(_PATHIE_UNIX) +#include <glob.h> +#endif + +using namespace Pathie; + +PathieError::PathieError() +{ + m_pathie_errmsg = "Unknown pathie exception."; +} + +PathieError::PathieError(std::string message) +{ + m_pathie_errmsg = message; +} + +PathieError::~PathieError() throw() +{ + // +} + +const char* PathieError::what() const throw() +{ + return m_pathie_errmsg.c_str(); +} + +ErrnoError::ErrnoError(int val) +{ + std::stringstream ss; + ss << val; + + m_val = val; + m_pathie_errmsg = "Errno " + ss.str() + ": " + strerror(val); +} + +ErrnoError::~ErrnoError() throw() +{ + // +} + +#ifdef _WIN32 +WindowsError::WindowsError(DWORD val) +{ + std::stringstream ss; + ss << val; + + wchar_t* buf = NULL; + FormatMessageW(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS, + NULL, + val, + LANG_USER_DEFAULT, + (wchar_t*) &buf, // What a weird API. + 0, + NULL); + + m_val = val; + m_pathie_errmsg = std::string("Windows Error Code ") + ss.str() + ": " + utf16_to_utf8(buf); + + LocalFree(buf); +} + +WindowsError::~WindowsError() throw() +{ + // +} + +WindowsHresultError::WindowsHresultError(HRESULT val) +{ + std::stringstream ss; + ss << val; + + m_val = val; + m_pathie_errmsg = std::string("Windows HRESULT Error Code :") + ss.str(); +} + +WindowsHresultError::~WindowsHresultError() throw() +{ + // +} + +#endif + +#ifdef _PATHIE_UNIX +GlobError::GlobError(int val) +{ + std::stringstream ss; + ss << val; + + m_val = val; + + m_pathie_errmsg = "Glob error code " + ss.str() + ": "; + + switch(val) { + case GLOB_NOSPACE: + m_pathie_errmsg += "GLOB_NOSPACE"; + break; + case GLOB_ABORTED: + m_pathie_errmsg += "GLOB_ABORTED"; + break; + case GLOB_NOMATCH: + m_pathie_errmsg += "GLOB_NOMATCH"; + break; + default: + m_pathie_errmsg += "Unknown glob error"; + break; + } +} + +GlobError::~GlobError() throw() +{ + // +} +#endif diff --git a/src/3rd_party/pathie-cpp/src/path.cpp b/src/3rd_party/pathie-cpp/src/path.cpp new file mode 100644 index 00000000..99185085 --- /dev/null +++ b/src/3rd_party/pathie-cpp/src/path.cpp @@ -0,0 +1,3348 @@ +/* -*- coding: utf-8 -*- + * This file is part of Pathie. + * + * Copyright © 2015, 2017 Marvin Gülker + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * “AS IS” AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "../include/path.hpp" +#include "../include/pathie.hpp" +#include "../include/errors.hpp" + +#include <cstdlib> +#include <cstdio> +#include <ctime> +#include <cstring> +#include <iostream> +#include <sstream> +#include <string> +#include <sys/types.h> +#include <sys/stat.h> +#include <stdexcept> +#include <errno.h> + +#if defined(_WIN32) +#include <windows.h> +#include <winioctl.h> +#include <direct.h> +#include <shlobj.h> +#include <shlwapi.h> +//#include <ntifs.h> // Currently not in msys2 + +#elif defined(_PATHIE_UNIX) +#include <unistd.h> +#include <limits.h> +#include <fcntl.h> +#include <sys/types.h> +#include <sys/param.h> // defines "BSD" macro on BSD systems +#include <pwd.h> +#include <glob.h> +#include <fnmatch.h> + +#else +#error Unsupported system. +#endif + +#ifdef BSD +#include <sys/time.h> +#include <sys/sysctl.h> +#endif + +using namespace Pathie; +using namespace std; + +Path::localpathtype Path::c_localdefault = LOCALPATH_LOCAL; + +/** + * The default constructor. It does **not** create an empty + * path, but a path whose value is ".", i.e. the current + * working directory as a relative path (see also pwd()). + */ +Path::Path() +{ + m_path = "."; +} + +/** + * Copies contents from path to a new instance. + * + * \param[in] path The Path instance to copy. + */ +Path::Path(const Path& path) +{ + m_path = path.m_path; +} + +/** + * This constructs a path from a given std::string. + * + * \param path String to construct from. Must be encoded in UTF-8. + * + * \returns a new instance of class Path. + */ +Path::Path(std::string path) +{ + m_path = path; + sanitize(); +} + +/** + * Constructs a Path instance from a list of path components. + * This is the inverse of the burst() method. + * + * \param[in] components List of components to join. + * + * \returns A new instance. + */ +Path::Path(const std::vector<Path>& components) +{ + m_path = components.front().m_path; + + if (components.size() > 1) { + // Ensure that for both absolute and relative path we end in + // a slash for appending below + if (m_path[0] != '/') { + m_path += "/"; + } + + std::vector<Path>::const_iterator iter; + for(iter=components.begin()+1; iter != components.end(); iter++) { // first element has already been taken care of above + m_path += (*iter).m_path + "/"; + } + + // Trailing slash is unwanted, remove it + m_path = m_path.substr(0, m_path.length()-1); + } +} + +/** + * Sanitizes the path. It: + * + * 1. Replaces any backslashes with forward slashes (read Windows). + * 2. Replaces all double forward slashes with single forward slashes + * 3. Delates a trailing slash, if any. + */ +void Path::sanitize() +{ + // Replace any backslashes \ with forward slashes /. + size_t cur = string::npos; + while ((cur = m_path.find("\\")) != string::npos) { // assignment intended + m_path.replace(cur, 1, "/"); + } + + // Replace all double slashes // with a single one + cur = string::npos; + while ((cur = m_path.find("//")) != string::npos) { // assignment intended + m_path.replace(cur, 2, "/"); + } + + // Remove trailing slash if any (except for the filesystem root) + long len = m_path.length(); +#if defined(_PATHIE_UNIX) + if (len > 1 && m_path[len - 1] == '/') + m_path = m_path.substr(0, len - 1); +#elif defined(_WIN32) + if (len > 1) { // / is root of current drive, "x" is the relative path "./x" + // Check if X:/foo/bar + if (len > 3 && m_path[len - 1] == '/') { // More than 3 chars cannot be root + m_path = m_path.substr(0, len - 1); + } + else { // Only drive root? + if (m_path[1] == ':') { + // Here m_path must be a drive root. The colon ":" is not allowed in paths on Windows except as the 2nd char to denote the drive letter + if (len == 2) { // Whoa -- "X:" misses leading / for drive root, append it + m_path.append("/"); + } + else if (len == 3 && m_path[2] != '/') { // Whoa -- "X:f" misses leading / for root directory, insert it + m_path.insert(2, "/"); + } + // else length is 3 with a slash, i.e. "X:/". This is fine and shall not be touched. + } + else { // not a drive root, delete trailing / if any + if (m_path[len - 1] == '/') { + m_path = m_path.substr(0, len - 1); + } + } + } + } +#else +#error Unsupported system +#endif +} + +/** \name Conversion methods + * + * Convert a path to other objects. + */ +///@{ + +/** + * Returns a copy of the underlying `std::string`. This is always + * encoded in UTF-8, regardless of the operating system. + * + * \see native() utf8_str() + */ +std::string Path::str() const +{ + return m_path; +} + +/** + * This method does the same as str(). It exists to make code using + * the UTF-8 variant more readable, because one tends to forget + * whether str() returns the native or the UTF-8 variant. + * + * \see native() str() + */ +std::string Path::utf8_str() const +{ + return m_path; +} + +#if defined(_PATHIE_UNIX) +std::string Path::native() const +{ + return utf8_to_filename(m_path); +} + +#elif defined(_WIN32) +/** + * Returns the path in the platform’s native format. Note + * that this method returns a `std::string` on UNIX, + * whereas it returns a `std::wstring` on Windows. + * + * On Windows, the returned string also uses exclusively backslashes + * instead of forward slashes. It is encoded in UTF-16LE. + * + * On UNIX, the returned string is in the encoding dictated by the locale + * ($LANG and $LC_ALL variables). + */ +std::wstring Path::native() const +{ + std::string dup(m_path); + + size_t pos = 0; + while((pos = dup.find("/", pos)) != std::string::npos) { // Single = intended + dup.replace(pos, 1, "\\"); + } + + return utf8_to_utf16(dup); +} +#else +#error Unsupported system. +#endif + +///@} + + +/** \name Path decomposition + * + * Retrieve the parts of the path you want. + */ +///@{ + +/** + * Returns the path’s basename, i.e. the last component + * of the path, including the file excention. + * + * For example, "/foo/bar.txt" has a basename of "bar.txt", + * and "/foo/bar" has a basename of "bar". + * + * \returns a new Path instance with only the basename. + * + * \see dirname() + */ +Path Path::basename() const +{ + if (m_path == ".") + return Path("."); + else if (m_path == "..") + return Path(".."); + else if (is_root()) + return Path(m_path); + + size_t pos = 0; + if ((pos = m_path.rfind("/")) != string::npos) // Single = intended + return Path(m_path.substr(pos + 1)); + else + return Path(m_path); +} + +/** + * Returns the path’s dirname, i.e. all components of the + * path except for the basename component (see basename()). + * + * For example, "/foo/bar/baz.txt" has a dirname of "/foo/bar", + * and "/foo/bar/baz" has a dirname of "/foo/bar". + * + * \returns a new Path instance with only the dirname. + * + * \see basename() parent() + */ +Path Path::dirname() const +{ + if (m_path == ".") + return Path("."); + else if (m_path == "..") + return Path("."); + else if (is_root()) + return Path(m_path); + + size_t pos = 0; + if ((pos = m_path.rfind("/")) != string::npos) { // Single = intended + if (pos == 0) { // /usr + return root(); + } +#ifdef _WIN32 + else if (pos == 1 && m_path[1] == ':') { // X:/foo + return root(); + } +#endif + else { // regular/path or /regular/path + return Path(m_path.substr(0, pos)); + } + } + else // single relative directory + return Path("."); +} + +/** + * This is a convenience method that allows you to retrieve + * both the dirname() and the basename() in one call. + * + * \param[out] dname Receives the dirname() value. + * \param[out] bname Receives the basename() value. + */ +void Path::split(Path& dname, Path& bname) const +{ + dname = dirname(); + bname = basename(); +} + +/** + * This method returns the file extension of the path, + * if possible; otherwise it returns an empty string. + * Filenames that consist entirely of a "file extension", + * i.e. ".txt" or "/foo/.txt" will return an empty string. + */ +std::string Path::extension() const +{ + if (m_path == ".") + return ""; + else if (m_path == "..") + return ""; + + size_t pos = 0; + if ((pos = m_path.rfind(".")) != string::npos) { // assignment intended + if (pos == 0 || pos == m_path.length() - 1) // .foo and foo. + return ""; + else { + if (m_path[pos - 1] == '/') // foo/.txt + return ""; + else + return m_path.substr(pos); + } + } + else + return ""; +} + +/** + * This is the same as dirname() and is provided only for convenience. + * + * \see dirname() + */ +Path Path::parent() const +{ + return dirname(); +} + +/** + * Returns the number of components in the path string, or + * in different words, counts the slashes and adds one for + * the last element, except if the path is just the root + * (see is_root()). + * + * The return value of this method minus one is the last + * possible index for operator[]. + */ +size_t Path::component_count() const +{ + if (is_root()) + return 1; + + size_t result = 0; + size_t pos = 0; + while ((pos = m_path.find("/", pos)) != string::npos) { // Assignment intended + result++; + pos++; + } + + return ++result; +} + +/** + * Returns the filesystem root for this path. On UNIX, + * this will always return /, but on Windows it will + * return X:/ if the referenced path is an absolute path + * with drive letter, and / if the referenced path is + * a relative path or an absolute path on the current + * drive. + */ +Path Path::root() const +{ +#if defined(_PATHIE_UNIX) + return Path("/"); +#elif defined(_WIN32) + // Check if we have an absolute path with drive, + // otherwise return the root for the current drive. + if (m_path[1] == ':') // Colon is on Windows only allowed here to denote a preceeding drive letter => absolute path + return Path(m_path.substr(0, 3)); + else + return Path("/"); +#else +#error Unsupported system. +#endif +} + +/** + * This method splits up the paths into its separate components, + * i.e. it splits it up at every /, except for the leading / of + * an absolute path, which is considered a component on its own + * and is thus the first element of a bursted absolute path. + * + * \param descend (`false`) If this is true, keeps the parent paths when bursting. + * + * \returns A vector of Path instances, where each instance + * corresponds to one component of the Path. + * + * Example: + * + * ~~~~~~~~~~~~~~~~~~~~ c++ + * Path p("/tmp/foo/bar"); + * p.burst(); // => /, tmp, foo, bar + * p.burst(true); // => /, /tmp, /tmp/foo, /tmp/foo/bar + * ~~~~~~~~~~~~~~~~~~~~ + */ +std::vector<Path> Path::burst(bool descend /* = false */) const +{ + size_t pos = 0; + size_t lastpos = 0; + std::vector<Path> results; + std::string prefix; + + // Take care of leading / of absolute paths + if (m_path[0] == '/') { + results.push_back(Path("/")); + prefix.append("/"); + + // Adjust pos so we don’t find the initial / + pos++; + lastpos++; + } + + while((pos = m_path.find("/", pos)) != string::npos) { + std::string component = m_path.substr(lastpos, pos - lastpos); + + if (descend) { + results.push_back(Path(prefix + component)); + prefix.append(component); + prefix.append("/"); + } + else { + results.push_back(Path(component)); + } + + lastpos = pos + 1; + pos++; + } + + std::string lastcomponent = m_path.substr(lastpos); + + if (descend) + results.push_back(Path(prefix + lastcomponent)); // Note no trailing / + else + results.push_back(Path(lastcomponent)); + + return results; +} + +///@} + +/** \name Path expansion + * + * Expand paths to a more fuller version without shortcuts. + */ + +///@{ + +/** + * This method, removes all occurences of . and .. from the path, + * leaving a clean filesystem path. + * + * Note that neither an absolute path is created, nor + * are shortcuts other than . and .. expanded. + * + * This method does not access file filesystem, and thus does not + * know about symbolic links. Therefore, if the path contains symlinks, + * the result may not be the way you expect it. Use real() if + * you need to resolve all your symbolic links in the path. + * + * For example, if you have a directory `/tmp/foo`, which contains a + * symbolic link `bar` that points to `/tmp/bar`, then a path of + * `/tmp/foo/bar/..` will be prune()d to `/tmp/foo`, although the + * canonically correct result is `/tmp`. The latter is what you will + * get if you use real(). + * + * \returns A new string with . and .. removed. + * + * \see expand() real() + */ +Path Path::prune() const +{ + std::string newpath(m_path); // copy + size_t pos = 0; + while((pos = newpath.find("/.", pos)) != string::npos) { // assignment intended + if (newpath.substr(pos, 3) == "/..") { + + // Weird path like /..foo or foo/..bar, which are NOT relative paths + if (newpath.length() > pos + 3 && newpath[pos + 3] != '/') { + // Do not reset `pos' -- this has to stay. Advance to the next char. + pos++; + continue; + } + + if (pos == 0) { + // /.. at beginning of string, replace with root / (/ on Windows is root on current drive) + newpath.erase(pos, 3); + + // Whoops -- the entire string was just "/.." + if (newpath.empty()) { + newpath.append("/"); + } + } +#ifdef _WIN32 + // Cater for paths with drive X:/ on Windows + else if (pos == 2 && newpath[1] == ':') { // ":" is on Windows only allowed at pos 1, where it signifies the preceding char is a drive letter + // X:/. or X:/.. at beginning of string + if(newpath.length() > 4 && newpath[4] == '.') { // X:/.. + // Prevent special case "X:/..foo", which is directory "..foo" under the root + if (newpath.length() <= 5 || newpath[5] != '/') { + // X:/.. or X:/../foo/bar at beginning of string, replace with drive root + newpath.erase(pos, 3); + } + } + else { // X:/./foo/bar X:/..foo + // Prevent special case "X:/.foo", which is directory ".foo" under the root + if (newpath.length() <= 4 || newpath[4] != '/') { + // X:/. or X:/./foo/bar at beginning of string, replace with drive root + newpath.erase(pos, 2); + } + } + + if (newpath.length() == 2) { + // Whoops -- the entire string was just "X:/.." or "X:/." + newpath.append("/"); + } + } +#endif + else { + size_t pos2 = 0; + if ((pos2 = newpath.rfind("/", pos - 1)) != string::npos) { // assignment intended + // Remove parent directory. + newpath.erase(pos2, pos - pos2 + 3); + } + else { // ../ for relative path (as in foo/../baz.txt) + newpath.erase(0, pos + 4); + } + } + } + else { // Single /. + + // Weird path like /..foo or foo/..bar, which are NOT relative paths + if (newpath.length() > pos + 2 && newpath[pos + 2] != '/') { + // Do not reset `pos' -- this has to stay. Advance to the next char. + pos++; + continue; + } + + newpath.erase(pos, 2); + + // Whoops -- the entire string was just "/." + if (newpath.empty()) { + newpath.append("/"); + } + } + + // Reset as we have modified the string and might need to go again over it + pos = 0; + } + + /* If we are empty now, the original string was a one-element + * relative path with .. appended. We cannot know what to set + * without referring to pwd(), which is external access and + * forbidden for this method. So instead, we do the one sane thing + * and just use ".". */ + if (newpath.empty()) + newpath = "."; + + return Path(newpath); +} + +/** + * \note Under specific circumstances (see below), this method + * accesses the file system. + * + * This method creates an absolute path by use of prune(), but + * additionally expands any expandable strings. If one of the + * following substitution sequences are encountered, it will be + * replaced accordingly. + * + * "~" is expanded to the user’s home directory, see home(). + * + * \returns a new instance with everything expanded. + * + * \remark This method uses prune() to expand ".." entries, therefore + * it will not consider symbolic links when resolving those. Use + * real() if you need to do that. + * + * \see prune() real() + */ +Path Path::expand() const +{ + Path path(*this); // copy + + if (m_path[0] != '~') + path = path.absolute(); + + std::string str = path.str(); + if (str[0] == '~') { + Path homepath = home(); + + if (str[1] == '/' || str.length() == 1) { + // User home requested + str.replace(0, 1, homepath.m_path); + } + + path = Path(str); + } + + return path.prune(); +} + +/** + * \note This method acceses the filesystem. + * + * This is the bruteforce method for determing the real path + * of the entry in question on the filesystem. It looks on + * each single component of the path, checks if it is a + * symbolic link, and if so, resolves it. + * + * This method supports symbolic link resolving only on UNIX. + * + * It still does not consider hardlinks, mountpoints, and junctions, + * though. However, a hardlink is a real second valid name for an + * object; in contrast to a symbolic link, if one hardlink gets + * removed, the other one stays still valid. If you remove the file a + * symbolic link points to, the link breaks. Thus, it is not even + * possible to determine which of two hardlinks to a file is the + * "primary" one. Mountpoints and junctions (junctions are on Windows + * what mountpoints are on UNIX) behave similar with respect to + * entire directory hierarchies. + * + * \see expand() prune() + */ +Path Path::real() const +{ +#if defined(_PATHIE_UNIX) + std::string nstr = native(); + char path[PATH_MAX]; + if (!realpath(nstr.c_str(), path)) + throw(Pathie::ErrnoError(errno)); + + return Path(filename_to_utf8(path)); +#elif defined(_WIN32) + // On Windows there sadly is no easy way to do this. We can + // only determine if a given path is a symlink and resolve it... + // Instructions taken from: http://msdn.microsoft.com/en-us/library/windows/desktop/aa363940%28v=vs.85%29.aspx + std::vector<Path> components = burst(); + unsigned int pos = 0; + + while (pos < components.size()) { + // Build path consisting of all elements upto our position pointer + Path reduced_path(components.front()); + if (components.size() - pos > 1) { + for (unsigned int i=1; i <= pos; i++) { // i=0 is already in the initialization above + reduced_path = reduced_path.join(components[i]); + } + } + + // If that’s a symlink, resolve it and replace our path until + // the symlink with the symlink’s target. + /*std::wstring reduced_path_utf16 = utf8_to_utf16(reduced_path.m_path); + if (is_ntfs_symlink(reduced_path_utf16.c_str())) { + wchar_t* target_utf16 = read_ntfs_symlink(reduced_path_utf16.c_str()); + Path target(utf16_to_utf8(target_utf16)); + std::vector<Path> target_components = target.burst(); + free(target_utf16); + + // Replace all components up to pos with the symlink target + components.erase(components.begin(), components.begin() + pos); + std::vector<Path> temp(components); + components.clear(); + for(auto iter=target_components.begin(); iter != target_components.end(); iter++) + components.push_back(*iter); + for(auto iter=temp.begin(); iter != temp.end(); iter++) + components.push_back(*iter); + } + else {*/ + // Note a symlink can point to another symlink, so we can only + // advance to the next element if this element has been tested + // for not being a symlink. + pos++; + //} + } + + // BUild a new path from the now resolved components + Path result(components.front()); + if (components.size() > 1) { + for(std::vector<Path>::const_iterator iter=components.begin(); + iter != components.end(); iter++) { + result = result.join(*iter); + } + } + + return result; +#else +#error Unsupported system. +#endif +} + +// Msys2 does currently not have ntifs.h windows header, which +// is required for reading NTFS symlinks. +#if 0 +//#ifdef __WIN32 +/* + * Checking if a file is a symlink under Windows is insane. + * See http://msdn.microsoft.com/en-us/library/windows/desktop/aa363940%28v=vs.85%29.aspx + * for the detailed instructions by Microsoft on how to do + * that. + */ +bool Path::is_ntfs_symlink(const wchar_t* path) const +{ + // First we need to obtain the file attributes. + DWORD attrs = GetFileAttributesW(path); + if (attrs == INVALID_FILE_ATTRIBUTES) { + DWORD err = GetLastError(); + throw(Pathie::WindowsError(err)); + } + + /* These file attributes must contain the REPARSE_POINT attribute + * that mark the file as being symlink, junction, or similar. + * Actually, reparse points can contain many more custom data, but + * we are not intersted in those. */ + if (attrs & FILE_ATTRIBUTE_REPARSE_POINT) { + // Now we have to retrieve a special attributes handle from the file. + WIN32_FIND_DATAW finddata; + HANDLE findhandle = FindFirstFileW(path, &finddata); + if (findhandle == INVALID_HANDLE_VALUE) { + DWORD err = GetLastError(); + throw(Pathie::WindowsError(err)); + } + FindClose(findhandle); + + // These extended attributes contain the SYMLINK tag if this file + // is a symlink. + if (finddata.dwReserved0 & IO_REPARSE_TAG_SYMLINK) + return true; + + // Junction or so, we do not resolve that + return false; + } + + // Regular file + return false; +} + +/* + * Reading the link target also is insanely hard. + * The process is documented at http://msdn.microsoft.com/en-us/library/windows/desktop/aa365503%28v=vs.85%29.aspx + * in general. The key function is DeviceIoControl(), documented + * at http://msdn.microsoft.com/en-us/library/windows/desktop/aa363216%28v=vs.85%29.aspx + * . + * + * This function does not check if `path` is a symlink, but assumes it. + * It will exhibit unexpactable behaviour if this assumption is wrong. + * + * The returned pointer must be freed by you. + */ +wchar_t* Path::read_ntfs_symlink(const wchar_t* path) const +{ + // We have to open the file (directories are files on Windows also) first. + HANDLE filehandle = CreateFileW(path, GENERIC_READ, 0, NULL, OPEN_EXISTING, FILE_FLAG_OPEN_REPARSE_POINT, NULL); + if (filehandle == INVALID_HANDLE_VALUE) { + DWORD err = GetLastError(); + throw(Pathie::WindowsError(err)); + } + + // This infamous structure is documented here: http://msdn.microsoft.com/en-us/library/ff552012.aspx + unsigned long reparsebufsize = REPARSE_GUID_DATA_BUFFER_HEADER_SIZE; // According to docs this is the minimum size + REPARSE_DATA_BUFFER* p_reparse_data = NULL; + while (true) { + reparsebufsize += 4096; // Do you have a better guess? + p_reparse_data = (REPARSE_DATA_BUFFER*) realloc(p_reparse_data, reparsebufsize); + memset(p_reparse_data, '\0', reparsebufsize); + + DWORD bytecount = 0; + // Obtain the reparse tag. FSCTL_GET_REPARSE_POINT is documented here: http://msdn.microsoft.com/en-us/library/windows/desktop/aa364571(v=vs.85).aspx + if (DeviceIoControl(filehandle, FSCTL_GET_REPARSE_POINT, NULL, 0, p_reparse_data, reparsebufsize, &bytecount, NULL) == 0) { + DWORD errsav = GetLastError(); + if (errsav == ERROR_INSUFFICIENT_BUFFER) { // buffer was to small, try again + continue; + } + else { + throw(Pathie::WindowsError(errsav)); + } + } + else { // success + break; + } + } + + // See also http://msdn.microsoft.com/en-us/library/windows/desktop/aa365511(v=vs.85).aspx + // And this one: http://www.codeproject.com/Articles/21202/Reparse-Points-in-Vista + if (p_reparse_data->ReparseTag == IO_REPARSE_TAG_SYMLINK) { + wchar_t* subsname = (wchar_t*) malloc(p_reparse_data->SymbolicLinkReparseBuffer.SubstituteNameLength + 2); // UTF-16 NUL + wchar_t* printname = (wchar_t*) malloc(p_reparse_data->SymbolicLinkReparseBuffer.PrintNameLength + 2); // UTF-16 NUL + + memset(subsname, '\0', p_reparse_data->SymbolicLinkReparseBuffer.SubstituteNameLength + 2); + memset(printname, '\0', p_reparse_data->SymbolicLinkReparseBuffer.PrintNameLength + 2); + + wcsncpy(subsname, &p_reparse_data->SymbolicLinkReparseBuffer.PathBuffer[p_reparse_data->SymbolicLinkReparseBuffer.SubstituteNameOffset], p_reparse_data->SymbolicLinkReparseBuffer.SubstituteNameLength / sizeof(WCHAR)); + wcsncpy(printname, &p_reparse_data->SymbolicLinkReparseBuffer.PathBuffer[p_reparse_data->SymbolicLinkReparseBuffer.PrintNameOffset], p_reparse_data->SymbolicLinkReparseBuffer.PrintNameLength / sizeof(WCHAR)); + + // Actually, it appears the subsname has no real usecase... + free(subsname); + free(p_reparse_data); + CloseHandle(filehandle); + return printname; + } + else { + return NULL; + } +} +#endif + +///@} + +/** \name Special files and directories + * + * Files and directories with a special meaning that did not + * fit in the other groups. + */ +///@{ + +/** + * Determines the current process working directory and returns + * it as an absolute path. Contains a leading drive letter on + * Windows. + */ +Path Path::pwd() +{ +#if defined(_PATHIE_UNIX) + char cwd[PATH_MAX]; + if (getcwd(cwd, PATH_MAX) != NULL) + return Path(filename_to_utf8(cwd)); + else + throw(std::runtime_error("Failed to retrieve current working directory.")); +#elif defined(_WIN32) + wchar_t cwd[MAX_PATH]; + if (GetCurrentDirectoryW(MAX_PATH, cwd) == 0) + throw(std::runtime_error("Failed to retrieve current working directory.")); + else + return Path(utf16_to_utf8(std::wstring(cwd))); +#else +#error Unsupported platform. +#endif +} + +/** + * \note On Linux, this method accesses the `/proc` filesystem. + * + * This method returns the full absolute path to the currently running + * executable. + */ +Path Path::exe() +{ +#if defined(__linux__) + char buf[PATH_MAX]; + ssize_t size = ::readlink("/proc/self/exe", buf, PATH_MAX); + + if (size < 0) + throw(Pathie::ErrnoError(errno)); + + return Path(filename_to_utf8(std::string(buf, size))); +#elif defined(BSD) + // BSD does not have /proc mounted by default. However, using raw syscalls, + // we can figure out what would have been in /proc/curproc/file. See + // sysctl(3) for the management info base identifiers that are used here. + int mib[4]; + char buf[PATH_MAX]; + size_t bufsize = PATH_MAX; + mib[0] = CTL_KERN; + mib[1] = KERN_PROC; + mib[2] = KERN_PROC_PATHNAME; + mib[3] = -1; // According to sysctl(3), -1 means the current process. + + if (sysctl(mib, 4, buf, &bufsize, NULL, 0) != 0) // Note this changes `bufsize' to the number of chars copied + throw(Pathie::ErrnoError(errno)); + + return Path(filename_to_utf8(std::string(buf, bufsize - 1))); // Exclude terminating NUL +#elif defined(_WIN32) + wchar_t buf[MAX_PATH]; + if (GetModuleFileNameW(NULL, buf, MAX_PATH) == 0) { + DWORD err = GetLastError(); + throw(Pathie::WindowsError(err)); + } + + std::string str = utf16_to_utf8(buf); + return Path(str); +#else +#error Unsupported platform. +#endif +} + +/** + * This method returns the current user’s home directory. On UNIX + * systems, the $HOME environment variable is consulted, whereas + * on Windows the Windows API is queried for the directory. + * + * It will throw std::runtime_error if $HOME is not defined on + * UNIX. + */ +Path Path::home() +{ +#if defined(_PATHIE_UNIX) + char* homedir = getenv("HOME"); + if (homedir) + return Path(filename_to_utf8(homedir)); + else + throw(std::runtime_error("$HOME not defined.")); +#elif defined(_WIN32) + /* TODO: Switch to KNOWNFOLDERID system as explained + * on http://msdn.microsoft.com/en-us/library/windows/desktop/bb762494%28v=vs.85%29.aspx + * and http://msdn.microsoft.com/en-us/library/windows/desktop/bb762181%28v=vs.85%29.aspx + *. Howevever, MinGW does currently (September 2014) not have + * the new KNOWNFOLDERID declarations. + */ + + wchar_t homedir[MAX_PATH]; + if (SHGetFolderPathW(NULL, CSIDL_PROFILE, NULL, SHGFP_TYPE_CURRENT, homedir) != S_OK) + throw(std::runtime_error("Home directory not defined.")); + + return Path(utf16_to_utf8(homedir)); +#else +#error Unsupported system. +#endif +} + +///@} + +/** \name Handling of absolute and relative paths + * + * Converting relative paths to absolute ones and vice-versa. + */ +///@{ + +/** + * Builds an absolute path from the referenced path by + * prefixing it with a `base` path, which defaults to + * the current working directory. If the referenced path + * is absolute already, nothing is done and a copy of the + * referenced path is returned. + * + * \param[in] base Base path. Default is the return value of Path::pwd(). + * + * \returns A new instance that is absolute. + * + * \see relative() + */ +Path Path::absolute(const Path& base /* = Path::pwd() */) const +{ + if (is_absolute()) + return Path(m_path); + else + return base.join(m_path); +} + +/** + * The referenced path has to to be absolute; by doing pure string + * manipulation (read: no symlinks), it will then be determined how to + * go from the (also absolute) `base` path to the referenced path. The + * result is a relative path, which will be returned by this method. + * + * On Windows, this method will throw an std::invalid_argument if the `base` + * is on a different drive than the referenced path. If either the referenced + * or the passed path is relative, std::invalid_argument will also be thrown. + * + * \param base Base path from which to start. Must also be absolute. + * + * \returns A new instance as a relative path. + * + * Example: + * + * ~~~~~~~~~~~~~~~~~~~~ c++ + * Path p1("/tmp/foo/bar/baz"); + * Path p2("/tmp/xxx/yyy"); + * + * p1.relative(p2); // => ../../foo/bar/baz + * p2.relative(p1); // => ../../../xxx/yyy + * ~~~~~~~~~~~~~~~~~~~~ + * + * \remark Both the referenced path and the `base` argument + * are prune()d before they are worked with. + * + * \see absolute() + */ +Path Path::relative(Path base) const +{ + if (is_relative()) + throw(std::invalid_argument("Referenced path must be absolute.")); + if (base.is_relative()) + throw(std::invalid_argument("Argument path must be absolute.")); + + // Wipe all ".." and ".", this would break the below algorithm + base = base.prune(); + Path refpath = prune(); + + // Shortcut for equal paths + if (base.m_path == refpath.m_path) + return Path("."); + + // Shortcut if base is the root + if (base.is_root()) { +#if defined(_PATHIE_UNIX) + return Path(refpath.m_path.substr(1)); // Skip leading / +#elif defined(_WIN32) + return Path(refpath.m_path.substr(root().m_path.length())); // Skip leading / or X:/ +#else +#error Unsupported system. +#endif + } + + size_t pos = 0; + size_t baselength = base.m_path.length(); + size_t reflength = refpath.m_path.length(); + while (true) { + if (pos >= baselength) + break; + else if (pos >= reflength) + break; + else if (base.m_path[pos] != refpath.m_path[pos]) + break; + else + pos++; + } + // pos now points to the last character in which both strings were equal + + // For each component in base that is not part of refpath, add a "..". + std::string resultstr; + Path basepart(base.m_path.substr(pos)); + for(size_t i=0; i < basepart.component_count(); i++) + resultstr.append("../"); + + // Now append the part of refpath that is not part of base to the result. + resultstr.append(refpath.m_path.substr(pos)); + + // Done. + return Path(resultstr); +} + +/** + * Checks if this is an absolute path, i.e. one that + * starts with a / on all systems or with X:/ + * only on Windows, where `X` is a drive letter. + * + * Note that / on Windows is the root of the current drive + * and hence also an absolute path. + */ +bool Path::is_absolute() const +{ +#if defined(_PATHIE_UNIX) + return m_path[0] == '/'; +#elif defined(_WIN32) + // / is root on current drive + if (m_path[0] == '/') + return true; + + return m_path[1] == ':'; // This is the only position where : is allowed on windows, and if it is there, the path is absolute with a drive letter (X:/) +#else +#error Unsupported system. +#endif +} + +/** + * The inverse of is_absolute(). + */ +bool Path::is_relative() const +{ + return !is_absolute(); +} + +/** + * Checks if this path is a filesystem root. On UNIX, this + * is the case if the path consists solely of one slash, on + * Windows this is the case if the path looks like this: + * "<letter>:/". + */ +bool Path::is_root() const +{ +#if defined(_PATHIE_UNIX) + return m_path.length() == 1 && m_path[0] == '/'; +#elif defined(_WIN32) + // / on Windows is root on current drive + if (m_path.length() == 1 && m_path[0] == '/') + return true; + + // X:/ is root including drive letter + return m_path.length() == 3 && m_path[1] == ':'; +#else +#error Unsupported platform. +#endif +} + +///@} + +/** \name In-place substitution + * + * These methods change the underlying path string. + */ +///@{ + +void Path::assign(std::string str) +{ + m_path = str; +} + +void Path::swap(Path& path) throw() +{ + m_path.swap(path.m_path); +} + +///@} + +/** \name File attributes + * + * Functions that work on file attributes like timestamps. + */ +///@{ + +#if defined(_PATHIE_UNIX) +struct stat* Path::stat() const +{ + struct stat* s = (struct stat*) malloc(sizeof(struct stat)); + std::string nstr = native(); + + if (::stat(nstr.c_str(), s) < 0) + throw(Pathie::ErrnoError(errno)); + + return s; +} +#elif defined(_WIN32) +/** + * \note This method accesses the file system. + * + * Returns a pointer to a C `stat` struct that describes the + * given file. You have to free() the pointer manually yourself. + * + * \returns A `struct stat` pointer on UNIX, and a `struct _stat` + * pointer on Windows. + */ +struct _stat* Path::stat() const +{ + struct _stat* s = (struct _stat*) malloc(sizeof(struct _stat)); + std::wstring utf16 = utf8_to_utf16(m_path); + + if (_wstat(utf16.c_str(), s) < 0) + throw(Pathie::ErrnoError(errno)); + + return s; +} +#else +#error Unsupported system. +#endif + +/** + * \note This method accesses the file system. + * + * Returns the file size. + */ +long Path::size() const +{ +#if defined(_PATHIE_UNIX) + struct stat s; + std::string nstr = native(); + + if (::stat(nstr.c_str(), &s) < 0) + throw(Pathie::ErrnoError(errno)); +#elif defined(_WIN32) + struct _stat s; + std::wstring utf16 = utf8_to_utf16(m_path); + + if (_wstat(utf16.c_str(), &s) < 0) + throw(Pathie::ErrnoError(errno)); +#else +#error Unsupported system. +#endif + + return s.st_size; +} + +/** + * \note This method accesses the file system. + * + * Returns the file’s last access time. The value is not + * really reliable. + */ +time_t Path::atime() const +{ +#if defined(_PATHIE_UNIX) + struct stat s; + std::string nstr = native(); + + if (::stat(nstr.c_str(), &s) < 0) + throw(Pathie::ErrnoError(errno)); +#elif defined(_WIN32) + struct _stat s; + std::wstring utf16 = utf8_to_utf16(m_path); + + if (_wstat(utf16.c_str(), &s) < 0) + throw(Pathie::ErrnoError(errno)); +#else +#error Unsupported system. +#endif + + return s.st_atime; +} + +/** + * \note This method accesses the file system. + * + * Returns the file’s last modification time. + */ +time_t Path::mtime() const +{ +#if defined(_PATHIE_UNIX) + struct stat s; + std::string nstr = native(); + + if (::stat(nstr.c_str(), &s) < 0) + throw(Pathie::ErrnoError(errno)); +#elif defined(_WIN32) + struct _stat s; + std::wstring utf16 = utf8_to_utf16(m_path); + + if (_wstat(utf16.c_str(), &s) < 0) + throw(Pathie::ErrnoError(errno)); +#else +#error Unsupported system. +#endif + + return s.st_mtime; +} + +/** + * \note This method accesses the file system. + * + * Returns the file’s creation time. + */ +time_t Path::ctime() const +{ +#if defined(_PATHIE_UNIX) + struct stat s; + std::string nstr = native(); + + if (::stat(nstr.c_str(), &s) < 0) + throw(Pathie::ErrnoError(errno)); +#elif defined(_WIN32) + struct _stat s; + std::wstring utf16 = utf8_to_utf16(m_path); + + if (_wstat(utf16.c_str(), &s) < 0) + throw(Pathie::ErrnoError(errno)); +#else +#error Unsupported system. +#endif + + return s.st_ctime; +} + +///@} + +/** \name Path traversal + * + * What’s in this directory? + */ +///@{ + +/** + * Returns an entry_iterator instance you can use to iterate + * the entries in a directory. Note that the list somewhere + * always includes the "." (current directory) and ".." + * (parent directory) entries. + */ +entry_iterator Path::begin_entries() const +{ + return entry_iterator(this); +} + +/** + * Returns the terminal iterator you test for in order to + * find out whether the iteration is complete. + */ +entry_iterator Path::end_entries() const +{ + return entry_iterator(); +} + +/** + * \note This method accesses the file system. + * + * This method assumes the path is a directory and returns + * a list of all entries in it. The items in the list follow + * the order of the items on the file system, i.e. for most + * applications they are to be considered unsorted. + * + * \see children() + */ +std::vector<Path> Path::entries() const +{ + std::vector<Path> results; + for(entry_iterator iter=begin_entries(); iter != end_entries(); iter++) { + results.push_back(*iter); + } + + return results; +} + +/** + * \note This method accesses the file system. + * + * This method assumes the path is a directory and returns + * a list of all its children. Children are all entries + * in the directory *except* for the entries for the directory + * itself and its parent directory. + * + * Or for short, this method is the same as children() except + * the return value does not include the "." and ".." entries. + * + * \see entries() + */ +std::vector<Path> Path::children() const +{ + std::vector<Path> results; + for(entry_iterator iter=begin_entries(); iter != end_entries(); iter++) { + if (*iter != Path(".") && *iter != Path("..")) + results.push_back(*iter); + } + + return results; +} + +/** + * \note This method accesses the file system. + * + * Recursively traverse the directory structure below the referenced + * path. Each entry will be passed to the callback while traversing + * from top to bottom. If the entry passed is a directory, you can return + * true if you want to traverse that directory down or false if you + * don't want to. If the entry passed is not a directory, the + * callback's return value is ignored. + * + * The callback will never be passed "." and ".." entries. All paths + * passed to the callback retain the full prefix, i.e. if you + * have this structure: + * + * ~~~~~~~~~~~~~~~~ + * foo + * bar/ + * baz.txt + * ~~~~~~~~~~~~~~~~ + * + * Then find() will give you these paths in this order: `foo`, + * `foo/bar`, and `foo/bar/baz.txt`, rather than just the sole + * basename (which you can still obtain by calling basename() on the + * argument). + * + * \param cb Callback that takes the currently examined path. + * + * \remark Do not assume any order for the paths you receive, + * except that you will be given a directory entry before you + * are given its child entries. + */ +void Path::find(bool (*cb)(const Path& entry)) const +{ + for(entry_iterator iter=begin_entries(); iter != end_entries(); iter++) { + // Skip . and .. + if (iter->str() != "." && iter->str() != "..") { + Path path = join(*iter); + if (cb(path) && path.is_directory()) { + path.find(cb); + } + } + } +} + +///@} + +/** \name Path status information + * + * Query information on the path. + */ +///@{ + + +/** + * \note This method acceses the filesystem. + * + * Checks if the file exists. Note that if you don’t have + * sufficient rights for the check on the given path, this + * method will throw an exception. + */ +bool Path::exists() const +{ +#if defined(_PATHIE_UNIX) + std::string nstr = native(); + + if (access(nstr.c_str(), F_OK) == -1) { + int errsav = errno; + if (errsav == ENOENT) { + return false; + } + else { + throw(Pathie::ErrnoError(errsav)); + } + } + else + return true; +#elif defined(_WIN32) + std::wstring utf16 = utf8_to_utf16(m_path); + if (_waccess(utf16.c_str(), F_OK) == -1) { + int errsav = errno; + if (errsav == ENOENT) { + return false; + } + else { + throw(Pathie::ErrnoError(errsav)); + } + } + else + return true; +#else +#error Unsupported system. +#endif +} + +/** + * \note This method acceses the filesystem. + * + * Checks if this file is a symbolic link; also + * works with NTFS symlinks on Windows. Returns false + * rather than erroring out if the referenced path does + * not exist. + */ +bool Path::is_symlink() const +{ +#if defined(_PATHIE_UNIX) + struct stat s; + std::string nstr = native(); + + if (lstat(nstr.c_str(), &s) < 0) { + int errsav = errno; + + if (errsav == ENOENT) + return false; + else + throw(Pathie::ErrnoError(errsav)); + } + + if (S_ISLNK(s.st_mode)) + return true; + else + return false; +#elif defined(_WIN32) + if (!exists()) + return false; + + return false; + // ntifs.h is currently not included in msys2 + //std::wstring path = utf8_to_utf16(m_path); + //return is_ntfs_symlink(path.c_str()); +#else +#error Unsupported system. +#endif +} + +/** + * \note This method acceses the filesystem. + * + * Checks if this is a directory. Returns false if the + * referenced path does not exist rather than erroring out. + */ +bool Path::is_directory() const +{ +#if defined(_PATHIE_UNIX) + struct stat s; + std::string nstr = native(); + + if (::stat(nstr.c_str(), &s) < 0) { + int errsav = errno; + + // "Not found" means it isn’t a directory. + if (errsav == ENOENT) + return false; + else + throw(Pathie::ErrnoError(errsav)); + } + + if (S_ISDIR(s.st_mode)) + return true; + else + return false; +#elif defined(_WIN32) + struct _stat s; + std::wstring utf16 = utf8_to_utf16(m_path); + if (_wstat(utf16.c_str(), &s) < 0) { + int errsav = errno; + + if (errsav == ENOENT) + return false; + else + throw(Pathie::ErrnoError(errsav)); + } + + return s.st_mode & S_IFDIR; +#else +#error Unsupported system. +#endif +} + +/** + * \note This method accesses the filesystem. + * + * Checks if this is a file. Returns false + * if the referenced path does not exist rather + * than erroring out. + */ +bool Path::is_file() const +{ +#if defined(_PATHIE_UNIX) + struct stat s; + std::string nstr = native(); + + if (::stat(nstr.c_str(), &s) < 0) { + int errsav = errno; + + if (errsav == ENOENT) + return false; + else + throw(Pathie::ErrnoError(errsav)); + } + + if (S_ISREG(s.st_mode)) + return true; + else + return false; +#elif defined(_WIN32) + struct _stat s; + std::wstring utf16 = utf8_to_utf16(m_path); + if (_wstat(utf16.c_str(), &s) < 0) { + int errsav = errno; + + if (errsav == ENOENT) + return false; + else + throw(Pathie::ErrnoError(errno)); + } + + return s.st_mode & S_IFREG; +#else +#error Unsupported system. +#endif +} + +///@} + +/** \name Utility methods + * + * These methods operate on the file or directory referenced + * by the path. + */ +/// @{ + +/** + * \note This method writes to the filesystem. + * + * Creates the referenced directory non-recursively, + * i.e. parent directories are not created. Trying + * to create a directory below a nonexistant directory + * will result in an ErrnoError exception. + * + * \remark UNIX note: The directory is created with RWX permissions + * for everyone, but filtered by your current `umask` before applied + * to disk. + * + * \see mktree() + */ +void Path::mkdir() const +{ +#if defined(_PATHIE_UNIX) + std::string nstr = native(); + + if (::mkdir(nstr.c_str(), S_IRWXU | S_IRWXG | S_IRWXO) < 0) + throw(Pathie::ErrnoError(errno)); +#elif defined(_WIN32) + std::wstring utf16 = utf8_to_utf16(m_path); + + if (_wmkdir(utf16.c_str()) < 0) + throw(Pathie::ErrnoError(errno)); +#else +#error Unsupported system. +#endif +} + +/** + * \note This method writes to the filesystem. + * + * Deletes the referenced directory, which is required + * to be empty, if not, an ErrnoError will be thrown. + * + * This cannot be used to delete a file rather than a + * directory. + * + * \see remove() unlink() + */ +void Path::rmdir() const +{ +#if defined(_PATHIE_UNIX) + std::string nstr = native(); + + if (::rmdir(nstr.c_str()) < 0) + throw(Pathie::ErrnoError(errno)); +#elif defined(_WIN32) + std::wstring utf16 = utf8_to_utf16(m_path); + if (_wrmdir(utf16.c_str()) < 0) + throw(Pathie::ErrnoError(errno)); +#else +#error Unsupported system. +#endif +} + +/** + * \note This method writes to the filesystem. + * + * Deletes the referenced file. This cannot be used to + * delete a directory rather than a file. + * + * \see remove() rmdir() + */ +void Path::unlink() const +{ +#if defined(_PATHIE_UNIX) + std::string nstr = native(); + if (::unlink(nstr.c_str()) < 0) + throw(Pathie::ErrnoError(errno)); +#elif defined(_WIN32) + std::wstring utf16 = utf8_to_utf16(m_path); + if (_wunlink(utf16.c_str()) < 0) + throw(Pathie::ErrnoError(errno)); +#else +#error Unsupported system. +#endif +} + +/** + * \note This method writes to the filesystem. + * + * Delete this path, regardless of whether it is a file + * or an empty directory. This method can’t be used to + * delete a directory that isn’t empty. + * + * \see rmdir() unlink() + */ +void Path::remove() const +{ +#if defined(_PATHIE_UNIX) + std::string nstr = native(); + + if (::remove(nstr.c_str()) < 0) + throw(Pathie::ErrnoError(errno)); +#elif defined(_WIN32) + std::wstring utf16 = utf8_to_utf16(m_path); + bool result = false; + + /* On Windows, `_wremove()` doesn’t work on directories. This + * function uses the apropriate native Win32API function + * calls accordingly therefore. */ + if (is_directory()) + result = RemoveDirectoryW(utf16.c_str()); + else + result = DeleteFileW(utf16.c_str()); + + if (!result) { + DWORD err = GetLastError(); + throw(Pathie::WindowsError(err)); + } + +#else +#error Unsupported system. +#endif +} + +/** + * \note This method writes to the file system. + * + * This method provides a functionality akin to the UNIX `mkdir -p` + * command, i.e. it creates the referenced directory, and if necessary, + * also creates all parent directories. Note this method does not + * throw an ErrnoError if the referenced directory already exists; + * it just does nothing. + * + * \see mkdir() + */ +void Path::mktree() const +{ + // Root is required to exist + if (is_root()) + return; + + if (!is_directory()) { + Path p = parent(); + + if (!p.is_directory()) { + p.mktree(); + } + + mkdir(); + } + +} + +/** + * \note This method accesses the filesystem. + * + * Open the referenced path as a file with the given mode. + * Refer to your preferred C documentation for the value + * of the `mode` parameter. + * + * As with all methods of this library, Unicode filenames + * are handled properly on both UNIX and Windows by transcoding + * to UTF-16LE on Windows. Therefore, on UNIX the file + * is opened using `fopen()`, and on Windows it is opened + * using `_wfopen()`. Thanksfully, as an exception + * to Microsoft’s wchar-them-all rule, it is possible to close + * a file that is opened with `_wfopen()` by means of the + * regular `fclose()` function, which saves me from implementing + * a wrapper around the C `FILE*` pointer to abstract the problem. + * + * In contrast to original `fopen()`, this method throws an + * ErrnoError exception if the call fails, i.e. if `fopen()` + * returns NULL. As a result, this method will _never_ return + * a NULL pointer. + * + * Here’s an example of how to use this method (with error checking + * ommited): + * + * ~~~~~~~~~~~~~~~~~ c++ + * Path p("Unicöde file.txt"); + * FILE* p_file = p.fopen("w"); + * fwrite("A", 1, 1, p_file); + * fclose(p_file); + * ~~~~~~~~~~~~~~~~~ + * + * This will create a file named "Unicöde.txt" both on UNIX and Windows. + * + * \param[in] mode File open mode as per the C `fopen()` documentation. + * + * \remark Don’t forget you have to close the file using `fclose()`, which + * works, as explained, both on UNIX and Windows. `fclose()` is + * not wrapped by this library, use your C libraries’ implementation + * directly. + * + * \remark The file’s actual _contents_ are not affected in any way + * by this method. They are outside the scope of this library; note + * however that with regard to line endings you might want to consider + * the "b" mode modifier for binary files. + * + * \see [Microsoft’s documentation on `fopen()` and `_wfopen()`](http://msdn.microsoft.com/en-us/library/yeby3zcb.aspx) + */ +FILE* Path::fopen(const char* mode) const +{ +#if defined(_PATHIE_UNIX) + std::string nstr = native(); + + FILE* ptr = ::fopen(nstr.c_str(), mode); + if (ptr) + return ptr; + else + throw(Pathie::ErrnoError(errno)); +#elif defined(_WIN32) + std::wstring utf16_path = utf8_to_utf16(m_path); + std::wstring utf16_mode = utf8_to_utf16(mode); + FILE* ptr = _wfopen(utf16_path.c_str(), utf16_mode.c_str()); + + if (ptr) + return ptr; + else + throw(Pathie::ErrnoError(errno)); +#else +#error Unsupported system. +#endif +} + +/** + * \note This method writes to the filesystem. + * + * Sets the file’s modification and access times to the + * current time. If the file does not yet exist, it is created. + * + * This is akin to the UNIX `touch` command. + */ +void Path::touch() const +{ +#if defined(BSD) // FreeBSD didn’t have futimens() yet as of testing (december 2014) + FILE* p_file = Path::fopen("a"); + if (futimes(fileno(p_file), NULL) < 0) { + fclose(p_file); + throw(Pathie::ErrnoError(errno)); + } + + fclose(p_file); +#elif defined(_PATHIE_UNIX) + FILE* p_file = Path::fopen("a"); + // futimens() is considered the modern variant of doing this + // (at least according to utimes(2) on my Linux system). + if (futimens(fileno(p_file), NULL) < 0) { + fclose(p_file); + throw(Pathie::ErrnoError(errno)); + } + + fclose(p_file); +#elif defined(_WIN32) + // Create file if it does not exist yet + if (!exists()) { + FILE* p_file = Path::fopen("a"); + fclose(p_file); + } + + SYSTEMTIME currenttime; + GetSystemTime(¤ttime); + + FILETIME newtime; + if (SystemTimeToFileTime(¤ttime, &newtime) == 0) { + DWORD err = GetLastError(); + throw(Pathie::WindowsError(err)); + } + + std::wstring utf16 = utf8_to_utf16(m_path); + HANDLE filehandle = CreateFileW(utf16.c_str(), FILE_WRITE_ATTRIBUTES, 0, NULL, OPEN_EXISTING, 0, NULL); + if (filehandle == INVALID_HANDLE_VALUE) { + DWORD err = GetLastError(); + throw(Pathie::WindowsError(err)); + } + + if (SetFileTime(filehandle, NULL, &newtime, &newtime) == 0) { + int errsav = GetLastError(); + CloseHandle(filehandle); + throw(Pathie::WindowsError(errsav)); + } + + CloseHandle(filehandle); +#else +#error Unsupported system. +#endif +} + +/** + * \note This method writes to the filesystem. + * + * This method, which is akin to the UNIX "rm -r" command, removes + * the entire referenced directory hierarchy recursively, including + * any files and directories contained therein. + */ +void Path::rmtree() const +{ + if (is_directory()) { + std::vector<Path> kids = children(); + + for(std::vector<Path>::iterator iter=kids.begin(); iter != kids.end(); iter++) { + join(*iter).rmtree(); + } + + rmdir(); + } + else { // file or similar + unlink(); + } +} + +/** + * \note This method writes to the filesystem. + * + * This method makes the referenced file a symbolic link + * to the path passed as an argument. On Windows, an + * NTFS symlink is created. + * + * \remark On Windows, this function requires that the process holds + * the `SE_CREATE_SYMBOLIC_LINK_NAME` privilege or it will fail with a + * WindowsError exception whose error code is 1314 + * (`ERROR_PRIVILEGE_NOT_HELD`). + */ +void Path::make_symlink(const Path& target) const +{ +#if defined(_PATHIE_UNIX) + std::string target_nstr = target.native(); + std::string nstr = native(); + + if (symlink(target_nstr.c_str(), nstr.c_str()) < 0) + throw(Pathie::ErrnoError(errno)); +#elif defined(_WIN32) + std::wstring source = utf8_to_utf16(m_path); + std::wstring target2 = utf8_to_utf16(target.m_path); + + DWORD flags = 0; + if (target.is_directory()) + flags = SYMBOLIC_LINK_FLAG_DIRECTORY; + + if (CreateSymbolicLinkW(source.c_str(), target2.c_str(), flags) == 0) { + DWORD err = GetLastError(); + throw(Pathie::WindowsError(err)); + } +#else +#error Unsupported system. +#endif +} + +/** + * \note This method accesses the file system. + * + * Treats the referened path as a symlink and reads in its target, + * returning it as a new Path intance. Supports NTFS symlinks. + */ +Path Path::readlink() const +{ +#if defined(_PATHIE_UNIX) + std::string nstr = native(); + char buf[PATH_MAX]; + memset(buf, '\0', PATH_MAX); + + ssize_t count = ::readlink(nstr.c_str(), buf, PATH_MAX); + if (count < 0) + throw(Pathie::ErrnoError(errno)); + + return Path(filename_to_utf8(std::string(buf, count))); +#elif defined(_WIN32) + std::wstring utf16_path = utf8_to_utf16(m_path); + + throw(std::runtime_error("NTFS symlinks currently not supported.")); + + // ntifs.h currently not included in msys2.h + //if (!is_ntfs_symlink(utf16_path.c_str())) + // throw(std::runtime_error("Not an NTFS symlink.")); + // + //wchar_t* utf16_target = NULL; + //utf16_target = read_ntfs_symlink(utf16_path.c_str()); + // + //Path result(utf16_to_utf8(utf16_target)); + //free(utf16_target); + // + //return result; +#else +#error Unsupported system. +#endif +} + +/** + * \note This method writes to the file system. + * + * Renames a file to another name without involving file streams. + * + * \param[in] newname The new name of the file. + */ +void Path::rename(Path& newname) const +{ +#if defined(_PATHIE_UNIX) + std::string nstr = native(); + std::string newname_nstr = newname.native(); + + if (::rename(nstr.c_str(), newname_nstr.c_str()) != 0) + throw Pathie::ErrnoError(errno); +#elif defined(_WIN32) + std::wstring utf16_oldname = utf8_to_utf16(m_path); + std::wstring utf16_newname = utf8_to_utf16(newname.m_path); + + if (_wrename(utf16_oldname.c_str(), utf16_newname.c_str()) != 0) + throw Pathie::ErrnoError(errno); +#else +#error Unsupported system. +#endif +} + +///@} + +/** \name Operators + * + * C++ operators. + */ +///@{ + +Path& Path::operator=(const Path& path) +{ + // Self-assignment + if (this == &path) + return *this; + + m_path = path.m_path; + return *this; +} + +Path& Path::operator=(const std::string& str) +{ + m_path = str; + return *this; +} + +/** + * Compares two Path instances. Two paths are considered equal + * if their underlying path std::strings are equal. + */ +bool Path::operator==(const Path& other) const +{ + return m_path == other.m_path; +} + +/** + * Compares two Path instances. Two paths are considered inequal + * if their underlying path std::strings are inequal. + */ +bool Path::operator!=(const Path& other) const +{ + return m_path != other.m_path; +} + +/** + * Compares two Path instances. The referenced path is + * considered smaller than `other` if the underlying path + * std::string of the referenced path is smaller than the + * one of `other`. + */ +bool Path::operator<(const Path& other) const +{ + return m_path < other.m_path; +} + +/** + * Compares two Path instances. The referenced path is + * considered greater than `other` if the underlying path + * std::string of the referenced path is greater than the + * one of `other`. + */ +bool Path::operator>(const Path& other) const +{ + return m_path > other.m_path; +} + +/** + * Compares two Path instances. The referenced path is + * considered smaller than or equal to `other` if the underlying path + * std::string of the referenced path is smaller than or equal to the + * one of `other`. + */ +bool Path::operator<=(const Path& other) const +{ + return m_path <= other.m_path; +} + +/** + * Compares two Path instances. The referenced path is + * considered greater than or equal to `other` if the underlying path + * std::string of the referenced path is greater than or equal to the + * one of `other`. + */ +bool Path::operator>=(const Path& other) const +{ + return m_path >= other.m_path; +} + +/** + * This method allows you to access a specific component in the + * path string. The first component has the index 0; for an + * absolute path, it will be the / entry. + * + * If you specify an index that is beyond the end of the path, + * an std::out_of_range exception will be thrown. + * + * \param index Index of the component to retrieve. + * + * \see component_count() + * + * \remark This operator loops over the path string internally + * each time you request an element. If you want to index the + * path consecutively, you might consider using burst(), which + * can be more performant as it only loops once over the path + * string. + */ +Path Path::operator[](size_t index) const +{ + // Absolute path index 0 needs special treatment + if (index == 0 && m_path[0] == '/') + return Path("/"); + + size_t pos = 0; + size_t lastpos = 0; + size_t i = 0; + while ((pos = m_path.find("/", pos)) != string::npos) { // Assignment intended + if (i == index) + return Path(m_path.substr(lastpos, pos - lastpos)); + + lastpos = pos + 1; + pos++; + i++; + } + + // Last element requested + if (index == i) + return Path(m_path.substr(lastpos)); + + // Out of range + throw(std::out_of_range("Index out of path range")); +} + +/** + * Appends a /, then the new component, then calls expand(), and + * finally returns a new Path instance. + * + * \param path New component. + * + * \returns New Path instance. + */ +Path Path::operator/(Path path) const +{ + return join(path); +} + +/** + * Appends a /, then the new component, and + * finally returns a new Path instance. + * + * \param str New component. + * + * \returns New Path instance. + */ +Path Path::operator/(std::string str) const +{ + return join(str); +} + +/** + * Appends a / followed by the new component `path` onto this + * instance and returns this instance. + * + * \param path New component. + * + * \returns The receiver. + */ +Path& Path::operator/=(Path path) +{ + *this = join(path); + return *this; +} + +/** + * Appends a / followed by the new component `path` onto this + * instance and returns this instance. + * + * \param str New component. + * + * \returns The receiver. + */ +Path& Path::operator/=(std::string str) +{ + *this = join(str); + return *this; +} + +/** + * Allows you to insert Pathie::Path instances into `std::cout`. + * + * ~~~~~~~~~~ c++ + * Pathie::Path p("foo/bar"); + * std::cout << p << std::endl; + * ~~~~~~~~~~ + */ +std::ostream& operator<<(std::ostream& stream, const Path& p) +{ + return stream << p.str(); +} + +///@} + +#ifdef _PATHIE_UNIX +/* + * Returns the XDG directory for the given environment variable, + * if defined, otherwise returns home() with `defaultpath` + * appended. + * + * See http://standards.freedesktop.org/basedir-spec/basedir-spec-latest.html + * for values. + */ +Path Path::get_xdg_dir(const std::string& envvarname, const std::string& defaultpath) +{ + std::string env_nstr = utf8_to_filename(envvarname); // environment is encoded the same as the filenames + char* env_value = getenv(env_nstr.c_str()); + if (env_value) + return Path(filename_to_utf8(env_value)); + + return Path::home().join(defaultpath); +} + +std::vector<Path> Path::get_xdg_dirlist(const std::string& envvarname, const std::string& defaultlist) +{ + std::string env_nstr = utf8_to_filename(envvarname); // environment is encoded the same as the filenames + char* env_value = getenv(env_nstr.c_str()); + std::string envstr; + if (env_value && strcmp(env_value, "") != 0) + envstr = filename_to_utf8(env_value); // Encode entire env string to UTF-8 + else + envstr = defaultlist; + + size_t pos = 0; + size_t lastpos = 0; + std::vector<Path> results; + while ((pos = envstr.find(":")) != string::npos) { + results.push_back(Path(envstr.substr(lastpos, pos))); // envstr is already UTF-8 + + lastpos = pos + 1; + pos++; + } + + results.push_back(envstr.substr(lastpos)); + + return results; +} + +std::string Path::get_xdg_userdir_setting(const std::string& setting) +{ + // XDG user-dirs spec recommends (only) checking for $XDG_CONFIG_HOME/user-dirs.dirs, + // the files under $XDG_CONFIG_DIRS are not to consider. + Path userconfig = Path::config_dir().join("user-dirs.dirs"); + + if (userconfig.is_file()) { + FILE* p_file = userconfig.fopen("r"); + + char line[256]; + char buf[256]; + bool found = false; + while (!feof(p_file)) { + memset(line, 0, 256); + memset(buf, 0, 256); + + fgets(line, 256, p_file); + + // Ignore comments and empty lines + if (line[0] == '#' || line[0] == '\n') + continue; + + // Extract the setting name from the line, e.g. "DOWNLOAD" for + // "XDG_DOWNLOAD_DIR=...". + strncpy(buf, line + 4, setting.length()); // +4 for "XDG_" + if (strcmp(buf, setting.c_str()) == 0) { + found = true; + break; + } + } + + fclose(p_file); + + // Error out if not found + if (!found) { + std::string msg = "Unknown XDG directory '"; + msg += setting + "' requested."; + throw(std::runtime_error(msg)); + } + + // OK, we have found the correct setting. Extract the value now. + // »XDG_DOWNLOAD_DIR="$HOME/Downloads"« + char* start = strchr(line, '"') + 1; // Exclude " itself + size_t len = strcspn(start, "\""); + + if (!start) // Malformed + throw(std::runtime_error("Malformed XDG config file (quote mismatch/missing quotes)!")); + + memset(buf, 0, 256); + strncpy(buf, start, len); + // buf now contains the part between the quotes followed by NUL bytes + + char result[PATH_MAX]; + memset(result, 0, PATH_MAX); + + // Replace $HOME with env value of $HOME + start = strstr(buf, "$HOME"); + if (start) { // Contains $HOME + char* homestr = getenv("HOME"); + if (!homestr) + throw(std::runtime_error("$HOME not set!")); + + // Stuff before $HOME + strncpy(result, buf, ((char*)start) - ((char*)buf)); // Compiler does not allow doing pointer arithmetics with char[], but with char* ??? They should be the same... + // $HOME replacement + strcpy(result + strlen(result), homestr); + // Suff after $HOME ($HOME is exactly 5 chars long) + strcpy(result + strlen(result), start + 5); + } + else { // No $HOME included. Copy everything verbosely. + strcpy(result, buf); + } + + // result now holds the final result with lots of NUL bytes at the end. + return std::string(result); + } + + // No XDG configuration on this system, use $HOME. + return Path::home().str(); +} +#endif + +/** \name Program data directories + * + * Directories containing program data other than files the + * user works with (e.g. configuration files). + */ +///@{ + +/** + * Returns the directory for application- and user-specific permanent + * data. + * + * On UNIX, this returns $XDG_DATA_HOME, defaulting to ~/.local/share. + * + * On Windows, this returns the roaming appdata folder, which defaults + * to `C:/Users/username/AppData/Roaming`. + */ +Path Path::data_dir() +{ +#if defined(_PATHIE_UNIX) + return get_xdg_dir("XDG_DATA_HOME", ".local/share"); +#elif defined(_WIN32) + wchar_t dir[MAX_PATH]; + HRESULT result = SHGetFolderPathW(NULL, CSIDL_APPDATA, NULL, SHGFP_TYPE_CURRENT, dir); + if (result != S_OK) + throw(Pathie::WindowsHresultError(result)); + + return Path(utf16_to_utf8(dir)); +#else +#error Unsupported system. +#endif +} + +/** + * \warning This method may behave unexpectedly on Windows; see below. + * + * Returns the directory for application- and user-specific configuration + * files. + * + * On UNIX, this returns $XDG_CONFIG_HOME, defaulting to ~/.config. + * + * Windows does not have a notion of a directory for configuration + * files, hence some return value for this method had to be chosen. I + * think it is best to not clutter a user’s home directory with config + * files, and [this stackoverflow thread](https://stackoverflow.com/questions/2243895/location-to-put-user-configuration-files-in-windows) + * suggests to place the files in the data_dir(). That however yields + * the problem of possible name clashes when you want to name a file + * the same in data_dir() and config_dir(). It is not an option to + * fall back to the "LocalSettings" directory instead, because 99% of + * the applications written are "roaming" applications rather than + * "local" ones, and any use of the "LocalSettings" directory + * (available via cache_dir()) must be a specific decision of the + * programmer therefore. The decision was made that this method on + * Windows should return the same as data_dir() without a specific + * encforcing reason, but, as said, some decision needed to be + * made. As a consequence, you have to be careful to not accidentally + * place equally named files in data_dir() and config_dir() as they + * would conflict. + * + * I want to point out that on Windows, configuration files are rather + * unusual. The normal way to save configuration settings on Windows + * is use of the Windows Registry, which is beyond the scope of a + * path manipulation library like Pathie. + */ +Path Path::config_dir() +{ +#if defined(_PATHIE_UNIX) + return get_xdg_dir("XDG_CONFIG_HOME", ".config"); +#elif defined(_WIN32) + return data_dir(); +#else +#error Unsupported system. +#endif +} + +/** + * Returns the directory for application- and user-specific cache files, i.e. + * files that, when deleted, do not impact the application apart from resetting + * it to some default values. A typical example for cache data is saving the + * folder where the user last opened a file, so that when he starts the application + * the next time and wants to open a file, is directly taken to the directory + * where he last picked a file from. Positions of windows could also be saved + * here, allowing application windows to be placed exactly where they were + * when the application was closed last time. In short, store the unimportant + * stuff here and be prepared the data is gone on application startup. + * + * On UNIX, this returns $XDG_CACHE_HOME, defaulting to ~/.cache. + * + * On Windows, this method returns the LOCAL_APPDATA folder, which means that + * in corporate setups using Windows roaming the data will not be available + * if you log in on another machine (which is expected, cf. the directory + * saving example above, which would break if this was saved into the roaming + * folder). This defaults to `C:/Users/username/AppData/Local`. + */ +Path Path::cache_dir() +{ +#if defined(_PATHIE_UNIX) + return get_xdg_dir("XDG_CACHE_HOME", ".cache"); +#elif defined(_WIN32) + wchar_t dir[MAX_PATH]; + HRESULT result = SHGetFolderPathW(NULL, CSIDL_LOCAL_APPDATA, NULL, SHGFP_TYPE_CURRENT, dir); + if (result != S_OK) + throw(Pathie::WindowsHresultError(result)); + + return Path(utf16_to_utf8(dir)); +#else +#error Unsupported system. +#endif +} + +/** + * Returns the directory for application- and user-specific volatile + * runtime data, i.e. data that WILL be deleted once the user logs + * off. + * + * On UNIX, this returns $XDG_RUNTIME_DIR. That environment variable is + * required to be defined as per the XDG standard, and if it isn’t, this + * method prints a warning to the standard error stream and uses the + * value of Path::temp_dir() instead. + * + * On Windows, the return value of this method is equivalent to that + * of temp_dir() always. + */ +Path Path::runtime_dir() +{ +#if defined(_PATHIE_UNIX) + std::string nstr = utf8_to_filename("XDG_RUNTIME_DIR"); // environment is encoded the same as paths + char* env_value = getenv(nstr.c_str()); + if (env_value) + return Path(filename_to_utf8(env_value)); + + Path tmp = Path::temp_dir(); + std::cerr << "(pathie XDG) WARNING: XDG_RUNTIME_DIR not defined in environment. Falling back to '" << tmp.str() << "'." << std::endl; + + return tmp; +#elif defined(_WIN32) + return temp_dir(); +#else +#error Unsupported system. +#endif +} + +/** + * Returns the root directory for temporary directories, i.e. + * directories which are expected to vanish when the application + * closes. Do not assume that anything you created in this + * directory still exists after your application exited and is + * restarted. + * + * \returns Path instance for temporary directory. + * + * \remark On UNIX, this function honours the value of the + * environment variable $TMPDIR. If that is not defined, the standard + * "/tmp" location will be returned. On Windows, GetTempPath() is + * called to retrieve the path, which in turn honours the environment + * variables $TMP, $TEMP, and $USERPROFILE (in that order); if all + * of them are undefined, a Windows-predefined path is returned, + * which defaults to `C:/Users/username/AppData/Local/Temp`. + * + * \see mktmpdir(3), [GetTempPath()](http://msdn.microsoft.com/en-us/library/windows/desktop/aa364992%28v=vs.85%29.aspx) + */ +Path Path::temp_dir() +{ +#if defined(_PATHIE_UNIX) + std::string nstr = utf8_to_filename("TMPDIR"); // environment is encoded the same as paths + char* env_value = NULL; + + if ((env_value = getenv(nstr.c_str()))) // Single = intended + return Path(filename_to_utf8(env_value)); + + + return Path("/tmp"); // As per the Filesystem Hierarchy Standard. +#elif defined(_WIN32) + wchar_t buf[MAX_PATH +1]; // See http://msdn.microsoft.com/en-us/library/windows/desktop/aa364992%28v=vs.85%29.aspx for the +1 + DWORD count = GetTempPathW(MAX_PATH + 1, buf); + + if (count == 0) { + DWORD err = GetLastError(); + throw(Pathie::WindowsError(err)); + } + + std::wstring utf16(buf, count); + return utf16_to_utf8(utf16); +#else +#error Unsupported system. +#endif +} + +///@} + +/** + * Create a temporary directory (with permissions set to + * 0700 on UNIX). The directory is guaranteed to be empty, and + * it is your responsibility to recursively remove the + * directory on program exit (or earlier). + * + * \param[in] name (`"tmpd"`) This will be used as part of + * the name of the directory, _not_ as the entire name. + * + * \returns Path instance for the new temporary directory. + * + * \remark Parts of the random name are generated with the + * C rand() function, so you might want to call srand() + * before using this function in order to seed the random + * number generator with a useful value. + */ +Path Path::mktmpdir(const std::string& name /* = "tmpd" */) +{ + Path tmp = Path::temp_dir() / Path(make_tempname(name)); + tmp.mkdir(); + +#ifdef _PATHIE_UNIX + std::string nstr = tmp.native(); + chmod(nstr.c_str(), S_IRWXU); // Silently ignore failure of setting file permissions +#endif + // TODO: How to do that on Windows? + + return tmp; +} + +// Constructs a filename that tries to be unique. +std::string Path::make_tempname(const std::string& namepart) +{ + time_t now; + struct tm* p_nowinfo = NULL; + time(&now); + p_nowinfo = localtime(&now); + + char buf[16]; // 15 + NUL + memset(buf, '\0', 16); + strftime(buf, 16, "%Y%m%d-%H%M%S", p_nowinfo); + std::string timepart(buf, 15); + +#if defined(_PATHIE_UNIX) + std::stringstream ss; + ss << getpid(); + std::string pidpart = ss.str(); +#elif defined(_WIN32) + std::stringstream ss; + ss << GetCurrentProcessId(); + std::string pidpart = ss.str(); +#else +#error Unsupported system. +#endif + + memset(buf, '\0', 16); + short i; + for(i=0; i < 16; i++) + buf[i] = 97 + rand() % 26; // Random char between a and z + + std::string randompart(buf, 15); + + return namepart + "_" + timepart + pidpart + randompart; +} + +#if defined(_PATHIE_UNIX) +/** + * \note Only available on UNIX. Accesses the file system. + * + * Returns $XDG_DATA_DIRS as per the XDG specification. + * If that is not set, returns a vector of paths for + * /usr/local/share and /usr/share. + */ +std::vector<Path> Path::data_dirs() +{ + return get_xdg_dirlist("XDG_DATA_DIRS", "/usr/local/share/:/usr/share/"); +} + +/** + * \note Only available on UNIX. Accesses the file system. + * + * Returns $XDG_CONFIG_DIRS as per the XDG specification. + * If that is not set, returns a vector of paths for + * /etc/xdg (i.e. a one-element vector). + */ +std::vector<Path> Path::config_dirs() +{ + return get_xdg_dirlist("XDG_CONFIG_DIRS", "/etc/xdg"); +} +#endif + +/** \name User data directories + * + * Directories that contain user data like music or text files + * the user works with. + */ +///@{ + +/** + * \note On UNIX, this method accesses the file system. + * + * Retrieves the directory of the user’s desktop. Generally, + * any files placed in this directory will appear on the + * user’s desktop view (the area shown when no windows + * are open). + * + * On UNIX, this is $XDG_DESKTOP_DIR, defaulting to `~/Desktop`. + * Note you likely will receive a localised version (like “Schreibtisch” + * on a German Linux). + * + * On Windows, the default is `C:/Users/username/Desktop` or a localised + * version. + */ +Path Path::desktop_dir() +{ +#if defined(_PATHIE_UNIX) + return Path(get_xdg_userdir_setting("DESKTOP")); +#elif defined(_WIN32) + wchar_t dir[MAX_PATH]; + HRESULT result = SHGetFolderPathW(NULL, CSIDL_DESKTOPDIRECTORY, NULL, SHGFP_TYPE_CURRENT, dir); + if (result != S_OK) + throw(Pathie::WindowsHresultError(result)); + + return Path(utf16_to_utf8(dir)); +#else +#error Unsupported system. +#endif +} + +/** + * \note On UNIX, this method accesses the file system. + * + * Retrieves the directory for the user’s documents. This is + * not the place for your data files, savegames, or configuration + * files -- it is meant only for textual and other documents you can + * access with an office or similar program. See data_dir() for a directory + * you can store your data into. + * + * On UNIX, this is $XDG_DOCUMENTS_DIR, defaulting to `~/Documents`. + * Note you likely will receive a localised version (like "Dokumente" + * on a German Linux). + * + * On Windows, the default is `C:/Users/username/Documents` or a localised + * version. + */ +Path Path::documents_dir() +{ +#if defined(_PATHIE_UNIX) + return Path(get_xdg_userdir_setting("DOCUMENTS")); +#elif defined(_WIN32) + wchar_t dir[MAX_PATH]; + HRESULT result = SHGetFolderPathW(NULL, CSIDL_PERSONAL, NULL, SHGFP_TYPE_CURRENT, dir); + if (result != S_OK) + throw(Pathie::WindowsHresultError(result)); + + return Path(utf16_to_utf8(dir)); +#else +#error Unsupported system. +#endif +} + +/** + * \note On UNIX, this method accesses the file system. + * + * Retrieves the user’s download directory. Unfortunately, this function + * is currently unsupported under Windows, because MinGW has not yet + * adapted the necessary win32api changes. + * + * On UNIX, this is $XDG_DOWNLOAD_DIR, defaulting to `~/Downloads`. + * Note you likely will receive a localised version. + */ +Path Path::download_dir() +{ +#if defined(_PATHIE_UNIX) + return Path(get_xdg_userdir_setting("DOWNLOAD")); +#elif defined(_WIN32) + // Not available via CSIDL, must use the newer KNOWNFOLDERID system, + // which is not supported by MinGW yet. + throw(std::runtime_error("KNOWNFOLDERID is not supported by MinGW yet, can't retrieve this directory.")); +#else +#error Unsupported system. +#endif +} + +/** + * \note On UNIX, this method accesses the file system. + * + * Retrieves the user’s music directory. + * + * On UNIX, this is $XDG_MUSIC_DIR, defaulting to `~/Music`. + * Note you likely will receive a localised version (like "Musik" + * on a German Linux). + * + * On Windows, this defaults to `C:/users/username/Music` or a localised + * version. + */ +Path Path::music_dir() +{ +#if defined(_PATHIE_UNIX) + return Path(get_xdg_userdir_setting("MUSIC")); +#elif defined(_WIN32) + wchar_t dir[MAX_PATH]; + HRESULT result = SHGetFolderPathW(NULL, CSIDL_MYMUSIC, NULL, SHGFP_TYPE_CURRENT, dir); + if (result != S_OK) + throw(Pathie::WindowsHresultError(result)); + + return Path(utf16_to_utf8(dir)); +#else +#error Unsupported system. +#endif +} + +/** + * \note On UNIX, this method accesses the file system. + * + * Retrieves the user’s pictures directory. + * + * On UNIX, this is $XDG_PICTURES_DIR, defaulting to `~/Pictures`. + * Note you likely will receive a localised version (like "Bilder" + * on a German Linux). + * + * On Windows, this defaults to `C:/users/username/Pictures` or a + * localised version. + */ +Path Path::pictures_dir() +{ +#if defined(_PATHIE_UNIX) + return Path(get_xdg_userdir_setting("PICTURES")); +#elif defined(_WIN32) + wchar_t dir[MAX_PATH]; + HRESULT result = SHGetFolderPathW(NULL, CSIDL_MYPICTURES, NULL, SHGFP_TYPE_CURRENT, dir); + if (result != S_OK) + throw(Pathie::WindowsHresultError(result)); + + return Path(utf16_to_utf8(dir)); +#else +#error Unsupported system. +#endif +} + +/** + * \note On UNIX, this method accesses the file system. + * + * Retrieves the user’s publicshare directory. This directory may + * be exposed to network access on the local network, though this + * is not required. + * + * On UNIX, this is $XDG_PUBLICSHARE_DIR, defaulting to `~/Public`. + * Note you likely will receive a localised version (like "Öffentlich" + * on a German Linux). + * + * On Windows, this defaults to `C:/users/username/AppData/Roaming/Microsoft/Windows/Network Shortcuts`. + */ +Path Path::publicshare_dir() +{ +#if defined(_PATHIE_UNIX) + return Path(get_xdg_userdir_setting("PUBLICSHARE")); +#elif defined(_WIN32) + wchar_t dir[MAX_PATH]; + HRESULT result = SHGetFolderPathW(NULL, CSIDL_NETHOOD, NULL, SHGFP_TYPE_CURRENT, dir); + if (result != S_OK) + throw(Pathie::WindowsHresultError(result)); + + return Path(utf16_to_utf8(dir)); +#else +#error Unsupported system. +#endif +} + +/** + * \note On UNIX, this method accesses the file system. + * + * Retrieves the user’s directory for document templates. The files + * in this directory can generally be accessed by right-clicking + * in the user’s favourite file manager and selecting "new" followed + * by the desired file. The file will then be copied from the templates + * directory into the directory the user works in at the moment. + * + * On UNIX, this is $XDG_TEMPLATES_DIR, defaulting to `~/Templates`. + * Note you likely will receive a localised version (like "Vorlagen" + * on a German Linux). + * + * On Windows, this defaults to `C:/users/username/AppData/Roaming/Microsoft/Windows/Templates`. + */ +Path Path::templates_dir() +{ +#if defined(_PATHIE_UNIX) + return Path(get_xdg_userdir_setting("TEMPLATES")); +#elif defined(_WIN32) + wchar_t dir[MAX_PATH]; + HRESULT result = SHGetFolderPathW(NULL, CSIDL_TEMPLATES, NULL, SHGFP_TYPE_CURRENT, dir); + if (result != S_OK) + throw(Pathie::WindowsHresultError(result)); + + return Path(utf16_to_utf8(dir)); +#else +#error Unsupported system. +#endif +} + +/** + * \note On UNIX, this method accesses the file system. + * + * Retrieves the user’s directory for videos. + * + * On UNIX, this is $XDG_VIDEOS_DIR, defaulting to `~/Videos` + * or a localised version. + * + * On Windows, this defaults to `C:/users/username/Videos` or a + * localised version. + */ +Path Path::videos_dir() +{ +#if defined(_PATHIE_UNIX) + return Path(get_xdg_userdir_setting("VIDEOS")); +#elif defined(_WIN32) + wchar_t dir[MAX_PATH]; + HRESULT result = SHGetFolderPathW(NULL, CSIDL_MYVIDEO, NULL, SHGFP_TYPE_CURRENT, dir); + if (result != S_OK) + throw(Pathie::WindowsHresultError(result)); + + return Path(utf16_to_utf8(dir)); +#else +#error Unsupported system. +#endif +} + +/** + * \note On UNIX, this method accesses the file system. + * + * Retrieves the user’s path for application starters. On UNIX, + * this will return a directory (typically `~/.local/share/applications`) + * where you can store XDG `.desktop` files in so they get picked up + * by the desktop environment’s application menu for that user. On Windows, + * the user’s startmenu folder is returned, and any files and directories + * you add there will show up in the user’s startmenu. + * + * \remark On Windows, this is not the global startmenu folder, but the + * user’s specific ones. Other users will not have the entries you put + * here in their startmenu. + */ +Path Path::appentries_dir() +{ +#if defined(_PATHIE_UNIX) + return data_dir().join("applications"); +#elif defined(_WIN32) + wchar_t dir[MAX_PATH]; + HRESULT result = SHGetFolderPathW(NULL, CSIDL_STARTMENU, NULL, SHGFP_TYPE_CURRENT, dir); + if (result != S_OK) + throw(Pathie::WindowsHresultError(result)); + + return Path(utf16_to_utf8(dir)); +#else +#error Unsupported system. +#endif +} + +///@} + +/** \name Global data directories + * + * Directories that contain data either unrelated to users at all, + * or applicable to all users at once. Be careful to read the + * Windows notes in the documentation of these methods, as Windows + * only supplies are much smaller set of system directories than UNIX. + */ +///@{ + +/** + * Retrieves the global directory for application starters. On UNIX, + * any XDG `.desktop` files you place there should show up in any user’s + * desktop environment’s menu, and on Windows, anything you place there + * should show up in any user’s startmenu. + * + * \param local (true) If true, this method returns the location + * under the `/usr/local` hierarchy, otherwise it returns the + * location under the `/usr` hierarchy. This parameter has no meaning + * on Windows and is ignored. + */ +Path Path::global_appentries_dir(localpathtype local) +{ +#if defined(_PATHIE_UNIX) + if (local == Path::LOCALPATH_LOCAL || (local == Path::LOCALPATH_DEFAULT && get_global_dir_default() == Path::LOCALPATH_LOCAL)) + return Path("/usr/local/share/applications"); + else + return Path("/usr/share/applications"); +#elif defined(_WIN32) + wchar_t dir[MAX_PATH]; + HRESULT result = SHGetFolderPathW(NULL, CSIDL_COMMON_STARTMENU, NULL, SHGFP_TYPE_CURRENT, dir); + if (result != S_OK) + throw(Pathie::WindowsHresultError(result)); + + return Path(utf16_to_utf8(dir)); +#else +#error Unsupported system. +#endif +} + +/** + * Retrieves the directory for immutable application data that isn’t user-specific, + * i.e. which shall be available to all users using the system. + * + * On UNIX, this is `/usr/share`. On Windows, this is `C:/Windows/system32`. + * On Windows, beware conflicts with files of the same name in + * global_config_dir()! + * + * \param local (true) If true, this method returns the location + * under the `/usr/local` hierarchy, otherwise it returns the + * location under the `/usr` hierarchy. This parameter has no meaning + * under Windows and is ignored. + */ +Path Path::global_immutable_data_dir(localpathtype local) +{ +#if defined(_PATHIE_UNIX) + if (local == Path::LOCALPATH_LOCAL || (local == Path::LOCALPATH_DEFAULT && get_global_dir_default() == Path::LOCALPATH_LOCAL)) + return Path("/usr/local/share"); + else + return Path("/usr/share"); +#elif defined(_WIN32) + wchar_t dir[MAX_PATH]; + HRESULT result = SHGetFolderPathW(NULL, CSIDL_SYSTEM, NULL, SHGFP_TYPE_CURRENT, dir); + if (result != S_OK) + throw(Pathie::WindowsHresultError(result)); + + return Path(utf16_to_utf8(dir)); +#else +#error Unsupported system. +#endif +} + +/** + * Retrieves the directory for mutable application data that isn’t user-specific, + * i.e. which shall be available to all users using the system. + * + * On UNIX, this is `/var/lib`. On Windows, this is the Application Data folder + * for the "All Users" account. On Windows, this is equivalent to global_cache_dir(), + * so beware file name conflicts on Windows! + * + * \param local (true) If true, this method returns the location + * under the `/var/local` hierarchy, otherwise it returns the + * location under the `/var` hierarchy. This parameter has no meaning + * under Windows and is ignored. + */ +Path Path::global_mutable_data_dir(localpathtype local) +{ +#if defined(_PATHIE_UNIX) + if (local == Path::LOCALPATH_LOCAL || (local == Path::LOCALPATH_DEFAULT && get_global_dir_default() == Path::LOCALPATH_LOCAL)) + return Path("/var/local/lib"); + else + return Path("/var/lib"); +#elif defined (_WIN32) + wchar_t dir[MAX_PATH]; + HRESULT result = SHGetFolderPathW(NULL, CSIDL_COMMON_APPDATA, NULL, SHGFP_TYPE_CURRENT, dir); + if (result != S_OK) + throw(Pathie::WindowsHresultError(result)); + + return Path(utf16_to_utf8(dir)); +#else +#error Unsupported system +#endif +} + +/** + * Retrieves the directory for global cache data, i.e. data, which + * is not essential to the program and can be reconstructed if it + * gets lost. + * + * On UNIX, this returns `/var/cache`. Windows does not have a notion + * of such a directory, hence the value is equal to the return value + * of global_mutable_data_dir(). Therefore: On Windows, beware conflicts if you + * use files of the same name in global_mutable_data_dir() and + * global_cache_dir()! + * + * \param local (true) If true, returns the cache directory for locally installed + * programs, which is `/var/local/cache`. This parameter has no effect under + * systems other than UNIX. + */ +Path Path::global_cache_dir(localpathtype local) +{ +#if defined(_PATHIE_UNIX) + if (local == Path::LOCALPATH_LOCAL || (local == Path::LOCALPATH_DEFAULT && get_global_dir_default() == Path::LOCALPATH_LOCAL)) + return Path("/var/local/cache"); + else + return Path("/var/cache"); +#elif defined(_WIN32) + return global_mutable_data_dir(); +#else +#error Unsupported system. +#endif +} + +/** + * \note On UNIX, this method accesses the filesystem. + * + * Returns the directory for volatile information that will be deleted + * on system shutdown. + * + * On UNIX, this returns `/run` if it exists, otherwise `/var/run`. + * Windows does not have a notion of such a directory; as a replacement, + * `C:/Temp` is returned. + * + * \param local (true) If true, returns the equivalent directory for + * `/run` for locally installed programs, which is `/var/local/run`. This + * parameter has no effect on systems other than UNIX. + */ +Path Path::global_runtime_dir(localpathtype local) +{ +#if defined(_PATHIE_UNIX) + if (local == Path::LOCALPATH_LOCAL || (local == Path::LOCALPATH_DEFAULT && get_global_dir_default() == Path::LOCALPATH_LOCAL)) + return Path("/var/local/run"); + + Path run("/run"); + if (run.exists()) + return run; + else + return Path("/var/run"); +#elif defined(_WIN32) + return Path("C:/Temp"); +#else +#error Unsupported system. +#endif +} + +/** + * Returns the global directory for configuration files. + * + * On UNIX, this is `/etc`. Windows does not really have a notion + * for configuration directories. This method returns the Windows + * system folder for that purpose, typically `C:/Windows/system32`; + * this is equivalent to global_immutable_data_dir(), so be careful + * when you place files of the same name in global_config_dir()! + * + * \param local (true) If true, returns the global configuration + * directory for locally installed programs instead, which is + * `/usr/local/etc`. + */ +Path Path::global_config_dir(localpathtype local) +{ +#if defined(_PATHIE_UNIX) + if (local == Path::LOCALPATH_LOCAL || (local == Path::LOCALPATH_DEFAULT && get_global_dir_default() == Path::LOCALPATH_LOCAL)) + return Path("/usr/local/etc"); + else + return Path("/etc"); + +#elif defined(_WIN32) + wchar_t dir[MAX_PATH]; + HRESULT result = SHGetFolderPathW(NULL, CSIDL_SYSTEM, NULL, SHGFP_TYPE_CURRENT, dir); + if (result != S_OK) + throw(Pathie::WindowsHresultError(result)); + + return Path(utf16_to_utf8(dir)); +#else +#error Unsupported system. +#endif +} + +/** + * Retrieves the global directory for self-contained applications, i.e. + * applications that require a directory structure different from the + * Filesystem Hierarchy Standard (FHS). Such programs are an exception + * under UNIX, but are the regular case on Windows. The programs placed + * in this directory are intended to be available to all users using the + * system. + * + * Under UNIX, this method returns the `/opt` directory. On Windows, + * it returns the Program Files directory (typically `C:\Program Files`). + * + * \note On UNIX, the FHS mandates that programs installed under + * `/opt` do not use the usual directories for variable information + * returned by global_mutable_data_dir() and global_cache_dir(), but + * instead use `/var/opt`. + */ +Path Path::global_programs_dir() +{ +#if defined(_PATHIE_UNIX) + return Path("/opt"); +#elif defined(_WIN32) + wchar_t dir[MAX_PATH]; + HRESULT result = SHGetFolderPathW(NULL, CSIDL_PROGRAM_FILES, NULL, SHGFP_TYPE_CURRENT, dir); + if (result != S_OK) + throw(Pathie::WindowsHresultError(result)); + + return Path(utf16_to_utf8(dir)); + +#else +#error Unsupported system. +#endif +} + +///@} + +/** \name Miscellaneous static functions + * + * Other functions that didn’t fit somewhere else. + */ +///@{ + +/// \note This method accesses the filesystem. +/// +/// Uses a shell-like glob pattern on the current working directory. +/// Typically available patterns include "*" for a string of +/// arbitrary length and "?" for a string of length one. +/// +/// Refer to glob(7) for glob patterns available on UNIX. +/// Refer to [MSDN](http://msdn.microsoft.com/en-us/library/windows/desktop/aa364418%28v=vs.85%29.aspx) +/// for glob patterns available on Windows. +/// +/// Windows does not support recursive patterns like +/// +/// \verbatim **/* \endverbatim +/// +/// or +/// +/// \verbatim foo/*/bar \endverbatim +/// +/// . This will result in a Pathie::WindowsError exception +/// with Windows error code 123 (“invalid filename”). For cross-platform +/// recursive matching, you can try to combine find() and fnmatch(). +/// +/// \param[in] pattern Glob pattern. +/// \param flags (`0`) Globbing flags. Refer to glob(3) for +/// possible values; the parameter is ignored on Windows. +/// +/// \returns A vector of Path instances that matched the glob +/// pattern. +/// +/// \remark Glob patterns on UNIX are generally much more powerful than +/// those on Windows. Be careful when using anything apart from "*" and "?" +/// patterns on Windows. +/// +/// \see dglob() fnmatch() +/// +std::vector<Path> Path::glob(const std::string& pattern, int flags /* = 0 */) +{ +#if defined(_PATHIE_UNIX) + std::string nstr = utf8_to_filename(pattern); + glob_t globinfo; + int result = ::glob(nstr.c_str(), flags, NULL, &globinfo); + + if (result == GLOB_NOMATCH) { + return std::vector<Path>(); // Empty vector + } + else if (result == 0) { + std::vector<Path> result; + + for(size_t i=0; i < globinfo.gl_pathc; i++) { + result.push_back(Path(filename_to_utf8(globinfo.gl_pathv[i]))); + } + + globfree(&globinfo); + return result; + } + else { + throw(GlobError(result)); + } +#elif defined(_WIN32) + std::vector<Path> results; + std::wstring utf16_pattern = utf8_to_utf16(pattern); + + /* Windows’ FindFirstFile()/FindNextFile() returns bare file names. + * However, to ensure output similar to the UNIX version, we prepend + * the pattern’s stem if a slash / is found in the pattern; FindFirstFile()/ + * FindNextFile() don’t support recursive matching anyway, so this is safe. */ + std::string stem; + size_t pos = 0; + if ((pos = pattern.rfind("/")) != string::npos) // Single = intended + stem = pattern.substr(0, pos + 1); // Trailing / included + + // Prepare + HANDLE filehandle = INVALID_HANDLE_VALUE; + WIN32_FIND_DATAW finddata; + memset(&finddata, '\0', sizeof(WIN32_FIND_DATA)); + + // Try finding the first file + filehandle = FindFirstFileW(utf16_pattern.c_str(), &finddata); + + // Check if some error happened + if (filehandle == INVALID_HANDLE_VALUE) { + DWORD errval = GetLastError(); + if (errval == ERROR_FILE_NOT_FOUND) // According to docs, this means no matching files were found. Return empty list. + return results; + else if (errval != ERROR_SUCCESS) + throw Pathie::WindowsError(errval); + } + + // All well, save this one... + results.push_back(Path(stem + utf16_to_utf8(finddata.cFileName))); + + // ...and continue. + while (FindNextFileW(filehandle, &finddata)) { + results.push_back(Path(stem + utf16_to_utf8(finddata.cFileName))); + } + + DWORD errval = GetLastError(); + FindClose(filehandle); + + if (errval != ERROR_NO_MORE_FILES) + throw(Pathie::WindowsError(errval)); + + return results; +#else +#error Unsupported system. +#endif +} + +///@} + +/** \name Miscellaneous member functions + * + * Methods that didn’t fit anywhere else. + */ + +///@{ + +/** + * This method tests whether the referenced path matches the + * given pattern under the rules of the local glob-matching + * function. Note this method does _not_ access the filesystem, + * hence there is no guarantee that the referenced path exists. + * + * \param[in] pattern The pattern to match. + * \param flags Any flags. This parameter is ignored on Windows, + * for UNIX refer to the fnmatch(3) manpage. + * + * \returns Whether the path matches the pattern. + * + * \remark On Windows, this method uses the [PathMatchSpec()](http://msdn.microsoft.com/en-us/library/bb773727%28VS.85%29.aspx) + * function; on UNIX, it uses fnmatch(3). + * + * \remark Windows’s `PathMatchSpec()` function does not support + * recursive matching patterns, while the UNIX fnmatch(8), relying + * on glob(7), does. + * + * \remark Glob patterns on UNIX are generally much more powerful than + * those on Windows. Be careful when using anything apart from "*" and "?" + * patterns on Windows. + * + * \see glob() dglob() + */ +bool Path::fnmatch(const std::string& pattern, int flags /* = 0 */) const +{ +#if defined(_PATHIE_UNIX) + std::string nstr = native(); + std::string pattern_nstr = utf8_to_filename(pattern); + return ::fnmatch(pattern_nstr.c_str(), nstr.c_str(), flags) == 0; +#elif defined(_WIN32) + std::wstring utf16path = utf8_to_utf16(m_path); + std::wstring utf16pattern = utf8_to_utf16(pattern); + return PathMatchSpecW(utf16path.c_str(), utf16pattern.c_str()); +#else +#error Unsupported system. +#endif +} + +/** + * \note This method acceses the filesystem. + * + * Like glob(), but prepends the referenced path to the glob + * pattern. + * + * \see glob() fnmatch() + */ +std::vector<Path> Path::dglob(const std::string& pattern, int flags /* = 0 */) const +{ + return glob(m_path + "/" + pattern, flags); +} + +/** + * Appends a /, then the new component, and + * finally returns a new Path instance. + * + * \param path New component. + * + * \returns New Path instance. + */ +Path Path::join(Path path) const +{ + Path p(m_path + "/" + path.str()); + return p; +} + +/** + * Appends a /, then the new component, and + * finally returns a new Path instance. + * + * \param str New component. + * + * \returns New Path instance. + */ +Path Path::join(std::string str) const +{ + Path path(m_path + "/" + str); + return path; +} + +/** + * Replaces the current extension with the given new extension + * and returns the result. If the referenced path doesn’t have + * a file extension currently, the new extension is appended. + * + * \param new_extension The new extension. If the leading point + * is missing, it will automatically be prepended. + * + * \returns The new Path instance. + */ +Path Path::sub_ext(std::string new_extension) const +{ + // If the point is missing, add it to the beginning. + if (new_extension.find(".") == string::npos) + new_extension.insert(0, "."); + + std::string old_extension = extension(); + if (old_extension.empty()) { + return Path(m_path + new_extension); + } + else { + size_t pos = m_path.find(old_extension); + return Path(m_path.substr(0, pos) + new_extension); + } +} + +///@} diff --git a/src/3rd_party/pathie-cpp/src/pathie.cpp b/src/3rd_party/pathie-cpp/src/pathie.cpp new file mode 100644 index 00000000..9df1f733 --- /dev/null +++ b/src/3rd_party/pathie-cpp/src/pathie.cpp @@ -0,0 +1,226 @@ +/* -*- coding: utf-8 -*- + * This file is part of Pathie. + * + * Copyright © 2015, 2017 Marvin Gülker + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * “AS IS” AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "../include/pathie.hpp" +#include "../include/errors.hpp" + +#if defined(_WIN32) +#include <windows.h> + +/** + * Converts a UTF-16LE string into UTF-8. Only available + * on Windows. + */ +std::string Pathie::utf16_to_utf8(std::wstring str) +{ + int size = WideCharToMultiByte(CP_UTF8, 0, str.c_str(), str.length(), NULL, 0, NULL, NULL); + + char* utf8 = (char*) malloc(size); // sizeof(char) = 1 per ANSI C standard. + memset(utf8, 0, size); + + size = WideCharToMultiByte(CP_UTF8, 0, str.c_str(), str.length(), utf8, size, NULL, NULL); + + if (size == 0) + throw(Pathie::WindowsError(GetLastError())); + + std::string utf8str(utf8, size); + free(utf8); + + return utf8str; +} + +/** + * Converts a UTF-8 string into UTF-16LE. Only available + * on Windows. + */ +std::wstring Pathie::utf8_to_utf16(std::string str) +{ + int count = MultiByteToWideChar(CP_UTF8, 0, str.c_str(), str.length(), NULL, 0); + + wchar_t* utf16 = (wchar_t*) malloc(count * sizeof(wchar_t)); + memset(utf16, 0, count * sizeof(wchar_t)); + + count = MultiByteToWideChar(CP_UTF8, 0, str.c_str(), str.length(), utf16, count); + + if (count == 0) + throw(Pathie::WindowsError(GetLastError())); + + std::wstring utf16str(utf16, count); + free(utf16); + + return utf16str; +} +#endif + +#ifdef _PATHIE_UNIX +#include <cstring> +#include <cstdlib> +#include <errno.h> +#include <iconv.h> +#include <langinfo.h> +#include <sys/param.h> // defines "BSD" macro on BSD systems + +/* iconv() function family is available on every POSIX-conformant + * system. In POSIX.1-2008, it’s specified in the "System Interfaces" + * section. + * + * nl_langinfo() is also specified by POSIX, though I’ve found no evidence + * that iconv() is required to understand the encoding output by nl_langinfo(CODESET). + * From checking on Linux and FreeBSD, this however seems very likely, so we have + * to assume that this always is the case. + */ + +/** + * This function converts the given string from the given source encoding + * to another given target encoding and returns the result as a std::string. + * + * \param[in] from_encoding Convert from this encoding. + * \param[in] to_encoding Convert into this encoding. + * \param[in] string The string to convert. + * + * \returns The converted string. + * + * \remark See the output of the `iconv --list` command for a list of + * supported encodings. + */ +std::string Pathie::convert_encodings(const char* from_encoding, const char* to_encoding, const std::string& string) +{ + size_t input_length = string.length(); + + // We need a C string working copy that isn’t const + char* copy = (char*) malloc(input_length + 1); // Terminating NUL + strcpy(copy, string.c_str()); + + // Set up the encoding converter + iconv_t converter = iconv_open(to_encoding, from_encoding); + size_t outbytes_left = 0; + size_t inbytes_left = input_length; + + if (converter == (iconv_t) -1) + throw Pathie::ErrnoError(errno); + + /* There is no way to know how much space iconv() will need. So we keep + * allocating more and more memory as needed. `current_size' keeps track + * of how large our memory blob is currently. `outbuf' is the pointer to + * that memory blob. */ + size_t current_size = input_length + 1; // NUL + char* outbuf = NULL; + char* inbuf = copy; // Copy the pointer + + int errsav = 0; + outbytes_left = current_size; + while(true) { + outbuf = (char*) realloc(outbuf - (current_size - outbytes_left), current_size + 10); + current_size += 10; + outbytes_left += 10; + + errno = 0; + errsav = 0; + +#ifdef BSD + // What the heck. FreeBSD violates POSIX.1-2008: it declares iconv() + // differently than mandated by POSIX: http://pubs.opengroup.org/onlinepubs/9699919799/functions/iconv.html + // (it declares a `const' where it must not be). + iconv(converter, const_cast<const char**>(&inbuf), &inbytes_left, &outbuf, &outbytes_left); // sets outbytes_left to 0 or very low values if not enough space (E2BIG) +#else + iconv(converter, &inbuf, &inbytes_left, &outbuf, &outbytes_left); // sets outbytes_left to 0 or very low values if not enough space (E2BIG) +#endif + errsav = errno; + + if (errsav != E2BIG) { + break; + } + } + + iconv_close(converter); + free(copy); + + size_t count = current_size - outbytes_left; + outbuf -= count; // iconv() advances the pointer! + + if (errsav != 0) { + free(outbuf); + throw(Pathie::ErrnoError(errsav)); + } + + std::string result(outbuf, count); + free(outbuf); + + return result; +} + +/** + * Converts the given UTF-8 string into the native filename encoding. + */ +std::string Pathie::utf8_to_filename(const std::string& utf8) +{ + bool fs_encoding_is_utf8 = false; + +#if defined(__APPLE__) || defined(PATHIE_ASSUME_UTF8_ON_UNIX) + fs_encoding_is_utf8 = true; +#else + char* fsencoding = NULL; + fsencoding = nl_langinfo(CODESET); + fs_encoding_is_utf8 = (strcmp(fsencoding, "UTF-8") == 0); +#endif + + // Skip the expensive convert_encodings() call if the filesystem + // encoding already is UTF-8. + if (fs_encoding_is_utf8) { + return std::string(utf8); + } + + return convert_encodings("UTF-8", fsencoding, utf8); +} + +/** + * Converts the given string in native filesystem encoding to + * UTF-8. + */ +std::string Pathie::filename_to_utf8(const std::string& native_filename) +{ + bool fs_encoding_is_utf8 = false; + +#if defined(__APPLE__) || defined(PATHIE_ASSUME_UTF8_ON_UNIX) + fs_encoding_is_utf8 = true; +#else + char* fsencoding = NULL; + fsencoding = nl_langinfo(CODESET); + fs_encoding_is_utf8 = (strcmp(fsencoding, "UTF-8") == 0); +#endif + + // Skip the expensive convert_encodings() call if the filesystem + // encoding already is UTF-8. + if (fs_encoding_is_utf8) { + return std::string(native_filename); + } + + return convert_encodings(fsencoding, "UTF-8", native_filename); +} +#endif diff --git a/src/3rd_party/pathie-cpp/src/pathie_ifstream.cpp b/src/3rd_party/pathie-cpp/src/pathie_ifstream.cpp new file mode 100644 index 00000000..06b80731 --- /dev/null +++ b/src/3rd_party/pathie-cpp/src/pathie_ifstream.cpp @@ -0,0 +1,320 @@ +/* -*- coding: utf-8 -*- + * This file is part of Pathie. + * + * Copyright © 2015, 2017 Marvin Gülker + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * “AS IS” AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "../include/pathie_ifstream.hpp" + +#include <cstdlib> + +namespace Pathie { +#if defined(_PATHIE_UNIX) + // All well and easy under UNIX. Just delegate to standard constructor. + Pathie::ifstream::ifstream(Pathie::Path path, std::ios_base::openmode mode) + : std::ifstream(path.native().c_str(), mode) + { + // + } + + Pathie::ifstream::ifstream() + : std::ifstream() + { + // + } + + Pathie::ifstream::ifstream(std::string path, std::ios_base::openmode mode) + : std::ifstream(utf8_to_filename(path).c_str(), mode) + { + // + } + + Pathie::ifstream::ifstream(char* path, std::ios_base::openmode mode) + : std::ifstream(utf8_to_filename(path).c_str(), mode) + { + // + } + + void Pathie::ifstream::open(const char* filename, ios_base::openmode mode) + { + std::string filename_nstr = utf8_to_filename(filename); + std::ifstream::open(filename_nstr.c_str(), mode); + } + + void Pathie::ifstream::open(const std::string& filename, ios_base::openmode mode) + { + std::ifstream::open(utf8_to_filename(filename).c_str(), mode); + } + + void Pathie::ifstream::open(const Pathie::Path& filename, ios_base::openmode mode) + { + std::ifstream::open(filename.native().c_str(), mode); + } + + +#elif defined (_WIN32) +# if defined(_MSC_VER) + // Easy again under MSVC under Windows; using Microsoft’s nonstandard constructor + // for Unicode filenames. + // It is documented here: http://msdn.microsoft.com/en-us/library/8et8s826.aspx + Pathie::ifstream::ifstream(Pathie::Path path, std::ios_base::openmode mode) + : std::ifstream(path.native(), mode) + { + // + } + + Pathie::ifstream::ifstream() + : std::ifstream() + { + // + } + + Pathie::ifstream::ifstream(std::string path, std::ios_base::openmode mode) + : std::ifstream(path, mode) + { + // + } + + Pathie::ifstream::ifstream(char* path, std::ios_base::openmode mode) + : std::ifstream(path, mode) + { + // + } +# elif defined(__GNUC__) + // This one is tough, but solveable. There’s a nonstandard C++ extension by the + // GCC team to create a C++ stream from a file descriptor and similar. + // It is documented here: https://gcc.gnu.org/onlinedocs/gcc-4.9.2/libstdc++/api/a00054.html + + /** + * Default constructor for deferred initialisation via open(). + * Beware that before you called open(), any methods other than + * is_open() may behave unexpectedly! + */ + Pathie::ifstream::ifstream() + : std::basic_istream<char, std::char_traits<char> >() + { + mp_file = NULL; + mp_filebuffer = NULL; + m_buffer_allocated = false; + + // See the lengthy explanation in open() for why we do this here. + mp_filebuffer = (__gnu_cxx::stdio_filebuf<char>*) malloc(sizeof(__gnu_cxx::stdio_filebuf<char>)); + memset(mp_filebuffer, '\0', sizeof(__gnu_cxx::stdio_filebuf<char>)); + + this->init(mp_filebuffer); + } + + /** + * Construct a stream for the given UTF-8 file path. + * + * \param[in] filename The path to open the stream for. UTF-8. + * \param mode Mode to open the file in. + */ + Pathie::ifstream::ifstream(const char* filename, ios_base::openmode mode) + : std::basic_istream<char, std::char_traits<char> >() + { + mp_file = NULL; + mp_filebuffer = NULL; + m_buffer_allocated = false; + + // See the lengthy explanation in open() for why we do this here. + mp_filebuffer = (__gnu_cxx::stdio_filebuf<char>*) malloc(sizeof(__gnu_cxx::stdio_filebuf<char>)); + memset(mp_filebuffer, '\0', sizeof(__gnu_cxx::stdio_filebuf<char>)); + + this->init(mp_filebuffer); + this->open(filename, mode); + } + + /** + * Construct a stream for the given UTF-8 file path. + * + * \param[in] filename The path to open the stream for. UTF-8. + * \param mode Mode to open the file in. + */ + Pathie::ifstream::ifstream(const std::string& filename, ios_base::openmode mode) + : std::basic_istream<char, std::char_traits<char> >() + { + mp_file = NULL; + mp_filebuffer = NULL; + m_buffer_allocated = false; + + // See the lengthy explanation in open() for why we do this here. + mp_filebuffer = (__gnu_cxx::stdio_filebuf<char>*) malloc(sizeof(__gnu_cxx::stdio_filebuf<char>)); + memset(mp_filebuffer, '\0', sizeof(__gnu_cxx::stdio_filebuf<char>)); + + this->init(mp_filebuffer); + this->open(filename, mode); + } + + /** + * Construct a stream for the given Pathie::Path instance. + * + * \param[in] filename The path to open the stream for. A Pathie::Path instance. + * \param mode Mode to open the file in. + */ + Pathie::ifstream::ifstream(const Pathie::Path& filename, ios_base::openmode mode) + : std::basic_istream<char, std::char_traits<char> >() + { + mp_file = NULL; + mp_filebuffer = NULL; + m_buffer_allocated = false; + + // See the lengthy explanation in open() for why we do this here. + mp_filebuffer = (__gnu_cxx::stdio_filebuf<char>*) malloc(sizeof(__gnu_cxx::stdio_filebuf<char>)); + memset(mp_filebuffer, '\0', sizeof(__gnu_cxx::stdio_filebuf<char>)); + + this->init(mp_filebuffer); + this->open(filename, mode); + } + + Pathie::ifstream::~ifstream() + { + free(mp_filebuffer); + } + + /** + * The underlying buffer. + */ + __gnu_cxx::stdio_filebuf<char>* Pathie::ifstream::rdbuf() const + { + return mp_filebuffer; + } + + /** + * Checks whether the stream has been open()ed already. This is the only + * method safe to use before you called open() on a stream constructed + * with the default constructor (apart from open() itself of course). + */ + bool Pathie::ifstream::is_open() const + { + if (!m_buffer_allocated) + return false; + + return mp_filebuffer->is_open(); + } + + /** + * Open the given UTF-8 file path in this stream. You can call this anytime + * after you constructed an instance with the default constructor; otherwise, + * you have to close() whatever was opened before you call this method. + * + * \param[in] filename UTF-8 filename to open + * \param mode Mode to open the stream in. + */ + void Pathie::ifstream::open(const char* filename, ios_base::openmode mode) + { + std::wstring w_filename = Pathie::utf8_to_utf16(filename); + + mp_file = _wfopen(w_filename.c_str(), L"r"); // Mode will be overridden + if (!mp_file) { + setstate(ios_base::failbit); + return; + } + + /* The following construction uses a “placement new” as it appears + * to be the only "clean" solution applicable. The init() method, + * an internum of the GCC implementation of basic_istream that + * needs to be called in the stream’s constructor, requires a + * pointer to the filebuffer object. However, we do not have that + * filebuffer object at hand in the constructor, the + * __gnu_cxx::stdio_filebuf instance will be created later when + * open() is called. It is impossible to construct it earlier, + * because it does not support a delayed open() call, the file + * descriptor or FILE* pointer must be passed during its + * construction, but we don’t have it there; it is available only + * in open() -- remember that you can create the ifstream instance + * without being attached to a file and then call open() later + * with a filename. To be able to pass something meaningful to + * init(), we have to "foresee" where in memory the stdio_filebuf + * instance will be created. This only is possible with a + * placement new into a place we have allocated previously using + * malloc(). + * + * An alternative would be to use internal GCC APIs by duplicating + * the sourcecode of the __gnu_cxx::stdio_filebuf constructor; however + * undocumented internal APIs are never good to use. For informational + * purposes therefore the sourcecode link: + * + * https://gcc.gnu.org/onlinedocs/gcc-4.9.2/libstdc++/api/a01222_source.html + */ + + new (mp_filebuffer) __gnu_cxx::stdio_filebuf<char>(mp_file, mode); + m_buffer_allocated = true; + + if (!mp_filebuffer->is_open()) + setstate(ios_base::failbit); + else + clear(); + } + + /** + * Open the given UTF-8 file path in this stream. You can call this anytime + * after you constructed an instance with the default constructor; otherwise, + * you have to close() whatever was opened before you call this method. + * + * \param[in] filename UTF-8 filename to open + * \param mode Mode to open the stream in. + */ + void Pathie::ifstream::open(const std::string& filename, ios_base::openmode mode) + { + open(filename.c_str(), mode); + } + + /** + * Open the given Pathie::Path in this stream. You can call this anytime + * after you constructed an instance with the default constructor; otherwise, + * you have to close() whatever was opened before you call this method. + * + * \param[in] filename Pathie::Path to open the stream for. + * \param mode Mode to open the stream in. + */ + void Pathie::ifstream::open(const Pathie::Path& filename, ios_base::openmode mode) + { + open(filename.str(), mode); + } + + /** + * Close the underlying file. Has no effect if no file is opened. + */ + void Pathie::ifstream::close() + { + if (mp_file) { + if (!mp_filebuffer->close()) + setstate(ios_base::failbit); + + // Do not deallocate, we may need it later if an open() call follows. + memset(mp_filebuffer, '\0', sizeof(__gnu_cxx::stdio_filebuf<char>)); + m_buffer_allocated = false; + fclose(mp_file); + } + } +# else +# error Unsupported compiler: do not know how to open C++ stream on Unicode file. +# endif +#else +# error Unsupported system. +#endif +}; diff --git a/src/3rd_party/pathie-cpp/src/pathie_ofstream.cpp b/src/3rd_party/pathie-cpp/src/pathie_ofstream.cpp new file mode 100644 index 00000000..f1085043 --- /dev/null +++ b/src/3rd_party/pathie-cpp/src/pathie_ofstream.cpp @@ -0,0 +1,326 @@ +/* -*- coding: utf-8 -*- + * This file is part of Pathie. + * + * Copyright © 2015, 2017 Marvin Gülker + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * “AS IS” AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "../include/pathie_ofstream.hpp" + +#if defined(_WIN32) && defined(__GNUC__) +#include <cstdio> +#include <cstdlib> +#endif + +namespace Pathie { +#if defined(_PATHIE_UNIX) + // All well and easy under UNIX. Just delegate to standard constructor. + Pathie::ofstream::ofstream(Pathie::Path path, std::ios_base::openmode mode) + : std::ofstream(path.native().c_str(), mode) + { + // + } + + Pathie::ofstream::ofstream() + : std::ofstream() + { + // + } + + Pathie::ofstream::ofstream(std::string path, std::ios_base::openmode mode) + : std::ofstream(utf8_to_filename(path).c_str(), mode) + { + // + } + + Pathie::ofstream::ofstream(char* path, std::ios_base::openmode mode) + : std::ofstream(utf8_to_filename(path).c_str(), mode) + { + // + } + + void Pathie::ofstream::open(const char* filename, ios_base::openmode mode) + { + std::string filename_nstr = utf8_to_filename(filename); + std::ofstream::open(filename_nstr.c_str(), mode); + } + + void Pathie::ofstream::open(const std::string& filename, ios_base::openmode mode) + { + std::ofstream::open(utf8_to_filename(filename).c_str(), mode); + } + + void Pathie::ofstream::open(const Pathie::Path& filename, ios_base::openmode mode) + { + std::ofstream::open(filename.native().c_str(), mode); + } + + +#elif defined (_WIN32) +# if defined(_MSC_VER) + // Easy again under MSVC under Windows; using Microsoft’s nonstandard constructor + // for Unicode filenames. + // It is documented here: http://msdn.microsoft.com/en-us/library/8et8s826.aspx + Pathie::ofstream::ofstream(Pathie::Path path, std::ios_base::openmode mode) + : std::ofstream(path.native(), mode) + { + // + } + + Pathie::ofstream::ofstream() + : std::ofstream() + { + // + } + + Pathie::ofstream::ofstream(std::string path, std::ios_base::openmode mode) + : std::ofstream(path, mode) + { + // + } + + Pathie::ofstream::ofstream(char* path, std::ios_base::openmode mode) + : std::ofstream(path, mode) + { + // + } +# elif defined(__GNUC__) + // This one is tough, but solveable. There’s a nonstandard C++ extension by the + // GCC team to create a C++ stream from a file descriptor and similar. + // It is documented here: https://gcc.gnu.org/onlinedocs/gcc-4.9.2/libstdc++/api/a00054.html + + /** + * Default constructor for deferred initialisation via open(). + * Beware that before you called open(), any methods other than + * is_open() may behave unexpectedly! + */ + Pathie::ofstream::ofstream() + : std::basic_ostream<char, std::char_traits<char> >() + { + mp_file = NULL; + mp_filebuffer = NULL; + m_buffer_allocated = false; + + // See the lengthy explanation in open() for why we do this here. + mp_filebuffer = (__gnu_cxx::stdio_filebuf<char>*) malloc(sizeof(__gnu_cxx::stdio_filebuf<char>)); + memset(mp_filebuffer, '\0', sizeof(__gnu_cxx::stdio_filebuf<char>)); + + this->init(mp_filebuffer); + } + + /** + * Construct a stream for the given UTF-8 file path. + * + * \param[in] filename The path to open the stream for. UTF-8. + * \param mode Mode to open the file in. + */ + Pathie::ofstream::ofstream(const char* filename, ios_base::openmode mode) + : std::basic_ostream<char, std::char_traits<char> >() + { + mp_file = NULL; + mp_filebuffer = NULL; + m_buffer_allocated = false; + + // See the lengthy explanation in open() for why we do this here. + mp_filebuffer = (__gnu_cxx::stdio_filebuf<char>*) malloc(sizeof(__gnu_cxx::stdio_filebuf<char>)); + memset(mp_filebuffer, '\0', sizeof(__gnu_cxx::stdio_filebuf<char>)); + + this->init(mp_filebuffer); + this->open(filename, mode); + } + + /** + * Construct a stream for the given UTF-8 file path. + * + * \param[in] filename The path to open the stream for. UTF-8. + * \param mode Mode to open the file in. + */ + Pathie::ofstream::ofstream(const std::string& filename, ios_base::openmode mode) + : std::basic_ostream<char, std::char_traits<char> >() + { + mp_file = NULL; + mp_filebuffer = NULL; + m_buffer_allocated = false; + + // See the lengthy explanation in open() for why we do this here. + mp_filebuffer = (__gnu_cxx::stdio_filebuf<char>*) malloc(sizeof(__gnu_cxx::stdio_filebuf<char>)); + memset(mp_filebuffer, '\0', sizeof(__gnu_cxx::stdio_filebuf<char>)); + + this->init(mp_filebuffer); + this->open(filename, mode); + } + + /** + * Construct a stream for the given Pathie::Path instance. + * + * \param[in] filename The path to open the stream for. A Pathie::Path instance. + * \param mode Mode to open the file in. + */ + Pathie::ofstream::ofstream(const Pathie::Path& filename, ios_base::openmode mode) + : std::basic_ostream<char, std::char_traits<char> >() + { + mp_file = NULL; + mp_filebuffer = NULL; + m_buffer_allocated = false; + + // See the lengthy explanation in open() for why we do this here. + mp_filebuffer = (__gnu_cxx::stdio_filebuf<char>*) malloc(sizeof(__gnu_cxx::stdio_filebuf<char>)); + memset(mp_filebuffer, '\0', sizeof(__gnu_cxx::stdio_filebuf<char>)); + + this->init(mp_filebuffer); + this->open(filename, mode); + } + + Pathie::ofstream::~ofstream() + { + free(mp_filebuffer); + } + + /** + * The underlying buffer. + */ + __gnu_cxx::stdio_filebuf<char>* Pathie::ofstream::rdbuf() const + { + return mp_filebuffer; + } + + /** + * Checks whether the stream has been open()ed already. This is the only + * method safe to use before you called open() on a stream constructed + * with the default constructor (apart from open() itself of course). + */ + bool Pathie::ofstream::is_open() const + { + if (!m_buffer_allocated) + return false; + + return mp_filebuffer->is_open(); + } + + /** + * Open the given UTF-8 file path in this stream. You can call this anytime + * after you constructed an instance with the default constructor; otherwise, + * you have to close() whatever was opened before you call this method. + * + * \param[in] filename UTF-8 filename to open + * \param mode Mode to open the stream in. + */ + void Pathie::ofstream::open(const char* filename, ios_base::openmode mode) + { + std::wstring w_filename = Pathie::utf8_to_utf16(filename); + + mp_file = _wfopen(w_filename.c_str(), + (mode & ios_base::trunc) ? L"w" : L"a"); + + if (!mp_file) { + setstate(ios_base::failbit); + return; + } + + /* The following construction uses a “placement new” as it appears + * to be the only "clean" solution applicable. The init() method, + * an internum of the GCC implementation of basic_ostream that + * needs to be called in the stream’s constructor, requires a + * pointer to the filebuffer object. However, we do not have that + * filebuffer object at hand in the constructor, the + * __gnu_cxx::stdio_filebuf instance will be created later when + * open() is called. It is impossible to construct it earlier, + * because it does not support a delayed open() call, the file + * descriptor or FILE* pointer must be passed during its + * construction, but we don’t have it there; it is available only + * in open() -- remember that you can create the ofstream instance + * without being attached to a file and then call open() later + * with a filename. To be able to pass something meaningful to + * init(), we have to "foresee" where in memory the stdio_filebuf + * instance will be created. This only is possible with a + * placement new into a place we have allocated previously using + * malloc(). + * + * An alternative would be to use internal GCC APIs by duplicating + * the sourcecode of the __gnu_cxx::stdio_filebuf constructor; however + * undocumented internal APIs are never good to use. For informational + * purposes therefore the sourcecode link: + * + * https://gcc.gnu.org/onlinedocs/gcc-4.9.2/libstdc++/api/a01222_source.html + */ + + new (mp_filebuffer) __gnu_cxx::stdio_filebuf<char>(mp_file, mode); + m_buffer_allocated = true; + + if (!mp_filebuffer->is_open()) + setstate(ios_base::failbit); + else + clear(); + } + + /** + * Open the given UTF-8 file path in this stream. You can call this anytime + * after you constructed an instance with the default constructor; otherwise, + * you have to close() whatever was opened before you call this method. + * + * \param[in] filename UTF-8 filename to open + * \param mode Mode to open the stream in. + */ + void Pathie::ofstream::open(const std::string& filename, ios_base::openmode mode) + { + open(filename.c_str(), mode); + } + + /** + * Open the given Pathie::Path in this stream. You can call this anytime + * after you constructed an instance with the default constructor; otherwise, + * you have to close() whatever was opened before you call this method. + * + * \param[in] filename Pathie::Path to open the stream for. + * \param mode Mode to open the stream in. + */ + void Pathie::ofstream::open(const Pathie::Path& filename, ios_base::openmode mode) + { + open(filename.str(), mode); + } + + /** + * Close the underlying file. Has no effect if no file is opened. + */ + void Pathie::ofstream::close() + { + if (mp_file) { + if (!mp_filebuffer->close()) + setstate(ios_base::failbit); + + // Do not deallocate, we may need it later if an open() call follows. + memset(mp_filebuffer, '\0', sizeof(__gnu_cxx::stdio_filebuf<char>)); + m_buffer_allocated = false; + fclose(mp_file); + } + } + +# else +# error Unsupported compiler: do not know how to open C++ stream on Unicode file. +# endif +#else +# error Unsupported system. +#endif +}; diff --git a/src/3rd_party/pathie-cpp/src/temp.cpp b/src/3rd_party/pathie-cpp/src/temp.cpp new file mode 100644 index 00000000..ae51bf87 --- /dev/null +++ b/src/3rd_party/pathie-cpp/src/temp.cpp @@ -0,0 +1,197 @@ +#include "../include/temp.hpp" +#include <sstream> +#include <cstdlib> +#include <cstring> +#include <ctime> + +#if defined(_PATHIE_UNIX) +#include <sys/types.h> +#include <unistd.h> +#elif defined(_WIN32) +#include <Windows.h> +#else +#error Unsupported system +#endif + +using namespace Pathie; + +static std::string generate_random_filename(const std::string& namepart) +{ + std::stringstream name; + name << namepart << "-" << time(NULL) << rand(); + +#if defined(_PATHIE_UNIX) + name << getpid(); +#elif defined(_WIN32) + name << GetCurrentProcessId(); +#else +#error Unsupported system +#endif + + return name.str(); +} + +/** + * TempEntry is not meant to be instanciated on itself. This + * constructor does the common work between the Tempdir and Tempfile + * classes, namely it generates a temporary filename that is not + * currently in use. + * + * \param namepart + * A string that will be included verbatim into the basename + * of the created directory. + * + * \remark The generated path name is of form + * `<namepart>-<currenttime><random><pid>`. However, future releases + * may change this format, so do not rely on it. + */ +TempEntry::TempEntry(std::string namepart) + : m_keep(false) +{ + do { + m_path = Path::temp_dir() / generate_random_filename(namepart); + } while (m_path.exists()); +} + +/** + * Destructor. + */ +TempEntry::~TempEntry() +{ + // +} + +/** + * Returns the absolute path to the temporary entry + * that was created by the constructor. + */ +Path TempEntry::path() const +{ + return m_path; +} + +/** + * Call this function if you do not want the destructor to delete + * the created temporary entry. You can still expressly delete + * the temporary entry by calling remove(). + * + * \param k + * If true (default), the destructor will not delete the temporary entry. + * If false, the destructor will delete the temporary entry. + */ +void TempEntry::keep(bool k) +{ + m_keep = k; +} + +/** + * Returns the keep status; see keep(). + */ +bool TempEntry::is_kept() const +{ + return m_keep; +} + +/** + * Constructs an instance of this class. A temporary directory + * is created that will be recursively removed when the object + * is deleted. + * + * \param namepart + * A string that will be included verbatim into the basename + * of the created directory. + * + * \returns The newly created instance. + * + * \remark There is a small timespan between the generation of the + * temporary path name and the creation of the directory in which it + * is theoretically possible for another process to create an entry + * that conflicts with the generated name. However, since the + * generated name includes a random number, the process identifier, + * and the number of seconds since epoch as well as the given + * `namepart`, the chance of an accidental collision is very low. + * Even a malicious attacker would have to guess the random number, so + * if your `srand()` seed is chosen properly and your C standard + * library is properly impelemented, this risk is again very low. + */ +Tempdir::Tempdir(std::string namepart) + : TempEntry(namepart) +{ + m_path.mktree(); +} + +/** + * Destructor, removes the temporary entry unless keep() has been called. + * Does nothing if the temporary file does not exist anymore for whatever + * reason. + */ +Tempdir::~Tempdir() +{ + if (!m_keep) + remove(); +} + +/** + * Recursively removes the temporary directory. This method + * ignores what was set with keep(), i.e., it *always* deletes + * the temporary directory if you call it. This method does + * nothing if the directory does not exist anymore for whatever + * reason. + */ +void Tempdir::remove() const +{ + if (m_path.exists()) + m_path.rmtree(); +} + +/** + * Constructs an instance of this class. A temporary file + * is created that will be recursively removed when the object + * is deleted. + * + * \param namepart + * A string that will be included verbatim into the basename + * of the created filename. + * + * \returns The newly created instance. + * + * \remark There is a small timespan between the generation of the + * temporary path name and the creation of the file in which it + * is theoretically possible for another process to create an entry + * that conflicts with the generated name. However, since the + * generated name includes a random number, the process identifier, + * and the number of seconds since epoch as well as the given + * `namepart`, the chance of an accidental collision is very low. + * Even a malicious attacker would have to guess the random number, so + * if your `srand()` seed is chosen properly and your C standard + * library is properly impelemented, this risk is again very low. + */ +Tempfile::Tempfile(std::string namepart) + : TempEntry(namepart) +{ + m_path.touch(); +} + +/** + * Destructor, removes the temporary file unless keep() has been called. + * Does nothing if the temporary directory does not exist anymore for whatever + * reason. + */ +Tempfile::~Tempfile() +{ + if (!m_keep) + remove(); +} + +/** + * Removes the temporary file. This method + * ignores what was set with keep(), i.e., it *always* deletes + * the temporary file if you call it. This method does nothing + * if the file does not exist anymore for whatever + * reason. + */ +void Tempfile::remove() const +{ + if (m_path.exists()) + m_path.unlink(); +} diff --git a/src/3rd_party/sentencepiece b/src/3rd_party/sentencepiece new file mode 160000 +Subproject 1a38d26a13cc67b1aae641d4983b624bef6d530 diff --git a/src/3rd_party/zstr/LICENSE b/src/3rd_party/zstr/LICENSE new file mode 100644 index 00000000..841c7214 --- /dev/null +++ b/src/3rd_party/zstr/LICENSE @@ -0,0 +1,21 @@ +The MIT License (MIT) + +Copyright (c) 2015 Matei David, Ontario Institute for Cancer Research + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/src/3rd_party/zstr/README.org b/src/3rd_party/zstr/README.org new file mode 100644 index 00000000..bc0dd3e5 --- /dev/null +++ b/src/3rd_party/zstr/README.org @@ -0,0 +1,54 @@ +# -*- mode:org; mode:visual-line; coding:utf-8; -*- + +** A C++ ZLib wrapper + +[[http://travis-ci.org/mateidavid/zstr][http://travis-ci.org/mateidavid/zstr.svg?branch=master]] [[https://tldrlegal.com/license/mit-license][http://img.shields.io/:license-mit-blue.svg]] + +This C++ header-only library enables the use of C++ standard iostreams to access ZLib-compressed streams. + +For input access (decompression), the compression format is auto-detected, and multiple concatenated compressed streams are decompressed seamlessly. + +For output access (compression), the only parameter exposed by this API is the compression level. + +Alternatives to this library include: + +- The original [[http://www.zlib.net/][ZLib]], through its [[http://www.zlib.net/manual.html][C API]]. This does not interact nicely with C++ iostreams. + +- The [[http://www.cs.unc.edu/Research/compgeom/gzstream/][GZStream]] library. This library does not auto-detect input compression, and it cannot wrap streams (only files). + +- The [[http://www.boost.org/doc/libs/release/libs/iostreams/][Boost IOStreams]] library. The library does not auto-detect input compression (by default, though that can be easily implemented with filters), and more importantly, it is not a header-only Boost library. + +For an example usage, see [[examples/ztxtpipe.cpp]] and [[examples/zc.cpp]]. + +**** Input Auto-detection + +For input access, the library seamlessly auto-detects whether the source stream is compressed or not. The following compressed streams are detected: + +- GZip header, when stream starts with =1F 8B=. See [[http://en.wikipedia.org/wiki/Gzip][GZip format]]. + +- ZLib header, when stream starts with =78 01=, =78 9C=, and =78 DA=. See [[http://stackoverflow.com/a/17176881][answer here]]. + +If none of these formats are detected, the library assumes the input is not compressed, and it produces a plain copy of the source stream. + +**** Classes + +The package provides 6 classes for accessing ZLib streams: + +- =zstr::istreambuf= is the core decompression class. This is constructed from an existing =std::streambuf= that contains source data. The =zstr::istreambuf= constructor accepts explicit settings for the internal buffer size (default: 1 MB) and the auto-detection option (default: on). ZLib errors cause exceptions to be thrown. + +- =zstr::ostreambuf= is the core compression class. This is constructed from an existing =std::streambuf= that contains sink data. The =zstr::ostreambuf= constructor accepts explicit settings for the internal buffer size (default: 1 MB) and the compression option (default: ZLib default). ZLib errors cause exceptions to be thrown. + +- =zstr::istream= is a wrapper for a =zstr::istreambuf= that accesses an /external/ =std::streambuf=. It can be constructed from an existing =std::istream= (such as =std::cin=) or =std::streambuf=. + +- =zstr::ostream= is a wrapper for a =zstr::ostreambuf= that accesses an /external/ =std::streambuf=. It can be constructed from an existing =std::ostream= (such as =std::cout=) or =std::streambuf=. + +- =zstr::ifstream= is a wrapper for a =zstr::istreambuf= that accesses an /internal/ =std::ifstream=. This can be used to open a file and read decompressed data from it. + +- =zstr::ofstream= is a wrapper for a =zstr::ostreambuf= that accesses an /internal/ =std::ofstream=. This can be used to open a file and write compressed data to it. + +For all stream objects, the =badbit= of their expection mask is turned on in order to propagate exceptions. + +**** License + +Released under the [[file:LICENSE][MIT license]]. + diff --git a/src/3rd_party/zstr/strict_fstream.hpp b/src/3rd_party/zstr/strict_fstream.hpp new file mode 100644 index 00000000..21173c73 --- /dev/null +++ b/src/3rd_party/zstr/strict_fstream.hpp @@ -0,0 +1,202 @@ +#ifndef __STRICT_FSTREAM_HPP +#define __STRICT_FSTREAM_HPP + +#include <cassert> +#include <fstream> +#include <cstring> +#include <string> + +/** + * This namespace defines wrappers for std::ifstream, std::ofstream, and + * std::fstream objects. The wrappers perform the following steps: + * - check the open modes make sense + * - check that the call to open() is successful + * - (for input streams) check that the opened file is peek-able + * - turn on the badbit in the exception mask + */ +namespace strict_fstream +{ + +/// Overload of error-reporting function, to enable use with VS. +/// Ref: http://stackoverflow.com/a/901316/717706 +static std::string strerror() +{ + std::string buff(80, '\0'); +#ifdef _WIN32 + if (strerror_s(&buff[0], buff.size(), errno) != 0) + { + buff = "Unknown error"; + } +#elif (_POSIX_C_SOURCE >= 200112L || _XOPEN_SOURCE >= 600) && ! _GNU_SOURCE +// XSI-compliant strerror_r() + if (strerror_r(errno, &buff[0], buff.size()) != 0) + { + buff = "Unknown error"; + } +#else +// GNU-specific strerror_r() + auto p = strerror_r(errno, &buff[0], buff.size()); + std::string tmp(p, std::strlen(p)); + std::swap(buff, tmp); +#endif + buff.resize(buff.find('\0')); + return buff; +} + +/// Exception class thrown by failed operations. +class Exception + : public std::exception +{ +public: + Exception(const std::string& msg) : _msg(msg) {} + const char * what() const noexcept { return _msg.c_str(); } +private: + std::string _msg; +}; // class Exception + +namespace detail +{ + +struct static_method_holder +{ + static std::string mode_to_string(std::ios_base::openmode mode) + { + static const int n_modes = 6; + static const std::ios_base::openmode mode_val_v[n_modes] = + { + std::ios_base::in, + std::ios_base::out, + std::ios_base::app, + std::ios_base::ate, + std::ios_base::trunc, + std::ios_base::binary + }; + + static const char * mode_name_v[n_modes] = + { + "in", + "out", + "app", + "ate", + "trunc", + "binary" + }; + std::string res; + for (int i = 0; i < n_modes; ++i) + { + if (mode & mode_val_v[i]) + { + res += (! res.empty()? "|" : ""); + res += mode_name_v[i]; + } + } + if (res.empty()) res = "none"; + return res; + } + static void check_mode(const std::string& filename, std::ios_base::openmode mode) + { + if ((mode & std::ios_base::trunc) && ! (mode & std::ios_base::out)) + { + throw Exception(std::string("strict_fstream: open('") + filename + "'): mode error: trunc and not out"); + } + else if ((mode & std::ios_base::app) && ! (mode & std::ios_base::out)) + { + throw Exception(std::string("strict_fstream: open('") + filename + "'): mode error: app and not out"); + } + else if ((mode & std::ios_base::trunc) && (mode & std::ios_base::app)) + { + throw Exception(std::string("strict_fstream: open('") + filename + "'): mode error: trunc and app"); + } + } + static void check_open(std::ios * s_p, const std::string& filename, std::ios_base::openmode mode) + { + if (s_p->fail()) + { + throw Exception(std::string("strict_fstream: open('") + + filename + "'," + mode_to_string(mode) + "): open failed: " + + strerror()); + } + } + static void check_peek(std::istream * is_p, const std::string& filename, std::ios_base::openmode mode) + { + bool peek_failed = true; + try + { + is_p->peek(); + peek_failed = is_p->fail(); + } + catch (std::ios_base::failure e) {} + if (peek_failed) + { + throw Exception(std::string("strict_fstream: open('") + + filename + "'," + mode_to_string(mode) + "): peek failed: " + + strerror()); + } + is_p->clear(); + } +}; // struct static_method_holder + +} // namespace detail + +class ifstream + : public std::ifstream +{ +public: + ifstream() = default; + ifstream(const std::string& filename, std::ios_base::openmode mode = std::ios_base::in) + { + open(filename, mode); + } + void open(const std::string& filename, std::ios_base::openmode mode = std::ios_base::in) + { + mode |= std::ios_base::in; + exceptions(std::ios_base::badbit); + detail::static_method_holder::check_mode(filename, mode); + std::ifstream::open(filename, mode); + detail::static_method_holder::check_open(this, filename, mode); + detail::static_method_holder::check_peek(this, filename, mode); + } +}; // class ifstream + +class ofstream + : public std::ofstream +{ +public: + ofstream() = default; + ofstream(const std::string& filename, std::ios_base::openmode mode = std::ios_base::out) + { + open(filename, mode); + } + void open(const std::string& filename, std::ios_base::openmode mode = std::ios_base::out) + { + mode |= std::ios_base::out; + exceptions(std::ios_base::badbit); + detail::static_method_holder::check_mode(filename, mode); + std::ofstream::open(filename, mode); + detail::static_method_holder::check_open(this, filename, mode); + } +}; // class ofstream + +class fstream + : public std::fstream +{ +public: + fstream() = default; + fstream(const std::string& filename, std::ios_base::openmode mode = std::ios_base::in) + { + open(filename, mode); + } + void open(const std::string& filename, std::ios_base::openmode mode = std::ios_base::in) + { + if (! (mode & std::ios_base::out)) mode |= std::ios_base::in; + exceptions(std::ios_base::badbit); + detail::static_method_holder::check_mode(filename, mode); + std::fstream::open(filename, mode); + detail::static_method_holder::check_open(this, filename, mode); + detail::static_method_holder::check_peek(this, filename, mode); + } +}; // class fstream + +} // namespace strict_fstream + +#endif diff --git a/src/3rd_party/zstr/zstr.hpp b/src/3rd_party/zstr/zstr.hpp new file mode 100644 index 00000000..6b633728 --- /dev/null +++ b/src/3rd_party/zstr/zstr.hpp @@ -0,0 +1,411 @@ +//--------------------------------------------------------- +// Copyright 2015 Ontario Institute for Cancer Research +// Written by Matei David (matei@cs.toronto.edu) +//--------------------------------------------------------- + +// Reference: +// http://stackoverflow.com/questions/14086417/how-to-write-custom-input-stream-in-c + +#ifndef __ZSTR_HPP +#define __ZSTR_HPP + +#include <cassert> +#include <fstream> +#include <sstream> +#include <zlib.h> +#include "strict_fstream.hpp" + +namespace zstr +{ + +/// Exception class thrown by failed zlib operations. +class Exception + : public std::exception +{ +public: + Exception(z_stream * zstrm_p, int ret) + : _msg("zlib: ") + { + switch (ret) + { + case Z_STREAM_ERROR: + _msg += "Z_STREAM_ERROR: "; + break; + case Z_DATA_ERROR: + _msg += "Z_DATA_ERROR: "; + break; + case Z_MEM_ERROR: + _msg += "Z_MEM_ERROR: "; + break; + case Z_VERSION_ERROR: + _msg += "Z_VERSION_ERROR: "; + break; + case Z_BUF_ERROR: + _msg += "Z_BUF_ERROR: "; + break; + default: + std::ostringstream oss; + oss << ret; + _msg += "[" + oss.str() + "]: "; + break; + } + _msg += zstrm_p->msg; + } + Exception(const std::string msg) : _msg(msg) {} + const char * what() const noexcept { return _msg.c_str(); } +private: + std::string _msg; +}; // class Exception + +namespace detail +{ + +class z_stream_wrapper + : public z_stream +{ +public: + z_stream_wrapper(bool _is_input = true, int _level = Z_DEFAULT_COMPRESSION) + : is_input(_is_input) + { + this->zalloc = Z_NULL; + this->zfree = Z_NULL; + this->opaque = Z_NULL; + int ret; + if (is_input) + { + this->avail_in = 0; + this->next_in = Z_NULL; + ret = inflateInit2(this, 15+32); + } + else + { + ret = deflateInit2(this, _level, Z_DEFLATED, 15+16, 8, Z_DEFAULT_STRATEGY); + } + if (ret != Z_OK) throw Exception(this, ret); + } + ~z_stream_wrapper() + { + if (is_input) + { + inflateEnd(this); + } + else + { + deflateEnd(this); + } + } +private: + bool is_input; +}; // class z_stream_wrapper + +} // namespace detail + +class istreambuf + : public std::streambuf +{ +public: + istreambuf(std::streambuf * _sbuf_p, + std::size_t _buff_size = default_buff_size, bool _auto_detect = true) + : sbuf_p(_sbuf_p), + zstrm_p(nullptr), + buff_size(_buff_size), + auto_detect(_auto_detect), + auto_detect_run(false), + is_text(false) + { + assert(sbuf_p); + in_buff = new char [buff_size]; + in_buff_start = in_buff; + in_buff_end = in_buff; + out_buff = new char [buff_size]; + setg(out_buff, out_buff, out_buff); + } + + istreambuf(const istreambuf &) = delete; + istreambuf(istreambuf &&) = default; + istreambuf & operator = (const istreambuf &) = delete; + istreambuf & operator = (istreambuf &&) = default; + + virtual ~istreambuf() + { + delete [] in_buff; + delete [] out_buff; + if (zstrm_p) delete zstrm_p; + } + + virtual std::streambuf::int_type underflow() + { + if (this->gptr() == this->egptr()) + { + // pointers for free region in output buffer + char * out_buff_free_start = out_buff; + do + { + // read more input if none available + if (in_buff_start == in_buff_end) + { + // empty input buffer: refill from the start + in_buff_start = in_buff; + std::streamsize sz = sbuf_p->sgetn(in_buff, buff_size); + in_buff_end = in_buff + sz; + if (in_buff_end == in_buff_start) break; // end of input + } + // auto detect if the stream contains text or deflate data + if (auto_detect && ! auto_detect_run) + { + auto_detect_run = true; + unsigned char b0 = *reinterpret_cast< unsigned char * >(in_buff_start); + unsigned char b1 = *reinterpret_cast< unsigned char * >(in_buff_start + 1); + // Ref: + // http://en.wikipedia.org/wiki/Gzip + // http://stackoverflow.com/questions/9050260/what-does-a-zlib-header-look-like + is_text = ! (in_buff_start + 2 <= in_buff_end + && ((b0 == 0x1F && b1 == 0x8B) // gzip header + || (b0 == 0x78 && (b1 == 0x01 // zlib header + || b1 == 0x9C + || b1 == 0xDA)))); + } + if (is_text) + { + // simply swap in_buff and out_buff, and adjust pointers + assert(in_buff_start == in_buff); + std::swap(in_buff, out_buff); + out_buff_free_start = in_buff_end; + in_buff_start = in_buff; + in_buff_end = in_buff; + } + else + { + // run inflate() on input + if (! zstrm_p) zstrm_p = new detail::z_stream_wrapper(true); + zstrm_p->next_in = reinterpret_cast< decltype(zstrm_p->next_in) >(in_buff_start); + zstrm_p->avail_in = in_buff_end - in_buff_start; + zstrm_p->next_out = reinterpret_cast< decltype(zstrm_p->next_out) >(out_buff_free_start); + zstrm_p->avail_out = (out_buff + buff_size) - out_buff_free_start; + int ret = inflate(zstrm_p, Z_NO_FLUSH); + // process return code + if (ret != Z_OK && ret != Z_STREAM_END) throw Exception(zstrm_p, ret); + // update in&out pointers following inflate() + in_buff_start = reinterpret_cast< decltype(in_buff_start) >(zstrm_p->next_in); + in_buff_end = in_buff_start + zstrm_p->avail_in; + out_buff_free_start = reinterpret_cast< decltype(out_buff_free_start) >(zstrm_p->next_out); + assert(out_buff_free_start + zstrm_p->avail_out == out_buff + buff_size); + // if stream ended, deallocate inflator + if (ret == Z_STREAM_END) + { + delete zstrm_p; + zstrm_p = nullptr; + } + } + } while (out_buff_free_start == out_buff); + // 2 exit conditions: + // - end of input: there might or might not be output available + // - out_buff_free_start != out_buff: output available + this->setg(out_buff, out_buff, out_buff_free_start); + } + return this->gptr() == this->egptr() + ? traits_type::eof() + : traits_type::to_int_type(*this->gptr()); + } +private: + std::streambuf * sbuf_p; + char * in_buff; + char * in_buff_start; + char * in_buff_end; + char * out_buff; + detail::z_stream_wrapper * zstrm_p; + std::size_t buff_size; + bool auto_detect; + bool auto_detect_run; + bool is_text; + + static const std::size_t default_buff_size = (std::size_t)1 << 20; +}; // class istreambuf + +class ostreambuf + : public std::streambuf +{ +public: + ostreambuf(std::streambuf * _sbuf_p, + std::size_t _buff_size = default_buff_size, int _level = Z_DEFAULT_COMPRESSION) + : sbuf_p(_sbuf_p), + zstrm_p(new detail::z_stream_wrapper(false, _level)), + buff_size(_buff_size) + { + assert(sbuf_p); + in_buff = new char [buff_size]; + out_buff = new char [buff_size]; + setp(in_buff, in_buff + buff_size); + } + + ostreambuf(const ostreambuf &) = delete; + ostreambuf(ostreambuf &&) = default; + ostreambuf & operator = (const ostreambuf &) = delete; + ostreambuf & operator = (ostreambuf &&) = default; + + int deflate_loop(int flush) + { + while (true) + { + zstrm_p->next_out = reinterpret_cast< decltype(zstrm_p->next_out) >(out_buff); + zstrm_p->avail_out = buff_size; + int ret = deflate(zstrm_p, flush); + if (ret != Z_OK && ret != Z_STREAM_END && ret != Z_BUF_ERROR) throw Exception(zstrm_p, ret); + std::streamsize sz = sbuf_p->sputn(out_buff, reinterpret_cast< decltype(out_buff) >(zstrm_p->next_out) - out_buff); + if (sz != reinterpret_cast< decltype(out_buff) >(zstrm_p->next_out) - out_buff) + { + // there was an error in the sink stream + return -1; + } + if (ret == Z_STREAM_END || ret == Z_BUF_ERROR || sz == 0) + { + break; + } + } + return 0; + } + + virtual ~ostreambuf() + { + // flush the zlib stream + // + // NOTE: Errors here (sync() return value not 0) are ignored, because we + // cannot throw in a destructor. This mirrors the behaviour of + // std::basic_filebuf::~basic_filebuf(). To see an exception on error, + // close the ofstream with an explicit call to close(), and do not rely + // on the implicit call in the destructor. + // + sync(); + delete [] in_buff; + delete [] out_buff; + delete zstrm_p; + } + virtual std::streambuf::int_type overflow(std::streambuf::int_type c = traits_type::eof()) + { + zstrm_p->next_in = reinterpret_cast< decltype(zstrm_p->next_in) >(pbase()); + zstrm_p->avail_in = pptr() - pbase(); + while (zstrm_p->avail_in > 0) + { + int r = deflate_loop(Z_NO_FLUSH); + if (r != 0) + { + setp(nullptr, nullptr); + return traits_type::eof(); + } + } + setp(in_buff, in_buff + buff_size); + return traits_type::eq_int_type(c, traits_type::eof()) ? traits_type::eof() : sputc(c); + } + virtual int sync() + { + // first, call overflow to clear in_buff + overflow(); + if (! pptr()) return -1; + // then, call deflate asking to finish the zlib stream + zstrm_p->next_in = nullptr; + zstrm_p->avail_in = 0; + if (deflate_loop(Z_FINISH) != 0) return -1; + deflateReset(zstrm_p); + return 0; + } +private: + std::streambuf * sbuf_p; + char * in_buff; + char * out_buff; + detail::z_stream_wrapper * zstrm_p; + std::size_t buff_size; + + static const std::size_t default_buff_size = (std::size_t)1 << 20; +}; // class ostreambuf + +class istream + : public std::istream +{ +public: + istream(std::istream & is) + : std::istream(new istreambuf(is.rdbuf())) + { + exceptions(std::ios_base::badbit); + } + explicit istream(std::streambuf * sbuf_p) + : std::istream(new istreambuf(sbuf_p)) + { + exceptions(std::ios_base::badbit); + } + virtual ~istream() + { + delete rdbuf(); + } +}; // class istream + +class ostream + : public std::ostream +{ +public: + ostream(std::ostream & os) + : std::ostream(new ostreambuf(os.rdbuf())) + { + exceptions(std::ios_base::badbit); + } + explicit ostream(std::streambuf * sbuf_p) + : std::ostream(new ostreambuf(sbuf_p)) + { + exceptions(std::ios_base::badbit); + } + virtual ~ostream() + { + delete rdbuf(); + } +}; // class ostream + +namespace detail +{ + +template < typename FStream_Type > +struct strict_fstream_holder +{ + strict_fstream_holder(const std::string& filename, std::ios_base::openmode mode = std::ios_base::in) + : _fs(filename, mode) + {} + FStream_Type _fs; +}; // class strict_fstream_holder + +} // namespace detail + +class ifstream + : private detail::strict_fstream_holder< strict_fstream::ifstream >, + public std::istream +{ +public: + explicit ifstream(const std::string& filename, std::ios_base::openmode mode = std::ios_base::in) + : detail::strict_fstream_holder< strict_fstream::ifstream >(filename, mode), + std::istream(new istreambuf(_fs.rdbuf())) + { + exceptions(std::ios_base::badbit); + } + virtual ~ifstream() + { + if (rdbuf()) delete rdbuf(); + } +}; // class ifstream + +class ofstream + : private detail::strict_fstream_holder< strict_fstream::ofstream >, + public std::ostream +{ +public: + explicit ofstream(const std::string& filename, std::ios_base::openmode mode = std::ios_base::out) + : detail::strict_fstream_holder< strict_fstream::ofstream >(filename, mode | std::ios_base::binary), + std::ostream(new ostreambuf(_fs.rdbuf())) + { + exceptions(std::ios_base::badbit); + } + virtual ~ofstream() + { + if (rdbuf()) delete rdbuf(); + } +}; // class ofstream + +} // namespace zstr + +#endif diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 913ab17d..09864161 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -83,7 +83,9 @@ add_library(marian STATIC $<TARGET_OBJECTS:libyaml-cpp> $<TARGET_OBJECTS:SQLiteCpp> + $<TARGET_OBJECTS:pathie-cpp> ) +target_compile_options(marian PUBLIC ${ALL_WARNINGS}) # Generate git_revision.h to reflect current git revision information # [https://stackoverflow.com/questions/1435953/how-can-i-pass-git-sha1-to-compiler-as-definition-using-cmake] @@ -110,6 +112,8 @@ cuda_add_library(marian_cuda training/gradient_dropping/gpu/dropper.cu training/gradient_dropping/gpu/sparse_algorithm.cu STATIC) + + target_compile_options(marian_cuda PUBLIC ${ALL_WARNINGS}) endif(CUDA_FOUND) set_target_properties(marian PROPERTIES LIBRARY_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}") @@ -117,18 +121,23 @@ set_target_properties(marian PROPERTIES ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_BINARY add_executable(marian_train command/marian_main.cpp) set_target_properties(marian_train PROPERTIES OUTPUT_NAME marian) +target_compile_options(marian_train PUBLIC ${ALL_WARNINGS}) add_executable(marian_decoder command/marian_decoder.cpp) set_target_properties(marian_decoder PROPERTIES OUTPUT_NAME marian-decoder) +target_compile_options(marian_decoder PUBLIC ${ALL_WARNINGS}) add_executable(marian_scorer command/marian_scorer.cpp) set_target_properties(marian_scorer PROPERTIES OUTPUT_NAME marian-scorer) +target_compile_options(marian_scorer PUBLIC ${ALL_WARNINGS}) add_executable(marian_vocab command/marian_vocab.cpp) set_target_properties(marian_vocab PROPERTIES OUTPUT_NAME marian-vocab) +target_compile_options(marian_vocab PUBLIC ${ALL_WARNINGS}) add_executable(marian_conv command/marian_conv.cpp) set_target_properties(marian_conv PROPERTIES OUTPUT_NAME marian-conv) +target_compile_options(marian_conv PUBLIC ${ALL_WARNINGS}) set(EXECUTABLES ${EXECUTABLES} marian_train marian_decoder marian_scorer marian_vocab marian_conv) @@ -165,6 +174,7 @@ endif() if(COMPILE_SERVER) add_executable(marian_server command/marian_server.cpp) set_target_properties(marian_server PROPERTIES OUTPUT_NAME marian-server) + target_compile_options(marian_server PUBLIC ${ALL_WARNINGS}) set(EXECUTABLES ${EXECUTABLES} marian_server) endif(COMPILE_SERVER) diff --git a/src/command/marian_vocab.cpp b/src/command/marian_vocab.cpp index d53dc5f0..de8ef3c7 100755 --- a/src/command/marian_vocab.cpp +++ b/src/command/marian_vocab.cpp @@ -25,9 +25,7 @@ int main(int argc, char** argv) { LOG(info, "Creating vocabulary..."); auto vocab = New<Vocab>(options, 0); - io::InputFileStream corpusStrm(std::cin); - io::OutputFileStream vocabStrm(std::cout); - vocab->create(corpusStrm, vocabStrm, options->get<size_t>("max-size")); + vocab->create("stdout", "stdin", options->get<size_t>("max-size")); LOG(info, "Finished"); diff --git a/src/common/cli_wrapper.cpp b/src/common/cli_wrapper.cpp index 0e230c04..28826bb2 100755 --- a/src/common/cli_wrapper.cpp +++ b/src/common/cli_wrapper.cpp @@ -1,6 +1,8 @@ #include "common/cli_wrapper.h" +#include "common/cli_helper.h" #include "common/logging.h" #include "common/options.h" +#include "common/timer.h" #include "common/version.h" namespace marian { @@ -85,8 +87,7 @@ CLIWrapper::CLIWrapper(YAML::Node &config, app_->formatter(fmt); // add --version option - optVersion_ - = app_->add_flag("--version", "Print the version number and exit"); + optVersion_ = app_->add_flag("--version", "Print the version number and exit"); optVersion_->group(defaultGroup_); } @@ -96,20 +97,12 @@ CLIWrapper::CLIWrapper(Ptr<marian::Options> options, const std::string &footer, size_t columnWidth, size_t screenWidth) - : CLIWrapper(options->getYaml(), - description, - header, - footer, - columnWidth, - screenWidth) {} + : CLIWrapper(options->getYaml(), description, header, footer, columnWidth, screenWidth) {} CLIWrapper::~CLIWrapper() {} void CLIWrapper::switchGroup(const std::string &name) { - if(name.empty()) - currentGroup_ = defaultGroup_; - else - currentGroup_ = name; + currentGroup_ = name.empty() ? defaultGroup_ : name; } void CLIWrapper::parse(int argc, char **argv) { @@ -126,25 +119,77 @@ void CLIWrapper::parse(int argc, char **argv) { } } -std::string CLIWrapper::failureMessage(const CLI::App *app, - const CLI::Error &e) { +std::string CLIWrapper::failureMessage(const CLI::App *app, const CLI::Error &e) { std::string header = "Error: " + std::string(e.what()) + "\n"; if(app->get_help_ptr() != nullptr) - header += "Run with " + app->get_help_ptr()->get_name() - + " for more information.\n"; + header += "Run with " + app->get_help_ptr()->get_name() + " for more information.\n"; return header; } -void CLIWrapper::overwriteDefault(const YAML::Node &node) { - // iterate requested default values - for(auto it : node) { +bool CLIWrapper::updateConfig(const YAML::Node &config) { + bool success = true; + auto cmdOptions = getParsedOptionNames(); + for(auto it : config) { auto key = it.first.as<std::string>(); - ABORT_IF(!allVars_.count(key), "The following option was not expected: '{}'", key); - // if we have an option but it was not specified on command-line - if(allVars_.count(key) > 0 && opts_.at(key)->empty()) { + // skip options specified via command-line to allow overwriting them + if(cmdOptions.count(key)) + continue; + if(options_.count(key)) { config_[key] = YAML::Clone(it.second); + options_[key].modified = true; + } else { + success = false; } } + return success; +} + +std::string CLIWrapper::dumpConfig(bool skipDefault /*= false*/) const { + YAML::Emitter out; + out << YAML::Comment("Marian configuration file generated at " + timer::currentDate() + + " with version " + buildVersion()); + out << YAML::BeginMap; + std::string comment; + for(const auto &key : getOrderedOptionNames()) { + // do not proceed keys that are removed from config_ + if(!config_[key]) + continue; + if(skipDefault && !options_.at(key).modified) + continue; + auto group = options_.at(key).opt->get_group(); + if(comment != group) { + if(!comment.empty()) + out << YAML::Newline; + comment = group; + out << YAML::Comment(group); + } + out << YAML::Key; + out << key; + out << YAML::Value; + cli::OutputYaml(config_[key], out); + } + out << YAML::EndMap; + return out.c_str(); +} + +std::unordered_set<std::string> CLIWrapper::getParsedOptionNames() const { + std::unordered_set<std::string> keys; + for(const auto &it : options_) + if(!it.second.opt->empty()) + keys.emplace(it.first); + return keys; +} + +std::vector<std::string> CLIWrapper::getOrderedOptionNames() const { + std::vector<std::string> keys; + // extract all option names + for(auto const &it : options_) + keys.push_back(it.first); + // sort option names by creation index + sort(keys.begin(), keys.end(), [this](const std::string &a, const std::string &b) { + return options_.at(a).idx < options_.at(b).idx; + }); + return keys; } } // namespace cli diff --git a/src/common/cli_wrapper.h b/src/common/cli_wrapper.h index 67f2dff4..cf47a310 100755 --- a/src/common/cli_wrapper.h +++ b/src/common/cli_wrapper.h @@ -8,6 +8,7 @@ #include <iostream> #include <map> #include <string> +#include <unordered_set> namespace marian { @@ -46,6 +47,19 @@ private: size_t screenWidth_{0}; }; +// @TODO: in this file review the use of naked pointers. We use Ptr<Type> anywhere else, +// what's up with that? + +/** + * The helper structure storing an option object, the associated variable and creation index. + */ +struct CLIOptionTuple { + CLI::Option *opt; + Ptr<any_type> var; + size_t idx{0}; + bool modified{false}; +}; + /** * @brief The class used to define and parse command-line arguments. * @@ -63,10 +77,10 @@ private: */ class CLIWrapper { private: - // [option name] -> option value - std::map<std::string, Ptr<any_type>> allVars_; - // Map with option names and objects - std::map<std::string, CLI::Option *> opts_; + // Map with option names and option tuples + std::unordered_map<std::string, CLIOptionTuple> options_; + // Counter for created options + size_t counter_{0}; // Command-line argument parser Ptr<CLI::App> app_; @@ -75,23 +89,22 @@ private: // Name of the current option group std::string currentGroup_{""}; - // If this is a wrapper then this should just be a reference, - // then we do not have the added level of containment. + // Reference to the main config object YAML::Node &config_; // Option for --version flag. This is a special flag and similarly to --help, // the key "version" will be not added into the YAML config - CLI::Option* optVersion_; + CLI::Option *optVersion_; static std::string failureMessage(const CLI::App *app, const CLI::Error &e); - // Extract an option name from comma-separated list of command-line arguments, - // e.g. 'help' from '--help,-h' + // Extract option name from a comma-separated list of long and short options, e.g. 'help' from + // '--help,-h' std::string keyName(const std::string &args) const { // re-use existing functions from CLI11 to keep option names consistent - return std::get<1>(CLI::detail::get_names(CLI::detail::split_names( - args))) // get long names only - .front(); // get first long name + return std::get<1>( + CLI::detail::get_names(CLI::detail::split_names(args))) // get long names only + .front(); // get first long name } public: @@ -112,23 +125,14 @@ public: const std::string &description = "", const std::string &header = "General options", const std::string &footer = "", - size_t columnWidth = 35, + size_t columnWidth = 40, size_t screenWidth = 0); /** * @brief Create an instance of the command-line argument parser, * short-cuft for Options object. * - * Option --help, -h is automatically added. - * - * @param options A smart pointer to the Options object containing the - * to-be-wrapped yaml tree - * @param description Program description - * @param header Header text for the main option group - * @param footer Text displayed after the list of options - * @param columnWidth Width of the column with option names - * @param screenWidth Maximum allowed width for help messages, 0 means no - * limit + * @see Other constructor */ CLIWrapper(Ptr<Options> options, const std::string &description = "", @@ -201,8 +205,7 @@ public: * have a default value or be non-defaulted */ template <typename T> - CLI::Option *add_nondefault(const std::string &args, - const std::string &help) { + CLI::Option *add_nondefault(const std::string &args, const std::string &help) { return add_option<T>(keyName(args), args, help, @@ -212,8 +215,7 @@ public: } /** - * Switch to different option group or to the default group if - * argument is empty. + * Switch to different option group or to the default group if argument is empty. * * @param name Header of the option group */ @@ -222,23 +224,31 @@ public: // Parse command-line arguments. Handles --help and --version options void parse(int argc, char **argv); - /** + /* * @brief Overwrite values for unparsed options * - * Default values are overwritten with the options found in the config - * provided as the argument, while parsed command-line options remain - * unchanged + * Default values are overwritten with the options from the config provided, while parsed + * command-line options remain unchanged. + * This should be a preferred way of updating config options as the class keeps track of options, + * which values have changed. * * @param node YAML config with new default values for options */ - void overwriteDefault(const YAML::Node &node); + bool updateConfig(const YAML::Node &config); + + // Get textual YAML representation of the config + std::string dumpConfig(bool skipDefault = false) const; private: - template < - typename T, - // options with numeric and string-like values - CLI::enable_if_t<!CLI::is_bool<T>::value && !CLI::is_vector<T>::value, - CLI::detail::enabler> = CLI::detail::dummy> + // Get names of options passed via command-line + std::unordered_set<std::string> getParsedOptionNames() const; + // Get option names in the same order as they are created + std::vector<std::string> getOrderedOptionNames() const; + + template <typename T, + // options with numeric and string-like values + CLI::enable_if_t<!CLI::is_bool<T>::value && !CLI::is_vector<T>::value, + CLI::detail::enabler> = CLI::detail::dummy> CLI::Option *add_option(const std::string &key, const std::string &args, const std::string &help, @@ -248,13 +258,17 @@ private: // define YAML entry if requested if(addToConfig) config_[key] = val; - // create variable for the option - allVars_.insert(std::make_pair(key, std::make_shared<any_type>(val))); + + // create option tuple + CLIOptionTuple option; + option.idx = counter_++; + option.var = std::make_shared<any_type>(val); // callback function collecting a command-line argument CLI::callback_t fun = [this, key](CLI::results_t res) { + options_[key].modified = true; // get variable associated with the option - auto &var = allVars_[key]->as<T>(); + auto &var = options_[key].var->as<T>(); // store parser result in var auto ret = CLI::detail::lexical_cast(res[0], var); // update YAML entry @@ -275,15 +289,15 @@ private: opt->default_str(ss.str()); } - // store option object - opts_.insert(std::make_pair(key, opt)); - return opts_[key]; + // store option tuple + option.opt = opt; + options_.insert(std::make_pair(key, option)); + return options_[key].opt; } template <typename T, // options with vector values - CLI::enable_if_t<CLI::is_vector<T>::value, - CLI::detail::enabler> = CLI::detail::dummy> + CLI::enable_if_t<CLI::is_vector<T>::value, CLI::detail::enabler> = CLI::detail::dummy> CLI::Option *add_option(const std::string &key, const std::string &args, const std::string &help, @@ -293,13 +307,17 @@ private: // define YAML entry if requested if(addToConfig) config_[key] = val; - // create variable for the option - allVars_.insert(std::make_pair(key, std::make_shared<any_type>(val))); + + // create option tuple + CLIOptionTuple option; + option.idx = counter_++; + option.var = std::make_shared<any_type>(val); // callback function collecting command-line arguments CLI::callback_t fun = [this, key](CLI::results_t res) { + options_[key].modified = true; // get vector variable associated with the option - auto &vec = allVars_[key]->as<T>(); + auto &vec = options_[key].var->as<T>(); vec.clear(); bool ret = true; // handle '[]' as an empty vector @@ -330,15 +348,15 @@ private: if(defaulted) opt->default_str(CLI::detail::join(val)); - // store option object - opts_.insert(std::make_pair(key, opt)); - return opts_[key]; + // store option tuple + option.opt = opt; + options_.insert(std::make_pair(key, option)); + return options_[key].opt; } template <typename T, // options with boolean values, called flags in CLI11 - CLI::enable_if_t<CLI::is_bool<T>::value, - CLI::detail::enabler> = CLI::detail::dummy> + CLI::enable_if_t<CLI::is_bool<T>::value, CLI::detail::enabler> = CLI::detail::dummy> CLI::Option *add_option(const std::string &key, const std::string &args, const std::string &help, @@ -348,19 +366,23 @@ private: // define YAML entry if requested if(addToConfig) config_[key] = val; - // create variable for the option - allVars_.insert(std::make_pair(key, std::make_shared<any_type>(val))); + + // create option tuple + CLIOptionTuple option; + option.idx = counter_++; + option.var = std::make_shared<any_type>(val); // callback function setting the flag CLI::callback_t fun = [this, key](CLI::results_t res) { + options_[key].modified = true; // get parser result, it is safe as boolean options have an implicit value auto val = res[0]; auto ret = true; if(val == "true" || val == "on" || val == "yes" || val == "1") { - allVars_[key]->as<T>() = true; + options_[key].var->as<T>() = true; config_[key] = true; } else if(val == "false" || val == "off" || val == "no" || val == "0") { - allVars_[key]->as<T>() = false; + options_[key].var->as<T>() = false; config_[key] = false; } else { ret = false; @@ -378,9 +400,10 @@ private: // allow to use the flag without any argument opt->implicit_val("true"); - // store option object - opts_.insert(std::make_pair(key, opt)); - return opts_[key]; + // store option tuple + option.opt = opt; + options_.insert(std::make_pair(key, option)); + return options_[key].opt; } }; diff --git a/src/common/config.cpp b/src/common/config.cpp index c5209008..e5208b0d 100755 --- a/src/common/config.cpp +++ b/src/common/config.cpp @@ -38,7 +38,7 @@ void Config::initialize(int argc, char** argv, cli::mode mode, bool validate) { std::string quote; // attempt to quote special chars if (arg.empty() || arg.find_first_of(" #`\"'\\${}|&^?*!()%><") != std::string::npos) quote = "'"; - arg = regex::regex_replace(arg, std::regex("'"), "'\\''"); + arg = regex::regex_replace(arg, regex::regex("'"), "'\\''"); if (!cmdLine.empty()) cmdLine.push_back(' '); cmdLine += quote + arg + quote; diff --git a/src/common/config_parser.cpp b/src/common/config_parser.cpp index e7676b01..539579a1 100755 --- a/src/common/config_parser.cpp +++ b/src/common/config_parser.cpp @@ -77,8 +77,9 @@ void ConfigParser::addOptionsGeneral(cli::CLIWrapper& cli) { "allow the use of environment variables in paths, of the form ${VAR_NAME}"); cli.add<bool>("--relative-paths", "All paths are relative to the config file location"); - cli.add<bool>("--dump-config", - "Dump current (modified) configuration to stdout and exit"); + cli.add_nondefault<std::string>("--dump-config", + "Dump current (modified) configuration to stdout and exit. Possible values: full, minimal") + ->implicit_val("full"); // clang-format on } @@ -255,8 +256,13 @@ void ConfigParser::addOptionsTraining(cli::CLIWrapper& cli) { "If these files do not exist they are created"); #ifdef USE_SENTENCEPIECE cli.add<std::vector<float>>("--sentencepiece-alphas", - "Sampling factors for SentencePieceVocab;" - "i-th factor corresponds to i-th vocabulary"); + "Sampling factors for SentencePiece vocabulary; i-th factor corresponds to i-th vocabulary"); + cli.add<std::string>("--sentencepiece-options", + "Pass-through command-line options to SentencePiece trainer"); + cli.add<size_t>("--sentencepiece-max-lines", + "Maximum lines to train SentencePiece vocabulary, selected with sampling from all data. " + "When set to 0 all lines are going to be used.", + 10000000); #endif // scheduling options cli.add<size_t>("--after-epochs,-e", @@ -356,10 +362,10 @@ void ConfigParser::addOptionsTraining(cli::CLIWrapper& cli) { "none"); cli.add<std::string>("--guided-alignment-cost", "Cost type for guided alignment: ce (cross-entropy), mse (mean square error), mult (multiplication)", - "ce"); + "mse"); cli.add<double>("--guided-alignment-weight", "Weight for guided alignment cost", - 1); + 0.1); cli.add_nondefault<std::string>("--data-weighting", "Path to a file with sentence or word weights"); cli.add<std::string>("--data-weighting-type", @@ -396,8 +402,8 @@ void ConfigParser::addOptionsValidation(cli::CLIWrapper& cli) { "Validate model every arg updates (append 't' for every arg target labels)", "10000u"); cli.add<std::vector<std::string>>("--valid-metrics", - "Metric to use during validation: cross-entropy, perplexity, valid-script, translation." - " Multiple metrics can be specified", + "Metric to use during validation: cross-entropy, ce-mean-words, perplexity, valid-script, " + " translation, bleu, bleu-detok. Multiple metrics can be specified", std::vector<std::string>({"cross-entropy"})); cli.add<size_t>("--early-stopping", "Stop if the first validation metric does not improve for arg consecutive validation steps", @@ -452,7 +458,7 @@ void ConfigParser::addOptionsTranslation(cli::CLIWrapper& cli) { "Paths to input file(s), stdin by default", std::vector<std::string>({"stdin"})); cli.add<std::string>("--output,-o", - "Paths to output file(s), stdout by default", + "Path to output file, stdout by default", "stdout"); cli.add<std::vector<std::string>>("--vocabs,-v", "Paths to vocabulary files have to correspond to --input"); @@ -511,6 +517,9 @@ void ConfigParser::addOptionsScoring(cli::CLIWrapper& cli) { // TODO: move options like vocabs and train-sets to a separate procedure as they are defined twice cli.add<std::vector<std::string>>("--train-sets,-t", "Paths to corpora to be scored: source target"); + cli.add<std::string>("--output,-o", + "Path to output file, stdout by default", + "stdout"); cli.add<std::vector<std::string>>("--vocabs,-v", "Paths to vocabulary files have to correspond to --train-sets." " If this parameter is not supplied we look for vocabulary files source.{yml,json} and target.{yml,json}." @@ -519,6 +528,8 @@ void ConfigParser::addOptionsScoring(cli::CLIWrapper& cli) { "Score n-best list instead of plain text corpus"); cli.add<std::string>("--n-best-feature", "Feature name to be inserted into n-best list", "Score"); + cli.add<bool>("--normalize,-n", + "Divide translation score by translation length"); cli.add_nondefault<std::string>("--summary", "Only print total cost, possible values: cross-entropy (ce-mean), ce-mean-words, ce-sum, perplexity") ->implicit_val("cross-entropy"); @@ -607,8 +618,40 @@ void ConfigParser::addSuboptionsInputLength(cli::CLIWrapper& cli) { // clang-format on } +void ConfigParser::addSuboptionsULR(cli::CLIWrapper& cli) { + // clang-format off + // support for universal encoder ULR https://arxiv.org/pdf/1802.05368.pdf + cli.add<bool>("--ulr", + "Enable ULR (Universal Language Representation)", + false); + // reading pre-trained universal embeddings for multi-sources. + // Note that source and target here is relative to ULR not the translation langs + // queries: EQ in Fig2 : is the unified embeddings projected to one space. + cli.add<std::string>("--ulr-query-vectors", + "Path to file with universal sources embeddings from projection into universal space", + ""); + // keys: EK in Fig2 : is the keys of the target embbedings projected to unified space (i.e. ENU in + // multi-lingual case) + cli.add<std::string>("--ulr-keys-vectors", + "Path to file with universal sources embeddings of traget keys from projection into universal space", + ""); + cli.add<bool>("--ulr-trainable-transformation", + "Make Query Transformation Matrix A trainable", + false); + cli.add<int>("--ulr-dim-emb", + "ULR monolingual embeddings dimension"); + cli.add<float>("--ulr-dropout", + "ULR dropout on embeddings attentions. Default is no dropout", + 0.0f); + cli.add<float>("--ulr-softmax-temperature", + "ULR softmax temperature to control randomness of predictions. Deafult is 1.0: no temperature", + 1.0f); + // clang-format on +} + void ConfigParser::expandAliases(cli::CLIWrapper& cli) { YAML::Node config; + // The order of aliases does matter as later options overwrite earlier if(config_["best-deep"].as<bool>()) { config["layer-normalization"] = true; @@ -622,11 +665,10 @@ void ConfigParser::expandAliases(cli::CLIWrapper& cli) { config["skip"] = true; } - // @TODO: Quite sure CLIWrapper should not do that; - // that's semantics that seem to belong into the current class - // and has not really anything to do with CLI proper. - if(config) - cli.overwriteDefault(config); + if(config) { + auto success = cli.updateConfig(config); + ABORT_IF(!success, "Unknown option(s) in aliases, check if aliases consist of correct options"); + } } void ConfigParser::parseOptions(int argc, char** argv, bool doValidate) { @@ -661,8 +703,8 @@ void ConfigParser::parseOptions(int argc, char** argv, bool doValidate) { auto configPaths = findConfigPaths(); if(!configPaths.empty()) { auto config = loadConfigFiles(configPaths); - // combine loaded options with the main config object - cli.overwriteDefault(config); + auto success = cli.updateConfig(config); + ABORT_IF(!success, "There are option(s) in a config file that are not expected"); } if(get<bool>("interpolate-env-vars")) { @@ -677,11 +719,10 @@ void ConfigParser::parseOptions(int argc, char** argv, bool doValidate) { // remove extra config files from the config to avoid redundancy config_.remove("config"); - if(get<bool>("dump-config")) { + if(has("dump-config")) { + bool skipDefault = get<std::string>("dump-config") == "minimal"; config_.remove("dump-config"); - YAML::Emitter emit; - cli::OutputYaml(config_, emit); - std::cout << emit.c_str() << std::endl; + std::cout << cli.dumpConfig(skipDefault) << std::endl; exit(0); } @@ -714,8 +755,7 @@ std::vector<std::string> ConfigParser::findConfigPaths() { return paths; } -YAML::Node ConfigParser::loadConfigFiles( - const std::vector<std::string>& paths) { +YAML::Node ConfigParser::loadConfigFiles(const std::vector<std::string>& paths) { YAML::Node configAll; for(auto& path : paths) { @@ -750,33 +790,4 @@ YAML::Node ConfigParser::loadConfigFiles( YAML::Node ConfigParser::getConfig() const { return config_; } - -void ConfigParser::addSuboptionsULR(cli::CLIWrapper& cli) { - // support for universal encoder ULR https://arxiv.org/pdf/1802.05368.pdf - cli.add<bool>("--ulr", - "Is ULR (Universal Language Representation) enabled?", - false); - // reading pre-trained universal embedings for multi-sources - // note that source and target here is relative to ULR not the translation langs - //queries: EQ in Fig2 : is the unified embbedins projected to one space. - //"Path to file with universal sources embeddings from projection into universal space") - cli.add<std::string>("--ulr-query-vectors", - "Path to file with universal sources embeddings from projection into universal space", - ""); - //keys: EK in Fig2 : is the keys of the target embbedins projected to unified space (i.e. ENU in multi-lingual case) - cli.add<std::string>("--ulr-keys-vectors", - "Path to file with universal sources embeddings of traget keys from projection into universal space", - ""); - cli.add<bool>("--ulr-trainable-transformation", - "Is Query Transformation Matrix A trainable ?", - false); - cli.add<int>("--ulr-dim-emb", - "ULR mono embed dim"); - cli.add<float>("--ulr-dropout", - "ULR dropout on embeddings attentions: default is no dropuout", - 0.0f); - cli.add<float>("--ulr-softmax-temperature", - "ULR softmax temperature to control randomness of predictions- deafult is 1.0: no temperature ", - 1.0f); -} } // namespace marian diff --git a/src/common/config_parser.h b/src/common/config_parser.h index 80f7e81c..de1cb70e 100755 --- a/src/common/config_parser.h +++ b/src/common/config_parser.h @@ -63,7 +63,7 @@ private: // Abort if not set. template <typename T> T get(const std::string& key) const { - ABORT_IF(!has(key), "CLI object has no key {}", key); + ABORT_IF(!has(key), "CLI object has no key '{}'", key); return config_[key].as<T>(); } diff --git a/src/common/config_validator.cpp b/src/common/config_validator.cpp index 625748e9..5086c726 100755 --- a/src/common/config_validator.cpp +++ b/src/common/config_validator.cpp @@ -84,9 +84,6 @@ void ConfigValidator::validateOptionsTraining() const { ABORT_IF(!modelDir.empty() && !filesystem::isDirectory(modelDir), "Model directory does not exist"); - ABORT_IF(!modelDir.empty() && !filesystem::canWrite(modelDir), - "No write permission in model directory"); - ABORT_IF( has("valid-sets") && get<std::vector<std::string>>("valid-sets").size() != trainSets.size(), "There should be as many validation sets as training sets"); diff --git a/src/common/file_stream.h b/src/common/file_stream.h index caa12a6c..87cb7f9a 100755 --- a/src/common/file_stream.h +++ b/src/common/file_stream.h @@ -1,38 +1,24 @@ #pragma once -// @TODO: this file still contains lots of stuff from boost::filesystem and boost::iostreams, -// this has to be figured out. - #include "common/filesystem.h" #include "common/logging.h" #include "common/definitions.h" -#include <boost/filesystem/fstream.hpp> -#include <boost/iostreams/device/file_descriptor.hpp> -#pragma warning(push) -#pragma warning(disable: 4458) // declaration of 'traits_type' hides class member -#pragma warning(disable: 4456) // declaration of 'c' hides previous local declaration -#pragma warning(disable: 4244) // conversion from 'int' to 'char', possible loss of data -#pragma warning(disable: 4706) // assignment within conditional expression -#include <boost/iostreams/filter/gzip.hpp> -#pragma warning(pop) -#ifdef __GNUC__ #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wsuggest-override" -#endif -#include <boost/iostreams/filtering_stream.hpp> -#ifdef __GNUC__ +#include "3rd_party/zstr/zstr.hpp" #pragma GCC diagnostic pop -#endif + +#include <boost/iostreams/device/file_descriptor.hpp> +#include <boost/iostreams/stream_buffer.hpp> + #include <iostream> #include <memory> #ifdef _MSC_VER - #include <fcntl.h> #include <io.h> #include <stdlib.h> - #endif namespace marian { @@ -40,7 +26,7 @@ namespace io { class TemporaryFile { private: - int fd_; + int fd_{-1}; bool unlink_; std::string name_; @@ -145,79 +131,78 @@ public: class InputFileStream { public: - InputFileStream(const std::string& file) : file_(file), ifstream_(file_.getBoost()) { - ABORT_IF( - !marian::filesystem::exists(file_), "File '{}' does not exist", file); - - if(file_.extension() == marian::filesystem::Path(std::string(".gz"))) - istream_.push(boost::iostreams::gzip_decompressor()); - istream_.push(ifstream_); + InputFileStream(const std::string& file) + : file_(file) { + ABORT_IF(!marian::filesystem::exists(file_), "File '{}' does not exist", file); + + if(file_.extension() == marian::filesystem::Path(".gz")) + // @TODO: consider make_unique for next refactoring + istream_.reset(new zstr::ifstream(file_.string())); + else + istream_.reset(new std::ifstream(file_.string())); } InputFileStream(TemporaryFile& tempfile) : fds_(tempfile.getFileDescriptor(), boost::iostreams::never_close_handle) { lseek(tempfile.getFileDescriptor(), 0, SEEK_SET); - istream_.push(fds_, 1024); + + namespace bio = boost::iostreams; + fdsBuffer_.reset(new bio::stream_buffer<bio::file_descriptor_source>(fds_)); + istream_.reset(new std::istream(fdsBuffer_.get())); } - InputFileStream(std::istream& strm) { istream_.push(strm, 0); } + InputFileStream(std::istream& strm) + : istream_(new std::istream(strm.rdbuf())) {} - operator std::istream&() { return istream_; } + operator std::istream&() { return *istream_; } - operator bool() { return (bool)istream_; } + operator bool() { return (bool)*istream_; } bool bad() const { - return istream_.bad(); + return istream_->bad(); } bool fail() const { - return istream_.fail(); + return istream_->fail(); } char widen(char c) { - return istream_.widen(c); - } - - bool isOpen() const { - return ifstream_.is_open(); + return istream_->widen(c); } std::string path() { return file_.string(); } - bool empty() { return ifstream_.peek() == std::ifstream::traits_type::eof(); } + bool empty() { return istream_->peek() == std::ifstream::traits_type::eof(); } void setbufsize(size_t size) const { - ifstream_.rdbuf()->pubsetbuf(0, 0); - //readBuf_ = std::make_unique<char[]>(size); + istream_->rdbuf()->pubsetbuf(0, 0); readBuf_.reset(new char[size]); - ifstream_.rdbuf()->pubsetbuf(readBuf_.get(), 0); + istream_->rdbuf()->pubsetbuf(readBuf_.get(), 0); } template <typename T> friend InputFileStream& operator>>(InputFileStream& stream, T& t) { - stream.istream_ >> t; + *stream.istream_ >> t; // bad() seems to be correct here. Should not abort on EOF. - ABORT_IF(stream.bad(), - "Error reading from file '{}'", - stream.path()); + ABORT_IF(stream.bad(), "Error reading from file '{}'", stream.path()); return stream; } template <typename T> size_t read(T* ptr, size_t num = 1) { - istream_.read((char*)ptr, num * sizeof(T)); + istream_->read((char*)ptr, num * sizeof(T)); // fail() seems to be correct here. Failure to read should abort. - ABORT_IF(fail(), - "Error reading from file '{}'", - path()); + ABORT_IF(fail(), "Error reading from file '{}'", path()); return num * sizeof(T); } private: marian::filesystem::Path file_; - boost::filesystem::ifstream ifstream_; + std::unique_ptr<std::istream> istream_; + boost::iostreams::file_descriptor_source fds_; - boost::iostreams::filtering_istream istream_; + std::unique_ptr<boost::iostreams::stream_buffer<boost::iostreams::file_descriptor_source>> fdsBuffer_; + mutable UPtr<char[]> readBuf_; // for setbuf() }; @@ -226,9 +211,7 @@ private: static inline InputFileStream& getline(InputFileStream& in, std::string& line) { std::getline((std::istream&)in, line); // bad() seems to be correct here. Should not abort on EOF. - ABORT_IF(in.bad(), - "Error reading from file '{}'", - in.path()); + ABORT_IF(in.bad(), "Error reading from file '{}'", in.path()); // strip terminal CR if present if(in && !line.empty() && line.back() == in.widen('\r')) line.pop_back(); @@ -240,9 +223,7 @@ static inline InputFileStream& getline(InputFileStream& in, std::string& line) { static inline InputFileStream& getline(InputFileStream& in, std::string& line, char delim) { std::getline((std::istream&)in, line, delim); // bad() seems to be correct here. Should not abort on EOF. - ABORT_IF(in.bad(), - "Error reading from file '{}'", - in.path()); + ABORT_IF(in.bad(), "Error reading from file '{}'", in.path()); // strip terminal CR if present if(in && !line.empty() && line.back() == in.widen('\r')) line.pop_back(); @@ -251,62 +232,61 @@ static inline InputFileStream& getline(InputFileStream& in, std::string& line, c class OutputFileStream { public: - OutputFileStream(const std::string& file) : file_(file), ofstream_(file_.getBoost()) { - ABORT_IF( - !marian::filesystem::exists(file_), "File '{}' does not exist", file); + OutputFileStream(const std::string& file) : file_(file) { + if(file_.extension() == marian::filesystem::Path(".gz")) + ostream_.reset(new zstr::ofstream(file_.string())); + else + ostream_.reset(new std::ofstream(file_.string())); - if(file_.extension() == marian::filesystem::Path(std::string(".gz"))) - ostream_.push(boost::iostreams::gzip_compressor()); - ostream_.push(ofstream_); + ABORT_IF(!marian::filesystem::exists(file_), "File '{}' could not be opened", file); } OutputFileStream(TemporaryFile& tempfile) : fds_(tempfile.getFileDescriptor(), boost::iostreams::never_close_handle) { lseek(tempfile.getFileDescriptor(), 0, SEEK_SET); - ostream_.push(fds_, 1024); + + namespace bio = boost::iostreams; + fdsBuffer_.reset(new bio::stream_buffer<bio::file_descriptor_sink>(fds_)); + ostream_.reset(new std::ostream(fdsBuffer_.get())); } - OutputFileStream(std::ostream& strm) { ostream_.push(strm, 0); } + OutputFileStream(std::ostream& strm) { + ostream_.reset(new std::ostream(strm.rdbuf())); + } - operator std::ostream&() { return ostream_; } + operator std::ostream&() { return *ostream_; } - operator bool() { return (bool)ostream_; } + operator bool() { return (bool)*ostream_; } bool bad() const { - return ostream_.bad(); + return ostream_->bad(); } bool fail() const { - return ostream_.fail(); + return ostream_->fail(); } template <typename T> friend OutputFileStream& operator<<(OutputFileStream& stream, const T& t) { - stream.ostream_ << t; + *stream.ostream_ << t; // fail() seems to be correct here. Failure to write should abort. - ABORT_IF(stream.fail(), - "Error writing to file '{}'", - stream.path()); + ABORT_IF(stream.fail(), "Error writing to file '{}'", stream.path()); return stream; } // handle things like std::endl which is actually a function not a value friend OutputFileStream& operator<<(OutputFileStream& stream, std::ostream& (*var)(std::ostream&)) { - stream.ostream_ << var; + *stream.ostream_ << var; // fail() seems to be correct here. Failure to write should abort. - ABORT_IF(stream.fail(), - "Error writing to file '{}'", - stream.path()); + ABORT_IF(stream.fail(), "Error writing to file '{}'", stream.path()); return stream; } template <typename T> size_t write(const T* ptr, size_t num = 1) { - ostream_.write((char*)ptr, num * sizeof(T)); + ostream_->write((char*)ptr, num * sizeof(T)); // fail() seems to be correct here. Failure to write should abort. - ABORT_IF(fail(), - "Error writing to file '{}'", - path()); + ABORT_IF(fail(), "Error writing to file '{}'", path()); return num * sizeof(T); } @@ -314,9 +294,10 @@ public: private: marian::filesystem::Path file_; - boost::filesystem::ofstream ofstream_; + std::unique_ptr<std::ostream> ostream_; + boost::iostreams::file_descriptor_sink fds_; - boost::iostreams::filtering_ostream ostream_; + std::unique_ptr<boost::iostreams::stream_buffer<boost::iostreams::file_descriptor_sink>> fdsBuffer_; }; } diff --git a/src/common/filesystem.h b/src/common/filesystem.h index 66927313..9dd0ae55 100755 --- a/src/common/filesystem.h +++ b/src/common/filesystem.h @@ -1,66 +1,57 @@ #pragma once -// @TODO: This is a temporary file to move every function from boost::filesystem used in Marian -// into one place. Marian should call functions only from this file. boost::filesystem will -// be removed. This needs to be portable to Windows too. +// This is a shallow wrapper around a filesystem path library. +// We used this to wrap boost::filesystem, now we are wrapping +// Pathie, a small open source lib. +// @TODO: go back to canonical names for functions and objects +// as specified in C++17 so it becomes easy to move in the future -#ifdef __GNUC__ #pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-variable" #pragma GCC diagnostic ignored "-Wsuggest-override" -#endif -#include <boost/filesystem.hpp> -#ifdef __GNUC__ -// "ignored -Wunused-variable" above ignores 'static const' declarations (where 'static' -// is not needed). We work around by referencing the offending variables in dummy code. -static inline void boost_dummy_filesystem() { boost::system::posix_category; boost::system::errno_ecat; boost::system::native_ecat; } +#include "3rd_party/pathie-cpp/include/path.hpp" +#include "3rd_party/pathie-cpp/include/errors.hpp" #pragma GCC diagnostic pop -#endif namespace marian { namespace filesystem { - struct Path { + class Path { private: - boost::filesystem::path path; + Pathie::Path path; public: Path() {} Path(const Path& p) : path{p.path} {} Path(const std::string& s) : path{s} {} - Path(const boost::filesystem::path& p) : path{p} {} + Path(const Pathie::Path& p) : path{p} {} Path parentPath() const { - return Path{path.parent_path()}; + return Path(path.parent()); } Path filename() const { - return Path{path.filename()}; + return Path(path.basename()); } Path extension() const { - return Path{path.extension()}; + return Path(path.extension()); } bool empty() const { - return path.empty(); + return path.str().empty(); } - const boost::filesystem::path& getBoost() const { + const Pathie::Path& getImpl() const { return path; } - operator std::string&() { - return (std::string&)path; - } - operator std::string() const { - return path.string(); + return path.str(); } std::string string() const { - return path.string(); + return path.str(); } bool operator==(const Path& p) const { @@ -73,35 +64,31 @@ namespace filesystem { }; static inline Path currentPath() { - return Path{boost::filesystem::current_path()}; + return Path(Pathie::Path::pwd()); } - static inline Path canonical(const Path& p, const Path& dir) { - return Path{ boost::filesystem::canonical(p.getBoost(), dir.getBoost()) }; + static inline Path canonical(const Path& p, const Path& base) { + // create absolute base path + return p.getImpl().absolute(base.getImpl()).expand(); } static inline bool exists(const Path& p) { - return boost::filesystem::exists(p.getBoost()); + return p.getImpl().exists(); } static inline size_t fileSize(const Path& p) { - return boost::filesystem::file_size(p.getBoost()); + return p.getImpl().size(); } static inline bool isDirectory(const Path& p) { - return boost::filesystem::is_directory(p.getBoost()); - } - - static inline bool canWrite(const Path& p) { - return (boost::filesystem::status(p.getBoost()).permissions() & boost::filesystem::owner_write) != 0; + return p.getImpl().is_directory(); } - // concatenation? static inline Path operator/ (const Path& lhs, const Path& rhs) { - return lhs.getBoost() / rhs.getBoost(); + return Path(lhs.getImpl() / rhs.getImpl()); } - using FilesystemError = boost::filesystem::filesystem_error; + using FilesystemError = Pathie::PathieError; } }
\ No newline at end of file diff --git a/src/common/logging.cpp b/src/common/logging.cpp index fdb999ca..0170d633 100755 --- a/src/common/logging.cpp +++ b/src/common/logging.cpp @@ -150,9 +150,11 @@ void switchtoMultinodeLogging(std::string nodeIdStr) { namespace marian { - void noinline logCallStack(size_t skipLevels) - { - auto callStack = ::Microsoft::MSR::CNTK::DebugUtil::GetCallStack(skipLevels + 2, /*makeFunctionNamesStandOut=*/true); - checkedLog("general", "critical", "Call stack:{}", callStack); + std::string noinline getCallStack(size_t skipLevels) { + return ::Microsoft::MSR::CNTK::DebugUtil::GetCallStack(skipLevels + 2, /*makeFunctionNamesStandOut=*/true); + } + + void noinline logCallStack(size_t skipLevels) { + checkedLog("general", "critical", getCallStack(skipLevels)); } } diff --git a/src/common/logging.h b/src/common/logging.h index 091044ea..cdaa806c 100755 --- a/src/common/logging.h +++ b/src/common/logging.h @@ -6,6 +6,7 @@ namespace marian { void logCallStack(size_t skipLevels); + std::string getCallStack(size_t skipLevels); } /** @@ -46,13 +47,19 @@ namespace marian { * * @param ... Message text and variables */ -#define ABORT(...) \ - do { \ - checkedLog("general", "critical", __VA_ARGS__); \ - ::marian::logCallStack(/*skipLevels=*/0); \ - std::cerr << "Aborted from " << FUNCTION_NAME << " in " << __FILE__ \ - << ": " << __LINE__ << std::endl; \ - std::abort(); \ +#define ABORT(...) \ + do { \ + auto logger = spdlog::get("general"); \ + if(logger == nullptr) \ + logger = createStderrLogger("general", "[%Y-%m-%d %T] Error: %v"); \ + else \ + logger->set_pattern("[%Y-%m-%d %T] Error: %v"); \ + checkedLog("general", "critical", __VA_ARGS__); \ + checkedLog("general", "critical", "Aborted from {} in {}:{}", \ + FUNCTION_NAME, __FILE__, __LINE__); \ + logger->set_pattern("%v"); \ + checkedLog("general", "critical", marian::getCallStack(/*skipLevels=*/0)); \ + std::abort(); \ } while(0) /** @@ -85,18 +92,6 @@ template <class... Args> void checkedLog(std::string logger, std::string level, Args... args) { Logger log = spdlog::get(logger); if(!log) { - if(level == "critical") { - // log and errlog are not the same, hence we need to check - // if an error logger exists first and not try to create a - // second one. Otherwise this will throw an exception. - Logger errlog = spdlog::get("error"); - if(!errlog) - errlog = createStderrLogger("error", "Error: %v - aborting"); - errlog->critical(args...); - } - // @TODO: should other loggers do something? This seems to be - // a sink state when logs are not intialized. Critical errors - // should log nevertheless, non-critical go unreported. return; } diff --git a/src/common/timer.h b/src/common/timer.h index 6f86b54f..4172cfc7 100755 --- a/src/common/timer.h +++ b/src/common/timer.h @@ -12,6 +12,14 @@ namespace marian { namespace timer { +// Helper function to get the current date and time +static std::string currentDate() { + std::time_t now = std::chrono::system_clock::to_time_t(std::chrono::system_clock::now()); + char date[100] = {0}; + std::strftime(date, sizeof(date), "%F %X %z", std::localtime(&now)); + return date; +} + // Timer measures elapsed time. // This is a wrapper around std::chrono providing wall time only class Timer { diff --git a/src/common/version.cpp b/src/common/version.cpp index a31c7df7..75814d92 100755 --- a/src/common/version.cpp +++ b/src/common/version.cpp @@ -1,12 +1,10 @@ #include "common/version.h"
-#include "common/project_version.h" // cmake-generated file, major/minor/tweak versions
-#include "common/git_revision.h" // make-generated file, contains git commit info
+#include "common/git_revision.h" // make-generated file, contains git commit info
+#include "common/project_version.h" // cmake-generated file, major/minor/tweak versions
namespace marian {
-std::string buildVersion()
-{
+std::string buildVersion() {
return std::string(PROJECT_VERSION) + " " + GIT_REVISION;
}
-
}
diff --git a/src/common/version.h b/src/common/version.h index a0c8ab22..a425af93 100755 --- a/src/common/version.h +++ b/src/common/version.h @@ -3,5 +3,5 @@ #include <string>
namespace marian {
- std::string buildVersion();
+ std::string buildVersion();
}
diff --git a/src/data/corpus.cpp b/src/data/corpus.cpp index 70660467..7a7a846e 100755 --- a/src/data/corpus.cpp +++ b/src/data/corpus.cpp @@ -106,9 +106,12 @@ void Corpus::shuffleData(const std::vector<std::string>& paths) { size_t numStreams = paths.size(); + size_t numSentences; std::vector<std::vector<std::string>> corpus(numStreams); // [stream][id] - if (!corpusInRAM_.empty()) // when caching, we use what we have instead + if (!corpusInRAM_.empty()) { // when caching, we use what we have instead corpus = std::move(corpusInRAM_); // temporarily move ownership here, will be moved back + numSentences = corpus[0].size(); + } else { files_.resize(numStreams); for(size_t i = 0; i < numStreams; ++i) { @@ -132,10 +135,9 @@ void Corpus::shuffleData(const std::vector<std::string>& paths) { ABORT_IF(eofsHit != 0, "Not all input files have the same number of lines"); } files_.clear(); - LOG(info, "[data] Done reading {} sentences.", corpus[0].size()); + numSentences = corpus[0].size(); + LOG(info, "[data] Done reading {} sentences", numSentences); } - size_t numSentences = corpus[0].size(); - LOG(info, "[data] Done reading {} sentences", numSentences); // randomize sequence ids, and remember them ids_.resize(numSentences); diff --git a/src/data/corpus_base.cpp b/src/data/corpus_base.cpp index 880a6946..c9704313 100755 --- a/src/data/corpus_base.cpp +++ b/src/data/corpus_base.cpp @@ -75,10 +75,14 @@ CorpusBase::CorpusBase(Ptr<Options> options, bool translate) if(maxVocabs.size() < paths_.size()) maxVocabs.resize(paths_.size(), 0); + LOG(info, "No vocabulary files given, trying to find or build based on training data. " + "Vocabularies will be built separately for each file."); + // Create vocabs if not provided for(size_t i = 0; i < paths_.size(); ++i) { Ptr<Vocab> vocab = New<Vocab>(options_, i); - int vocSize = vocab->loadOrCreate("", paths_[i], maxVocabs[i]); + std::vector<std::string> trainPaths = { paths_[i] }; + int vocSize = vocab->loadOrCreate("", trainPaths, maxVocabs[i]); // TODO: this is not nice as it modifies the option object and needs to expose the changes // outside the corpus as models need to know about the vocabulary size; extract the vocab // creation functionality from the class. @@ -92,9 +96,31 @@ CorpusBase::CorpusBase(Ptr<Options> options, bool translate) if(maxVocabs.size() < vocabPaths.size()) maxVocabs.resize(paths_.size(), 0); + // Helper object to for grouping training data based on vocabulary file name + struct PathsAndSize { + std::set<std::string> paths; // contains all paths that are used for training the vocabulary + size_t size; // contains the maximum vocabulary size + }; + + // Group training files based on vocabulary path. If the same + // vocab path corresponds to different training files, this means + // that a single vocab should combine tokens from all files. + std::map<std::string, PathsAndSize> groupVocab; + for(size_t i = 0; i < vocabPaths.size(); ++i) { + groupVocab[vocabPaths[i]].paths.insert(paths_[i]); + if(groupVocab[vocabPaths[i]].size < maxVocabs[i]) + groupVocab[vocabPaths[i]].size = maxVocabs[i]; + } + for(size_t i = 0; i < vocabPaths.size(); ++i) { Ptr<Vocab> vocab = New<Vocab>(options_, i); - int vocSize = vocab->loadOrCreate(vocabPaths[i], paths_[i], maxVocabs[i]); + + // Get the set of files that corresponds to the vocab. If the next file is the same vocab, + // it wild not be created again, but just correctly loaded. + auto pathsAndSize = groupVocab[vocabPaths[i]]; + std::vector<std::string> groupedPaths(pathsAndSize.paths.begin(), pathsAndSize.paths.end()); + int vocSize = vocab->loadOrCreate(vocabPaths[i], groupedPaths, pathsAndSize.size); + // TODO: this is not nice as it modifies the option object and needs to expose the changes // outside the corpus as models need to know about the vocabulary size; extract the vocab // creation functionality from the class. diff --git a/src/data/default_vocab.cpp b/src/data/default_vocab.cpp index 98bf5d8f..1ce055db 100755 --- a/src/data/default_vocab.cpp +++ b/src/data/default_vocab.cpp @@ -30,14 +30,16 @@ private: class VocabFreqOrderer { private: - std::unordered_map<std::string, size_t>& counter_; + const std::unordered_map<std::string, size_t>& counter_; public: - VocabFreqOrderer(std::unordered_map<std::string, size_t>& counter) - : counter_(counter) {} + VocabFreqOrderer(const std::unordered_map<std::string, size_t>& counter) + : counter_(counter) {} + // order first by decreasing frequency, + // if frequencies are the same order lexicographically by vocabulary string bool operator()(const std::string& a, const std::string& b) const { - return counter_[a] > counter_[b] || (counter_[a] == counter_[b] && a < b); + return counter_.at(a) > counter_.at(b) || (counter_.at(a) == counter_.at(b) && a < b); } }; @@ -117,10 +119,6 @@ public: auto str = pair.first; auto id = pair.second; - if(SPEC2SYM.count(str)) { - seenSpecial.insert(id); - } - // note: this requires ids to be sorted by frequency if(!max || id < (Word)max) { insertWord(id, str); @@ -174,8 +172,6 @@ public: }; // @TODO: the hard-att code has not yet been updated to accept EOS at any id requireWord(DEFAULT_EOS_ID, DEFAULT_EOS_STR); - for(auto id : seenSpecial) - requireWord(id, SYM2SPEC.at(id)); } return std::max((int)id2str_.size(), max); @@ -187,52 +183,50 @@ public: unkId_ = insertWord(DEFAULT_UNK_ID, DEFAULT_UNK_STR); } - void create(const std::string& vocabPath, const std::string& trainPath) override { - LOG(info, "[data] Creating vocabulary {} from {}", vocabPath, trainPath); - - filesystem::Path path(vocabPath); - auto dir = path.parentPath(); - if(dir.empty()) - dir = filesystem::currentPath(); + virtual void create(const std::string& vocabPath, + const std::vector<std::string>& trainPaths, + size_t maxSize = 0) override { - ABORT_IF(!dir.empty() && !filesystem::isDirectory(dir), - "Specified vocab directory {} does not exist", - dir.string()); + LOG(info, "[data] Creating vocabulary {} from {}", + vocabPath, + utils::join(trainPaths, ", ")); - ABORT_IF(!dir.empty() && !filesystem::canWrite(dir), - "No write permission in vocab directory {}", - dir.string()); + if(vocabPath != "stdout") { + filesystem::Path path(vocabPath); + auto dir = path.parentPath(); + if(dir.empty()) + dir = filesystem::currentPath(); - ABORT_IF(filesystem::exists(vocabPath), - "DefaultVocab file '{}' exists. Not overwriting", - path.string()); + ABORT_IF(!dir.empty() && !filesystem::isDirectory(dir), + "Specified vocab directory {} does not exist", + dir.string()); - io::InputFileStream trainStrm(trainPath); - io::OutputFileStream vocabStrm(vocabPath); - create(trainStrm, vocabStrm); + ABORT_IF(filesystem::exists(vocabPath), + "Vocabulary file '{}' exists. Not overwriting", + path.string()); + } + + std::unordered_map<std::string, size_t> counter; + for(const auto& trainPath : trainPaths) + addCounts(counter, trainPath); + create(vocabPath, counter, maxSize); } - void create(io::InputFileStream& trainStrm, - io::OutputFileStream& vocabStrm, - size_t maxSize = 0) override { - std::string line; - std::unordered_map<std::string, size_t> counter; +private: - std::unordered_set<Word> seenSpecial; + void addCounts(std::unordered_map<std::string, size_t>& counter, + const std::string& trainPath) { + std::unique_ptr<io::InputFileStream> trainStrm( + trainPath == "stdin" ? new io::InputFileStream(std::cin) + : new io::InputFileStream(trainPath) + ); - while(getline((std::istream&)trainStrm, line)) { + std::string line; + while(getline(*trainStrm, line)) { std::vector<std::string> toks; - - // we do not want any unexpected behavior during creation - // e.g. sampling, hence use inference mode utils::split(line, toks, " "); for(const std::string& tok : toks) { - if(SPEC2SYM.count(tok)) { - seenSpecial.insert(SPEC2SYM.at(tok)); - continue; - } - auto iter = counter.find(tok); if(iter == counter.end()) counter[tok] = 1; @@ -240,6 +234,11 @@ public: iter->second++; } } + } + + void create(const std::string& vocabPath, + const std::unordered_map<std::string, size_t>& counter, + size_t maxSize = 0) { std::vector<std::string> vocabVec; for(auto& p : counter) @@ -251,14 +250,7 @@ public: vocabYaml.force_insert(DEFAULT_EOS_STR, DEFAULT_EOS_ID); vocabYaml.force_insert(DEFAULT_UNK_STR, DEFAULT_UNK_ID); - for(auto word : seenSpecial) - vocabYaml.force_insert(SYM2SPEC.at(word), word); - Word maxSpec = 1; - for(auto i : seenSpecial) - if(i > maxSpec) - maxSpec = i; - auto vocabSize = vocabVec.size(); if(maxSize > maxSpec) vocabSize = std::min(maxSize - maxSpec - 1, vocabVec.size()); @@ -266,10 +258,13 @@ public: for(size_t i = 0; i < vocabSize; ++i) vocabYaml.force_insert(vocabVec[i], i + maxSpec + 1); - vocabStrm << vocabYaml; + std::unique_ptr<io::OutputFileStream> vocabStrm( + vocabPath == "stdout" ? new io::OutputFileStream(std::cout) + : new io::OutputFileStream(vocabPath) + ); + *vocabStrm << vocabYaml; } -private: Words operator()(const std::vector<std::string>& lineTokens, bool addEOS) const { Words words(lineTokens.size()); diff --git a/src/data/sentencepiece_vocab.cpp b/src/data/sentencepiece_vocab.cpp index d9c24415..ed476ec4 100755 --- a/src/data/sentencepiece_vocab.cpp +++ b/src/data/sentencepiece_vocab.cpp @@ -2,13 +2,18 @@ #ifdef USE_SENTENCEPIECE #include "sentencepiece/src/sentencepiece_processor.h" -#endif +#include "sentencepiece/src/sentencepiece_trainer.h" +#endif +#include "common/config.h" #include "common/options.h" #include "common/logging.h" #include "common/filesystem.h" #include "common/regex.h" +#include <sstream> +#include <random> + namespace marian { #ifdef USE_SENTENCEPIECE @@ -28,9 +33,85 @@ private: Ptr<Options> options_; size_t batchIndex_{0}; + std::mt19937 generator_; + std::uniform_int_distribution<int> randInt_; // from 0 to INT_MAX + + // Sample from one file, based on first algorithm from: + // https://en.wikipedia.org/wiki/Reservoir_sampling + void reservoirSampling(std::vector<std::string>& sample, size_t& seenLines, + const std::string& trainPath, size_t maxLines, size_t maxBytes) { + + ABORT_IF(maxLines == 0, "Sample needs to be larger 0"); + + std::unique_ptr<io::InputFileStream> trainStrm( + trainPath == "stdin" ? new io::InputFileStream(std::cin) + : new io::InputFileStream(trainPath) + ); + + std::string line; + while(getline(*trainStrm, line)) { + if(line.size() > 0 && line.size() < maxBytes) { + if(sample.size() < maxLines) { + sample.push_back(line); + } + else { + size_t i = randInt_(generator_) % (seenLines + 1); + if(i < maxLines) + sample[i] = line; + } + seenLines++; + } + } + } + + // Iterate over all input files and collect a representative sample via reservoir sampling. + // The sample will first grow to the desired size and next keep sampling with decreasing + // probability in the hope to get a uniform sample from the union of all files. + size_t reservoirSamplingAll(io::TemporaryFile& temp, + const std::vector<std::string>& trainPaths, + size_t maxLines, size_t maxBytes) { + LOG(info, "[SentencePiece] Sampling at most {} lines from {}", maxLines, utils::join(trainPaths, ", ")); + + std::vector<std::string> sample; + size_t seenLines = 0; + for(const auto& trainPath : trainPaths) + reservoirSampling(sample, seenLines, trainPath, maxLines, maxBytes); + std::shuffle(sample.begin(), sample.end(), generator_); + + io::OutputFileStream out(temp); + for(const auto& line : sample) + out << line << std::endl; + + LOG(info, "[SentencePiece] Selected {} lines", sample.size()); + return sample.size(); + } + + // Just concatenate all files to a temporary file so SentencePiece can consume it. + size_t dumpAll(io::TemporaryFile& temp, + const std::vector<std::string>& trainPaths, + size_t maxBytes) { + LOG(info, "[SentencePiece] Selecting all lines from {}", utils::join(trainPaths, ", ")); + + size_t seenLines = 0; + std::string line; + io::OutputFileStream out(temp); + for(const auto& trainPath : trainPaths) { + io::InputFileStream in(trainPath); + while(getline(in, line)) { + if(line.size() > 0 && line.size() < maxBytes) { + out << line << std::endl; + seenLines++; + } + } + } + + LOG(info, "[SentencePiece] Selected {} lines", seenLines); + return seenLines; + } + public: SentencePieceVocab(Ptr<Options> options, size_t batchIndex) - : options_(options), batchIndex_(batchIndex) { + : options_(options), batchIndex_(batchIndex), generator_(Config::seed) { if(options_->has("sentencepiece-alphas")) { auto alphas = options_->get<std::vector<float>>("sentencepiece-alphas"); @@ -41,47 +122,91 @@ public: if(alpha_ > 0) LOG(debug, - "Setting SentencePieceVocab sampling factor to {} for input {}", + "Setting SentencePiece vocabulary sampling factor to {} for input {}", alpha_, batchIndex_); } } - virtual const std::string& canonicalExtension() const { return suffixes_[0]; } - virtual const std::vector<std::string>& suffixes() const { return suffixes_; } + virtual const std::string& canonicalExtension() const override { return suffixes_[0]; } + virtual const std::vector<std::string>& suffixes() const override { return suffixes_; } virtual std::string suffix() { return suffixes_[0]; }; - virtual std::string type() const { return "SentencePieceVocab"; } + virtual std::string type() const override { return "SentencePieceVocab"; } virtual Word getEosId() const override { return (Word)spm_->eos_id(); } virtual Word getUnkId() const override { return (Word)spm_->unk_id(); } - void create(const std::string& /*vocabPath*/, const std::string& /*trainPath*/) { - ABORT("[data] Training of SentencePieceVocab not yet supported"); - } + void create(const std::string& vocabPath, + const std::vector<std::string>& trainPaths, + size_t maxSize) override { + + size_t defaultMaxSize = 32000; + size_t maxLines = options_->get<size_t>("sentencepiece-max-lines"); + size_t maxBytes = 2048; + + LOG(info, "[SentencePiece] Training SentencePiece vocabulary {}", vocabPath); - void create(io::InputFileStream& /*trainStrm*/, - io::OutputFileStream& /*vocabStrm*/, - size_t /*maxSize*/) { - ABORT("[data] Training of SentencePieceVocab not yet supported"); + if(maxSize == 0) { + LOG(info, "[SentencePiece] Vocabulary size is undefined (set with --dim-vocabs ...) - setting to {}", defaultMaxSize); + maxSize = defaultMaxSize; + } + + // Create temporary file to hold the sample for the SentencePiece trainer + io::TemporaryFile temp(options_->get<std::string>("tempdir"), false); + std::string tempFileName = temp.getFileName(); + LOG(info, "[SentencePiece] Creating temporary file {}", tempFileName); + + size_t seenLines = 0; + if(maxLines == 0) + seenLines = dumpAll(temp, trainPaths, maxBytes); + else + seenLines = reservoirSamplingAll(temp, trainPaths, maxLines, maxBytes); + + // Compose the SentencePiece training command from filenames and parameters0 + std::stringstream command; + command + << " --bos_id=-1 --eos_id=0 --unk_id=1" // these should not be changed as they match Marian defaults + << " --input=" << tempFileName + << " --model_prefix=" << vocabPath + << " --vocab_size=" << maxSize + << " --max_sentence_length=" << maxBytes + << " --input_sentence_size=" << seenLines + << " " << options_->get<std::string>("sentencepiece-options"); // these are SentencePiece command line options + + // Train the SentencePiece model + const auto status = sentencepiece::SentencePieceTrainer::Train(command.str()); + ABORT_IF(!status.ok(), + "SentencePiece vocabulary error: {}", + status.ToString()); + + LOG(info, "[SentencePiece] Removing {}", vocabPath + ".vocab"); + ABORT_IF(remove((vocabPath + ".vocab").c_str()) != 0, + "Could not remove {}", + vocabPath + ".vocab"); + + LOG(info, "[SentencePiece] Renaming {} to {}", vocabPath + ".model", vocabPath); + ABORT_IF(rename((vocabPath + ".model").c_str(), vocabPath.c_str()) != 0, + "Could not rename {} to {}", + vocabPath + ".model", vocabPath); } - void createFake() { - ABORT("[data] Fake SentencePieceVocab not supported"); + void createFake() override { + ABORT("[SentencePiece] Fake SentencePiece vocabulary not supported"); } - Word operator[](const std::string& token) const { + Word operator[](const std::string& token) const override { return (Word)spm_->PieceToId(token); } - const std::string& operator[](Word id) const { + const std::string& operator[](Word id) const override { ABORT_IF(id >= size(), "Unknown word id: ", id); return spm_->IdToPiece(id); } - Words encode(const std::string& line, bool addEOS, bool inference) const { + Words encode(const std::string& line, bool addEOS, bool inference) const override { std::vector<int> spmIds; if(inference || alpha_ == 0) spm_->Encode(line, &spmIds); @@ -95,7 +220,7 @@ public: return words; } - std::string decode(const Words& sentence, bool ignoreEOS) const { + std::string decode(const Words& sentence, bool /*ignoreEOS*/) const override { std::string line; // convert vector of Word to vector of int std::vector<int> spmSentence(sentence.begin(), sentence.end()); @@ -103,29 +228,29 @@ public: return line; } - size_t size() const { + size_t size() const override { return spm_->GetPieceSize(); } - int load(const std::string& vocabPath, int /*max*/) { - LOG(info, "[data] Loading SentencePieceVocab from file {}", vocabPath); + int load(const std::string& vocabPath, int /*max*/) override { + LOG(info, "[data] Loading SentencePiece vocabulary from file {}", vocabPath); ABORT_IF(!filesystem::exists(vocabPath), - "SentencePieceVocab file {} does not exits", - vocabPath); + "SentencePiece vocabulary file {} does not exits", + vocabPath); spm_.reset(new sentencepiece::SentencePieceProcessor()); const auto status = spm_->Load(vocabPath); ABORT_IF(!status.ok(), - "SentencePieceVocab error: {}", - status.ToString()); + "SentencePiece vocabulary error: {}", + status.ToString()); return spm_->GetPieceSize(); } }; -#endif +#endif // USE_SENTENCEPIECE Ptr<VocabBase> createSentencePieceVocab(const std::string& vocabPath, Ptr<Options> options, size_t batchIndex) { bool isSentencePiece = regex::regex_search(vocabPath, regex::regex("\\.(spm)$")); diff --git a/src/data/types.h b/src/data/types.h index 62566a74..2bda6ece 100644 --- a/src/data/types.h +++ b/src/data/types.h @@ -28,27 +28,4 @@ const std::string DEFAULT_UNK_STR = "<unk>"; const std::string NEMATUS_EOS_STR = "eos"; const std::string NEMATUS_UNK_STR = "UNK"; -const Word STP_ID = 2; -const Word CPY_ID = 3; -const Word DEL_ID = 4; -const Word RPL_ID = 5; - -const std::string STP_STR = "<step>"; -const std::string CPY_STR = "<c>"; -const std::string DEL_STR = "<d>"; -const std::string RPL_STR = "<r>"; - -const std::unordered_map<std::string, Word> SPEC2SYM = { - {STP_STR, STP_ID}, - {CPY_STR, CPY_ID}, - {DEL_STR, DEL_ID}, - {RPL_STR, RPL_ID}, -}; - -const std::unordered_map<Word, std::string> SYM2SPEC = { - {STP_ID, STP_STR}, - {CPY_ID, CPY_STR}, - {DEL_ID, DEL_STR}, - {RPL_ID, RPL_STR}, -}; } // namespace marian diff --git a/src/data/vocab.cpp b/src/data/vocab.cpp index 09849b2e..e95ea721 100755 --- a/src/data/vocab.cpp +++ b/src/data/vocab.cpp @@ -13,62 +13,63 @@ Ptr<VocabBase> createVocab(const std::string& vocabPath, Ptr<Options> options, s } int Vocab::loadOrCreate(const std::string& vocabPath, - const std::string& trainPath, - int max) { + const std::vector<std::string>& trainPaths, + size_t maxSize) { size_t size = 0; if(vocabPath.empty()) { // No vocabulary path was given, attempt to first find a vocabulary - // for trainPath + possible suffixes. If not found attempt to create - // as trainPath + canonical suffix. + // for trainPaths[0] + possible suffixes. If not found attempt to create + // as trainPaths[0] + canonical suffix. + // Only search based on first path, maybe disable this at all? LOG(info, "No vocabulary path given; " "trying to find default vocabulary based on data path {}", - trainPath); + trainPaths[0]); vImpl_ = createDefaultVocab(); - size = vImpl_->findAndLoad(trainPath, max); + size = vImpl_->findAndLoad(trainPaths[0], maxSize); if(size == 0) { - auto path = trainPath + vImpl_->canonicalExtension(); + auto newVocabPath = trainPaths[0] + vImpl_->canonicalExtension(); LOG(info, "No vocabulary path given; " - "trying to find vocabulary based on data path {}", - trainPath); - vImpl_->create(path, trainPath); - size = vImpl_->load(path, max); + "trying to create vocabulary based on data paths {}", + utils::join(trainPaths, ", ")); + create(newVocabPath, trainPaths, maxSize); + size = load(newVocabPath, maxSize); } } else { if(!filesystem::exists(vocabPath)) { // Vocabulary path was given, but no vocabulary present, // attempt to create in specified location. - create(vocabPath, trainPath); + create(vocabPath, trainPaths, maxSize); } // Vocabulary path exists, attempting to load - size = load(vocabPath, max); + size = load(vocabPath, maxSize); } LOG(info, "[data] Setting vocabulary size for input {} to {}", batchIndex_, size); return (int)size; } -int Vocab::load(const std::string& vocabPath, int max) { +int Vocab::load(const std::string& vocabPath, size_t maxSize) { if(!vImpl_) vImpl_ = createVocab(vocabPath, options_, batchIndex_); - return vImpl_->load(vocabPath, max); + return vImpl_->load(vocabPath, maxSize); } -void Vocab::create(const std::string& vocabPath, const std::string& trainPath) { +void Vocab::create(const std::string& vocabPath, + const std::vector<std::string>& trainPaths, + size_t maxSize) { if(!vImpl_) vImpl_ = createVocab(vocabPath, options_, batchIndex_); - vImpl_->create(vocabPath, trainPath); + vImpl_->create(vocabPath, trainPaths, maxSize); } -void Vocab::create(io::InputFileStream& trainStrm, - io::OutputFileStream& vocabStrm, +void Vocab::create(const std::string& vocabPath, + const std::string& trainPath, size_t maxSize) { - if(!vImpl_) - vImpl_ = createDefaultVocab(); // Only DefaultVocab can be built from streams - vImpl_->create(trainStrm, vocabStrm, maxSize); + create(vocabPath, std::vector<std::string>({trainPath}), maxSize); } void Vocab::createFake() { diff --git a/src/data/vocab.h b/src/data/vocab.h index 1551f746..4bad1795 100755 --- a/src/data/vocab.h +++ b/src/data/vocab.h @@ -26,15 +26,18 @@ public: : options_(options), batchIndex_(batchIndex) {} int loadOrCreate(const std::string& vocabPath, - const std::string& textPath, - int max = 0); + const std::vector<std::string>& trainPaths, + size_t maxSize = 0); - int load(const std::string& vocabPath, int max = 0); - void create(const std::string& vocabPath, const std::string& trainPath); + int load(const std::string& vocabPath, size_t maxSize = 0); - void create(io::InputFileStream& trainStrm, - io::OutputFileStream& vocabStrm, - size_t maxSize = 0); + void create(const std::string& vocabPath, + const std::vector<std::string>& trainPaths, + size_t maxSize); + + void create(const std::string& vocabPath, + const std::string& trainPath, + size_t maxSize); // string token to token id Word operator[](const std::string& word) const; diff --git a/src/data/vocab_base.h b/src/data/vocab_base.h index 23e1520c..d3078d9a 100644 --- a/src/data/vocab_base.h +++ b/src/data/vocab_base.h @@ -1,19 +1,19 @@ #pragma once +#include "data/types.h" #include "common/definitions.h" +#include "common/utils.h" #include "common/file_stream.h" -#include "data/types.h" namespace marian { class VocabBase { public: virtual int load(const std::string& vocabPath, int max = 0) = 0; - virtual void create(const std::string& vocabPath, const std::string& trainPath) = 0; - virtual void create(io::InputFileStream& trainStrm, - io::OutputFileStream& vocabStrm, - size_t maxSize = 0) = 0; + virtual void create(const std::string& vocabPath, + const std::vector<std::string>& trainPaths, + size_t maxSize) = 0; // return canonical suffix for given type of vocabulary virtual const std::string& canonicalExtension() const = 0; diff --git a/src/examples/mnist/model_lenet.h b/src/examples/mnist/model_lenet.h index ac0298e3..c2a39977 100644 --- a/src/examples/mnist/model_lenet.h +++ b/src/examples/mnist/model_lenet.h @@ -12,12 +12,12 @@ public: MnistLeNet(Ptr<Options> options, Args... args) : MnistFeedForwardNet(options, args...) {} - virtual void clear(Ptr<ExpressionGraph> graph) { graph->clear(); }; + virtual void clear(Ptr<ExpressionGraph> graph) override { graph->clear(); }; protected: virtual Expr construct(Ptr<ExpressionGraph> g, Ptr<data::Batch> batch, - bool inference = false) { + bool inference = false) override { const std::vector<int> dims = {784, 128, 10}; // Start with an empty expression graph diff --git a/src/graph/node_operators_binary.h b/src/graph/node_operators_binary.h index 01546c1e..7da85443 100755 --- a/src/graph/node_operators_binary.h +++ b/src/graph/node_operators_binary.h @@ -490,20 +490,20 @@ struct RowsNodeOp : public NaryNodeOp { // This operation indexes a tensor along an axis. // This is similar to the common gather() operation in other toolkits. // For example, this can be used for: -// - Same index applied to all batch items (today's select()):
-// 'index' has 1 in the axes that match batch axes in the input, and axis set to the one axis that gets selected over.
-// Example: Selecting Transformer head 0, i.e. return a[:,1,:,:]
-// axis = -3
-// a : (B, H , S, T) B=batch dim, H=#heads, S=src length, T=trg length
-// idx: ( #1#, 1, 1) #1# denotes 'axis'. All values are zero.
-// out: (B, 1 , S, T) out[b, 0, s, t] == a[b, idx[/*0,*/ 0, s, t], s, t]
-// - Same data with batched indices (today's rows()):
-// 'data' has 1 in the batch axes.
-// Example: Embedding lookup as done today using rows():
-// axis = -2
-// e : ( V , E) V=vocab size, E=embedding dimension
-// idx: (#(B*S)#, 1) B=batch size, S=source length, idx values are in range 0..V-1
-// out: ( (B*S) , E) out[b, s, e] == e[/*0,*/ idx[b, s, 0], e]
+// - Same index applied to all batch items (today's select()): +// 'index' has 1 in the axes that match batch axes in the input, and axis set to the one axis that gets selected over. +// Example: Selecting Transformer head 0, i.e. return a[:,1,:,:] +// axis = -3 +// a : (B, H , S, T) B=batch dim, H=#heads, S=src length, T=trg length +// idx: ( #1#, 1, 1) #1# denotes 'axis'. All values are zero. +// out: (B, 1 , S, T) out[b, 0, s, t] == a[b, idx[/*0,*/ 0, s, t], s, t] +// - Same data with batched indices (today's rows()): +// 'data' has 1 in the batch axes. +// Example: Embedding lookup as done today using rows(): +// axis = -2 +// e : ( V , E) V=vocab size, E=embedding dimension +// idx: (#(B*S)#, 1) B=batch size, S=source length, idx values are in range 0..V-1 +// out: ( (B*S) , E) out[b, s, e] == e[/*0,*/ idx[b, s, 0], e] // - Batched selection (x-ent scenario): Both 'index' and 'data' have matching batch axes. // Example: Cross-entropy loss as -select(logSoftmax(logits), groundTruth, axis=-1): // axis = -1 @@ -511,14 +511,14 @@ struct RowsNodeOp : public NaryNodeOp { // idx: (B, T, #1#) idx values are in range 0..V-1 // out: (B, T, 1 ) out[b,t,0] == lp[b, t, idx[b, t, 0]] // Example for 2D tensor with axis=0: -// | t[index[0, 0] 0] t[index[0, 1] 1] |
-// | t[index[1, 0] 0] t[index[1, 1] 1] |
-// And for axis 1:
-// | t[0 index[0, 0]] t[0 index[0, 1]] |
-// | t[1 index[1, 0]] t[1 index[1, 1]] |
-// For a 3-D tensor the output is specified by:
-// out[i][j][k] = input[index[i][j][k]][j][k] # if dim == 0
-// out[i][j][k] = input[i][index[i][j][k]][k] # if dim == 1
+// | t[index[0, 0] 0] t[index[0, 1] 1] | +// | t[index[1, 0] 0] t[index[1, 1] 1] | +// And for axis 1: +// | t[0 index[0, 0]] t[0 index[0, 1]] | +// | t[1 index[1, 0]] t[1 index[1, 1]] | +// For a 3-D tensor the output is specified by: +// out[i][j][k] = input[index[i][j][k]][j][k] # if dim == 0 +// out[i][j][k] = input[i][index[i][j][k]][k] # if dim == 1 // out[i][j][k] = input[i][j][index[i][j][k]] # if dim == 2 // If 'a' and 'indices' do not have the same rank, then negative 'axis' is // interpreted relative to 'a', and 'indices' must have the resulting axis. @@ -953,6 +953,7 @@ struct HighwayNodeOp : public NaryNodeOp { }; #ifdef CUDNN + class ConvolutionOp : public NaryNodeOp { public: ConvolutionOp(const std::vector<Expr>& nodes, @@ -970,12 +971,12 @@ public: conv_.getOutputShape(nodes[0]->shape(), shape_); } - NodeOps forwardOps() { + NodeOps forwardOps() override { return {NodeOp(conv_.forward( child(0)->val(), child(1)->val(), child(2)->val(), val_))}; } - NodeOps backwardOps() { + NodeOps backwardOps() override { return {NodeOp(conv_.backward(child(0)->val(), child(0)->grad(), child(1)->val(), @@ -984,7 +985,7 @@ public: adj_))}; } - const std::string type() { return "layer_convolution"; } + const std::string type() override { return "layer_convolution"; } protected: ConvolutionWrapper conv_; diff --git a/src/graph/node_operators_unary.h b/src/graph/node_operators_unary.h index 9a752786..b8b19208 100644 --- a/src/graph/node_operators_unary.h +++ b/src/graph/node_operators_unary.h @@ -881,16 +881,16 @@ public: strideWidth, mode) {} - NodeOps forwardOps() { + NodeOps forwardOps() override { return {NodeOp(pooling_.forward(child(0)->val(), val_))}; } - NodeOps backwardOps() { + NodeOps backwardOps() override { return {NodeOp( pooling_.backward(child(0)->val(), child(0)->grad(), val_, adj_))}; } - const std::string type() { return "layer_pooling"; } + const std::string type() override { return "layer_pooling"; } protected: PoolingWrapper pooling_; diff --git a/src/layers/loss.cpp b/src/layers/loss.cpp index 87a2a1fd..03b79682 100755 --- a/src/layers/loss.cpp +++ b/src/layers/loss.cpp @@ -15,6 +15,8 @@ Ptr<LossBase> LossFactory(Ptr<Options> options, bool inference) { return New<PerplexityLoss>(smoothing); } else if(costType == "ce-rescore") { return New<CrossEntropyRescoreLoss>(smoothing); + } else if(costType == "ce-rescore-mean") { + return New<CrossEntropyRescoreMeanLoss>(smoothing); } else { // same as ce-mean return New<CrossEntropyMeanLoss>(smoothing); } @@ -108,4 +110,14 @@ Expr CrossEntropyRescoreLoss::getCost(Expr logits, auto ce = getCrossEntropy(logits, indices, mask, weights); return -sum(ce, /*axis =*/ -3); } + +Expr CrossEntropyRescoreMeanLoss::getCost(Expr logits, + Expr indices, + Expr mask, + Expr weights) { + auto ce = getCrossEntropy(logits, indices, mask, weights); + // divide by number of words in sentence + return -sum(ce, /*axis =*/ -3) / sum(mask, /*axis =*/ -3); +} + } // namespace marian diff --git a/src/layers/loss.h b/src/layers/loss.h index 89d20819..ebf71147 100644 --- a/src/layers/loss.h +++ b/src/layers/loss.h @@ -66,5 +66,11 @@ public: Expr getCost(Expr logits, Expr indices, Expr mask, Expr weights) override; }; +class CrossEntropyRescoreMeanLoss : public LossBase { +public: + explicit CrossEntropyRescoreMeanLoss(float smoothing = 0) : LossBase(smoothing){}; + Expr getCost(Expr logits, Expr indices, Expr mask, Expr weights) override; +}; + Ptr<LossBase> LossFactory(Ptr<Options> options, bool inference); } // namespace marian diff --git a/src/layers/word2vec_reader.h b/src/layers/word2vec_reader.h index f18fd439..a7e85592 100755 --- a/src/layers/word2vec_reader.h +++ b/src/layers/word2vec_reader.h @@ -18,8 +18,6 @@ public: LOG(info, "[data] Loading embedding vectors from {}", fileName); io::InputFileStream embFile(fileName); - ABORT_IF(!embFile.isOpen(), - "Unable to open file with embeddings: " + fileName); std::string line; std::vector<std::string> values; @@ -75,19 +73,19 @@ private: values.reserve(dimEmb); // Glorot numal distribution float scale = sqrtf(2.0f / (dimVoc + dimEmb)); - + // @TODO: switch to new random generator back-end. - // This is rarly used however. + // This is rarly used however. std::random_device rd; std::mt19937 engine(rd()); - + std::normal_distribution<float> d(0, scale); auto gen = [&d, &engine] () { return d(engine); }; std::generate(values.begin(), values.end(), gen); - + return values; } }; diff --git a/src/models/char_s2s.h b/src/models/char_s2s.h index c4dce6f5..6d5d1db1 100644 --- a/src/models/char_s2s.h +++ b/src/models/char_s2s.h @@ -12,7 +12,7 @@ public: CharS2SEncoder(Ptr<Options> options) : EncoderS2S(options) {} virtual Ptr<EncoderState> build(Ptr<ExpressionGraph> graph, - Ptr<data::CorpusBatch> batch) { + Ptr<data::CorpusBatch> batch) override { auto embeddings = buildSourceEmbeddings(graph); // select embeddings that occur in the batch diff --git a/src/models/hardatt.h b/src/models/hardatt.h deleted file mode 100755 index 77ba7f44..00000000 --- a/src/models/hardatt.h +++ /dev/null @@ -1,303 +0,0 @@ -#pragma once - -#include "marian.h" - -#include "layers/generic.h" -#include "rnn/attention_constructors.h" -#include "rnn/types.h" - -#include <numeric> - -namespace marian { - -class DecoderStateHardAtt : public DecoderState { -protected: - std::vector<IndexType> attentionIndices_; - -public: - DecoderStateHardAtt(const rnn::States& states, - Expr logProbs, - const std::vector<Ptr<EncoderState>>& encStates, - Ptr<data::CorpusBatch> batch) - : DecoderState(states, logProbs, encStates, batch) {} - - virtual Ptr<DecoderState> select(const std::vector<IndexType>& selIdx, - int beamSize) const override { - std::vector<IndexType> selectedAttentionIndices; - for(auto i : selIdx) - selectedAttentionIndices.push_back(attentionIndices_[i]); - - auto selectedState = New<DecoderStateHardAtt>(states_.select(selIdx, beamSize, /*isBatchMajor=*/false), - logProbs_, - encStates_, - batch_); - selectedState->attentionIndices_ = selectedAttentionIndices; - - // Set positon of new state based on the target token position of current - // state - // @TODO: I copied this to make this consistent with the other instances. Needed? - selectedState->setPosition(getPosition()); - return selectedState; - } - - // @TODO: why are these virtual? - virtual void setAttentionIndices( - const std::vector<IndexType>& attentionIndices) { - attentionIndices_ = attentionIndices; - } - - virtual std::vector<IndexType>& getAttentionIndices() { - ABORT_IF(attentionIndices_.empty(), "Empty attention indices"); - return attentionIndices_; - } - - virtual void blacklist(Expr totalCosts, Ptr<data::CorpusBatch> batch) override { - auto attentionIdx = getAttentionIndices(); - int dimVoc = totalCosts->shape()[-1]; - for(size_t i = 0; i < attentionIdx.size(); i++) { - if(batch->front()->data()[attentionIdx[i]] != 0) { - totalCosts->val()->set( - i * dimVoc + DEFAULT_EOS_ID, // this is checked at vocab-load time - // if the special tokens are present - std::numeric_limits<float>::lowest()); - } else { - totalCosts->val()->set(i * dimVoc + STP_ID, - std::numeric_limits<float>::lowest()); - } - } - } -}; - -class DecoderHardAtt : public DecoderBase { -protected: - Ptr<rnn::RNN> rnn_; - std::unordered_set<Word> specialSymbols_; - -public: - DecoderHardAtt(Ptr<Options> options) : DecoderBase(options) { - if(options->has("special-vocab")) { - auto spec = options->get<std::vector<Word>>("special-vocab"); - specialSymbols_.insert(spec.begin(), spec.end()); - } - } - - virtual Ptr<DecoderState> startState( - Ptr<ExpressionGraph> graph, - Ptr<data::CorpusBatch> batch, - std::vector<Ptr<EncoderState>>& encStates) override { - - std::vector<Expr> meanContexts; - for(auto& encState : encStates) { - // average the source context weighted by the batch mask - // this will remove padded zeros from the average - meanContexts.push_back(weighted_average( - encState->getContext(), encState->getMask(), /*axis =*/ -3)); - } - - Expr start; - if(!meanContexts.empty()) { - // apply single layer network to mean to map into decoder space - auto mlp = mlp::mlp(graph) // - .push_back(mlp::dense(graph) // - ("prefix", prefix_ + "_ff_state") // - ("dim", opt<int>("dim-rnn")) // - ("activation", (int)mlp::act::tanh) // - ("layer-normalization", - opt<bool>("layer-normalization"))); - start = mlp->apply(meanContexts); - } - - rnn::States startStates(opt<size_t>("dec-depth"), {start, start}); - auto startState = New<DecoderStateHardAtt>(startStates, nullptr, encStates, batch); - startState->setAttentionIndices(std::vector<IndexType>({ 0 })); - return startState; - } - - virtual Ptr<DecoderState> step(Ptr<ExpressionGraph> graph, - Ptr<DecoderState> state) override { - - auto type = options_->get<std::string>("type"); - - int dimTrgVoc = options_->get<std::vector<int>>("dim-vocabs").back(); - - int dimTrgEmb = options_->get<int>("dim-emb"); - - int dimDecState = options_->get<int>("dim-rnn"); - bool layerNorm = options_->get<bool>("layer-normalization"); - bool skipDepth = options_->get<bool>("skip"); - - size_t decoderLayers = options_->get<size_t>("dec-depth"); - auto cellType = options_->get<std::string>("dec-cell"); - - float dropoutRnn = inference_ ? 0 : options_->get<float>("dropout-rnn"); - float dropoutTrg = inference_ ? 0 : options_->get<float>("dropout-trg"); - - auto stateHardAtt = std::dynamic_pointer_cast<DecoderStateHardAtt>(state); - - auto trgEmbeddings = stateHardAtt->getTargetEmbeddings(); - - auto context = stateHardAtt->getEncoderStates()[0]->getContext(); - int dimContext = context->shape()[-1]; - int dimSrcWords = context->shape()[-3]; - - int dimBatch = context->shape()[-2]; - int dimTrgWords = trgEmbeddings->shape()[-3]; - int dimBeam = trgEmbeddings->shape()[-4]; - - if(dropoutTrg) { - trgEmbeddings - = dropout(trgEmbeddings, dropoutTrg, {dimTrgWords, dimBatch, 1}); - } - - auto flatContext = reshape(context, {dimBatch * dimSrcWords, dimContext}); - auto attendedContext - = rows(flatContext, stateHardAtt->getAttentionIndices()); - attendedContext = reshape(attendedContext, - {dimBeam, dimTrgWords, dimBatch, dimContext}); - - auto rnnInputs = concatenate({trgEmbeddings, attendedContext}, /*axis =*/ -1); - int dimInput = rnnInputs->shape()[-1]; - - if(!rnn_) { - auto rnn = rnn::rnn(graph) // - ("type", cellType) // - ("dimInput", dimInput) // - ("dimState", dimDecState) // - ("dropout", dropoutRnn) // - ("layer-normalization", layerNorm) // - ("skip", skipDepth); - - if(type == "hard-soft-att") { - auto attCell = rnn::stacked_cell(graph) // - .push_back(rnn::cell(graph) // - ("prefix", prefix_ + "_cell1")); - for(size_t i = 0; i < state->getEncoderStates().size(); ++i) { - std::string prefix = prefix_; - if(state->getEncoderStates().size() > 1) - prefix += "_att" + std::to_string(i + 1); - - attCell.push_back(rnn::attention(graph) // - ("prefix", prefix) // - .set_state(state->getEncoderStates()[i])); - } - - attCell.push_back(rnn::cell(graph) // - ("prefix", prefix_ + "_cell2") // - ("final", true)); - rnn.push_back(attCell); - } else { - rnn.push_back(rnn::cell(graph)("prefix", prefix_)); - } - - for(size_t i = 0; i < decoderLayers - 1; ++i) - rnn.push_back(rnn::cell(graph) // - ("prefix", prefix_ + "_l" + std::to_string(i))); - - rnn_ = rnn.construct(); - } - - auto decContext = rnn_->transduce(rnnInputs, stateHardAtt->getStates()); - rnn::States decStates = rnn_->lastCellStates(); - - //// 2-layer feedforward network for outputs and cost - auto out = mlp::mlp(graph) - .push_back(mlp::dense(graph) // - ("prefix", prefix_ + "_ff_logit_l1") // - ("dim", dimTrgEmb) // - ("activation", (int)mlp::act::tanh) // - ("layer-normalization", layerNorm)) // - .push_back(mlp::dense(graph) // - ("prefix", prefix_ + "_ff_logit_l2") // - ("dim", dimTrgVoc)); - - Expr logits; - if(type == "hard-soft-att") { - std::vector<Expr> alignedContexts; - for(int k = 0; k < state->getEncoderStates().size(); ++k) { - // retrieve all the aligned contexts computed by the attention mechanism - auto att = rnn_->at(0) - ->as<rnn::StackedCell>() - ->at(k + 1) - ->as<rnn::Attention>(); - alignedContexts.push_back(att->getContext()); - } - - Expr alignedContext; - if(alignedContexts.size() > 1) - alignedContext = concatenate(alignedContexts, /*axis =*/ -1); - else if(alignedContexts.size() == 1) - alignedContext = alignedContexts[0]; - - logits = out->apply(rnnInputs, decContext, alignedContext); - } else { - logits = out->apply(rnnInputs, decContext); - } - - auto nextState = New<DecoderStateHardAtt>(decStates, - logits, - stateHardAtt->getEncoderStates(), - stateHardAtt->getBatch()); - nextState->setAttentionIndices(std::vector<IndexType>(stateHardAtt->getAttentionIndices())); - nextState->setPosition(state->getPosition() + 1); // @TODO: I added this for consistency. Correct? - return nextState; - } - - const std::vector<Expr> getAlignments() { - auto att = rnn_->at(0)->as<rnn::StackedCell>()->at(1)->as<rnn::Attention>(); - return att->getAlignments(); - } - - void embeddingsFromBatch(Ptr<ExpressionGraph> graph, - Ptr<DecoderState> state, - Ptr<data::CorpusBatch> batch) override { - DecoderBase::embeddingsFromBatch(graph, state, batch); - - auto subBatch = (*batch)[batchIndex_]; - int dimBatch = (int)subBatch->batchSize(); - int dimWords = (int)subBatch->batchWidth(); - - std::vector<IndexType> attentionIndices(dimBatch, 0); - std::vector<IndexType> currentPos(dimBatch, 0); - std::iota(currentPos.begin(), currentPos.end(), 0); - - for(int i = 0; i < dimWords - 1; ++i) { - for(int j = 0; j < dimBatch; ++j) { - Word word = subBatch->data()[i * dimBatch + j]; - if(specialSymbols_.count(word)) - currentPos[j] += dimBatch; - attentionIndices.push_back(currentPos[j]); - } - } - - std::dynamic_pointer_cast<DecoderStateHardAtt>(state)->setAttentionIndices( - attentionIndices); - } - - virtual void embeddingsFromPrediction(Ptr<ExpressionGraph> graph, - Ptr<DecoderState> state, - const std::vector<IndexType>& embIdx, - int dimBatch, - int beamSize) override { - DecoderBase::embeddingsFromPrediction( - graph, state, embIdx, dimBatch, beamSize); - - auto stateHardAtt = std::dynamic_pointer_cast<DecoderStateHardAtt>(state); - - size_t dimSrcWords - = state->getEncoderStates()[0]->getContext()->shape()[-3]; - - if(embIdx.empty()) { - stateHardAtt->setAttentionIndices({0}); - } else { - for(size_t i = 0; i < embIdx.size(); ++i) - if(specialSymbols_.count(embIdx[i])) { - stateHardAtt->getAttentionIndices()[i]++; - if(stateHardAtt->getAttentionIndices()[i] >= dimSrcWords) - stateHardAtt->getAttentionIndices()[i] = (IndexType)dimSrcWords - 1; - } - } - } - - void clear() override { rnn_ = nullptr; } -}; -} // namespace marian diff --git a/src/models/model_factory.cpp b/src/models/model_factory.cpp index 65629176..d42f07c8 100644 --- a/src/models/model_factory.cpp +++ b/src/models/model_factory.cpp @@ -6,7 +6,6 @@ #include "models/costs.h" #include "models/amun.h" -#include "models/hardatt.h" #include "models/nematus.h" #include "models/s2s.h" #include "models/transformer_factory.h" @@ -47,11 +46,6 @@ Ptr<DecoderBase> DecoderFactory::construct() { if(options_->get<std::string>("type") == "transformer") // return New<DecoderTransformer>(options_); return NewDecoderTransformer(options_); - if(options_->get<std::string>("type") == "hard-att") - return New<DecoderHardAtt>(options_); - if(options_->get<std::string>("type") == "hard-soft-att") - return New<DecoderHardAtt>(options_); - ABORT("Unknown decoder type"); } @@ -120,24 +114,6 @@ Ptr<ModelBase> by_type(std::string type, usage use, Ptr<Options> options) { .construct(); } - if(type == "hard-att") { - return models::encoder_decoder()(options) - ("usage", use) - ("original-type", type) - .push_back(models::encoder()("type", "s2s")) - .push_back(models::decoder()("type", "hard-att")) - .construct(); - } - - if(type == "hard-soft-att") { - return models::encoder_decoder()(options) - ("usage", use) - ("original-type", type) - .push_back(models::encoder()("type", "s2s")) - .push_back(models::decoder()("type", "hard-soft-att")) - .construct(); - } - if(type == "multi-s2s") { size_t numEncoders = 2; auto ms2sFactory = models::encoder_decoder()(options) @@ -172,25 +148,6 @@ Ptr<ModelBase> by_type(std::string type, usage use, Ptr<Options> options) { return ms2sFactory.construct(); } - if(type == "multi-hard-att") { - size_t numEncoders = 2; - auto ms2sFactory = models::encoder_decoder()(options) - ("usage", use) - ("type", "s2s") - ("original-type", type); - - for(size_t i = 0; i < numEncoders; ++i) { - auto prefix = "encoder" + std::to_string(i + 1); - ms2sFactory.push_back(models::encoder()("prefix", prefix)("index", i)); - } - - ms2sFactory.push_back(models::decoder() - ("index", numEncoders) - ("type", "hard-soft-att")); - - return ms2sFactory.construct(); - } - if(type == "multi-transformer") { size_t numEncoders = 2; auto mtransFactory = models::encoder_decoder()(options) diff --git a/src/rescorer/rescorer.h b/src/rescorer/rescorer.h index bf57a2b6..fa456856 100644 --- a/src/rescorer/rescorer.h +++ b/src/rescorer/rescorer.h @@ -52,8 +52,12 @@ public: ABORT_IF(options_->has("summary") && options_->has("alignment"), "Alignments can not be produced with summarized score"); + ABORT_IF(options_->has("summary") && options_->get<bool>("normalize"), + "Normalization by length cannot be used with summary scores"); + options_->set("inference", true); - options_->set("cost-type", "ce-rescore"); + // @TODO: make normalize here a float and pass into loss to compute the same way as in decoding + options_->set("cost-type", options_->get<bool>("normalize") ? "ce-rescore-mean" : "ce-rescore"); if(options_->get<bool>("n-best")) corpus_ = New<CorpusNBest>(options_); @@ -97,6 +101,8 @@ public: std::string alignment = options_->get<std::string>("alignment", ""); bool summarize = options_->has("summary"); + bool normalize = options_->get<bool>("normalize"); + std::string summary = summarize ? options_->get<std::string>("summary") : "cross-entropy"; float sumCost = 0; @@ -118,7 +124,11 @@ public: builder = models_[id % graphs_.size()]; } + // @TODO: normalize by length as in normalize + // Once we have Frank's concept of ce-sum with sample size by words we will return a pair + // here which will make it trivial to report all variants. auto costNode = builder->build(graph, batch); + graph->forward(); std::vector<float> scores; @@ -141,13 +151,29 @@ public: output->Write((long)batch->getSentenceIds()[i], scores[i], aligns[i]); } } + + // progress heartbeat for MS-internal Philly compute cluster + // otherwise this job may be killed prematurely if no log for 4 hrs + if (getenv("PHILLY_JOB_ID") // this environment variable exists when running on the cluster + && id % 1000 == 0) // hard beat once every 1000 batches + { + auto progress = id / 10000.f; //fake progress for now, becomes >100 after 1M batches + fprintf(stdout, "PROGRESS: %.2f%%\n", progress); + fflush(stdout); + } }; - pool.enqueue(task, batchId % graphs_.size()); - batchId++; + pool.enqueue(task, batchId++); } } + if(normalize) { + LOG(info, "Total normalized log probs {} : Total sentences {} : Total words {}", sumCost, sumSamples, sumWords); + LOG(warn, "Sum of normalized log probs is a sum of averages"); + } else { + LOG(info, "Total log probs {} : Total sentences {} : Total words {}", sumCost, sumSamples, sumWords); + } + if(summarize) { float cost = 0; if(summary == "perplexity") diff --git a/src/rescorer/score_collector.cpp b/src/rescorer/score_collector.cpp index 65f43c70..ac118a6a 100644 --- a/src/rescorer/score_collector.cpp +++ b/src/rescorer/score_collector.cpp @@ -9,9 +9,14 @@ namespace marian { ScoreCollector::ScoreCollector(const Ptr<Options>& options) : nextId_(0), - outStrm_(new io::OutputFileStream(std::cout)), alignment_(options->get<std::string>("alignment", "")), - alignmentThreshold_(getAlignmentThreshold(alignment_)) {} + alignmentThreshold_(getAlignmentThreshold(alignment_)) { + + if(options->get<std::string>("output") == "stdout") + outStrm_.reset(new io::OutputFileStream(std::cout)); + else + outStrm_.reset(new io::OutputFileStream(options->get<std::string>("output"))); + } void ScoreCollector::Write(long id, const std::string& message) { std::lock_guard<std::mutex> lock(mutex_); diff --git a/src/tensors/cpu/prod.cpp b/src/tensors/cpu/prod.cpp index 0baeeb96..69923f87 100755 --- a/src/tensors/cpu/prod.cpp +++ b/src/tensors/cpu/prod.cpp @@ -99,7 +99,7 @@ void Prod(marian::Tensor C, } void ProdBatched(marian::Tensor C, - Ptr<Allocator> allocator, + Ptr<Allocator> /*allocator*/, const marian::Tensor A, const marian::Tensor B, bool transA, @@ -150,7 +150,7 @@ void ProdBatched(marian::Tensor C, (int)ldc); } #else - C; allocator; A; B; transA; transB; beta; scalar; + C; A; B; transA; transB; beta; scalar; ABORT("You need to compile with MKL in order to use the CPU version"); #endif } diff --git a/src/training/scheduler.h b/src/training/scheduler.h index 8fc31d70..dee62496 100755 --- a/src/training/scheduler.h +++ b/src/training/scheduler.h @@ -246,8 +246,7 @@ public: } else { if(options_->get<bool>("lr-report")) { LOG(info, - "Ep. {} : Up. {} : Sen. {} : Cost {:.2f} : Time {:2f}s : {:.2f} words/s : L.r. " - "{:.4e}", + "Ep. {} : Up. {} : Sen. {} : Cost {:.8f} : Time {:.2f}s : {:.2f} words/s : L.r. {:.4e}", state_->epochs, state_->batches, utils::withCommas(state_->samplesEpoch), @@ -257,7 +256,7 @@ public: state_->eta); } else { LOG(info, - "Ep. {} : Up. {} : Sen. {} : Cost {:.2f} : Time {:.2f}s : {:.2f} words/s", + "Ep. {} : Up. {} : Sen. {} : Cost {:.8f} : Time {:.2f}s : {:.2f} words/s", state_->epochs, state_->batches, utils::withCommas(state_->samplesEpoch), diff --git a/src/translator/output_collector.cpp b/src/translator/output_collector.cpp index c7114a56..58fba69b 100755 --- a/src/translator/output_collector.cpp +++ b/src/translator/output_collector.cpp @@ -6,6 +6,10 @@ namespace marian { +OutputCollector::OutputCollector() + : nextId_(0), + printing_(new DefaultPrinting()) {} + OutputCollector::OutputCollector(std::string outFile) : nextId_(0), outStrm_(new io::OutputFileStream(std::cout)), diff --git a/src/translator/output_collector.h b/src/translator/output_collector.h index 154e8ded..51b47159 100755 --- a/src/translator/output_collector.h +++ b/src/translator/output_collector.h @@ -45,10 +45,13 @@ private: class OutputCollector { public: - OutputCollector(std::string outFile = "stdout"); + OutputCollector(); + OutputCollector(std::string outFile); template <class T> - OutputCollector(T&& arg) : nextId_(0), outStrm_(new io::OutputFileStream(arg)) {} + OutputCollector(T&& arg) + : nextId_(0), + outStrm_(new io::OutputFileStream(arg)) {} OutputCollector(const OutputCollector&) = delete; diff --git a/src/translator/translator.h b/src/translator/translator.h index cc2cbea2..9f973113 100755 --- a/src/translator/translator.h +++ b/src/translator/translator.h @@ -111,19 +111,21 @@ public: bestn.str(), options_->get<bool>("n-best")); } + + + // progress heartbeat for MS-internal Philly compute cluster + // otherwise this job may be killed prematurely if no log for 4 hrs + if (getenv("PHILLY_JOB_ID") // this environment variable exists when running on the cluster + && id % 1000 == 0) // hard beat once every 1000 batches + { + auto progress = 0.f; //fake progress for now + fprintf(stdout, "PROGRESS: %.2f%%\n", progress); + fflush(stdout); + } }; threadPool.enqueue(task, batchId++); - // progress heartbeat for MS-internal Philly compute cluster - //otherwise this job may be killed prematurely if no log for 4 hrs - if (getenv("PHILLY_JOB_ID")) // this environment variable exists when running on the cluster - { - auto progress = 0.f; //fake progress for now - fprintf(stdout, "PROGRESS: %.2f%%\n", progress); - fflush(stdout); - } - } } }; |