/* A handle to a source of mapped memory (C) 2017-2020 Niall Douglas (10 commits) File Created: Apr 2017 Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License in the accompanying file Licence.txt or at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. Distributed under the Boost Software License, Version 1.0. (See accompanying file Licence.txt or copy at http://www.boost.org/LICENSE_1_0.txt) */ #include "../../../map_handle.hpp" #include "../../../utils.hpp" #include "quickcpplib/signal_guard.hpp" #include //#define LLFIO_DEBUG_LINUX_MUNMAP #ifdef LLFIO_DEBUG_LINUX_MUNMAP #include static struct llfio_linux_munmap_debug_t { int smaps_fd, dumpfile_fd; llfio_linux_munmap_debug_t() { smaps_fd = ::open("/proc/self/smaps", O_RDONLY); dumpfile_fd = ::open("/tmp/llfio_unmap_debug_smaps.txt", O_WRONLY | O_CREAT | O_APPEND, 0x1b0 /*660*/); if(-1 == smaps_fd || -1 == dumpfile_fd) { puts("llfio_linux_munmap_debug: Failed to open one of the files\n"); abort(); } } } llfio_linux_munmap_debug; #endif LLFIO_V2_NAMESPACE_BEGIN section_handle::~section_handle() { if(_v) { auto ret = section_handle::close(); if(ret.has_error()) { LLFIO_LOG_FATAL(_v.h, "section_handle::~section_handle() close failed"); abort(); } } } result section_handle::close() noexcept { LLFIO_LOG_FUNCTION_CALL(this); if(_v) { // We don't want ~handle() to close our handle borrowed from the backing file or _anonymous _v = native_handle_type(); OUTCOME_TRYV(handle::close()); OUTCOME_TRYV(_anonymous.close()); _flag = flag::none; } return success(); } result section_handle::section(file_handle &backing, extent_type /* unused */, flag _flag) noexcept { result ret(section_handle(native_handle_type(), &backing, file_handle(), _flag)); native_handle_type &nativeh = ret.value()._v; nativeh.fd = backing.native_handle().fd; if(_flag & flag::read) { nativeh.behaviour |= native_handle_type::disposition::readable; } if(_flag & flag::write) { nativeh.behaviour |= native_handle_type::disposition::writable; } nativeh.behaviour |= native_handle_type::disposition::section; LLFIO_LOG_FUNCTION_CALL(&ret); return ret; } result section_handle::section(extent_type bytes, const path_handle &dirh, flag _flag) noexcept { OUTCOME_TRY(auto &&_anonh, file_handle::temp_inode(dirh)); OUTCOME_TRYV(_anonh.truncate(bytes)); result ret(section_handle(native_handle_type(), nullptr, std::move(_anonh), _flag)); native_handle_type &nativeh = ret.value()._v; file_handle &anonh = ret.value()._anonymous; nativeh.fd = anonh.native_handle().fd; if(_flag & flag::read) { nativeh.behaviour |= native_handle_type::disposition::readable; } if(_flag & flag::write) { nativeh.behaviour |= native_handle_type::disposition::writable; } nativeh.behaviour |= native_handle_type::disposition::section; LLFIO_LOG_FUNCTION_CALL(&ret); return ret; } result section_handle::length() const noexcept { LLFIO_LOG_FUNCTION_CALL(this); struct stat s { }; memset(&s, 0, sizeof(s)); if(-1 == ::fstat(_v.fd, &s)) { return posix_error(); } return s.st_size; } result section_handle::truncate(extent_type newsize) noexcept { LLFIO_LOG_FUNCTION_CALL(this); if((_backing == nullptr) && newsize > 0) { if(-1 == ::ftruncate(_anonymous.native_handle().fd, newsize)) { return posix_error(); } } return newsize; } /******************************************* map_handle *********************************************/ map_handle::~map_handle() { if(_addr != nullptr) { // Unmap the view auto ret = map_handle::close(); if(ret.has_error()) { LLFIO_LOG_FATAL(_v.fd, "map_handle::~map_handle() close failed. Cause is typically other code modifying mapped regions. If on Linux, you may have exceeded the 64k VMA process limit, set the LLFIO_DEBUG_LINUX_MUNMAP macro at the top of posix/map_handle.ipp to cause dumping of VMAs to " "/tmp/llfio_unmap_debug_smaps.txt, and combine with strace to figure it out."); abort(); } } } result map_handle::close() noexcept { LLFIO_LOG_FUNCTION_CALL(this); if(_addr != nullptr) { if(is_writable() && (_flag & section_handle::flag::barrier_on_close)) { OUTCOME_TRYV(map_handle::barrier(barrier_kind::wait_all)); } // printf("%d munmap %p-%p\n", getpid(), _addr, _addr+_reservation); if(-1 == ::munmap(_addr, _reservation)) { #ifdef LLFIO_DEBUG_LINUX_MUNMAP int olderrno = errno; ssize_t bytesread; // Refresh the /proc file (void) ::lseek(llfio_linux_munmap_debug.smaps_fd, 0, SEEK_END); (void) ::lseek(llfio_linux_munmap_debug.smaps_fd, 0, SEEK_SET); char buffer[4096]; (void) ::write(llfio_linux_munmap_debug.dumpfile_fd, buffer, sprintf(buffer, "\n---\nCause of munmap failure by process %d: %d (%s)\n\n", getpid(), olderrno, strerror(olderrno))); do { bytesread = ::read(llfio_linux_munmap_debug.smaps_fd, buffer, sizeof(buffer)); if(bytesread > 0) { (void) ::write(llfio_linux_munmap_debug.dumpfile_fd, buffer, bytesread); } } while(bytesread > 0); errno = olderrno; #endif return posix_error(); } } // We don't want ~handle() to close our borrowed handle _v = native_handle_type(); _addr = nullptr; _length = 0; return success(); } native_handle_type map_handle::release() noexcept { LLFIO_LOG_FUNCTION_CALL(this); // We don't want ~handle() to close our borrowed handle _v = native_handle_type(); _addr = nullptr; _length = 0; return {}; } map_handle::io_result map_handle::_do_barrier(map_handle::io_request reqs, barrier_kind kind, deadline d) noexcept { LLFIO_LOG_FUNCTION_CALL(this); byte *addr = _addr + reqs.offset; size_type bytes = 0; // Check for overflow for(const auto &req : reqs.buffers) { if(bytes + req.size() < bytes) { return errc::value_too_large; } bytes += req.size(); } // If empty, do the whole file if(reqs.buffers.empty()) { bytes = _length; } // If nvram and not syncing metadata, use lightweight barrier if(kind <= barrier_kind::wait_data_only && is_nvram()) { auto synced = nvram_barrier({addr, bytes}); if(synced.size() >= bytes) { return {reqs.buffers}; } } int flags = ((uint8_t) kind & 1) ? MS_SYNC : MS_ASYNC; if(-1 == ::msync(addr, bytes, flags)) { return posix_error(); } // Don't fsync temporary inodes if(_section != nullptr && (_section->backing() != nullptr) && kind >= barrier_kind::nowait_all) { reqs.offset += _offset; return _section->backing()->barrier(reqs, kind, d); } return {reqs.buffers}; } static inline result do_mmap(native_handle_type &nativeh, void *ataddr, int extra_flags, section_handle *section, map_handle::size_type pagesize, map_handle::size_type &bytes, map_handle::extent_type offset, section_handle::flag _flag) noexcept { bool have_backing = (section != nullptr); int prot = 0, flags = have_backing ? MAP_SHARED : (MAP_PRIVATE | MAP_ANONYMOUS); void *addr = nullptr; if(_flag == section_handle::flag::none) { prot |= PROT_NONE; #ifdef MAP_GUARD if(_flag & section_handle::flag::nocommit) { flags |= MAP_GUARD; } #endif } else if(_flag & section_handle::flag::cow) { prot |= PROT_READ | PROT_WRITE; flags &= ~MAP_SHARED; flags |= MAP_PRIVATE; nativeh.behaviour |= native_handle_type::disposition::seekable | native_handle_type::disposition::readable | native_handle_type::disposition::writable; } else if(_flag & section_handle::flag::write) { prot = (_flag & section_handle::flag::write_via_syscall) ? PROT_READ : (PROT_READ | PROT_WRITE); nativeh.behaviour |= native_handle_type::disposition::seekable | native_handle_type::disposition::readable | native_handle_type::disposition::writable; } else if(_flag & section_handle::flag::read) { prot |= PROT_READ; nativeh.behaviour |= native_handle_type::disposition::seekable | native_handle_type::disposition::readable; } if(_flag & section_handle::flag::execute) { prot |= PROT_EXEC; } #ifdef MAP_NORESERVE if(_flag & section_handle::flag::nocommit) { flags |= MAP_NORESERVE; } #endif #ifdef MAP_POPULATE if(_flag & section_handle::flag::prefault) { flags |= MAP_POPULATE; } #endif #ifdef MAP_PREFAULT_READ if(_flag & section_handle::flag::prefault) flags |= MAP_PREFAULT_READ; #endif #ifdef MAP_NOSYNC if(have_backing && section->backing() != nullptr && (section->backing()->kernel_caching() == handle::caching::temporary)) flags |= MAP_NOSYNC; #endif flags |= extra_flags; int fd_to_use = have_backing ? section->native_handle().fd : -1; if(pagesize != utils::page_size()) { static const auto &pagesizes = utils::page_sizes(); // can't throw, as guaranteed called before now #ifdef __linux__ flags |= MAP_HUGETLB; // gets me pagesizes[1] if(pagesize > pagesizes[1]) { #ifdef MAP_HUGE_SHIFT // Ask for page size requested size_t topbitset = (__CHAR_BIT__ * sizeof(unsigned long)) - __builtin_clzl((unsigned long) pagesize); flags |= topbitset << MAP_HUGE_SHIFT; #else return errc::invalid_argument; #endif } #elif defined(__FreeBSD__) size_t topbitset = (__CHAR_BIT__ * sizeof(unsigned long)) - __builtin_clzl((unsigned long) pagesize); flags |= MAP_ALIGNED(topbitset); #elif defined(__APPLE__) size_t topbitset = (__CHAR_BIT__ * sizeof(unsigned long)) - __builtin_clzl((unsigned long) pagesize); if(have_backing || topbitset < 21) { (void) pagesizes; return errc::invalid_argument; } fd_to_use = ((topbitset - 20) << VM_FLAGS_SUPERPAGE_SHIFT); #else #error Do not know how to specify large/huge/super pages on this platform #endif } // printf("mmap(%p, %u, %d, %d, %d, %u)\n", ataddr, (unsigned) bytes, prot, flags, have_backing ? section->native_handle().fd : -1, (unsigned) offset); #ifdef MAP_SYNC // Linux kernel 4.15 or later only // If backed by a file into persistent shared memory, ask the kernel to use persistent memory safe semantics if(have_backing && (_flag & section_handle::flag::nvram) && (flags & MAP_SHARED) != 0) { int flagscopy = flags & ~MAP_SHARED; flagscopy |= MAP_SHARED_VALIDATE | MAP_SYNC; addr = ::mmap(ataddr, bytes, prot, flagscopy, fd_to_use, offset); } #endif if(addr == nullptr) { addr = ::mmap(ataddr, bytes, prot, flags, fd_to_use, offset); } // printf("%d mmap %p-%p\n", getpid(), addr, (char *) addr+bytes); if(MAP_FAILED == addr) // NOLINT { return posix_error(); } #ifdef MADV_FREE_REUSABLE if((prot & PROT_WRITE) != 0 && (_flag & section_handle::flag::nocommit)) { // On Mac OS, you cannot ask for reserved memory, but you can ask // for committed memory first and then decommit it. Which can potentially // blow out the commit limit for really large reservations, // but this is literally the only game in town on Mac OS. if(-1 == ::madvise(addr, bytes, MADV_FREE_REUSABLE)) { return posix_error(); } } #endif #if 0 // not implemented yet, not seen any benefit over setting this at the fd level if(have_backing && ((flags & map_handle::flag::disable_prefetching) || (flags & map_handle::flag::maximum_prefetching))) { int advice = (flags & map_handle::flag::disable_prefetching) ? MADV_RANDOM : MADV_SEQUENTIAL; if(-1 == ::madvise(addr, bytes, advice)) return posix_error(); } #endif return addr; } result map_handle::map(size_type bytes, bool /*unused*/, section_handle::flag _flag) noexcept { // TODO: Keep a cache of MADV_FREE pages deallocated if(bytes == 0u) { return errc::argument_out_of_domain; } bytes = utils::round_up_to_page_size(bytes, /*FIXME*/ utils::page_size()); result ret(map_handle(nullptr, _flag)); native_handle_type &nativeh = ret.value()._v; OUTCOME_TRY(auto &&pagesize, detail::pagesize_from_flags(ret.value()._flag)); OUTCOME_TRY(auto &&addr, do_mmap(nativeh, nullptr, 0, nullptr, pagesize, bytes, 0, ret.value()._flag)); ret.value()._addr = static_cast(addr); ret.value()._reservation = bytes; ret.value()._length = bytes; ret.value()._pagesize = pagesize; nativeh._init = -2; // otherwise appears closed nativeh.behaviour |= native_handle_type::disposition::allocation; LLFIO_LOG_FUNCTION_CALL(&ret); return ret; } result map_handle::map(section_handle §ion, size_type bytes, extent_type offset, section_handle::flag _flag) noexcept { OUTCOME_TRY(auto &&length, section.length()); // length of the backing file if(bytes == 0u) { bytes = length - offset; } result ret{map_handle(§ion, _flag)}; native_handle_type &nativeh = ret.value()._v; OUTCOME_TRY(auto &&pagesize, detail::pagesize_from_flags(ret.value()._flag)); OUTCOME_TRY(auto &&addr, do_mmap(nativeh, nullptr, 0, §ion, pagesize, bytes, offset, ret.value()._flag)); ret.value()._addr = static_cast(addr); ret.value()._offset = offset; ret.value()._reservation = utils::round_up_to_page_size(bytes, pagesize); ret.value()._length = (length - offset < bytes) ? (length - offset) : bytes; // length of backing, not reservation ret.value()._pagesize = pagesize; // Make my handle borrow the native handle of my backing storage ret.value()._v.fd = section.native_handle().fd; nativeh.behaviour |= native_handle_type::disposition::allocation; LLFIO_LOG_FUNCTION_CALL(&ret); return ret; } // Change the address reservation for this map result map_handle::truncate(size_type newsize, bool permit_relocation) noexcept { LLFIO_LOG_FUNCTION_CALL(this); extent_type length = _length; if(_section != nullptr) { OUTCOME_TRY(length, _section->length()); // length of the backing file } auto _newsize = utils::round_up_to_page_size(newsize, _pagesize); if(_newsize == _reservation) { return success(); } if(_section == nullptr) { length = _newsize; // newsize, but rounded up } // If wiping the map ... if(newsize == 0) { if(-1 == ::munmap(_addr, _reservation)) { return posix_error(); } _addr = nullptr; _reservation = 0; _length = 0; return 0; } // If not mapped yet ... if(_addr == nullptr) { OUTCOME_TRY(auto &&addr, do_mmap(_v, nullptr, 0, _section, _pagesize, newsize, _offset, _flag)); _addr = static_cast(addr); _reservation = _newsize; _length = (length - _offset < newsize) ? (length - _offset) : newsize; // length of backing, not reservation return newsize; } #ifdef __linux__ // Dead easy on Linux void *newaddr = ::mremap(_addr, _reservation, newsize, permit_relocation ? MREMAP_MAYMOVE : 0); if(MAP_FAILED == newaddr) { return posix_error(); } _addr = static_cast(newaddr); _reservation = _newsize; _length = (length - _offset < newsize) ? (length - _offset) : newsize; // length of backing, not reservation return newsize; #else (void) permit_relocation; // Try to expand reservation in place if(newsize > _reservation) { #if defined(MAP_EXCL) // BSD type systems byte *addrafter = _addr + _reservation; size_type bytes = newsize - _reservation; extent_type offset = _offset + _reservation; OUTCOME_TRY(auto &&addr, do_mmap(_v, addrafter, MAP_FIXED | MAP_EXCL, _section, _pagesize, bytes, offset, _flag)); _reservation = _newsize; _length = (length - _offset < newsize) ? (length - _offset) : newsize; // length of backing, not reservation return newsize; #else // generic POSIX, inefficient byte *addrafter = _addr + _reservation; size_type bytes = newsize - _reservation; extent_type offset = _offset + _reservation; OUTCOME_TRY(auto &&addr, do_mmap(_v, addrafter, 0, _section, _pagesize, bytes, offset, _flag)); if(addr != addrafter) { ::munmap(addr, bytes); return errc::not_enough_memory; } _reservation = _newsize; _length = (length - _offset < newsize) ? (length - _offset) : newsize; // length of backing, not reservation return newsize; #endif } // Shrink the map if(-1 == ::munmap(_addr + newsize, _reservation - newsize)) { return posix_error(); } _reservation = newsize; _length = (length - _offset < newsize) ? (length - _offset) : newsize; return newsize; #endif } result map_handle::commit(buffer_type region, section_handle::flag flag) noexcept { LLFIO_LOG_FUNCTION_CALL(this); if(region.data() == nullptr) { return errc::invalid_argument; } // Set permissions on the pages region = utils::round_to_page_size_larger(region, _pagesize); extent_type offset = _offset + (region.data() - _addr); size_type bytes = region.size(); OUTCOME_TRYV(do_mmap(_v, region.data(), MAP_FIXED, _section, _pagesize, bytes, offset, flag)); // Tell the kernel we will be using these pages soon if(-1 == ::madvise(region.data(), region.size(), MADV_WILLNEED)) { return posix_error(); } return region; } result map_handle::decommit(buffer_type region) noexcept { LLFIO_LOG_FUNCTION_CALL(this); if(region.data() == nullptr) { return errc::invalid_argument; } region = utils::round_to_page_size_larger(region, _pagesize); // If decommitting a mapped file, tell the kernel to kick these pages back to storage if(_section != nullptr && -1 == ::madvise(region.data(), region.size(), MADV_DONTNEED)) { return posix_error(); } // Remap these pages with ones having no access, and remove from commit charge extent_type offset = _offset + (region.data() - _addr); size_type bytes = region.size(); OUTCOME_TRYV(do_mmap(_v, region.data(), MAP_FIXED, _section, _pagesize, bytes, offset, section_handle::flag::none | section_handle::flag::nocommit)); return region; } result map_handle::zero_memory(buffer_type region) noexcept { LLFIO_LOG_FUNCTION_CALL(this); if(region.data() == nullptr) { return errc::invalid_argument; } #ifdef MADV_FREE_REUSABLE /* Mac OS will reduce both the commit charge and MADV_FREE the pages if we use this flag. */ memset(region.data(), 0, region.size()); buffer_type page_region{utils::round_up_to_page_size(region.data(), _pagesize), utils::round_down_to_page_size(region.size(), _pagesize)}; // Zero contents if((page_region.size() != 0u) && _section == nullptr) { (void) ::madvise(page_region.data(), page_region.size(), MADV_FREE_REUSABLE); } return success(); #endif #ifdef MADV_REMOVE buffer_type page_region{utils::round_up_to_page_size(region.data(), _pagesize), utils::round_down_to_page_size(region.size(), _pagesize)}; // Zero contents and punch a hole in any backing storage if((page_region.size() != 0u) && -1 != ::madvise(page_region.data(), page_region.size(), MADV_REMOVE)) { memset(region.data(), 0, page_region.data() - region.data()); memset(page_region.data() + page_region.size(), 0, (region.data() + region.size()) - (page_region.data() + page_region.size())); return success(); } #endif //! Only Linux implements syscall zero(), and it's covered by MADV_REMOVE already memset(region.data(), 0, region.size()); return success(); } result> map_handle::prefetch(span regions) noexcept { LLFIO_LOG_FUNCTION_CALL(0); for(auto ®ion : regions) { if(-1 == ::madvise(region.data(), region.size(), MADV_WILLNEED)) { return posix_error(); } } return regions; } result map_handle::do_not_store(buffer_type region) noexcept { LLFIO_LOG_FUNCTION_CALL(0); region = utils::round_to_page_size_larger(region, _pagesize); if(region.data() == nullptr) { return errc::invalid_argument; } #ifdef MADV_FREE // Lightweight unset of dirty bit for these pages. Needs FreeBSD or very recent Linux. if(-1 != ::madvise(region.data(), region.size(), MADV_FREE)) return region; #endif #ifdef MADV_REMOVE // This is rather heavy weight in that it also punches a hole in any backing storage // but it works on Linux for donkey's years if(-1 != ::madvise(region.data(), region.size(), MADV_REMOVE)) { return region; } #endif // No support on this platform region = {region.data(), 0}; return region; } map_handle::io_result map_handle::_do_read(io_request reqs, deadline /*d*/) noexcept { LLFIO_LOG_FUNCTION_CALL(this); byte *addr = _addr + reqs.offset; size_type togo = reqs.offset < _length ? static_cast(_length - reqs.offset) : 0; for(size_t i = 0; i < reqs.buffers.size(); i++) { buffer_type &req = reqs.buffers[i]; req = {addr, req.size()}; if(req.size() > togo) { req = {req.data(), togo}; reqs.buffers = {reqs.buffers.data(), i + 1}; break; } addr += req.size(); togo -= req.size(); } return reqs.buffers; } map_handle::io_result map_handle::_do_write(io_request reqs, deadline d) noexcept { LLFIO_LOG_FUNCTION_CALL(this); if(!!(_flag & section_handle::flag::write_via_syscall) && _section != nullptr && _section->backing() != nullptr) { auto r = _section->backing()->write(reqs, d); if(!r) { return std::move(r).error(); } if(reqs.offset + r.bytes_transferred() > _length) { OUTCOME_TRY(update_map()); } return std::move(r).value(); } byte *addr = _addr + reqs.offset; size_type togo = reqs.offset < _length ? static_cast(_length - reqs.offset) : 0; if(QUICKCPPLIB_NAMESPACE::signal_guard::signal_guard( QUICKCPPLIB_NAMESPACE::signal_guard::signalc_set::undefined_memory_access, [&] { for(size_t i = 0; i < reqs.buffers.size(); i++) { const_buffer_type &req = reqs.buffers[i]; if(req.size() > togo) { assert(req.data() != nullptr); memcpy(addr, req.data(), togo); req = {addr, togo}; reqs.buffers = {reqs.buffers.data(), i + 1}; return false; } else { assert(req.data() != nullptr); memcpy(addr, req.data(), req.size()); req = {addr, req.size()}; addr += req.size(); togo -= req.size(); } } return false; }, [&](const QUICKCPPLIB_NAMESPACE::signal_guard::raised_signal_info *info) { auto *causingaddr = (byte *) info->addr; if(causingaddr < _addr || causingaddr >= (_addr + _reservation)) { // Not caused by this map thrd_raise_signal(info->signo, info->raw_info, info->raw_context); abort(); } return true; })) { return errc::no_space_on_device; } return reqs.buffers; } LLFIO_V2_NAMESPACE_END