From bd2e3bb7bd06bcbb2134e4853a72ab28f5f332b9 Mon Sep 17 00:00:00 2001 From: Brecht Van Lommel Date: Thu, 18 Nov 2021 00:41:04 +0100 Subject: Fix T93045: Cycles HIP not rendering OpenVDB volumes Build HIP kernels with NanoVDB, and patch NanoVDB to work with HIP. This is a header only library so no rebuild is needed. The changes are being submitted upstream to openvdb, so this patch should be temporary. Thanks Thomas for help testing this. --- build_files/build_environment/cmake/nanovdb.cmake | 1 + build_files/build_environment/patches/nanovdb.diff | 374 +++++++++++++++++++++ intern/cycles/kernel/CMakeLists.txt | 6 + 3 files changed, 381 insertions(+) create mode 100644 build_files/build_environment/patches/nanovdb.diff diff --git a/build_files/build_environment/cmake/nanovdb.cmake b/build_files/build_environment/cmake/nanovdb.cmake index 0baaf80c254..66bbb9f10d7 100644 --- a/build_files/build_environment/cmake/nanovdb.cmake +++ b/build_files/build_environment/cmake/nanovdb.cmake @@ -42,6 +42,7 @@ ExternalProject_Add(nanovdb URL_HASH ${NANOVDB_HASH_TYPE}=${NANOVDB_HASH} PREFIX ${BUILD_DIR}/nanovdb SOURCE_SUBDIR nanovdb + PATCH_COMMAND ${PATCH_CMD} -p 1 -d ${BUILD_DIR}/nanovdb/src/nanovdb < ${PATCH_DIR}/nanovdb.diff CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${LIBDIR}/nanovdb ${DEFAULT_CMAKE_FLAGS} ${NANOVDB_EXTRA_ARGS} INSTALL_DIR ${LIBDIR}/nanovdb ) diff --git a/build_files/build_environment/patches/nanovdb.diff b/build_files/build_environment/patches/nanovdb.diff new file mode 100644 index 00000000000..fd833e61336 --- /dev/null +++ b/build_files/build_environment/patches/nanovdb.diff @@ -0,0 +1,374 @@ +Index: nanovdb/nanovdb/NanoVDB.h +=================================================================== +--- a/nanovdb/nanovdb/NanoVDB.h (revision 62751) ++++ b/nanovdb/nanovdb/NanoVDB.h (working copy) +@@ -152,8 +152,8 @@ + + #endif // __CUDACC_RTC__ + +-#ifdef __CUDACC__ +-// Only define __hostdev__ when using NVIDIA CUDA compiler ++#if defined(__CUDACC__) || defined(__HIP__) ++// Only define __hostdev__ when using NVIDIA CUDA or HIP compiler + #define __hostdev__ __host__ __device__ + #else + #define __hostdev__ +@@ -461,7 +461,7 @@ + /// Maximum floating-point values + template + struct Maximum; +-#ifdef __CUDA_ARCH__ ++#if defined(__CUDA_ARCH__) || defined(__HIP__) + template<> + struct Maximum + { +@@ -1006,10 +1006,10 @@ + using Vec3i = Vec3; + + /// @brief Return a single precision floating-point vector of this coordinate +-Vec3f Coord::asVec3s() const { return Vec3f(float(mVec[0]), float(mVec[1]), float(mVec[2])); } ++inline __hostdev__ Vec3f Coord::asVec3s() const { return Vec3f(float(mVec[0]), float(mVec[1]), float(mVec[2])); } + + /// @brief Return a double precision floating-point vector of this coordinate +-Vec3d Coord::asVec3d() const { return Vec3d(double(mVec[0]), double(mVec[1]), double(mVec[2])); } ++inline __hostdev__ Vec3d Coord::asVec3d() const { return Vec3d(double(mVec[0]), double(mVec[1]), double(mVec[2])); } + + // ----------------------------> Vec4 <-------------------------------------- + +@@ -1820,7 +1820,7 @@ + }; // Map + + template +-void Map::set(const Mat4T& mat, const Mat4T& invMat, double taper) ++__hostdev__ void Map::set(const Mat4T& mat, const Mat4T& invMat, double taper) + { + float * mf = mMatF, *vf = mVecF; + float* mif = mInvMatF; +@@ -2170,7 +2170,7 @@ + }; // Class Grid + + template +-int Grid::findBlindDataForSemantic(GridBlindDataSemantic semantic) const ++__hostdev__ int Grid::findBlindDataForSemantic(GridBlindDataSemantic semantic) const + { + for (uint32_t i = 0, n = blindDataCount(); i < n; ++i) + if (blindMetaData(i).mSemantic == semantic) +@@ -2328,7 +2328,7 @@ + }; // Tree class + + template +-void Tree::extrema(ValueType& min, ValueType& max) const ++__hostdev__ void Tree::extrema(ValueType& min, ValueType& max) const + { + min = this->root().valueMin(); + max = this->root().valueMax(); +@@ -2336,7 +2336,7 @@ + + template + template +-const NodeT* Tree::getNode(uint32_t i) const ++__hostdev__ const NodeT* Tree::getNode(uint32_t i) const + { + static_assert(is_same, NodeT>::value, "Tree::getNode: unvalid node type"); + NANOVDB_ASSERT(i < DataType::mCount[NodeT::LEVEL]); +@@ -2345,7 +2345,7 @@ + + template + template +-const typename TreeNode, LEVEL>::type* Tree::getNode(uint32_t i) const ++__hostdev__ const typename TreeNode, LEVEL>::type* Tree::getNode(uint32_t i) const + { + NANOVDB_ASSERT(i < DataType::mCount[LEVEL]); + return reinterpret_cast*>(reinterpret_cast(this) + DataType::mBytes[LEVEL]) + i; +@@ -2353,7 +2353,7 @@ + + template + template +-NodeT* Tree::getNode(uint32_t i) ++__hostdev__ NodeT* Tree::getNode(uint32_t i) + { + static_assert(is_same, NodeT>::value, "Tree::getNode: invalid node type"); + NANOVDB_ASSERT(i < DataType::mCount[NodeT::LEVEL]); +@@ -2362,7 +2362,7 @@ + + template + template +-typename TreeNode, LEVEL>::type* Tree::getNode(uint32_t i) ++__hostdev__ typename TreeNode, LEVEL>::type* Tree::getNode(uint32_t i) + { + NANOVDB_ASSERT(i < DataType::mCount[LEVEL]); + return reinterpret_cast*>(reinterpret_cast(this) + DataType::mBytes[LEVEL]) + i; +@@ -2370,7 +2370,7 @@ + + template + template +-uint32_t Tree::getNodeID(const NodeT& node) const ++__hostdev__ uint32_t Tree::getNodeID(const NodeT& node) const + { + static_assert(is_same, NodeT>::value, "Tree::getNodeID: invalid node type"); + const NodeT* first = reinterpret_cast(reinterpret_cast(this) + DataType::mBytes[NodeT::LEVEL]); +@@ -2380,7 +2380,7 @@ + + template + template +-uint32_t Tree::getLinearOffset(const NodeT& node) const ++__hostdev__ uint32_t Tree::getLinearOffset(const NodeT& node) const + { + return this->getNodeID(node) + DataType::mPFSum[NodeT::LEVEL]; + } +@@ -3366,7 +3366,7 @@ + }; // LeafNode class + + template class MaskT, uint32_t LOG2DIM> +-inline void LeafNode::updateBBox() ++inline __hostdev__ void LeafNode::updateBBox() + { + static_assert(LOG2DIM == 3, "LeafNode::updateBBox: only supports LOGDIM = 3!"); + if (!this->isActive()) return; +Index: nanovdb/nanovdb/util/SampleFromVoxels.h +=================================================================== +--- a/nanovdb/nanovdb/util/SampleFromVoxels.h (revision 62751) ++++ b/nanovdb/nanovdb/util/SampleFromVoxels.h (working copy) +@@ -22,7 +22,7 @@ + #define NANOVDB_SAMPLE_FROM_VOXELS_H_HAS_BEEN_INCLUDED + + // Only define __hostdev__ when compiling as NVIDIA CUDA +-#ifdef __CUDACC__ ++#if defined(__CUDACC__) || defined(__HIP__) + #define __hostdev__ __host__ __device__ + #else + #include // for floor +@@ -136,7 +136,7 @@ + + template + template +-typename TreeOrAccT::ValueType SampleFromVoxels::operator()(const Vec3T& xyz) const ++__hostdev__ typename TreeOrAccT::ValueType SampleFromVoxels::operator()(const Vec3T& xyz) const + { + const CoordT ijk = Round(xyz); + if (ijk != mPos) { +@@ -147,7 +147,7 @@ + } + + template +-typename TreeOrAccT::ValueType SampleFromVoxels::operator()(const CoordT& ijk) const ++__hostdev__ typename TreeOrAccT::ValueType SampleFromVoxels::operator()(const CoordT& ijk) const + { + if (ijk != mPos) { + mPos = ijk; +@@ -158,7 +158,7 @@ + + template + template +-typename TreeOrAccT::ValueType SampleFromVoxels::operator()(const Vec3T& xyz) const ++__hostdev__ typename TreeOrAccT::ValueType SampleFromVoxels::operator()(const Vec3T& xyz) const + { + return mAcc.getValue(Round(xyz)); + } +@@ -195,7 +195,7 @@ + }; // TrilinearSamplerBase + + template +-void TrilinearSampler::stencil(CoordT& ijk, ValueT (&v)[2][2][2]) const ++__hostdev__ void TrilinearSampler::stencil(CoordT& ijk, ValueT (&v)[2][2][2]) const + { + v[0][0][0] = mAcc.getValue(ijk); // i, j, k + +@@ -224,7 +224,7 @@ + + template + template class Vec3T> +-typename TreeOrAccT::ValueType TrilinearSampler::sample(const Vec3T &uvw, const ValueT (&v)[2][2][2]) ++__hostdev__ typename TreeOrAccT::ValueType TrilinearSampler::sample(const Vec3T &uvw, const ValueT (&v)[2][2][2]) + { + #if 0 + auto lerp = [](ValueT a, ValueT b, ValueT w){ return fma(w, b-a, a); };// = w*(b-a) + a +@@ -239,7 +239,7 @@ + + template + template class Vec3T> +-Vec3T TrilinearSampler::gradient(const Vec3T &uvw, const ValueT (&v)[2][2][2]) ++__hostdev__ Vec3T TrilinearSampler::gradient(const Vec3T &uvw, const ValueT (&v)[2][2][2]) + { + static_assert(std::is_floating_point::value, "TrilinearSampler::gradient requires a floating-point type"); + #if 0 +@@ -270,7 +270,7 @@ + } + + template +-bool TrilinearSampler::zeroCrossing(const ValueT (&v)[2][2][2]) ++__hostdev__ bool TrilinearSampler::zeroCrossing(const ValueT (&v)[2][2][2]) + { + static_assert(std::is_floating_point::value, "TrilinearSampler::zeroCrossing requires a floating-point type"); + const bool less = v[0][0][0] < ValueT(0); +@@ -363,7 +363,7 @@ + + template + template class Vec3T> +-typename TreeOrAccT::ValueType SampleFromVoxels::operator()(Vec3T xyz) const ++__hostdev__ typename TreeOrAccT::ValueType SampleFromVoxels::operator()(Vec3T xyz) const + { + this->cache(xyz); + return BaseT::sample(xyz, mVal); +@@ -370,7 +370,7 @@ + } + + template +-typename TreeOrAccT::ValueType SampleFromVoxels::operator()(const CoordT &ijk) const ++__hostdev__ typename TreeOrAccT::ValueType SampleFromVoxels::operator()(const CoordT &ijk) const + { + return ijk == mPos ? mVal[0][0][0] : BaseT::mAcc.getValue(ijk); + } +@@ -377,7 +377,7 @@ + + template + template class Vec3T> +-Vec3T SampleFromVoxels::gradient(Vec3T xyz) const ++__hostdev__ Vec3T SampleFromVoxels::gradient(Vec3T xyz) const + { + this->cache(xyz); + return BaseT::gradient(xyz, mVal); +@@ -393,7 +393,7 @@ + + template + template class Vec3T> +-void SampleFromVoxels::cache(Vec3T& xyz) const ++__hostdev__ void SampleFromVoxels::cache(Vec3T& xyz) const + { + CoordT ijk = Floor(xyz); + if (ijk != mPos) { +@@ -406,7 +406,7 @@ + + template + template class Vec3T> +-typename TreeOrAccT::ValueType SampleFromVoxels::operator()(Vec3T xyz) const ++__hostdev__ typename TreeOrAccT::ValueType SampleFromVoxels::operator()(Vec3T xyz) const + { + ValueT val[2][2][2]; + CoordT ijk = Floor(xyz); +@@ -418,7 +418,7 @@ + + template + template class Vec3T> +-typename TreeOrAccT::ValueType SampleFromVoxels::operator()(Vec3T xyz) const ++__hostdev__ typename TreeOrAccT::ValueType SampleFromVoxels::operator()(Vec3T xyz) const + { + auto lerp = [](ValueT a, ValueT b, RealT w) { return a + ValueT(w) * (b - a); }; + +@@ -463,7 +463,7 @@ + + template + template class Vec3T> +-inline Vec3T SampleFromVoxels::gradient(Vec3T xyz) const ++inline __hostdev__ Vec3T SampleFromVoxels::gradient(Vec3T xyz) const + { + ValueT val[2][2][2]; + CoordT ijk = Floor(xyz); +@@ -473,7 +473,7 @@ + + template + template class Vec3T> +-bool SampleFromVoxels::zeroCrossing(Vec3T xyz) const ++__hostdev__ bool SampleFromVoxels::zeroCrossing(Vec3T xyz) const + { + ValueT val[2][2][2]; + CoordT ijk = Floor(xyz); +@@ -510,7 +510,7 @@ + }; // TriquadraticSamplerBase + + template +-void TriquadraticSampler::stencil(const CoordT &ijk, ValueT (&v)[3][3][3]) const ++__hostdev__ void TriquadraticSampler::stencil(const CoordT &ijk, ValueT (&v)[3][3][3]) const + { + CoordT p(ijk[0] - 1, 0, 0); + for (int dx = 0; dx < 3; ++dx, ++p[0]) { +@@ -526,7 +526,7 @@ + + template + template class Vec3T> +-typename TreeOrAccT::ValueType TriquadraticSampler::sample(const Vec3T &uvw, const ValueT (&v)[3][3][3]) ++__hostdev__ typename TreeOrAccT::ValueType TriquadraticSampler::sample(const Vec3T &uvw, const ValueT (&v)[3][3][3]) + { + auto kernel = [](const ValueT* value, double weight)->ValueT { + return weight * (weight * (0.5f * (value[0] + value[2]) - value[1]) + +@@ -545,7 +545,7 @@ + } + + template +-bool TriquadraticSampler::zeroCrossing(const ValueT (&v)[3][3][3]) ++__hostdev__ bool TriquadraticSampler::zeroCrossing(const ValueT (&v)[3][3][3]) + { + static_assert(std::is_floating_point::value, "TrilinearSampler::zeroCrossing requires a floating-point type"); + const bool less = v[0][0][0] < ValueT(0); +@@ -624,7 +624,7 @@ + + template + template class Vec3T> +-typename TreeOrAccT::ValueType SampleFromVoxels::operator()(Vec3T xyz) const ++__hostdev__ typename TreeOrAccT::ValueType SampleFromVoxels::operator()(Vec3T xyz) const + { + this->cache(xyz); + return BaseT::sample(xyz, mVal); +@@ -631,7 +631,7 @@ + } + + template +-typename TreeOrAccT::ValueType SampleFromVoxels::operator()(const CoordT &ijk) const ++__hostdev__ typename TreeOrAccT::ValueType SampleFromVoxels::operator()(const CoordT &ijk) const + { + return ijk == mPos ? mVal[1][1][1] : BaseT::mAcc.getValue(ijk); + } +@@ -646,7 +646,7 @@ + + template + template class Vec3T> +-void SampleFromVoxels::cache(Vec3T& xyz) const ++__hostdev__ void SampleFromVoxels::cache(Vec3T& xyz) const + { + CoordT ijk = Floor(xyz); + if (ijk != mPos) { +@@ -657,7 +657,7 @@ + + template + template class Vec3T> +-typename TreeOrAccT::ValueType SampleFromVoxels::operator()(Vec3T xyz) const ++__hostdev__ typename TreeOrAccT::ValueType SampleFromVoxels::operator()(Vec3T xyz) const + { + ValueT val[3][3][3]; + CoordT ijk = Floor(xyz); +@@ -667,7 +667,7 @@ + + template + template class Vec3T> +-bool SampleFromVoxels::zeroCrossing(Vec3T xyz) const ++__hostdev__ bool SampleFromVoxels::zeroCrossing(Vec3T xyz) const + { + ValueT val[3][3][3]; + CoordT ijk = Floor(xyz); +@@ -710,7 +710,7 @@ + }; // TricubicSampler + + template +-void TricubicSampler::stencil(const CoordT& ijk, ValueT (&C)[64]) const ++__hostdev__ void TricubicSampler::stencil(const CoordT& ijk, ValueT (&C)[64]) const + { + auto fetch = [&](int i, int j, int k) -> ValueT& { return C[((i + 1) << 4) + ((j + 1) << 2) + k + 1]; }; + +@@ -929,7 +929,7 @@ + + template + template class Vec3T> +-typename TreeOrAccT::ValueType SampleFromVoxels::operator()(Vec3T xyz) const ++__hostdev__ typename TreeOrAccT::ValueType SampleFromVoxels::operator()(Vec3T xyz) const + { + this->cache(xyz); + return BaseT::sample(xyz, mC); +@@ -937,7 +937,7 @@ + + template + template class Vec3T> +-void SampleFromVoxels::cache(Vec3T& xyz) const ++__hostdev__ void SampleFromVoxels::cache(Vec3T& xyz) const + { + CoordT ijk = Floor(xyz); + if (ijk != mPos) { diff --git a/intern/cycles/kernel/CMakeLists.txt b/intern/cycles/kernel/CMakeLists.txt index 29ff69df864..1a254f5eddc 100644 --- a/intern/cycles/kernel/CMakeLists.txt +++ b/intern/cycles/kernel/CMakeLists.txt @@ -565,6 +565,12 @@ if(WITH_CYCLES_HIP_BINARIES AND WITH_CYCLES_DEVICE_HIP) set(name ${name}_experimental) endif() + if(WITH_NANOVDB) + set(hip_flags ${hip_flags} + -D WITH_NANOVDB + -I "${NANOVDB_INCLUDE_DIR}") + endif() + if(WITH_CYCLES_DEBUG) set(hip_flags ${hip_flags} -D __KERNEL_DEBUG__) endif() -- cgit v1.2.3 From 3ad2bf1327cac5f036d763e1cc690b1b2da8e1c4 Mon Sep 17 00:00:00 2001 From: Sergey Sharybin Date: Thu, 18 Nov 2021 11:25:39 +0100 Subject: Cycles: Fix command line render overshooting time limit The calculation based on preserving device occupancy was conflicting with the fact that time limit needs to render less samples at the last round of render work. For example, rendering BMW27 for 30sec on i9-11900k was actually rendering for almost a minute. Now the render time limit is respected much more close. Differential Revision: https://developer.blender.org/D13269 --- intern/cycles/integrator/render_scheduler.cpp | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/intern/cycles/integrator/render_scheduler.cpp b/intern/cycles/integrator/render_scheduler.cpp index f776d01ef67..276453f7aec 100644 --- a/intern/cycles/integrator/render_scheduler.cpp +++ b/intern/cycles/integrator/render_scheduler.cpp @@ -827,6 +827,26 @@ int RenderScheduler::get_num_samples_to_path_trace() const num_samples_to_occupy = lround(state_.occupancy_num_samples * 0.7f / state_.occupancy); } + /* When time limit is used clamp the calculated number of samples to keep occupancy. + * This is because time limit causes the last render iteration to happen with less number of + * samples, which conflicts with the occupancy (lower number of samples causes lower + * occupancy, also the calculation is based on number of previously rendered samples). + * + * When time limit is not used the number of samples per render iteration is either increasing + * or stays the same, so there is no need to clamp number of samples calculated for occupancy. + */ + if (time_limit_ && state_.start_render_time) { + const double remaining_render_time = max( + 0.0, time_limit_ - (time_dt() - state_.start_render_time)); + const double time_per_sample_average = path_trace_time_.get_average(); + const double predicted_render_time = num_samples_to_occupy * time_per_sample_average; + + if (predicted_render_time > remaining_render_time) { + num_samples_to_occupy = lround(num_samples_to_occupy * + (remaining_render_time / predicted_render_time)); + } + } + num_samples_to_render = max(num_samples_to_render, min(num_samples_to_occupy, max_num_samples_to_render)); } -- cgit v1.2.3 From f71813204c405821bb2efb8e4ad65d240d390eaf Mon Sep 17 00:00:00 2001 From: Sergey Sharybin Date: Fri, 12 Nov 2021 16:12:05 +0100 Subject: Cycles: Don't tile if image area fits into tile area Previously the check was done based on dimension of image and if any of dimensions were larger than tile size tiling was used. This change makes it so that if image does not exceed number of pixels in the tile no tile will be used. Allows to render widescreen images without tiling. Differential Revision: https://developer.blender.org/D13206 --- intern/cycles/session/session.cpp | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/intern/cycles/session/session.cpp b/intern/cycles/session/session.cpp index 65c46d0dd3c..530baa8cafb 100644 --- a/intern/cycles/session/session.cpp +++ b/intern/cycles/session/session.cpp @@ -367,14 +367,26 @@ void Session::draw() int2 Session::get_effective_tile_size() const { + const int image_width = buffer_params_.width; + const int image_height = buffer_params_.height; + /* No support yet for baking with tiles. */ if (!params.use_auto_tile || scene->bake_manager->get_baking()) { - return make_int2(buffer_params_.width, buffer_params_.height); + return make_int2(image_width, image_height); } - /* TODO(sergey): Take available memory into account, and if there is enough memory do not tile - * and prefer optimal performance. */ + const int64_t image_area = static_cast(image_width) * image_height; + + /* TODO(sergey): Take available memory into account, and if there is enough memory do not + * tile and prefer optimal performance. */ + const int tile_size = tile_manager_.compute_render_tile_size(params.tile_size); + const int64_t actual_tile_area = static_cast(tile_size) * tile_size; + + if (actual_tile_area >= image_area) { + return make_int2(image_width, image_height); + } + return make_int2(tile_size, tile_size); } -- cgit v1.2.3