Welcome to mirror list, hosted at ThFree Co, Russian Federation.

git.blender.org/blender.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
Diffstat (limited to 'intern/cycles/device')
-rw-r--r--intern/cycles/device/cuda/device_cuda.h1
-rw-r--r--intern/cycles/device/cuda/device_cuda_impl.cpp22
-rw-r--r--intern/cycles/device/device.cpp15
-rw-r--r--intern/cycles/device/device.h11
-rw-r--r--intern/cycles/device/device_cpu.cpp33
-rw-r--r--intern/cycles/device/device_multi.cpp77
-rw-r--r--intern/cycles/device/device_optix.cpp463
7 files changed, 355 insertions, 267 deletions
diff --git a/intern/cycles/device/cuda/device_cuda.h b/intern/cycles/device/cuda/device_cuda.h
index e5e3e24165d..c3271c3cfcf 100644
--- a/intern/cycles/device/cuda/device_cuda.h
+++ b/intern/cycles/device/cuda/device_cuda.h
@@ -71,6 +71,7 @@ class CUDADevice : public Device {
};
typedef map<device_memory *, CUDAMem> CUDAMemMap;
CUDAMemMap cuda_mem_map;
+ thread_mutex cuda_mem_map_mutex;
struct PixelMem {
GLuint cuPBO;
diff --git a/intern/cycles/device/cuda/device_cuda_impl.cpp b/intern/cycles/device/cuda/device_cuda_impl.cpp
index 01adf10f252..6c26b247743 100644
--- a/intern/cycles/device/cuda/device_cuda_impl.cpp
+++ b/intern/cycles/device/cuda/device_cuda_impl.cpp
@@ -718,8 +718,10 @@ void CUDADevice::init_host_memory()
void CUDADevice::load_texture_info()
{
if (need_texture_info) {
- texture_info.copy_to_device();
+ /* Unset flag before copying, so this does not loop indenfinetly if the copy below calls
+ * into 'move_textures_to_host' (which calls 'load_texture_info' again). */
need_texture_info = false;
+ texture_info.copy_to_device();
}
}
@@ -988,6 +990,7 @@ void CUDADevice::mem_alloc(device_memory &mem)
assert(!"mem_alloc not supported for global memory.");
}
else {
+ thread_scoped_lock lock(cuda_mem_map_mutex);
generic_alloc(mem);
}
}
@@ -1006,10 +1009,10 @@ void CUDADevice::mem_copy_to(device_memory &mem)
tex_alloc((device_texture &)mem);
}
else {
+ thread_scoped_lock lock(cuda_mem_map_mutex);
if (!mem.device_pointer) {
generic_alloc(mem);
}
-
generic_copy_to(mem);
}
}
@@ -1048,6 +1051,7 @@ void CUDADevice::mem_zero(device_memory &mem)
/* If use_mapped_host of mem is false, mem.device_pointer currently refers to device memory
* regardless of mem.host_pointer and mem.shared_pointer. */
+ thread_scoped_lock lock(cuda_mem_map_mutex);
if (!cuda_mem_map[&mem].use_mapped_host || mem.host_pointer != mem.shared_pointer) {
const CUDAContextScope scope(this);
cuda_assert(cuMemsetD8((CUdeviceptr)mem.device_pointer, 0, mem.memory_size()));
@@ -1069,6 +1073,7 @@ void CUDADevice::mem_free(device_memory &mem)
tex_free((device_texture &)mem);
}
else {
+ thread_scoped_lock lock(cuda_mem_map_mutex);
generic_free(mem);
}
}
@@ -1092,6 +1097,7 @@ void CUDADevice::const_copy_to(const char *name, void *host, size_t size)
void CUDADevice::global_alloc(device_memory &mem)
{
if (mem.is_resident(this)) {
+ thread_scoped_lock lock(cuda_mem_map_mutex);
generic_alloc(mem);
generic_copy_to(mem);
}
@@ -1102,6 +1108,7 @@ void CUDADevice::global_alloc(device_memory &mem)
void CUDADevice::global_free(device_memory &mem)
{
if (mem.is_resident(this) && mem.device_pointer) {
+ thread_scoped_lock lock(cuda_mem_map_mutex);
generic_free(mem);
}
}
@@ -1170,6 +1177,8 @@ void CUDADevice::tex_alloc(device_texture &mem)
size_t src_pitch = mem.data_width * dsize * mem.data_elements;
size_t dst_pitch = src_pitch;
+ thread_scoped_lock lock(cuda_mem_map_mutex);
+
if (!mem.is_resident(this)) {
cmem = &cuda_mem_map[&mem];
cmem->texobject = 0;
@@ -1257,6 +1266,9 @@ void CUDADevice::tex_alloc(device_texture &mem)
cuda_assert(cuMemcpyHtoD(mem.device_pointer, mem.host_pointer, size));
}
+ /* Unlock mutex before resizing texture info, since that may attempt to lock it again. */
+ lock.unlock();
+
/* Resize once */
const uint slot = mem.slot;
if (slot >= texture_info.size()) {
@@ -1305,6 +1317,11 @@ void CUDADevice::tex_alloc(device_texture &mem)
texDesc.filterMode = filter_mode;
texDesc.flags = CU_TRSF_NORMALIZED_COORDINATES;
+ /* Lock again and refresh the data pointer (in case another thread modified the map in the
+ * meantime). */
+ lock.lock();
+ cmem = &cuda_mem_map[&mem];
+
cuda_assert(cuTexObjectCreate(&cmem->texobject, &resDesc, &texDesc, NULL));
texture_info[slot].data = (uint64_t)cmem->texobject;
@@ -1318,6 +1335,7 @@ void CUDADevice::tex_free(device_texture &mem)
{
if (mem.device_pointer) {
CUDAContextScope scope(this);
+ thread_scoped_lock lock(cuda_mem_map_mutex);
const CUDAMem &cmem = cuda_mem_map[&mem];
if (cmem.texobject) {
diff --git a/intern/cycles/device/device.cpp b/intern/cycles/device/device.cpp
index eb8fb8040e3..1efd628b79b 100644
--- a/intern/cycles/device/device.cpp
+++ b/intern/cycles/device/device.cpp
@@ -17,6 +17,8 @@
#include <stdlib.h>
#include <string.h>
+#include "bvh/bvh2.h"
+
#include "device/device.h"
#include "device/device_intern.h"
@@ -364,6 +366,19 @@ void Device::draw_pixels(device_memory &rgba,
}
}
+void Device::build_bvh(BVH *bvh, Progress &progress, bool refit)
+{
+ assert(bvh->params.bvh_layout == BVH_LAYOUT_BVH2);
+
+ BVH2 *const bvh2 = static_cast<BVH2 *>(bvh);
+ if (refit) {
+ bvh2->refit(progress);
+ }
+ else {
+ bvh2->build(progress, &stats);
+ }
+}
+
Device *Device::create(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background)
{
#ifdef WITH_MULTI
diff --git a/intern/cycles/device/device.h b/intern/cycles/device/device.h
index 2006db02ce7..e9b7cde7a16 100644
--- a/intern/cycles/device/device.h
+++ b/intern/cycles/device/device.h
@@ -373,12 +373,6 @@ class Device {
return NULL;
}
- /* Device specific pointer for BVH creation. Currently only used by Embree. */
- virtual void *bvh_device() const
- {
- return NULL;
- }
-
/* load/compile kernels, must be called before adding tasks */
virtual bool load_kernels(const DeviceRequestedFeatures & /*requested_features*/)
{
@@ -427,10 +421,7 @@ class Device {
const DeviceDrawParams &draw_params);
/* acceleration structure building */
- virtual bool build_optix_bvh(BVH *)
- {
- return false;
- }
+ virtual void build_bvh(BVH *bvh, Progress &progress, bool refit);
#ifdef WITH_NETWORK
/* networking */
diff --git a/intern/cycles/device/device_cpu.cpp b/intern/cycles/device/device_cpu.cpp
index 6912ac1e638..fea4fc53d1f 100644
--- a/intern/cycles/device/device_cpu.cpp
+++ b/intern/cycles/device/device_cpu.cpp
@@ -47,6 +47,8 @@
#include "kernel/osl/osl_globals.h"
// clang-format on
+#include "bvh/bvh_embree.h"
+
#include "render/buffers.h"
#include "render/coverage.h"
@@ -188,6 +190,7 @@ class CPUDevice : public Device {
#endif
thread_spin_lock oidn_task_lock;
#ifdef WITH_EMBREE
+ RTCScene embree_scene = NULL;
RTCDevice embree_device;
#endif
@@ -472,6 +475,15 @@ class CPUDevice : public Device {
virtual void const_copy_to(const char *name, void *host, size_t size) override
{
+#if WITH_EMBREE
+ if (strcmp(name, "__data") == 0) {
+ assert(size <= sizeof(KernelData));
+
+ // Update scene handle (since it is different for each device on multi devices)
+ KernelData *const data = (KernelData *)host;
+ data->bvh.scene = embree_scene;
+ }
+#endif
kernel_const_copy(&kernel_globals, name, host, size);
}
@@ -537,13 +549,26 @@ class CPUDevice : public Device {
#endif
}
- void *bvh_device() const override
+ void build_bvh(BVH *bvh, Progress &progress, bool refit) override
{
#ifdef WITH_EMBREE
- return embree_device;
-#else
- return NULL;
+ if (bvh->params.bvh_layout == BVH_LAYOUT_EMBREE ||
+ bvh->params.bvh_layout == BVH_LAYOUT_MULTI_OPTIX_EMBREE) {
+ BVHEmbree *const bvh_embree = static_cast<BVHEmbree *>(bvh);
+ if (refit) {
+ bvh_embree->refit(progress);
+ }
+ else {
+ bvh_embree->build(progress, &stats, embree_device);
+ }
+
+ if (bvh->params.top_level) {
+ embree_scene = bvh_embree->scene;
+ }
+ }
+ else
#endif
+ Device::build_bvh(bvh, progress, refit);
}
void thread_run(DeviceTask &task)
diff --git a/intern/cycles/device/device_multi.cpp b/intern/cycles/device/device_multi.cpp
index 2e72a0b4393..ed6e764b995 100644
--- a/intern/cycles/device/device_multi.cpp
+++ b/intern/cycles/device/device_multi.cpp
@@ -17,11 +17,14 @@
#include <sstream>
#include <stdlib.h>
+#include "bvh/bvh_multi.h"
+
#include "device/device.h"
#include "device/device_intern.h"
#include "device/device_network.h"
#include "render/buffers.h"
+#include "render/geometry.h"
#include "util/util_foreach.h"
#include "util/util_list.h"
@@ -164,9 +167,24 @@ class MultiDevice : public Device {
virtual BVHLayoutMask get_bvh_layout_mask() const
{
BVHLayoutMask bvh_layout_mask = BVH_LAYOUT_ALL;
+ BVHLayoutMask bvh_layout_mask_all = BVH_LAYOUT_NONE;
foreach (const SubDevice &sub_device, devices) {
- bvh_layout_mask &= sub_device.device->get_bvh_layout_mask();
+ BVHLayoutMask device_bvh_layout_mask = sub_device.device->get_bvh_layout_mask();
+ bvh_layout_mask &= device_bvh_layout_mask;
+ bvh_layout_mask_all |= device_bvh_layout_mask;
+ }
+
+ /* With multiple OptiX devices, every device needs its own acceleration structure */
+ if (bvh_layout_mask == BVH_LAYOUT_OPTIX) {
+ return BVH_LAYOUT_MULTI_OPTIX;
}
+
+ /* When devices do not share a common BVH layout, fall back to creating one for each */
+ const BVHLayoutMask BVH_LAYOUT_OPTIX_EMBREE = (BVH_LAYOUT_OPTIX | BVH_LAYOUT_EMBREE);
+ if ((bvh_layout_mask_all & BVH_LAYOUT_OPTIX_EMBREE) == BVH_LAYOUT_OPTIX_EMBREE) {
+ return BVH_LAYOUT_MULTI_OPTIX_EMBREE;
+ }
+
return bvh_layout_mask;
}
@@ -227,21 +245,58 @@ class MultiDevice : public Device {
return result;
}
- bool build_optix_bvh(BVH *bvh)
+ void build_bvh(BVH *bvh, Progress &progress, bool refit) override
{
+ /* Try to build and share a single acceleration structure, if possible */
+ if (bvh->params.bvh_layout == BVH_LAYOUT_BVH2) {
+ devices.back().device->build_bvh(bvh, progress, refit);
+ return;
+ }
+
+ BVHMulti *const bvh_multi = static_cast<BVHMulti *>(bvh);
+ bvh_multi->sub_bvhs.resize(devices.size());
+
+ vector<BVHMulti *> geom_bvhs;
+ geom_bvhs.reserve(bvh->geometry.size());
+ foreach (Geometry *geom, bvh->geometry) {
+ geom_bvhs.push_back(static_cast<BVHMulti *>(geom->bvh));
+ }
+
/* Broadcast acceleration structure build to all render devices */
+ size_t i = 0;
foreach (SubDevice &sub, devices) {
- if (!sub.device->build_optix_bvh(bvh))
- return false;
+ /* Change geometry BVH pointers to the sub BVH */
+ for (size_t k = 0; k < bvh->geometry.size(); ++k) {
+ bvh->geometry[k]->bvh = geom_bvhs[k]->sub_bvhs[i];
+ }
+
+ if (!bvh_multi->sub_bvhs[i]) {
+ BVHParams params = bvh->params;
+ if (bvh->params.bvh_layout == BVH_LAYOUT_MULTI_OPTIX)
+ params.bvh_layout = BVH_LAYOUT_OPTIX;
+ else if (bvh->params.bvh_layout == BVH_LAYOUT_MULTI_OPTIX_EMBREE)
+ params.bvh_layout = sub.device->info.type == DEVICE_OPTIX ? BVH_LAYOUT_OPTIX :
+ BVH_LAYOUT_EMBREE;
+
+ /* Skip building a bottom level acceleration structure for non-instanced geometry on Embree
+ * (since they are put into the top level directly, see bvh_embree.cpp) */
+ if (!params.top_level && params.bvh_layout == BVH_LAYOUT_EMBREE &&
+ !bvh->geometry[0]->is_instanced()) {
+ i++;
+ continue;
+ }
+
+ bvh_multi->sub_bvhs[i] = BVH::create(params, bvh->geometry, bvh->objects, sub.device);
+ }
+
+ sub.device->build_bvh(bvh_multi->sub_bvhs[i], progress, refit);
+ i++;
}
- return true;
- }
- virtual void *bvh_device() const
- {
- /* CPU devices will always be at the back, so simply choose the last one.
- There should only ever be one CPU device anyway and we need the Embree device for it. */
- return devices.back().device->bvh_device();
+ /* Change geomtry BVH pointers back to the multi BVH */
+ for (size_t k = 0; k < bvh->geometry.size(); ++k) {
+ bvh->geometry[k]->bvh = geom_bvhs[k];
+ }
}
virtual void *osl_memory()
diff --git a/intern/cycles/device/device_optix.cpp b/intern/cycles/device/device_optix.cpp
index c6276c1e955..a721f426dfe 100644
--- a/intern/cycles/device/device_optix.cpp
+++ b/intern/cycles/device/device_optix.cpp
@@ -31,6 +31,7 @@
# include "util/util_logging.h"
# include "util/util_md5.h"
# include "util/util_path.h"
+# include "util/util_progress.h"
# include "util/util_time.h"
# ifdef WITH_CUDA_DYNLOAD
@@ -186,7 +187,6 @@ class OptiXDevice : public CUDADevice {
bool motion_blur = false;
device_vector<SbtRecord> sbt_data;
device_only_memory<KernelParams> launch_params;
- vector<CUdeviceptr> as_mem;
OptixTraversableHandle tlas_handle = 0;
OptixDenoiser denoiser = NULL;
@@ -258,11 +258,6 @@ class OptiXDevice : public CUDADevice {
// Make CUDA context current
const CUDAContextScope scope(cuContext);
- // Free all acceleration structures
- for (CUdeviceptr mem : as_mem) {
- cuMemFree(mem);
- }
-
sbt_data.free();
texture_info.free();
launch_params.free();
@@ -1136,11 +1131,10 @@ class OptiXDevice : public CUDADevice {
}
}
- bool build_optix_bvh(const OptixBuildInput &build_input,
- uint16_t num_motion_steps,
- OptixTraversableHandle &out_handle,
- CUdeviceptr &out_data,
- OptixBuildOperation operation)
+ bool build_optix_bvh(BVHOptiX *bvh,
+ OptixBuildOperation operation,
+ const OptixBuildInput &build_input,
+ uint16_t num_motion_steps)
{
const CUDAContextScope scope(cuContext);
@@ -1166,24 +1160,21 @@ class OptiXDevice : public CUDADevice {
optixAccelComputeMemoryUsage(context, &options, &build_input, 1, &sizes));
// Allocate required output buffers
- device_only_memory<char> temp_mem(this, "temp_build_mem");
+ device_only_memory<char> temp_mem(this, "optix temp as build mem");
temp_mem.alloc_to_device(align_up(sizes.tempSizeInBytes, 8) + 8);
if (!temp_mem.device_pointer)
return false; // Make sure temporary memory allocation succeeded
- // Move textures to host memory if there is not enough room
- size_t size = 0, free = 0;
- cuMemGetInfo(&free, &size);
- size = sizes.outputSizeInBytes + device_working_headroom;
- if (size >= free && can_map_host) {
- move_textures_to_host(size - free, false);
- }
-
+ device_only_memory<char> &out_data = bvh->as_data;
if (operation == OPTIX_BUILD_OPERATION_BUILD) {
- check_result_cuda_ret(cuMemAlloc(&out_data, sizes.outputSizeInBytes));
+ assert(out_data.device == this);
+ out_data.alloc_to_device(sizes.outputSizeInBytes);
+ if (!out_data.device_pointer)
+ return false;
+ }
+ else {
+ assert(out_data.device_pointer && out_data.device_size >= sizes.outputSizeInBytes);
}
-
- as_mem.push_back(out_data);
// Finally build the acceleration structure
OptixAccelEmitDesc compacted_size_prop;
@@ -1192,6 +1183,7 @@ class OptiXDevice : public CUDADevice {
// Make sure this pointer is 8-byte aligned
compacted_size_prop.result = align_up(temp_mem.device_pointer + sizes.tempSizeInBytes, 8);
+ OptixTraversableHandle out_handle = 0;
check_result_optix_ret(optixAccelBuild(context,
NULL,
&options,
@@ -1199,11 +1191,12 @@ class OptiXDevice : public CUDADevice {
1,
temp_mem.device_pointer,
sizes.tempSizeInBytes,
- out_data,
+ out_data.device_pointer,
sizes.outputSizeInBytes,
&out_handle,
background ? &compacted_size_prop : NULL,
background ? 1 : 0));
+ bvh->traversable_handle = static_cast<uint64_t>(out_handle);
// Wait for all operations to finish
check_result_cuda_ret(cuStreamSynchronize(NULL));
@@ -1219,81 +1212,60 @@ class OptiXDevice : public CUDADevice {
// There is no point compacting if the size does not change
if (compacted_size < sizes.outputSizeInBytes) {
- CUdeviceptr compacted_data = 0;
- if (cuMemAlloc(&compacted_data, compacted_size) != CUDA_SUCCESS)
+ device_only_memory<char> compacted_data(this, "optix compacted as");
+ compacted_data.alloc_to_device(compacted_size);
+ if (!compacted_data.device_pointer)
// Do not compact if memory allocation for compacted acceleration structure fails
// Can just use the uncompacted one then, so succeed here regardless
return true;
- as_mem.push_back(compacted_data);
- check_result_optix_ret(optixAccelCompact(
- context, NULL, out_handle, compacted_data, compacted_size, &out_handle));
+ check_result_optix_ret(optixAccelCompact(context,
+ NULL,
+ out_handle,
+ compacted_data.device_pointer,
+ compacted_size,
+ &out_handle));
+ bvh->traversable_handle = static_cast<uint64_t>(out_handle);
// Wait for compaction to finish
check_result_cuda_ret(cuStreamSynchronize(NULL));
- // Free uncompacted acceleration structure
- cuMemFree(out_data);
- as_mem.erase(as_mem.end() - 2); // Remove 'out_data' from 'as_mem' array
+ std::swap(out_data.device_size, compacted_data.device_size);
+ std::swap(out_data.device_pointer, compacted_data.device_pointer);
}
}
return true;
}
- bool build_optix_bvh(BVH *bvh) override
+ void build_bvh(BVH *bvh, Progress &progress, bool refit) override
{
- assert(bvh->params.top_level);
-
- unsigned int num_instances = 0;
- unordered_map<Geometry *, OptixTraversableHandle> geometry;
- geometry.reserve(bvh->geometry.size());
+ BVHOptiX *const bvh_optix = static_cast<BVHOptiX *>(bvh);
- // Free all previous acceleration structures which can not be refit
- std::set<CUdeviceptr> refit_mem;
+ progress.set_substatus("Building OptiX acceleration structure");
- for (Geometry *geom : bvh->geometry) {
- if (static_cast<BVHOptiX *>(geom->bvh)->do_refit) {
- refit_mem.insert(static_cast<BVHOptiX *>(geom->bvh)->optix_data_handle);
- }
- }
+ if (!bvh->params.top_level) {
+ assert(bvh->objects.size() == 1 && bvh->geometry.size() == 1);
- for (CUdeviceptr mem : as_mem) {
- if (refit_mem.find(mem) == refit_mem.end()) {
- cuMemFree(mem);
- }
- }
-
- as_mem.clear();
-
- // Build bottom level acceleration structures (BLAS)
- // Note: Always keep this logic in sync with bvh_optix.cpp!
- for (Object *ob : bvh->objects) {
- // Skip geometry for which acceleration structure already exists
- Geometry *geom = ob->get_geometry();
- if (geometry.find(geom) != geometry.end())
- continue;
-
- OptixTraversableHandle handle;
- OptixBuildOperation operation;
- CUdeviceptr out_data;
- // Refit is only possible in viewport for now.
- if (static_cast<BVHOptiX *>(geom->bvh)->do_refit && !background) {
- out_data = static_cast<BVHOptiX *>(geom->bvh)->optix_data_handle;
- handle = static_cast<BVHOptiX *>(geom->bvh)->optix_handle;
+ // Refit is only possible in viewport for now (because AS is built with
+ // OPTIX_BUILD_FLAG_ALLOW_UPDATE only there, see above)
+ OptixBuildOperation operation = OPTIX_BUILD_OPERATION_BUILD;
+ if (refit && !background) {
+ assert(bvh_optix->traversable_handle != 0);
operation = OPTIX_BUILD_OPERATION_UPDATE;
}
else {
- out_data = 0;
- handle = 0;
- operation = OPTIX_BUILD_OPERATION_BUILD;
+ bvh_optix->as_data.free();
+ bvh_optix->traversable_handle = 0;
}
+ // Build bottom level acceleration structures (BLAS)
+ Geometry *const geom = bvh->geometry[0];
if (geom->geometry_type == Geometry::HAIR) {
// Build BLAS for curve primitives
- Hair *const hair = static_cast<Hair *const>(ob->get_geometry());
+ Hair *const hair = static_cast<Hair *const>(geom);
if (hair->num_curves() == 0) {
- continue;
+ return;
}
const size_t num_segments = hair->num_segments();
@@ -1304,10 +1276,10 @@ class OptiXDevice : public CUDADevice {
num_motion_steps = hair->get_motion_steps();
}
- device_vector<OptixAabb> aabb_data(this, "temp_aabb_data", MEM_READ_ONLY);
+ device_vector<OptixAabb> aabb_data(this, "optix temp aabb data", MEM_READ_ONLY);
# if OPTIX_ABI_VERSION >= 36
- device_vector<int> index_data(this, "temp_index_data", MEM_READ_ONLY);
- device_vector<float4> vertex_data(this, "temp_vertex_data", MEM_READ_ONLY);
+ device_vector<int> index_data(this, "optix temp index data", MEM_READ_ONLY);
+ device_vector<float4> vertex_data(this, "optix temp vertex data", MEM_READ_ONLY);
// Four control points for each curve segment
const size_t num_vertices = num_segments * 4;
if (DebugFlags().optix.curves_api && hair->curve_shape == CURVE_THICK) {
@@ -1325,7 +1297,7 @@ class OptiXDevice : public CUDADevice {
size_t center_step = (num_motion_steps - 1) / 2;
if (step != center_step) {
size_t attr_offset = (step > center_step) ? step - 1 : step;
- // Technically this is a float4 array, but sizeof(float3) is the same as sizeof(float4)
+ // Technically this is a float4 array, but sizeof(float3) == sizeof(float4)
keys = motion_keys->data_float3() + attr_offset * hair->get_curve_keys().size();
}
@@ -1452,22 +1424,15 @@ class OptiXDevice : public CUDADevice {
# endif
}
- // Allocate memory for new BLAS and build it
- if (build_optix_bvh(build_input, num_motion_steps, handle, out_data, operation)) {
- geometry.insert({ob->get_geometry(), handle});
- static_cast<BVHOptiX *>(geom->bvh)->optix_data_handle = out_data;
- static_cast<BVHOptiX *>(geom->bvh)->optix_handle = handle;
- static_cast<BVHOptiX *>(geom->bvh)->do_refit = false;
- }
- else {
- return false;
+ if (!build_optix_bvh(bvh_optix, operation, build_input, num_motion_steps)) {
+ progress.set_error("Failed to build OptiX acceleration structure");
}
}
else if (geom->geometry_type == Geometry::MESH || geom->geometry_type == Geometry::VOLUME) {
// Build BLAS for triangle primitives
- Mesh *const mesh = static_cast<Mesh *const>(ob->get_geometry());
+ Mesh *const mesh = static_cast<Mesh *const>(geom);
if (mesh->num_triangles() == 0) {
- continue;
+ return;
}
const size_t num_verts = mesh->get_verts().size();
@@ -1478,12 +1443,12 @@ class OptiXDevice : public CUDADevice {
num_motion_steps = mesh->get_motion_steps();
}
- device_vector<int> index_data(this, "temp_index_data", MEM_READ_ONLY);
+ device_vector<int> index_data(this, "optix temp index data", MEM_READ_ONLY);
index_data.alloc(mesh->get_triangles().size());
memcpy(index_data.data(),
mesh->get_triangles().data(),
mesh->get_triangles().size() * sizeof(int));
- device_vector<float3> vertex_data(this, "temp_vertex_data", MEM_READ_ONLY);
+ device_vector<float3> vertex_data(this, "optix temp vertex data", MEM_READ_ONLY);
vertex_data.alloc(num_verts * num_motion_steps);
for (size_t step = 0; step < num_motion_steps; ++step) {
@@ -1528,190 +1493,208 @@ class OptiXDevice : public CUDADevice {
build_input.triangleArray.numSbtRecords = 1;
build_input.triangleArray.primitiveIndexOffset = mesh->optix_prim_offset;
- // Allocate memory for new BLAS and build it
- if (build_optix_bvh(build_input, num_motion_steps, handle, out_data, operation)) {
- geometry.insert({ob->get_geometry(), handle});
- static_cast<BVHOptiX *>(geom->bvh)->optix_data_handle = out_data;
- static_cast<BVHOptiX *>(geom->bvh)->optix_handle = handle;
- static_cast<BVHOptiX *>(geom->bvh)->do_refit = false;
- }
- else {
- return false;
+ if (!build_optix_bvh(bvh_optix, operation, build_input, num_motion_steps)) {
+ progress.set_error("Failed to build OptiX acceleration structure");
}
}
}
+ else {
+ unsigned int num_instances = 0;
+
+ bvh_optix->as_data.free();
+ bvh_optix->traversable_handle = 0;
+ bvh_optix->motion_transform_data.free();
- // Fill instance descriptions
+ // Fill instance descriptions
# if OPTIX_ABI_VERSION < 41
- device_vector<OptixAabb> aabbs(this, "tlas_aabbs", MEM_READ_ONLY);
- aabbs.alloc(bvh->objects.size());
+ device_vector<OptixAabb> aabbs(this, "optix tlas aabbs", MEM_READ_ONLY);
+ aabbs.alloc(bvh->objects.size());
# endif
- device_vector<OptixInstance> instances(this, "tlas_instances", MEM_READ_ONLY);
- instances.alloc(bvh->objects.size());
-
- for (Object *ob : bvh->objects) {
- // Skip non-traceable objects
- if (!ob->is_traceable())
- continue;
-
- // Create separate instance for triangle/curve meshes of an object
- const auto handle_it = geometry.find(ob->get_geometry());
- if (handle_it == geometry.end()) {
- continue;
+ device_vector<OptixInstance> instances(this, "optix tlas instances", MEM_READ_ONLY);
+ instances.alloc(bvh->objects.size());
+
+ // Calculate total motion transform size and allocate memory for them
+ size_t motion_transform_offset = 0;
+ if (motion_blur) {
+ size_t total_motion_transform_size = 0;
+ for (Object *const ob : bvh->objects) {
+ if (ob->is_traceable() && ob->use_motion()) {
+ total_motion_transform_size = align_up(total_motion_transform_size,
+ OPTIX_TRANSFORM_BYTE_ALIGNMENT);
+ const size_t motion_keys = max(ob->get_motion().size(), 2) - 2;
+ total_motion_transform_size = total_motion_transform_size +
+ sizeof(OptixSRTMotionTransform) +
+ motion_keys * sizeof(OptixSRTData);
+ }
+ }
+
+ assert(bvh_optix->motion_transform_data.device == this);
+ bvh_optix->motion_transform_data.alloc_to_device(total_motion_transform_size);
}
- OptixTraversableHandle handle = handle_it->second;
+
+ for (Object *ob : bvh->objects) {
+ // Skip non-traceable objects
+ if (!ob->is_traceable())
+ continue;
+
+ BVHOptiX *const blas = static_cast<BVHOptiX *>(ob->get_geometry()->bvh);
+ OptixTraversableHandle handle = blas->traversable_handle;
# if OPTIX_ABI_VERSION < 41
- OptixAabb &aabb = aabbs[num_instances];
- aabb.minX = ob->bounds.min.x;
- aabb.minY = ob->bounds.min.y;
- aabb.minZ = ob->bounds.min.z;
- aabb.maxX = ob->bounds.max.x;
- aabb.maxY = ob->bounds.max.y;
- aabb.maxZ = ob->bounds.max.z;
+ OptixAabb &aabb = aabbs[num_instances];
+ aabb.minX = ob->bounds.min.x;
+ aabb.minY = ob->bounds.min.y;
+ aabb.minZ = ob->bounds.min.z;
+ aabb.maxX = ob->bounds.max.x;
+ aabb.maxY = ob->bounds.max.y;
+ aabb.maxZ = ob->bounds.max.z;
# endif
- OptixInstance &instance = instances[num_instances++];
- memset(&instance, 0, sizeof(instance));
+ OptixInstance &instance = instances[num_instances++];
+ memset(&instance, 0, sizeof(instance));
- // Clear transform to identity matrix
- instance.transform[0] = 1.0f;
- instance.transform[5] = 1.0f;
- instance.transform[10] = 1.0f;
+ // Clear transform to identity matrix
+ instance.transform[0] = 1.0f;
+ instance.transform[5] = 1.0f;
+ instance.transform[10] = 1.0f;
- // Set user instance ID to object index
- instance.instanceId = ob->get_device_index();
+ // Set user instance ID to object index
+ instance.instanceId = ob->get_device_index();
- // Have to have at least one bit in the mask, or else instance would always be culled
- instance.visibilityMask = 1;
+ // Have to have at least one bit in the mask, or else instance would always be culled
+ instance.visibilityMask = 1;
- if (ob->get_geometry()->has_volume) {
- // Volumes have a special bit set in the visibility mask so a trace can mask only volumes
- instance.visibilityMask |= 2;
- }
+ if (ob->get_geometry()->has_volume) {
+ // Volumes have a special bit set in the visibility mask so a trace can mask only volumes
+ instance.visibilityMask |= 2;
+ }
- if (ob->get_geometry()->geometry_type == Geometry::HAIR) {
- // Same applies to curves (so they can be skipped in local trace calls)
- instance.visibilityMask |= 4;
+ if (ob->get_geometry()->geometry_type == Geometry::HAIR) {
+ // Same applies to curves (so they can be skipped in local trace calls)
+ instance.visibilityMask |= 4;
# if OPTIX_ABI_VERSION >= 36
- if (motion_blur && ob->get_geometry()->has_motion_blur() &&
- DebugFlags().optix.curves_api &&
- static_cast<const Hair *>(ob->get_geometry())->curve_shape == CURVE_THICK) {
- // Select between motion blur and non-motion blur built-in intersection module
- instance.sbtOffset = PG_HITD_MOTION - PG_HITD;
- }
+ if (motion_blur && ob->get_geometry()->has_motion_blur() &&
+ DebugFlags().optix.curves_api &&
+ static_cast<const Hair *>(ob->get_geometry())->curve_shape == CURVE_THICK) {
+ // Select between motion blur and non-motion blur built-in intersection module
+ instance.sbtOffset = PG_HITD_MOTION - PG_HITD;
+ }
# endif
- }
-
- // Insert motion traversable if object has motion
- if (motion_blur && ob->use_motion()) {
- size_t motion_keys = max(ob->get_motion().size(), 2) - 2;
- size_t motion_transform_size = sizeof(OptixSRTMotionTransform) +
- motion_keys * sizeof(OptixSRTData);
-
- const CUDAContextScope scope(cuContext);
-
- CUdeviceptr motion_transform_gpu = 0;
- check_result_cuda_ret(cuMemAlloc(&motion_transform_gpu, motion_transform_size));
- as_mem.push_back(motion_transform_gpu);
-
- // Allocate host side memory for motion transform and fill it with transform data
- OptixSRTMotionTransform &motion_transform = *reinterpret_cast<OptixSRTMotionTransform *>(
- new uint8_t[motion_transform_size]);
- motion_transform.child = handle;
- motion_transform.motionOptions.numKeys = ob->get_motion().size();
- motion_transform.motionOptions.flags = OPTIX_MOTION_FLAG_NONE;
- motion_transform.motionOptions.timeBegin = 0.0f;
- motion_transform.motionOptions.timeEnd = 1.0f;
-
- OptixSRTData *const srt_data = motion_transform.srtData;
- array<DecomposedTransform> decomp(ob->get_motion().size());
- transform_motion_decompose(
- decomp.data(), ob->get_motion().data(), ob->get_motion().size());
-
- for (size_t i = 0; i < ob->get_motion().size(); ++i) {
- // Scale
- srt_data[i].sx = decomp[i].y.w; // scale.x.x
- srt_data[i].sy = decomp[i].z.w; // scale.y.y
- srt_data[i].sz = decomp[i].w.w; // scale.z.z
-
- // Shear
- srt_data[i].a = decomp[i].z.x; // scale.x.y
- srt_data[i].b = decomp[i].z.y; // scale.x.z
- srt_data[i].c = decomp[i].w.x; // scale.y.z
- assert(decomp[i].z.z == 0.0f); // scale.y.x
- assert(decomp[i].w.y == 0.0f); // scale.z.x
- assert(decomp[i].w.z == 0.0f); // scale.z.y
-
- // Pivot point
- srt_data[i].pvx = 0.0f;
- srt_data[i].pvy = 0.0f;
- srt_data[i].pvz = 0.0f;
-
- // Rotation
- srt_data[i].qx = decomp[i].x.x;
- srt_data[i].qy = decomp[i].x.y;
- srt_data[i].qz = decomp[i].x.z;
- srt_data[i].qw = decomp[i].x.w;
-
- // Translation
- srt_data[i].tx = decomp[i].y.x;
- srt_data[i].ty = decomp[i].y.y;
- srt_data[i].tz = decomp[i].y.z;
}
- // Upload motion transform to GPU
- cuMemcpyHtoD(motion_transform_gpu, &motion_transform, motion_transform_size);
- delete[] reinterpret_cast<uint8_t *>(&motion_transform);
+ // Insert motion traversable if object has motion
+ if (motion_blur && ob->use_motion()) {
+ size_t motion_keys = max(ob->get_motion().size(), 2) - 2;
+ size_t motion_transform_size = sizeof(OptixSRTMotionTransform) +
+ motion_keys * sizeof(OptixSRTData);
+
+ const CUDAContextScope scope(cuContext);
+
+ motion_transform_offset = align_up(motion_transform_offset,
+ OPTIX_TRANSFORM_BYTE_ALIGNMENT);
+ CUdeviceptr motion_transform_gpu = bvh_optix->motion_transform_data.device_pointer +
+ motion_transform_offset;
+ motion_transform_offset += motion_transform_size;
+
+ // Allocate host side memory for motion transform and fill it with transform data
+ OptixSRTMotionTransform &motion_transform = *reinterpret_cast<OptixSRTMotionTransform *>(
+ new uint8_t[motion_transform_size]);
+ motion_transform.child = handle;
+ motion_transform.motionOptions.numKeys = ob->get_motion().size();
+ motion_transform.motionOptions.flags = OPTIX_MOTION_FLAG_NONE;
+ motion_transform.motionOptions.timeBegin = 0.0f;
+ motion_transform.motionOptions.timeEnd = 1.0f;
+
+ OptixSRTData *const srt_data = motion_transform.srtData;
+ array<DecomposedTransform> decomp(ob->get_motion().size());
+ transform_motion_decompose(
+ decomp.data(), ob->get_motion().data(), ob->get_motion().size());
+
+ for (size_t i = 0; i < ob->get_motion().size(); ++i) {
+ // Scale
+ srt_data[i].sx = decomp[i].y.w; // scale.x.x
+ srt_data[i].sy = decomp[i].z.w; // scale.y.y
+ srt_data[i].sz = decomp[i].w.w; // scale.z.z
+
+ // Shear
+ srt_data[i].a = decomp[i].z.x; // scale.x.y
+ srt_data[i].b = decomp[i].z.y; // scale.x.z
+ srt_data[i].c = decomp[i].w.x; // scale.y.z
+ assert(decomp[i].z.z == 0.0f); // scale.y.x
+ assert(decomp[i].w.y == 0.0f); // scale.z.x
+ assert(decomp[i].w.z == 0.0f); // scale.z.y
+
+ // Pivot point
+ srt_data[i].pvx = 0.0f;
+ srt_data[i].pvy = 0.0f;
+ srt_data[i].pvz = 0.0f;
+
+ // Rotation
+ srt_data[i].qx = decomp[i].x.x;
+ srt_data[i].qy = decomp[i].x.y;
+ srt_data[i].qz = decomp[i].x.z;
+ srt_data[i].qw = decomp[i].x.w;
+
+ // Translation
+ srt_data[i].tx = decomp[i].y.x;
+ srt_data[i].ty = decomp[i].y.y;
+ srt_data[i].tz = decomp[i].y.z;
+ }
- // Disable instance transform if object uses motion transform already
- instance.flags = OPTIX_INSTANCE_FLAG_DISABLE_TRANSFORM;
+ // Upload motion transform to GPU
+ cuMemcpyHtoD(motion_transform_gpu, &motion_transform, motion_transform_size);
+ delete[] reinterpret_cast<uint8_t *>(&motion_transform);
- // Get traversable handle to motion transform
- optixConvertPointerToTraversableHandle(context,
- motion_transform_gpu,
- OPTIX_TRAVERSABLE_TYPE_SRT_MOTION_TRANSFORM,
- &instance.traversableHandle);
- }
- else {
- instance.traversableHandle = handle;
+ // Disable instance transform if object uses motion transform already
+ instance.flags = OPTIX_INSTANCE_FLAG_DISABLE_TRANSFORM;
- if (ob->get_geometry()->is_instanced()) {
- // Set transform matrix
- memcpy(instance.transform, &ob->get_tfm(), sizeof(instance.transform));
+ // Get traversable handle to motion transform
+ optixConvertPointerToTraversableHandle(context,
+ motion_transform_gpu,
+ OPTIX_TRAVERSABLE_TYPE_SRT_MOTION_TRANSFORM,
+ &instance.traversableHandle);
}
else {
- // Disable instance transform if geometry already has it applied to vertex data
- instance.flags = OPTIX_INSTANCE_FLAG_DISABLE_TRANSFORM;
- // Non-instanced objects read ID from prim_object, so
- // distinguish them from instanced objects with high bit set
- instance.instanceId |= 0x800000;
+ instance.traversableHandle = handle;
+
+ if (ob->get_geometry()->is_instanced()) {
+ // Set transform matrix
+ memcpy(instance.transform, &ob->get_tfm(), sizeof(instance.transform));
+ }
+ else {
+ // Disable instance transform if geometry already has it applied to vertex data
+ instance.flags = OPTIX_INSTANCE_FLAG_DISABLE_TRANSFORM;
+ // Non-instanced objects read ID from prim_object, so
+ // distinguish them from instanced objects with high bit set
+ instance.instanceId |= 0x800000;
+ }
}
}
- }
- // Upload instance descriptions
+ // Upload instance descriptions
# if OPTIX_ABI_VERSION < 41
- aabbs.resize(num_instances);
- aabbs.copy_to_device();
+ aabbs.resize(num_instances);
+ aabbs.copy_to_device();
# endif
- instances.resize(num_instances);
- instances.copy_to_device();
+ instances.resize(num_instances);
+ instances.copy_to_device();
- // Build top-level acceleration structure (TLAS)
- OptixBuildInput build_input = {};
- build_input.type = OPTIX_BUILD_INPUT_TYPE_INSTANCES;
+ // Build top-level acceleration structure (TLAS)
+ OptixBuildInput build_input = {};
+ build_input.type = OPTIX_BUILD_INPUT_TYPE_INSTANCES;
# if OPTIX_ABI_VERSION < 41 // Instance AABBs no longer need to be set since OptiX 7.2
- build_input.instanceArray.aabbs = aabbs.device_pointer;
- build_input.instanceArray.numAabbs = num_instances;
+ build_input.instanceArray.aabbs = aabbs.device_pointer;
+ build_input.instanceArray.numAabbs = num_instances;
# endif
- build_input.instanceArray.instances = instances.device_pointer;
- build_input.instanceArray.numInstances = num_instances;
+ build_input.instanceArray.instances = instances.device_pointer;
+ build_input.instanceArray.numInstances = num_instances;
- CUdeviceptr out_data = 0;
- tlas_handle = 0;
- return build_optix_bvh(build_input, 0, tlas_handle, out_data, OPTIX_BUILD_OPERATION_BUILD);
+ if (!build_optix_bvh(bvh_optix, OPTIX_BUILD_OPERATION_BUILD, build_input, 0)) {
+ progress.set_error("Failed to build OptiX acceleration structure");
+ }
+ tlas_handle = bvh_optix->traversable_handle;
+ }
}
void const_copy_to(const char *name, void *host, size_t size) override
@@ -1724,7 +1707,7 @@ class OptiXDevice : public CUDADevice {
if (strcmp(name, "__data") == 0) {
assert(size <= sizeof(KernelData));
- // Fix traversable handle on multi devices
+ // Update traversable handle (since it is different for each device on multi devices)
KernelData *const data = (KernelData *)host;
*(OptixTraversableHandle *)&data->bvh.scene = tlas_handle;