From a2b52dc5716a97e5413acbd6eefc9ce3788b6456 Mon Sep 17 00:00:00 2001 From: Patrick Mours Date: Thu, 12 Sep 2019 14:50:06 +0200 Subject: Cycles: add Optix device backend This uses hardware-accelerated raytracing on NVIDIA RTX graphics cards. It is still currently experimental. Most features are supported, but a few are still missing like baking, branched path tracing and using CPU memory. https://wiki.blender.org/wiki/Reference/Release_Notes/2.81/Cycles#NVIDIA_RTX For building with Optix support, the Optix SDK must be installed. See here for build instructions: https://wiki.blender.org/wiki/Building_Blender/CUDA Differential Revision: https://developer.blender.org/D5363 --- intern/cycles/CMakeLists.txt | 18 + intern/cycles/blender/addon/properties.py | 46 +- intern/cycles/blender/addon/ui.py | 23 +- intern/cycles/blender/blender_device.cpp | 7 +- intern/cycles/blender/blender_python.cpp | 10 +- intern/cycles/blender/blender_sync.cpp | 2 +- intern/cycles/bvh/CMakeLists.txt | 2 + intern/cycles/bvh/bvh.cpp | 11 + intern/cycles/bvh/bvh_optix.cpp | 215 ++++ intern/cycles/bvh/bvh_optix.h | 53 + intern/cycles/device/CMakeLists.txt | 4 + intern/cycles/device/device.cpp | 30 + intern/cycles/device/device.h | 18 +- intern/cycles/device/device_cuda.cpp | 5 - intern/cycles/device/device_intern.h | 4 + intern/cycles/device/device_multi.cpp | 18 + intern/cycles/device/device_optix.cpp | 1969 +++++++++++++++++++++++++++++ intern/cycles/kernel/CMakeLists.txt | 76 +- intern/cycles/render/mesh.cpp | 4 +- intern/cycles/util/util_debug.cpp | 16 +- intern/cycles/util/util_debug.h | 14 + 21 files changed, 2511 insertions(+), 34 deletions(-) create mode 100644 intern/cycles/bvh/bvh_optix.cpp create mode 100644 intern/cycles/bvh/bvh_optix.h create mode 100644 intern/cycles/device/device_optix.cpp (limited to 'intern') diff --git a/intern/cycles/CMakeLists.txt b/intern/cycles/CMakeLists.txt index 6a3ebd85378..25e8e124885 100644 --- a/intern/cycles/CMakeLists.txt +++ b/intern/cycles/CMakeLists.txt @@ -219,6 +219,24 @@ if(WITH_CYCLES_OSL) ) endif() +if(WITH_CYCLES_DEVICE_OPTIX) + find_package(OptiX) + + if(OPTIX_FOUND) + add_definitions(-DWITH_OPTIX) + include_directories( + SYSTEM + ${OPTIX_INCLUDE_DIR} + ) + + # Need pre-compiled CUDA binaries in the OptiX device + set(WITH_CYCLES_CUDA_BINARIES ON) + else() + message(STATUS "Optix not found, disabling it from Cycles") + set(WITH_CYCLES_DEVICE_OPTIX OFF) + endif() +endif() + if(WITH_CYCLES_EMBREE) add_definitions(-DWITH_EMBREE) add_definitions(-DEMBREE_STATIC_LIB) diff --git a/intern/cycles/blender/addon/properties.py b/intern/cycles/blender/addon/properties.py index 93f8f76cd6a..8623b38a271 100644 --- a/intern/cycles/blender/addon/properties.py +++ b/intern/cycles/blender/addon/properties.py @@ -137,6 +137,7 @@ enum_world_mis = ( enum_device_type = ( ('CPU', "CPU", "CPU", 0), ('CUDA', "CUDA", "CUDA", 1), + ('OPTIX', "OptiX", "OptiX", 3), ('OPENCL', "OpenCL", "OpenCL", 2) ) @@ -740,6 +741,8 @@ class CyclesRenderSettings(bpy.types.PropertyGroup): debug_use_cuda_adaptive_compile: BoolProperty(name="Adaptive Compile", default=False) debug_use_cuda_split_kernel: BoolProperty(name="Split Kernel", default=False) + debug_optix_cuda_streams: IntProperty(name="CUDA Streams", default=1, min=1) + debug_opencl_kernel_type: EnumProperty( name="OpenCL Kernel Type", default='DEFAULT', @@ -1400,10 +1403,12 @@ class CyclesPreferences(bpy.types.AddonPreferences): def get_device_types(self, context): import _cycles - has_cuda, has_opencl = _cycles.get_device_types() + has_cuda, has_optix, has_opencl = _cycles.get_device_types() list = [('NONE', "None", "Don't use compute device", 0)] if has_cuda: list.append(('CUDA', "CUDA", "Use CUDA for GPU acceleration", 1)) + if has_optix: + list.append(('OPTIX', "OptiX", "Use OptiX for GPU acceleration", 3)) if has_opencl: list.append(('OPENCL', "OpenCL", "Use OpenCL for GPU acceleration", 2)) return list @@ -1424,7 +1429,7 @@ class CyclesPreferences(bpy.types.AddonPreferences): def update_device_entries(self, device_list): for device in device_list: - if not device[1] in {'CUDA', 'OPENCL', 'CPU'}: + if not device[1] in {'CUDA', 'OPTIX', 'OPENCL', 'CPU'}: continue # Try to find existing Device entry entry = self.find_existing_device_entry(device) @@ -1439,8 +1444,8 @@ class CyclesPreferences(bpy.types.AddonPreferences): # Update name in case it changed entry.name = device[0] - # Gets all devices types by default. - def get_devices(self, compute_device_type=''): + # Gets all devices types for a compute device type. + def get_devices_for_type(self, compute_device_type): import _cycles # Layout of the device tuples: (Name, Type, Persistent ID) device_list = _cycles.available_devices(compute_device_type) @@ -1449,20 +1454,23 @@ class CyclesPreferences(bpy.types.AddonPreferences): # hold pointers to a resized array. self.update_device_entries(device_list) # Sort entries into lists - cuda_devices = [] - opencl_devices = [] + devices = [] cpu_devices = [] for device in device_list: entry = self.find_existing_device_entry(device) - if entry.type == 'CUDA': - cuda_devices.append(entry) - elif entry.type == 'OPENCL': - opencl_devices.append(entry) + if entry.type == compute_device_type: + devices.append(entry) elif entry.type == 'CPU': cpu_devices.append(entry) # Extend all GPU devices with CPU. - cuda_devices.extend(cpu_devices) - opencl_devices.extend(cpu_devices) + if compute_device_type in ('CUDA', 'OPENCL'): + devices.extend(cpu_devices) + return devices + + # For backwards compatibility, only has CUDA and OpenCL. + def get_devices(self, compute_device_type=''): + cuda_devices = self.get_devices_for_type('CUDA') + opencl_devices = self.get_devices_for_type('OPENCL') return cuda_devices, opencl_devices def get_num_gpu_devices(self): @@ -1498,16 +1506,24 @@ class CyclesPreferences(bpy.types.AddonPreferences): for device in devices: box.prop(device, "use", text=device.name) + if device_type == 'OPTIX': + col = box.column(align=True) + col.label(text="OptiX support is experimental", icon='INFO') + col.label(text="Not all Cycles features are supported yet", icon='BLANK1') + + def draw_impl(self, layout, context): row = layout.row() row.prop(self, "compute_device_type", expand=True) - cuda_devices, opencl_devices = self.get_devices(self.compute_device_type) + devices = self.get_devices_for_type(self.compute_device_type) row = layout.row() if self.compute_device_type == 'CUDA': - self._draw_devices(row, 'CUDA', cuda_devices) + self._draw_devices(row, 'CUDA', devices) + elif self.compute_device_type == 'OPTIX': + self._draw_devices(row, 'OPTIX', devices) elif self.compute_device_type == 'OPENCL': - self._draw_devices(row, 'OPENCL', opencl_devices) + self._draw_devices(row, 'OPENCL', devices) def draw(self, context): self.draw_impl(self.layout, context) diff --git a/intern/cycles/blender/addon/ui.py b/intern/cycles/blender/addon/ui.py index 200b08f93cb..44ed28e9e02 100644 --- a/intern/cycles/blender/addon/ui.py +++ b/intern/cycles/blender/addon/ui.py @@ -88,10 +88,16 @@ def use_cuda(context): return (get_device_type(context) == 'CUDA' and cscene.device == 'GPU') +def use_optix(context): + cscene = context.scene.cycles + + return (get_device_type(context) == 'OPTIX' and cscene.device == 'GPU') + + def use_branched_path(context): cscene = context.scene.cycles - return (cscene.progressive == 'BRANCHED_PATH') + return (cscene.progressive == 'BRANCHED_PATH' and not use_optix(context)) def use_sample_all_lights(context): @@ -168,7 +174,8 @@ class CYCLES_RENDER_PT_sampling(CyclesButtonsPanel, Panel): layout.use_property_split = True layout.use_property_decorate = False - layout.prop(cscene, "progressive") + if not use_optix(context): + layout.prop(cscene, "progressive") if cscene.progressive == 'PATH' or use_branched_path(context) is False: col = layout.column(align=True) @@ -1763,6 +1770,10 @@ class CYCLES_RENDER_PT_bake(CyclesButtonsPanel, Panel): bl_options = {'DEFAULT_CLOSED'} COMPAT_ENGINES = {'CYCLES'} + @classmethod + def poll(cls, context): + return not use_optix(context) + def draw(self, context): layout = self.layout layout.use_property_split = True @@ -1947,7 +1958,13 @@ class CYCLES_RENDER_PT_debug(CyclesButtonsPanel, Panel): col.separator() col = layout.column() - col.label(text='OpenCL Flags:') + col.label(text="OptiX Flags:") + col.prop(cscene, "debug_optix_cuda_streams") + + col.separator() + + col = layout.column() + col.label(text="OpenCL Flags:") col.prop(cscene, "debug_opencl_device_type", text="Device") col.prop(cscene, "debug_use_opencl_debug", text="Debug") col.prop(cscene, "debug_opencl_mem_limit") diff --git a/intern/cycles/blender/blender_device.cpp b/intern/cycles/blender/blender_device.cpp index 98fc0c6dec4..111fc8d5192 100644 --- a/intern/cycles/blender/blender_device.cpp +++ b/intern/cycles/blender/blender_device.cpp @@ -61,7 +61,8 @@ DeviceInfo blender_device_info(BL::Preferences &b_preferences, BL::Scene &b_scen COMPUTE_DEVICE_CPU = 0, COMPUTE_DEVICE_CUDA = 1, COMPUTE_DEVICE_OPENCL = 2, - COMPUTE_DEVICE_NUM = 3, + COMPUTE_DEVICE_OPTIX = 3, + COMPUTE_DEVICE_NUM = 4, }; ComputeDevice compute_device = (ComputeDevice)get_enum( @@ -73,6 +74,10 @@ DeviceInfo blender_device_info(BL::Preferences &b_preferences, BL::Scene &b_scen if (compute_device == COMPUTE_DEVICE_CUDA) { mask |= DEVICE_MASK_CUDA; } + else if (compute_device == COMPUTE_DEVICE_OPTIX) { + /* Cannot use CPU and OptiX device at the same time right now, so replace mask. */ + mask = DEVICE_MASK_OPTIX; + } else if (compute_device == COMPUTE_DEVICE_OPENCL) { mask |= DEVICE_MASK_OPENCL; } diff --git a/intern/cycles/blender/blender_python.cpp b/intern/cycles/blender/blender_python.cpp index 2bea6b34772..335d4daf09c 100644 --- a/intern/cycles/blender/blender_python.cpp +++ b/intern/cycles/blender/blender_python.cpp @@ -81,6 +81,8 @@ bool debug_flags_sync_from_scene(BL::Scene b_scene) /* Synchronize CUDA flags. */ flags.cuda.adaptive_compile = get_boolean(cscene, "debug_use_cuda_adaptive_compile"); flags.cuda.split_kernel = get_boolean(cscene, "debug_use_cuda_split_kernel"); + /* Synchronize OptiX flags. */ + flags.optix.cuda_streams = get_int(cscene, "debug_optix_cuda_streams"); /* Synchronize OpenCL device type. */ switch (get_enum(cscene, "debug_opencl_device_type")) { case 0: @@ -960,14 +962,16 @@ static PyObject *enable_print_stats_func(PyObject * /*self*/, PyObject * /*args* static PyObject *get_device_types_func(PyObject * /*self*/, PyObject * /*args*/) { vector device_types = Device::available_types(); - bool has_cuda = false, has_opencl = false; + bool has_cuda = false, has_optix = false, has_opencl = false; foreach (DeviceType device_type, device_types) { has_cuda |= (device_type == DEVICE_CUDA); + has_optix |= (device_type == DEVICE_OPTIX); has_opencl |= (device_type == DEVICE_OPENCL); } - PyObject *list = PyTuple_New(2); + PyObject *list = PyTuple_New(3); PyTuple_SET_ITEM(list, 0, PyBool_FromLong(has_cuda)); - PyTuple_SET_ITEM(list, 1, PyBool_FromLong(has_opencl)); + PyTuple_SET_ITEM(list, 1, PyBool_FromLong(has_optix)); + PyTuple_SET_ITEM(list, 2, PyBool_FromLong(has_opencl)); return list; } diff --git a/intern/cycles/blender/blender_sync.cpp b/intern/cycles/blender/blender_sync.cpp index 8b7c66363d9..1a166d171bc 100644 --- a/intern/cycles/blender/blender_sync.cpp +++ b/intern/cycles/blender/blender_sync.cpp @@ -758,7 +758,7 @@ SessionParams BlenderSync::get_session_params(BL::RenderEngine &b_engine, preview_samples = preview_samples * preview_samples; } - if (get_enum(cscene, "progressive") == 0) { + if (get_enum(cscene, "progressive") == 0 && (params.device.type != DEVICE_OPTIX)) { if (background) { params.samples = aa_samples; } diff --git a/intern/cycles/bvh/CMakeLists.txt b/intern/cycles/bvh/CMakeLists.txt index 36bbd937e1a..27a7f604e1c 100644 --- a/intern/cycles/bvh/CMakeLists.txt +++ b/intern/cycles/bvh/CMakeLists.txt @@ -15,6 +15,7 @@ set(SRC bvh_build.cpp bvh_embree.cpp bvh_node.cpp + bvh_optix.cpp bvh_sort.cpp bvh_split.cpp bvh_unaligned.cpp @@ -29,6 +30,7 @@ set(SRC_HEADERS bvh_build.h bvh_embree.h bvh_node.h + bvh_optix.h bvh_params.h bvh_sort.h bvh_split.h diff --git a/intern/cycles/bvh/bvh.cpp b/intern/cycles/bvh/bvh.cpp index b6a4aba74b5..16c721da06a 100644 --- a/intern/cycles/bvh/bvh.cpp +++ b/intern/cycles/bvh/bvh.cpp @@ -26,6 +26,9 @@ #include "bvh/bvh_build.h" #include "bvh/bvh_node.h" +#ifdef WITH_OPTIX +# include "bvh/bvh_optix.h" +#endif #ifdef WITH_EMBREE # include "bvh/bvh_embree.h" #endif @@ -51,6 +54,8 @@ const char *bvh_layout_name(BVHLayout layout) return "NONE"; case BVH_LAYOUT_EMBREE: return "EMBREE"; + case BVH_LAYOUT_OPTIX: + return "OPTIX"; case BVH_LAYOUT_ALL: return "ALL"; } @@ -115,6 +120,12 @@ BVH *BVH::create(const BVHParams ¶ms, return new BVHEmbree(params, meshes, objects); #else break; +#endif + case BVH_LAYOUT_OPTIX: +#ifdef WITH_OPTIX + return new BVHOptiX(params, meshes, objects); +#else + break; #endif case BVH_LAYOUT_NONE: case BVH_LAYOUT_ALL: diff --git a/intern/cycles/bvh/bvh_optix.cpp b/intern/cycles/bvh/bvh_optix.cpp new file mode 100644 index 00000000000..b3a9aab3266 --- /dev/null +++ b/intern/cycles/bvh/bvh_optix.cpp @@ -0,0 +1,215 @@ +/* + * Copyright 2019, NVIDIA Corporation. + * Copyright 2019, Blender Foundation. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifdef WITH_OPTIX + +# include "bvh/bvh_optix.h" +# include "render/mesh.h" +# include "render/object.h" +# include "util/util_logging.h" +# include "util/util_progress.h" + +CCL_NAMESPACE_BEGIN + +BVHOptiX::BVHOptiX(const BVHParams ¶ms_, + const vector &meshes_, + const vector &objects_) + : BVH(params_, meshes_, objects_) +{ +} + +BVHOptiX::~BVHOptiX() +{ +} + +void BVHOptiX::build(Progress &, Stats *) +{ + if (params.top_level) + pack_tlas(); + else + pack_blas(); +} + +void BVHOptiX::copy_to_device(Progress &progress, DeviceScene *dscene) +{ + progress.set_status("Updating Scene BVH", "Building OptiX acceleration structure"); + + Device *const device = dscene->bvh_nodes.device; + if (!device->build_optix_bvh(this, dscene->bvh_nodes)) + progress.set_error("Failed to build OptiX acceleration structure"); +} + +void BVHOptiX::pack_blas() +{ + // Bottom-level BVH can contain multiple primitive types, so merge them: + assert(meshes.size() == 1 && objects.size() == 1); // These are build per-mesh + Mesh *const mesh = meshes[0]; + + if (params.primitive_mask & PRIMITIVE_ALL_CURVE && mesh->num_curves() > 0) { + const size_t num_curves = mesh->num_curves(); + const size_t num_segments = mesh->num_segments(); + pack.prim_type.reserve(pack.prim_type.size() + num_segments); + pack.prim_index.reserve(pack.prim_index.size() + num_segments); + pack.prim_object.reserve(pack.prim_object.size() + num_segments); + // 'pack.prim_time' is only used in geom_curve_intersect.h + // It is not needed because of OPTIX_MOTION_FLAG_[START|END]_VANISH + + uint type = PRIMITIVE_CURVE; + if (mesh->use_motion_blur && mesh->curve_attributes.find(ATTR_STD_MOTION_VERTEX_POSITION)) + type = PRIMITIVE_MOTION_CURVE; + + for (size_t j = 0; j < num_curves; ++j) { + const Mesh::Curve curve = mesh->get_curve(j); + for (size_t k = 0; k < curve.num_segments(); ++k) { + pack.prim_type.push_back_reserved(PRIMITIVE_PACK_SEGMENT(type, k)); + // Each curve segment points back to its curve index + pack.prim_index.push_back_reserved(j); + pack.prim_object.push_back_reserved(0); + } + } + } + + if (params.primitive_mask & PRIMITIVE_ALL_TRIANGLE && mesh->num_triangles() > 0) { + const size_t num_triangles = mesh->num_triangles(); + pack.prim_type.reserve(pack.prim_type.size() + num_triangles); + pack.prim_index.reserve(pack.prim_index.size() + num_triangles); + pack.prim_object.reserve(pack.prim_object.size() + num_triangles); + + uint type = PRIMITIVE_TRIANGLE; + if (mesh->use_motion_blur && mesh->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION)) + type = PRIMITIVE_MOTION_TRIANGLE; + + for (size_t k = 0; k < num_triangles; ++k) { + pack.prim_type.push_back_reserved(type); + pack.prim_index.push_back_reserved(k); + pack.prim_object.push_back_reserved(0); + } + } + + // Initialize visibility to zero and later update it during top-level build + uint prev_visibility = objects[0]->visibility; + objects[0]->visibility = 0; + + // Update 'pack.prim_tri_index', 'pack.prim_tri_verts' and 'pack.prim_visibility' + pack_primitives(); + + // Reset visibility after packing + objects[0]->visibility = prev_visibility; +} + +void BVHOptiX::pack_tlas() +{ + // Calculate total packed size + size_t prim_index_size = 0; + size_t prim_tri_verts_size = 0; + foreach (Mesh *mesh, meshes) { + BVH *const bvh = mesh->bvh; + prim_index_size += bvh->pack.prim_index.size(); + prim_tri_verts_size += bvh->pack.prim_tri_verts.size(); + } + + if (prim_index_size == 0) + return; // Abort right away if this is an empty BVH + + size_t pack_offset = 0; + size_t pack_verts_offset = 0; + + pack.prim_type.resize(prim_index_size); + int *pack_prim_type = pack.prim_type.data(); + pack.prim_index.resize(prim_index_size); + int *pack_prim_index = pack.prim_index.data(); + pack.prim_object.resize(prim_index_size); + int *pack_prim_object = pack.prim_object.data(); + pack.prim_visibility.resize(prim_index_size); + uint *pack_prim_visibility = pack.prim_visibility.data(); + pack.prim_tri_index.resize(prim_index_size); + uint *pack_prim_tri_index = pack.prim_tri_index.data(); + pack.prim_tri_verts.resize(prim_tri_verts_size); + float4 *pack_prim_tri_verts = pack.prim_tri_verts.data(); + + // Top-level BVH should only contain instances, see 'Mesh::need_build_bvh' + // Iterate over scene mesh list instead of objects, since the 'prim_offset' is calculated based + // on that list, which may be ordered differently from the object list. + foreach (Mesh *mesh, meshes) { + PackedBVH &bvh_pack = mesh->bvh->pack; + int mesh_tri_offset = mesh->tri_offset; + int mesh_curve_offset = mesh->curve_offset; + + // Merge primitive, object and triangle indexes + if (!bvh_pack.prim_index.empty()) { + int *bvh_prim_type = &bvh_pack.prim_type[0]; + int *bvh_prim_index = &bvh_pack.prim_index[0]; + uint *bvh_prim_tri_index = &bvh_pack.prim_tri_index[0]; + uint *bvh_prim_visibility = &bvh_pack.prim_visibility[0]; + + for (size_t i = 0; i < bvh_pack.prim_index.size(); i++, pack_offset++) { + if (bvh_pack.prim_type[i] & PRIMITIVE_ALL_CURVE) { + pack_prim_index[pack_offset] = bvh_prim_index[i] + mesh_curve_offset; + pack_prim_tri_index[pack_offset] = -1; + } + else { + pack_prim_index[pack_offset] = bvh_prim_index[i] + mesh_tri_offset; + pack_prim_tri_index[pack_offset] = bvh_prim_tri_index[i] + pack_verts_offset; + } + + pack_prim_type[pack_offset] = bvh_prim_type[i]; + pack_prim_object[pack_offset] = 0; // Unused for instanced meshes + pack_prim_visibility[pack_offset] = bvh_prim_visibility[i]; + } + } + + // Merge triangle vertex data + if (!bvh_pack.prim_tri_verts.empty()) { + const size_t prim_tri_size = bvh_pack.prim_tri_verts.size(); + memcpy(pack_prim_tri_verts + pack_verts_offset, + bvh_pack.prim_tri_verts.data(), + prim_tri_size * sizeof(float4)); + pack_verts_offset += prim_tri_size; + } + } + + // Merge visibility flags of all objects and fix object indices for non-instanced meshes + foreach (Object *ob, objects) { + Mesh *const mesh = ob->mesh; + for (size_t i = 0; i < mesh->num_primitives(); ++i) { + if (!ob->mesh->is_instanced()) { + assert(pack.prim_object[mesh->prim_offset + i] == 0); + pack.prim_object[mesh->prim_offset + i] = ob->get_device_index(); + } + pack.prim_visibility[mesh->prim_offset + i] |= ob->visibility_for_tracing(); + } + } +} + +void BVHOptiX::pack_nodes(const BVHNode *) +{ +} + +void BVHOptiX::refit_nodes() +{ + // TODO(pmours): Implement? + VLOG(1) << "Refit is not yet implemented for OptiX BVH."; +} + +BVHNode *BVHOptiX::widen_children_nodes(const BVHNode *) +{ + return NULL; +} + +CCL_NAMESPACE_END + +#endif /* WITH_OPTIX */ diff --git a/intern/cycles/bvh/bvh_optix.h b/intern/cycles/bvh/bvh_optix.h new file mode 100644 index 00000000000..35033fe635f --- /dev/null +++ b/intern/cycles/bvh/bvh_optix.h @@ -0,0 +1,53 @@ +/* + * Copyright 2019, NVIDIA Corporation. + * Copyright 2019, Blender Foundation. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __BVH_OPTIX_H__ +#define __BVH_OPTIX_H__ + +#ifdef WITH_OPTIX + +# include "bvh/bvh.h" +# include "bvh/bvh_params.h" +# include "device/device_memory.h" + +CCL_NAMESPACE_BEGIN + +class BVHOptiX : public BVH { + friend class BVH; + + public: + BVHOptiX(const BVHParams ¶ms, const vector &meshes, const vector &objects); + virtual ~BVHOptiX(); + + virtual void build(Progress &progress, Stats *) override; + virtual void copy_to_device(Progress &progress, DeviceScene *dscene) override; + + private: + void pack_blas(); + void pack_tlas(); + + virtual void pack_nodes(const BVHNode *) override; + virtual void refit_nodes() override; + + virtual BVHNode *widen_children_nodes(const BVHNode *) override; +}; + +CCL_NAMESPACE_END + +#endif /* WITH_OPTIX */ + +#endif /* __BVH_OPTIX_H__ */ diff --git a/intern/cycles/device/CMakeLists.txt b/intern/cycles/device/CMakeLists.txt index 3e14480e2ad..a8c4949ad07 100644 --- a/intern/cycles/device/CMakeLists.txt +++ b/intern/cycles/device/CMakeLists.txt @@ -29,6 +29,7 @@ set(SRC device_memory.cpp device_multi.cpp device_opencl.cpp + device_optix.cpp device_split_kernel.cpp device_task.cpp ) @@ -85,6 +86,9 @@ endif() if(WITH_CYCLES_DEVICE_CUDA) add_definitions(-DWITH_CUDA) endif() +if(WITH_CYCLES_DEVICE_OPTIX) + add_definitions(-DWITH_OPTIX) +endif() if(WITH_CYCLES_DEVICE_MULTI) add_definitions(-DWITH_MULTI) endif() diff --git a/intern/cycles/device/device.cpp b/intern/cycles/device/device.cpp index 47d111802cd..fe8a814cd14 100644 --- a/intern/cycles/device/device.cpp +++ b/intern/cycles/device/device.cpp @@ -38,6 +38,7 @@ bool Device::need_devices_update = true; thread_mutex Device::device_mutex; vector Device::opencl_devices; vector Device::cuda_devices; +vector Device::optix_devices; vector Device::cpu_devices; vector Device::network_devices; uint Device::devices_initialized_mask = 0; @@ -379,6 +380,14 @@ Device *Device::create(DeviceInfo &info, Stats &stats, Profiler &profiler, bool device = NULL; break; #endif +#ifdef WITH_OPTIX + case DEVICE_OPTIX: + if (device_optix_init()) + device = device_optix_create(info, stats, profiler, background); + else + device = NULL; + break; +#endif #ifdef WITH_MULTI case DEVICE_MULTI: device = device_multi_create(info, stats, profiler, background); @@ -410,6 +419,8 @@ DeviceType Device::type_from_string(const char *name) return DEVICE_CPU; else if (strcmp(name, "CUDA") == 0) return DEVICE_CUDA; + else if (strcmp(name, "OPTIX") == 0) + return DEVICE_OPTIX; else if (strcmp(name, "OPENCL") == 0) return DEVICE_OPENCL; else if (strcmp(name, "NETWORK") == 0) @@ -426,6 +437,8 @@ string Device::string_from_type(DeviceType type) return "CPU"; else if (type == DEVICE_CUDA) return "CUDA"; + else if (type == DEVICE_OPTIX) + return "OPTIX"; else if (type == DEVICE_OPENCL) return "OPENCL"; else if (type == DEVICE_NETWORK) @@ -443,6 +456,9 @@ vector Device::available_types() #ifdef WITH_CUDA types.push_back(DEVICE_CUDA); #endif +#ifdef WITH_OPTIX + types.push_back(DEVICE_OPTIX); +#endif #ifdef WITH_OPENCL types.push_back(DEVICE_OPENCL); #endif @@ -488,6 +504,20 @@ vector Device::available_devices(uint mask) } #endif +#ifdef WITH_OPTIX + if (mask & DEVICE_MASK_OPTIX) { + if (!(devices_initialized_mask & DEVICE_MASK_OPTIX)) { + if (device_optix_init()) { + device_optix_info(optix_devices); + } + devices_initialized_mask |= DEVICE_MASK_OPTIX; + } + foreach (DeviceInfo &info, optix_devices) { + devices.push_back(info); + } + } +#endif + if (mask & DEVICE_MASK_CPU) { if (!(devices_initialized_mask & DEVICE_MASK_CPU)) { device_cpu_info(cpu_devices); diff --git a/intern/cycles/device/device.h b/intern/cycles/device/device.h index 15a0ceb4a19..672d93c2581 100644 --- a/intern/cycles/device/device.h +++ b/intern/cycles/device/device.h @@ -34,6 +34,7 @@ CCL_NAMESPACE_BEGIN +class BVH; class Progress; class RenderTile; @@ -45,13 +46,15 @@ enum DeviceType { DEVICE_OPENCL, DEVICE_CUDA, DEVICE_NETWORK, - DEVICE_MULTI + DEVICE_MULTI, + DEVICE_OPTIX, }; enum DeviceTypeMask { DEVICE_MASK_CPU = (1 << DEVICE_CPU), DEVICE_MASK_OPENCL = (1 << DEVICE_OPENCL), DEVICE_MASK_CUDA = (1 << DEVICE_CUDA), + DEVICE_MASK_OPTIX = (1 << DEVICE_OPTIX), DEVICE_MASK_NETWORK = (1 << DEVICE_NETWORK), DEVICE_MASK_ALL = ~0 }; @@ -380,7 +383,11 @@ class Device { } /* tasks */ - virtual int get_split_task_count(DeviceTask &task) = 0; + virtual int get_split_task_count(DeviceTask &) + { + return 1; + } + virtual void task_add(DeviceTask &task) = 0; virtual void task_wait() = 0; virtual void task_cancel() = 0; @@ -399,6 +406,12 @@ class Device { bool transparent, const DeviceDrawParams &draw_params); + /* acceleration structure building */ + virtual bool build_optix_bvh(BVH *, device_memory &) + { + return false; + } + #ifdef WITH_NETWORK /* networking */ void server_run(); @@ -456,6 +469,7 @@ class Device { static bool need_types_update, need_devices_update; static thread_mutex device_mutex; static vector cuda_devices; + static vector optix_devices; static vector opencl_devices; static vector cpu_devices; static vector network_devices; diff --git a/intern/cycles/device/device_cuda.cpp b/intern/cycles/device/device_cuda.cpp index 4d7d87828c2..00dd37f089c 100644 --- a/intern/cycles/device/device_cuda.cpp +++ b/intern/cycles/device/device_cuda.cpp @@ -2263,11 +2263,6 @@ class CUDADevice : public Device { } }; - int get_split_task_count(DeviceTask & /*task*/) - { - return 1; - } - void task_add(DeviceTask &task) { CUDAContextScope scope(this); diff --git a/intern/cycles/device/device_intern.h b/intern/cycles/device/device_intern.h index c393a3f9cda..5b8b86886c4 100644 --- a/intern/cycles/device/device_intern.h +++ b/intern/cycles/device/device_intern.h @@ -27,6 +27,9 @@ Device *device_opencl_create(DeviceInfo &info, Stats &stats, Profiler &profiler, bool device_opencl_compile_kernel(const vector ¶meters); bool device_cuda_init(); Device *device_cuda_create(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background); +bool device_optix_init(); +Device *device_optix_create(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background); + Device *device_network_create(DeviceInfo &info, Stats &stats, Profiler &profiler, @@ -36,6 +39,7 @@ Device *device_multi_create(DeviceInfo &info, Stats &stats, Profiler &profiler, void device_cpu_info(vector &devices); void device_opencl_info(vector &devices); void device_cuda_info(vector &devices); +void device_optix_info(vector &devices); void device_network_info(vector &devices); string device_cpu_capabilities(); diff --git a/intern/cycles/device/device_multi.cpp b/intern/cycles/device/device_multi.cpp index 4a40e106115..ac71be9dbea 100644 --- a/intern/cycles/device/device_multi.cpp +++ b/intern/cycles/device/device_multi.cpp @@ -153,6 +153,24 @@ class MultiDevice : public Device { return result; } + bool build_optix_bvh(BVH *bvh, device_memory &mem) + { + device_ptr key = unique_key++; + + // Broadcast acceleration structure build to all devices + foreach (SubDevice &sub, devices) { + mem.device = sub.device; + if (!sub.device->build_optix_bvh(bvh, mem)) + return false; + sub.ptr_map[key] = mem.device_pointer; + } + + mem.device = this; + mem.device_pointer = key; + stats.mem_alloc(mem.device_size); + return true; + } + void mem_alloc(device_memory &mem) { device_ptr key = unique_key++; diff --git a/intern/cycles/device/device_optix.cpp b/intern/cycles/device/device_optix.cpp new file mode 100644 index 00000000000..84d7ecf6934 --- /dev/null +++ b/intern/cycles/device/device_optix.cpp @@ -0,0 +1,1969 @@ +/* + * Copyright 2019, NVIDIA Corporation. + * Copyright 2019, Blender Foundation. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifdef WITH_OPTIX + +# include "device/device.h" +# include "device/device_intern.h" +# include "device/device_denoising.h" +# include "bvh/bvh.h" +# include "render/scene.h" +# include "render/mesh.h" +# include "render/object.h" +# include "render/buffers.h" +# include "util/util_md5.h" +# include "util/util_path.h" +# include "util/util_time.h" +# include "util/util_debug.h" +# include "util/util_logging.h" + +# undef _WIN32_WINNT // Need minimum API support for Windows 7 +# define _WIN32_WINNT _WIN32_WINNT_WIN7 + +# ifdef WITH_CUDA_DYNLOAD +# include +// Do not use CUDA SDK headers when using CUEW +# define OPTIX_DONT_INCLUDE_CUDA +# endif +# include +# include + +CCL_NAMESPACE_BEGIN + +/* Make sure this stays in sync with kernel_globals.h */ +struct ShaderParams { + uint4 *input; + float4 *output; + int type; + int filter; + int sx; + int offset; + int sample; +}; +struct KernelParams { + WorkTile tile; + KernelData data; + ShaderParams shader; +# define KERNEL_TEX(type, name) const type *name; +# include "kernel/kernel_textures.h" +# undef KERNEL_TEX +}; + +# define check_result_cuda(stmt) \ + { \ + CUresult res = stmt; \ + if (res != CUDA_SUCCESS) { \ + const char *name; \ + cuGetErrorName(res, &name); \ + set_error(string_printf("OptiX CUDA error %s in %s, line %d", name, #stmt, __LINE__)); \ + return; \ + } \ + } \ + (void)0 +# define check_result_cuda_ret(stmt) \ + { \ + CUresult res = stmt; \ + if (res != CUDA_SUCCESS) { \ + const char *name; \ + cuGetErrorName(res, &name); \ + set_error(string_printf("OptiX CUDA error %s in %s, line %d", name, #stmt, __LINE__)); \ + return false; \ + } \ + } \ + (void)0 + +# define check_result_optix(stmt) \ + { \ + enum OptixResult res = stmt; \ + if (res != OPTIX_SUCCESS) { \ + const char *name = optixGetErrorName(res); \ + set_error(string_printf("OptiX error %s in %s, line %d", name, #stmt, __LINE__)); \ + return; \ + } \ + } \ + (void)0 +# define check_result_optix_ret(stmt) \ + { \ + enum OptixResult res = stmt; \ + if (res != OPTIX_SUCCESS) { \ + const char *name = optixGetErrorName(res); \ + set_error(string_printf("OptiX error %s in %s, line %d", name, #stmt, __LINE__)); \ + return false; \ + } \ + } \ + (void)0 + +class OptiXDevice : public Device { + + // List of OptiX program groups + enum { + PG_RGEN, + PG_MISS, + PG_HITD, // Default hit group + PG_HITL, // __BVH_LOCAL__ hit group + PG_HITS, // __SHADOW_RECORD_ALL__ hit group +# ifdef WITH_CYCLES_DEBUG + PG_EXCP, +# endif + PG_BAKE, // kernel_bake_evaluate + PG_DISP, // kernel_displace_evaluate + PG_BACK, // kernel_background_evaluate + NUM_PROGRAM_GROUPS + }; + + // List of OptiX pipelines + enum { PIP_PATH_TRACE, PIP_SHADER_EVAL, NUM_PIPELINES }; + + // A single shader binding table entry + struct SbtRecord { + char header[OPTIX_SBT_RECORD_HEADER_SIZE]; + }; + + // Information stored about CUDA memory allocations + struct CUDAMem { + bool free_map_host = false; + CUarray array = NULL; + CUtexObject texobject = 0; + void *map_host_pointer = nullptr; + }; + + // Helper class to manage current CUDA context + struct CUDAContextScope { + CUDAContextScope(CUcontext ctx) + { + cuCtxPushCurrent(ctx); + } + ~CUDAContextScope() + { + cuCtxPopCurrent(NULL); + } + }; + + // Use a pool with multiple threads to support launches with multiple CUDA streams + TaskPool task_pool; + + // CUDA/OptiX context handles + CUdevice cuda_device = 0; + CUcontext cuda_context = NULL; + vector cuda_stream; + OptixDeviceContext context = NULL; + + // Need CUDA kernel module for some utility functions + CUmodule cuda_module = NULL; + CUmodule cuda_filter_module = NULL; + // All necessary OptiX kernels are in one module + OptixModule optix_module = NULL; + OptixPipeline pipelines[NUM_PIPELINES] = {}; + + bool need_texture_info = false; + device_vector sbt_data; + device_vector texture_info; + device_only_memory launch_params; + vector> blas; + OptixTraversableHandle tlas_handle = 0; + + map cuda_mem_map; + + public: + OptiXDevice(DeviceInfo &info_, Stats &stats_, Profiler &profiler_, bool background_) + : Device(info_, stats_, profiler_, background_), + sbt_data(this, "__sbt", MEM_READ_ONLY), + texture_info(this, "__texture_info", MEM_TEXTURE), + launch_params(this, "__params") + { + // Store number of CUDA streams in device info + info.cpu_threads = DebugFlags().optix.cuda_streams; + + // Initialize CUDA driver API + check_result_cuda(cuInit(0)); + + // Retrieve the primary CUDA context for this device + check_result_cuda(cuDeviceGet(&cuda_device, info.num)); + check_result_cuda(cuDevicePrimaryCtxRetain(&cuda_context, cuda_device)); + + // Make that CUDA context current + const CUDAContextScope scope(cuda_context); + + // Create OptiX context for this device + OptixDeviceContextOptions options = {}; +# ifdef WITH_CYCLES_LOGGING + options.logCallbackLevel = 4; // Fatal = 1, Error = 2, Warning = 3, Print = 4 + options.logCallbackFunction = + [](unsigned int level, const char *, const char *message, void *) { + switch (level) { + case 1: + LOG_IF(FATAL, VLOG_IS_ON(1)) << message; + break; + case 2: + LOG_IF(ERROR, VLOG_IS_ON(1)) << message; + break; + case 3: + LOG_IF(WARNING, VLOG_IS_ON(1)) << message; + break; + case 4: + LOG_IF(INFO, VLOG_IS_ON(1)) << message; + break; + } + }; +# endif + check_result_optix(optixDeviceContextCreate(cuda_context, &options, &context)); +# ifdef WITH_CYCLES_LOGGING + check_result_optix(optixDeviceContextSetLogCallback( + context, options.logCallbackFunction, options.logCallbackData, options.logCallbackLevel)); +# endif + + // Create launch streams + cuda_stream.resize(info.cpu_threads); + for (int i = 0; i < info.cpu_threads; ++i) + check_result_cuda(cuStreamCreate(&cuda_stream[i], CU_STREAM_NON_BLOCKING)); + + // Fix weird compiler bug that assigns wrong size + launch_params.data_elements = sizeof(KernelParams); + // Allocate launch parameter buffer memory on device + launch_params.alloc_to_device(info.cpu_threads); + } + ~OptiXDevice() + { + // Stop processing any more tasks + task_pool.stop(); + + // Clean up all memory before destroying context + blas.clear(); + + sbt_data.free(); + texture_info.free(); + launch_params.free(); + + // Make CUDA context current + const CUDAContextScope scope(cuda_context); + + // Unload modules + if (cuda_module != NULL) + cuModuleUnload(cuda_module); + if (cuda_filter_module != NULL) + cuModuleUnload(cuda_filter_module); + if (optix_module != NULL) + optixModuleDestroy(optix_module); + for (unsigned int i = 0; i < NUM_PIPELINES; ++i) + if (pipelines[i] != NULL) + optixPipelineDestroy(pipelines[i]); + + // Destroy launch streams + for (int i = 0; i < info.cpu_threads; ++i) + cuStreamDestroy(cuda_stream[i]); + + // Destroy OptiX and CUDA context + optixDeviceContextDestroy(context); + cuDevicePrimaryCtxRelease(cuda_device); + } + + private: + bool show_samples() const override + { + // Only show samples if not rendering multiple tiles in parallel + return info.cpu_threads == 1; + } + + BVHLayoutMask get_bvh_layout_mask() const override + { + // OptiX has its own internal acceleration structure format + return BVH_LAYOUT_OPTIX; + } + + bool load_kernels(const DeviceRequestedFeatures &requested_features) override + { + if (have_error()) + return false; // Abort early if context creation failed already + + // Disable baking for now, since its kernel is not well-suited for inlining and is very slow + if (requested_features.use_baking) { + set_error("OptiX implementation does not support baking yet"); + return false; + } + // Disable shader raytracing support for now, since continuation callables are slow + if (requested_features.use_shader_raytrace) { + set_error("OptiX implementation does not support shader raytracing yet"); + return false; + } + + const CUDAContextScope scope(cuda_context); + + // Unload any existing modules first + if (cuda_module != NULL) + cuModuleUnload(cuda_module); + if (cuda_filter_module != NULL) + cuModuleUnload(cuda_filter_module); + if (optix_module != NULL) + optixModuleDestroy(optix_module); + for (unsigned int i = 0; i < NUM_PIPELINES; ++i) + if (pipelines[i] != NULL) + optixPipelineDestroy(pipelines[i]); + + OptixModuleCompileOptions module_options; + module_options.maxRegisterCount = 0; // Do not set an explicit register limit +# ifdef WITH_CYCLES_DEBUG + module_options.optLevel = OPTIX_COMPILE_OPTIMIZATION_LEVEL_0; + module_options.debugLevel = OPTIX_COMPILE_DEBUG_LEVEL_FULL; +# else + module_options.optLevel = OPTIX_COMPILE_OPTIMIZATION_LEVEL_3; + module_options.debugLevel = OPTIX_COMPILE_DEBUG_LEVEL_LINEINFO; +# endif + OptixPipelineCompileOptions pipeline_options; + // Default to no motion blur and two-level graph, since it is the fastest option + pipeline_options.usesMotionBlur = false; + pipeline_options.traversableGraphFlags = + OPTIX_TRAVERSABLE_GRAPH_FLAG_ALLOW_SINGLE_LEVEL_INSTANCING; + pipeline_options.numPayloadValues = 6; + pipeline_options.numAttributeValues = 2; // u, v +# ifdef WITH_CYCLES_DEBUG + pipeline_options.exceptionFlags = OPTIX_EXCEPTION_FLAG_STACK_OVERFLOW | + OPTIX_EXCEPTION_FLAG_TRACE_DEPTH; +# else + pipeline_options.exceptionFlags = OPTIX_EXCEPTION_FLAG_NONE; +# endif + pipeline_options.pipelineLaunchParamsVariableName = "__params"; // See kernel_globals.h + + if (requested_features.use_object_motion) { + pipeline_options.usesMotionBlur = true; + // Motion blur can insert motion transforms into the traversal graph + // It is no longer a two-level graph then, so need to set flags to allow any configuration + pipeline_options.traversableGraphFlags = OPTIX_TRAVERSABLE_GRAPH_FLAG_ALLOW_ANY; + } + + { // Load and compile PTX module with OptiX kernels + string ptx_data; + const string ptx_filename = "lib/kernel_optix.ptx"; + if (!path_read_text(path_get(ptx_filename), ptx_data)) { + set_error("Failed loading OptiX kernel " + ptx_filename + "."); + return false; + } + + check_result_optix_ret(optixModuleCreateFromPTX(context, + &module_options, + &pipeline_options, + ptx_data.data(), + ptx_data.size(), + nullptr, + 0, + &optix_module)); + } + + { // Load CUDA modules because we need some of the utility kernels + int major, minor; + cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, info.num); + cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, info.num); + + string cubin_data; + const string cubin_filename = string_printf("lib/kernel_sm_%d%d.cubin", major, minor); + if (!path_read_text(path_get(cubin_filename), cubin_data)) { + set_error("Failed loading pre-compiled CUDA kernel " + cubin_filename + "."); + return false; + } + + check_result_cuda_ret(cuModuleLoadData(&cuda_module, cubin_data.data())); + + if (requested_features.use_denoising) { + string filter_data; + const string filter_filename = string_printf("lib/filter_sm_%d%d.cubin", major, minor); + if (!path_read_text(path_get(filter_filename), filter_data)) { + set_error("Failed loading pre-compiled CUDA filter kernel " + filter_filename + "."); + return false; + } + + check_result_cuda_ret(cuModuleLoadData(&cuda_filter_module, filter_data.data())); + } + } + + // Create program groups + OptixProgramGroup groups[NUM_PROGRAM_GROUPS] = {}; + OptixProgramGroupDesc group_descs[NUM_PROGRAM_GROUPS] = {}; + OptixProgramGroupOptions group_options = {}; // There are no options currently + group_descs[PG_RGEN].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN; + group_descs[PG_RGEN].raygen.module = optix_module; + // Ignore branched integrator for now (see "requested_features.use_integrator_branched") + group_descs[PG_RGEN].raygen.entryFunctionName = "__raygen__kernel_optix_path_trace"; + group_descs[PG_MISS].kind = OPTIX_PROGRAM_GROUP_KIND_MISS; + group_descs[PG_MISS].miss.module = optix_module; + group_descs[PG_MISS].miss.entryFunctionName = "__miss__kernel_optix_miss"; + group_descs[PG_HITD].kind = OPTIX_PROGRAM_GROUP_KIND_HITGROUP; + group_descs[PG_HITD].hitgroup.moduleCH = optix_module; + group_descs[PG_HITD].hitgroup.entryFunctionNameCH = "__closesthit__kernel_optix_hit"; + group_descs[PG_HITD].hitgroup.moduleAH = optix_module; + group_descs[PG_HITD].hitgroup.entryFunctionNameAH = "__anyhit__kernel_optix_visibility_test"; + group_descs[PG_HITS].kind = OPTIX_PROGRAM_GROUP_KIND_HITGROUP; + group_descs[PG_HITS].hitgroup.moduleAH = optix_module; + group_descs[PG_HITS].hitgroup.entryFunctionNameAH = "__anyhit__kernel_optix_shadow_all_hit"; + + if (requested_features.use_hair) { + // Add curve intersection programs + group_descs[PG_HITD].hitgroup.moduleIS = optix_module; + group_descs[PG_HITD].hitgroup.entryFunctionNameIS = "__intersection__curve"; + group_descs[PG_HITS].hitgroup.moduleIS = optix_module; + group_descs[PG_HITS].hitgroup.entryFunctionNameIS = "__intersection__curve"; + } + + if (requested_features.use_subsurface || requested_features.use_shader_raytrace) { + // Add hit group for local intersections + group_descs[PG_HITL].kind = OPTIX_PROGRAM_GROUP_KIND_HITGROUP; + group_descs[PG_HITL].hitgroup.moduleAH = optix_module; + group_descs[PG_HITL].hitgroup.entryFunctionNameAH = "__anyhit__kernel_optix_local_hit"; + } + +# ifdef WITH_CYCLES_DEBUG + group_descs[PG_EXCP].kind = OPTIX_PROGRAM_GROUP_KIND_EXCEPTION; + group_descs[PG_EXCP].exception.module = optix_module; + group_descs[PG_EXCP].exception.entryFunctionName = "__exception__kernel_optix_exception"; +# endif + + if (requested_features.use_baking) { + group_descs[PG_BAKE].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN; + group_descs[PG_BAKE].raygen.module = optix_module; + group_descs[PG_BAKE].raygen.entryFunctionName = "__raygen__kernel_optix_bake"; + } + + if (requested_features.use_true_displacement) { + group_descs[PG_DISP].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN; + group_descs[PG_DISP].raygen.module = optix_module; + group_descs[PG_DISP].raygen.entryFunctionName = "__raygen__kernel_optix_displace"; + } + + if (requested_features.use_background_light) { + group_descs[PG_BACK].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN; + group_descs[PG_BACK].raygen.module = optix_module; + group_descs[PG_BACK].raygen.entryFunctionName = "__raygen__kernel_optix_background"; + } + + check_result_optix_ret(optixProgramGroupCreate( + context, group_descs, NUM_PROGRAM_GROUPS, &group_options, nullptr, 0, groups)); + + // Get program stack sizes + OptixStackSizes stack_size[NUM_PROGRAM_GROUPS] = {}; + // Set up SBT, which in this case is used only to select between different programs + sbt_data.alloc(NUM_PROGRAM_GROUPS); + memset(sbt_data.host_pointer, 0, sizeof(SbtRecord) * NUM_PROGRAM_GROUPS); + for (unsigned int i = 0; i < NUM_PROGRAM_GROUPS; ++i) { + check_result_optix_ret(optixSbtRecordPackHeader(groups[i], &sbt_data[i])); + check_result_optix_ret(optixProgramGroupGetStackSize(groups[i], &stack_size[i])); + } + sbt_data.copy_to_device(); // Upload SBT to device + + // Calculate maximum trace continuation stack size + unsigned int trace_css = stack_size[PG_HITD].cssCH; + // This is based on the maximum of closest-hit and any-hit/intersection programs + trace_css = max(trace_css, stack_size[PG_HITD].cssIS + stack_size[PG_HITD].cssAH); + trace_css = max(trace_css, stack_size[PG_HITL].cssIS + stack_size[PG_HITL].cssAH); + trace_css = max(trace_css, stack_size[PG_HITS].cssIS + stack_size[PG_HITS].cssAH); + + OptixPipelineLinkOptions link_options; + link_options.maxTraceDepth = 1; +# ifdef WITH_CYCLES_DEBUG + link_options.debugLevel = OPTIX_COMPILE_DEBUG_LEVEL_FULL; +# else + link_options.debugLevel = OPTIX_COMPILE_DEBUG_LEVEL_LINEINFO; +# endif + link_options.overrideUsesMotionBlur = pipeline_options.usesMotionBlur; + + { // Create path tracing pipeline + OptixProgramGroup pipeline_groups[] = { + groups[PG_RGEN], + groups[PG_MISS], + groups[PG_HITD], + groups[PG_HITS], + groups[PG_HITL], +# ifdef WITH_CYCLES_DEBUG + groups[PG_EXCP], +# endif + }; + check_result_optix_ret( + optixPipelineCreate(context, + &pipeline_options, + &link_options, + pipeline_groups, + (sizeof(pipeline_groups) / sizeof(pipeline_groups[0])), + nullptr, + 0, + &pipelines[PIP_PATH_TRACE])); + + // Combine ray generation and trace continuation stack size + const unsigned int css = stack_size[PG_RGEN].cssRG + link_options.maxTraceDepth * trace_css; + + // Set stack size depending on pipeline options + check_result_optix_ret(optixPipelineSetStackSize( + pipelines[PIP_PATH_TRACE], 0, 0, css, (pipeline_options.usesMotionBlur ? 3 : 2))); + } + + // Only need to create shader evaluation pipeline if one of these features is used: + const bool use_shader_eval_pipeline = requested_features.use_baking || + requested_features.use_background_light || + requested_features.use_true_displacement; + + if (use_shader_eval_pipeline) { // Create shader evaluation pipeline + OptixProgramGroup pipeline_groups[] = { + groups[PG_BAKE], + groups[PG_DISP], + groups[PG_BACK], + groups[PG_MISS], + groups[PG_HITD], + groups[PG_HITS], + groups[PG_HITL], +# ifdef WITH_CYCLES_DEBUG + groups[PG_EXCP], +# endif + }; + check_result_optix_ret( + optixPipelineCreate(context, + &pipeline_options, + &link_options, + pipeline_groups, + (sizeof(pipeline_groups) / sizeof(pipeline_groups[0])), + nullptr, + 0, + &pipelines[PIP_SHADER_EVAL])); + + // Calculate continuation stack size based on the maximum of all ray generation stack sizes + const unsigned int css = max(stack_size[PG_BAKE].cssRG, + max(stack_size[PG_DISP].cssRG, stack_size[PG_BACK].cssRG)) + + link_options.maxTraceDepth * trace_css; + + check_result_optix_ret(optixPipelineSetStackSize( + pipelines[PIP_SHADER_EVAL], 0, 0, css, (pipeline_options.usesMotionBlur ? 3 : 2))); + } + + // Clean up program group objects + for (unsigned int i = 0; i < NUM_PROGRAM_GROUPS; ++i) { + optixProgramGroupDestroy(groups[i]); + } + + return true; + } + + void thread_run(DeviceTask &task, int thread_index) // Main task entry point + { + if (have_error()) + return; // Abort early if there was an error previously + + if (task.type == DeviceTask::RENDER) { + RenderTile tile; + while (task.acquire_tile(this, tile)) { + if (tile.task == RenderTile::PATH_TRACE) + launch_render(task, tile, thread_index); + else if (tile.task == RenderTile::DENOISE) + launch_denoise(task, tile, thread_index); + task.release_tile(tile); + if (task.get_cancel() && !task.need_finish_queue) + break; // User requested cancellation + else if (have_error()) + break; // Abort rendering when encountering an error + } + } + else if (task.type == DeviceTask::SHADER) { + launch_shader_eval(task, thread_index); + } + else if (task.type == DeviceTask::FILM_CONVERT) { + launch_film_convert(task, thread_index); + } + } + + void launch_render(DeviceTask &task, RenderTile &rtile, int thread_index) + { + assert(thread_index < launch_params.data_size); + + // Keep track of total render time of this tile + const scoped_timer timer(&rtile.buffers->render_time); + + WorkTile wtile; + wtile.x = rtile.x; + wtile.y = rtile.y; + wtile.w = rtile.w; + wtile.h = rtile.h; + wtile.offset = rtile.offset; + wtile.stride = rtile.stride; + wtile.buffer = (float *)rtile.buffer; + + const int end_sample = rtile.start_sample + rtile.num_samples; + // Keep this number reasonable to avoid running into TDRs + const int step_samples = (info.display_device ? 8 : 32); + // Offset into launch params buffer so that streams use separate data + device_ptr launch_params_ptr = launch_params.device_pointer + + thread_index * launch_params.data_elements; + + const CUDAContextScope scope(cuda_context); + + for (int sample = rtile.start_sample; sample < end_sample; sample += step_samples) { + // Copy work tile information to device + wtile.num_samples = min(step_samples, end_sample - sample); + wtile.start_sample = sample; + check_result_cuda(cuMemcpyHtoDAsync(launch_params_ptr + offsetof(KernelParams, tile), + &wtile, + sizeof(wtile), + cuda_stream[thread_index])); + + OptixShaderBindingTable sbt_params = {}; + sbt_params.raygenRecord = sbt_data.device_pointer + PG_RGEN * sizeof(SbtRecord); +# ifdef WITH_CYCLES_DEBUG + sbt_params.exceptionRecord = sbt_data.device_pointer + PG_EXCP * sizeof(SbtRecord); +# endif + sbt_params.missRecordBase = sbt_data.device_pointer + PG_MISS * sizeof(SbtRecord); + sbt_params.missRecordStrideInBytes = sizeof(SbtRecord); + sbt_params.missRecordCount = 1; + sbt_params.hitgroupRecordBase = sbt_data.device_pointer + PG_HITD * sizeof(SbtRecord); + sbt_params.hitgroupRecordStrideInBytes = sizeof(SbtRecord); + sbt_params.hitgroupRecordCount = 3; // PG_HITD, PG_HITL, PG_HITS + + // Launch the ray generation program + check_result_optix(optixLaunch(pipelines[PIP_PATH_TRACE], + cuda_stream[thread_index], + launch_params_ptr, + launch_params.data_elements, + &sbt_params, + // Launch with samples close to each other for better locality + wtile.w * wtile.num_samples, + wtile.h, + 1)); + + // Wait for launch to finish + check_result_cuda(cuStreamSynchronize(cuda_stream[thread_index])); + + // Update current sample, so it is displayed correctly + rtile.sample = wtile.start_sample + wtile.num_samples; + // Update task progress after the kernel completed rendering + task.update_progress(&rtile, wtile.w * wtile.h * wtile.num_samples); + + if (task.get_cancel() && !task.need_finish_queue) + return; // Cancel rendering + } + } + + void launch_denoise(DeviceTask &task, RenderTile &rtile, int thread_index) + { + const CUDAContextScope scope(cuda_context); + + // Run CUDA denoising kernels + DenoisingTask denoising(this, task); + denoising.functions.construct_transform = function_bind( + &OptiXDevice::denoising_construct_transform, this, &denoising, thread_index); + denoising.functions.accumulate = function_bind( + &OptiXDevice::denoising_accumulate, this, _1, _2, _3, _4, &denoising, thread_index); + denoising.functions.solve = function_bind( + &OptiXDevice::denoising_solve, this, _1, &denoising, thread_index); + denoising.functions.divide_shadow = function_bind( + &OptiXDevice::denoising_divide_shadow, this, _1, _2, _3, _4, _5, &denoising, thread_index); + denoising.functions.non_local_means = function_bind( + &OptiXDevice::denoising_non_local_means, this, _1, _2, _3, _4, &denoising, thread_index); + denoising.functions.combine_halves = function_bind(&OptiXDevice::denoising_combine_halves, + this, + _1, + _2, + _3, + _4, + _5, + _6, + &denoising, + thread_index); + denoising.functions.get_feature = function_bind( + &OptiXDevice::denoising_get_feature, this, _1, _2, _3, _4, _5, &denoising, thread_index); + denoising.functions.write_feature = function_bind( + &OptiXDevice::denoising_write_feature, this, _1, _2, _3, &denoising, thread_index); + denoising.functions.detect_outliers = function_bind( + &OptiXDevice::denoising_detect_outliers, this, _1, _2, _3, _4, &denoising, thread_index); + + denoising.filter_area = make_int4(rtile.x, rtile.y, rtile.w, rtile.h); + denoising.render_buffer.samples = rtile.sample = rtile.start_sample + rtile.num_samples; + denoising.buffer.gpu_temporary_mem = true; + + denoising.run_denoising(&rtile); + + task.update_progress(&rtile, rtile.w * rtile.h); + } + + void launch_shader_eval(DeviceTask &task, int thread_index) + { + unsigned int rgen_index = PG_BACK; + if (task.shader_eval_type >= SHADER_EVAL_BAKE) + rgen_index = PG_BAKE; + if (task.shader_eval_type == SHADER_EVAL_DISPLACE) + rgen_index = PG_DISP; + + const CUDAContextScope scope(cuda_context); + + device_ptr launch_params_ptr = launch_params.device_pointer + + thread_index * launch_params.data_elements; + + for (int sample = 0; sample < task.num_samples; ++sample) { + ShaderParams params; + params.input = (uint4 *)task.shader_input; + params.output = (float4 *)task.shader_output; + params.type = task.shader_eval_type; + params.filter = task.shader_filter; + params.sx = task.shader_x; + params.offset = task.offset; + params.sample = sample; + + check_result_cuda(cuMemcpyHtoDAsync(launch_params_ptr + offsetof(KernelParams, shader), + ¶ms, + sizeof(params), + cuda_stream[thread_index])); + + OptixShaderBindingTable sbt_params = {}; + sbt_params.raygenRecord = sbt_data.device_pointer + rgen_index * sizeof(SbtRecord); +# ifdef WITH_CYCLES_DEBUG + sbt_params.exceptionRecord = sbt_data.device_pointer + PG_EXCP * sizeof(SbtRecord); +# endif + sbt_params.missRecordBase = sbt_data.device_pointer + PG_MISS * sizeof(SbtRecord); + sbt_params.missRecordStrideInBytes = sizeof(SbtRecord); + sbt_params.missRecordCount = 1; + sbt_params.hitgroupRecordBase = sbt_data.device_pointer + PG_HITD * sizeof(SbtRecord); + sbt_params.hitgroupRecordStrideInBytes = sizeof(SbtRecord); + sbt_params.hitgroupRecordCount = 3; // PG_HITD, PG_HITL, PG_HITS + + check_result_optix(optixLaunch(pipelines[PIP_SHADER_EVAL], + cuda_stream[thread_index], + launch_params_ptr, + launch_params.data_elements, + &sbt_params, + task.shader_w, + 1, + 1)); + + check_result_cuda(cuStreamSynchronize(cuda_stream[thread_index])); + + task.update_progress(NULL); + } + } + + void launch_film_convert(DeviceTask &task, int thread_index) + { + const CUDAContextScope scope(cuda_context); + + CUfunction film_convert_func; + check_result_cuda(cuModuleGetFunction(&film_convert_func, + cuda_module, + task.rgba_byte ? "kernel_cuda_convert_to_byte" : + "kernel_cuda_convert_to_half_float")); + + float sample_scale = 1.0f / (task.sample + 1); + CUdeviceptr rgba = (task.rgba_byte ? task.rgba_byte : task.rgba_half); + + void *args[] = {&rgba, + &task.buffer, + &sample_scale, + &task.x, + &task.y, + &task.w, + &task.h, + &task.offset, + &task.stride}; + + int threads_per_block; + check_result_cuda(cuFuncGetAttribute( + &threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, film_convert_func)); + + const int num_threads_x = (int)sqrt(threads_per_block); + const int num_blocks_x = (task.w + num_threads_x - 1) / num_threads_x; + const int num_threads_y = (int)sqrt(threads_per_block); + const int num_blocks_y = (task.h + num_threads_y - 1) / num_threads_y; + + check_result_cuda(cuLaunchKernel(film_convert_func, + num_blocks_x, + num_blocks_y, + 1, /* blocks */ + num_threads_x, + num_threads_y, + 1, /* threads */ + 0, + cuda_stream[thread_index], + args, + 0)); + + check_result_cuda(cuStreamSynchronize(cuda_stream[thread_index])); + + task.update_progress(NULL); + } + + bool build_optix_bvh(const OptixBuildInput &build_input, + uint16_t num_motion_steps, + device_memory &out_data, + OptixTraversableHandle &out_handle) + { + out_handle = 0; + + const CUDAContextScope scope(cuda_context); + + // Compute memory usage + OptixAccelBufferSizes sizes = {}; + OptixAccelBuildOptions options; + options.operation = OPTIX_BUILD_OPERATION_BUILD; + options.buildFlags = OPTIX_BUILD_FLAG_PREFER_FAST_TRACE; + options.motionOptions.numKeys = num_motion_steps; + options.motionOptions.flags = OPTIX_MOTION_FLAG_START_VANISH | OPTIX_MOTION_FLAG_END_VANISH; + options.motionOptions.timeBegin = 0.0f; + options.motionOptions.timeEnd = 1.0f; + + check_result_optix_ret( + optixAccelComputeMemoryUsage(context, &options, &build_input, 1, &sizes)); + + // Allocate required output buffers + device_only_memory temp_mem(this, "temp_build_mem"); + temp_mem.alloc_to_device(sizes.tempSizeInBytes); + + out_data.data_type = TYPE_UNKNOWN; + out_data.data_elements = 1; + out_data.data_size = sizes.outputSizeInBytes; + mem_alloc(out_data); + + // Finally build the acceleration structure + check_result_optix_ret(optixAccelBuild(context, + NULL, + &options, + &build_input, + 1, + temp_mem.device_pointer, + sizes.tempSizeInBytes, + out_data.device_pointer, + sizes.outputSizeInBytes, + &out_handle, + NULL, + 0)); + + // Wait for all operations to finish + check_result_cuda_ret(cuStreamSynchronize(NULL)); + + return true; + } + + bool build_optix_bvh(BVH *bvh, device_memory &out_data) override + { + assert(bvh->params.top_level); + + unsigned int num_instances = 0; + unordered_map> meshes; + + // Clear all previous AS + blas.clear(); + + // Build bottom level acceleration structures (BLAS) + // Note: Always keep this logic in sync with bvh_optix.cpp! + for (Object *ob : bvh->objects) { + // Skip meshes for which acceleration structure already exists + if (meshes.find(ob->mesh) != meshes.end()) + continue; + + Mesh *const mesh = ob->mesh; + vector handles; + + // Build BLAS for curve primitives + if (bvh->params.primitive_mask & PRIMITIVE_ALL_CURVE && mesh->num_curves() > 0) { + const size_t num_curves = mesh->num_curves(); + const size_t num_segments = mesh->num_segments(); + + size_t num_motion_steps = 1; + Attribute *motion_keys = mesh->curve_attributes.find(ATTR_STD_MOTION_VERTEX_POSITION); + if (mesh->use_motion_blur && motion_keys) { + num_motion_steps = mesh->motion_steps; + } + + device_vector aabb_data(this, "temp_aabb_data", MEM_READ_ONLY); + aabb_data.alloc(num_segments * num_motion_steps); + + // Get AABBs for each motion step + for (size_t step = 0; step < num_motion_steps; ++step) { + const float3 *keys = mesh->curve_keys.data(); + + size_t center_step = (num_motion_steps - 1) / 2; + // The center step for motion vertices is not stored in the attribute + if (step != center_step) { + keys = motion_keys->data_float3() + + (step > center_step ? step - 1 : step) * num_segments; + } + + for (size_t i = step * num_segments, j = 0; j < num_curves; ++j) { + const Mesh::Curve c = mesh->get_curve(j); + for (size_t k = 0; k < c.num_segments(); ++i, ++k) { + BoundBox bounds = BoundBox::empty; + c.bounds_grow(k, keys, mesh->curve_radius.data(), bounds); + aabb_data[i].minX = bounds.min.x; + aabb_data[i].minY = bounds.min.y; + aabb_data[i].minZ = bounds.min.z; + aabb_data[i].maxX = bounds.max.x; + aabb_data[i].maxY = bounds.max.y; + aabb_data[i].maxZ = bounds.max.z; + } + } + } + + // Upload AABB data to GPU + aabb_data.copy_to_device(); + + vector aabb_ptrs; + aabb_ptrs.reserve(num_motion_steps); + for (size_t step = 0; step < num_motion_steps; ++step) { + aabb_ptrs.push_back(aabb_data.device_pointer + step * num_segments * sizeof(OptixAabb)); + } + + // Disable visibility test anyhit program, since it is already checked during intersection + // Those trace calls that require anyhit can force it with OPTIX_RAY_FLAG_ENFORCE_ANYHIT + unsigned int build_flags = OPTIX_GEOMETRY_FLAG_DISABLE_ANYHIT; + + OptixBuildInput build_input = {}; + build_input.type = OPTIX_BUILD_INPUT_TYPE_CUSTOM_PRIMITIVES; + build_input.aabbArray.aabbBuffers = (CUdeviceptr *)aabb_ptrs.data(); + build_input.aabbArray.numPrimitives = num_segments; + build_input.aabbArray.strideInBytes = sizeof(OptixAabb); + build_input.aabbArray.flags = &build_flags; + build_input.aabbArray.numSbtRecords = 1; + build_input.aabbArray.primitiveIndexOffset = mesh->prim_offset; + + // Allocate memory for new BLAS and build it + blas.emplace_back(this, "blas"); + handles.emplace_back(); + if (!build_optix_bvh(build_input, num_motion_steps, blas.back(), handles.back())) + return false; + } + + // Build BLAS for triangle primitives + if (bvh->params.primitive_mask & PRIMITIVE_ALL_TRIANGLE && mesh->num_triangles() > 0) { + const size_t num_verts = mesh->verts.size(); + + size_t num_motion_steps = 1; + Attribute *motion_keys = mesh->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION); + if (mesh->use_motion_blur && motion_keys) { + num_motion_steps = mesh->motion_steps; + } + + device_vector index_data(this, "temp_index_data", MEM_READ_ONLY); + index_data.alloc(mesh->triangles.size()); + memcpy(index_data.data(), mesh->triangles.data(), mesh->triangles.size() * sizeof(int)); + device_vector vertex_data(this, "temp_vertex_data", MEM_READ_ONLY); + vertex_data.alloc(num_verts * num_motion_steps); + + for (size_t step = 0; step < num_motion_steps; ++step) { + const float3 *verts = mesh->verts.data(); + + size_t center_step = (num_motion_steps - 1) / 2; + // The center step for motion vertices is not stored in the attribute + if (step != center_step) { + verts = motion_keys->data_float3() + + (step > center_step ? step - 1 : step) * num_verts; + } + + memcpy(vertex_data.data() + num_verts * step, verts, num_verts * sizeof(float3)); + } + + // Upload triangle data to GPU + index_data.copy_to_device(); + vertex_data.copy_to_device(); + + vector vertex_ptrs; + vertex_ptrs.reserve(num_motion_steps); + for (size_t step = 0; step < num_motion_steps; ++step) { + vertex_ptrs.push_back(vertex_data.device_pointer + num_verts * step * sizeof(float3)); + } + + // No special build flags for triangle primitives + unsigned int build_flags = OPTIX_GEOMETRY_FLAG_NONE; + + OptixBuildInput build_input = {}; + build_input.type = OPTIX_BUILD_INPUT_TYPE_TRIANGLES; + build_input.triangleArray.vertexBuffers = (CUdeviceptr *)vertex_ptrs.data(); + build_input.triangleArray.numVertices = num_verts; + build_input.triangleArray.vertexFormat = OPTIX_VERTEX_FORMAT_FLOAT3; + build_input.triangleArray.vertexStrideInBytes = sizeof(float3); + build_input.triangleArray.indexBuffer = index_data.device_pointer; + build_input.triangleArray.numIndexTriplets = mesh->num_triangles(); + build_input.triangleArray.indexFormat = OPTIX_INDICES_FORMAT_UNSIGNED_INT3; + build_input.triangleArray.indexStrideInBytes = 3 * sizeof(int); + build_input.triangleArray.flags = &build_flags; + // The SBT does not store per primitive data since Cycles already allocates separate + // buffers for that purpose. OptiX does not allow this to be zero though, so just pass in + // one and rely on that having the same meaning in this case. + build_input.triangleArray.numSbtRecords = 1; + // Triangle primitives are packed right after the curve primitives of this mesh + build_input.triangleArray.primitiveIndexOffset = mesh->prim_offset + mesh->num_segments(); + + // Allocate memory for new BLAS and build it + blas.emplace_back(this, "blas"); + handles.emplace_back(); + if (!build_optix_bvh(build_input, num_motion_steps, blas.back(), handles.back())) + return false; + } + + meshes.insert({mesh, handles}); + } + + // Fill instance descriptions + device_vector aabbs(this, "tlas_aabbs", MEM_READ_ONLY); + aabbs.alloc(bvh->objects.size() * 2); + device_vector instances(this, "tlas_instances", MEM_READ_ONLY); + instances.alloc(bvh->objects.size() * 2); + + for (Object *ob : bvh->objects) { + // Skip non-traceable objects + if (!ob->is_traceable()) + continue; + // Create separate instance for triangle/curve meshes of an object + for (OptixTraversableHandle handle : meshes[ob->mesh]) { + OptixAabb &aabb = aabbs[num_instances]; + aabb.minX = ob->bounds.min.x; + aabb.minY = ob->bounds.min.y; + aabb.minZ = ob->bounds.min.z; + aabb.maxX = ob->bounds.max.x; + aabb.maxY = ob->bounds.max.y; + aabb.maxZ = ob->bounds.max.z; + + OptixInstance &instance = instances[num_instances++]; + memset(&instance, 0, sizeof(instance)); + + // Clear transform to identity matrix + instance.transform[0] = 1.0f; + instance.transform[5] = 1.0f; + instance.transform[10] = 1.0f; + + // Set user instance ID to object index + instance.instanceId = ob->get_device_index(); + + // Volumes have a special bit set in the visibility mask so a trace can mask only volumes + // See 'scene_intersect_volume' in bvh.h + instance.visibilityMask = (ob->mesh->has_volume ? 3 : 1); + + // Insert motion traversable if object has motion + if (ob->use_motion()) { + blas.emplace_back(this, "motion_transform"); + device_only_memory &motion_transform_gpu = blas.back(); + motion_transform_gpu.alloc_to_device(sizeof(OptixSRTMotionTransform) + + (max(ob->motion.size(), 2) - 2) * + sizeof(OptixSRTData)); + + // Allocate host side memory for motion transform and fill it with transform data + OptixSRTMotionTransform &motion_transform = *reinterpret_cast( + motion_transform_gpu.host_pointer = new uint8_t[motion_transform_gpu.memory_size()]); + motion_transform.child = handle; + motion_transform.motionOptions.numKeys = ob->motion.size(); + motion_transform.motionOptions.flags = OPTIX_MOTION_FLAG_NONE; + motion_transform.motionOptions.timeBegin = 0.0f; + motion_transform.motionOptions.timeEnd = 1.0f; + + OptixSRTData *const srt_data = motion_transform.srtData; + array decomp(ob->motion.size()); + transform_motion_decompose(decomp.data(), ob->motion.data(), ob->motion.size()); + + for (size_t i = 0; i < ob->motion.size(); ++i) { + // scaling + srt_data[i].a = decomp[i].z.x; // scale.x.y + srt_data[i].b = decomp[i].z.y; // scale.x.z + srt_data[i].c = decomp[i].w.x; // scale.y.z + srt_data[i].sx = decomp[i].y.w; // scale.x.x + srt_data[i].sy = decomp[i].z.w; // scale.y.y + srt_data[i].sz = decomp[i].w.w; // scale.z.z + srt_data[i].pvx = 0; + srt_data[i].pvy = 0; + srt_data[i].pvz = 0; + // rotation + srt_data[i].qx = decomp[i].x.x; + srt_data[i].qy = decomp[i].x.y; + srt_data[i].qz = decomp[i].x.z; + srt_data[i].qw = decomp[i].x.w; + // transform + srt_data[i].tx = decomp[i].y.x; + srt_data[i].ty = decomp[i].y.y; + srt_data[i].tz = decomp[i].y.z; + } + + // Upload motion transform to GPU + mem_copy_to(motion_transform_gpu); + delete[] reinterpret_cast(motion_transform_gpu.host_pointer); + motion_transform_gpu.host_pointer = 0; + + // Disable instance transform if object uses motion transform already + instance.flags = OPTIX_INSTANCE_FLAG_DISABLE_TRANSFORM; + + // Get traversable handle to motion transform + optixConvertPointerToTraversableHandle(context, + motion_transform_gpu.device_pointer, + OPTIX_TRAVERSABLE_TYPE_SRT_MOTION_TRANSFORM, + &instance.traversableHandle); + } + else { + instance.traversableHandle = handle; + + if (ob->mesh->is_instanced()) { + // Set transform matrix + memcpy(instance.transform, &ob->tfm, sizeof(instance.transform)); + } + else { + // Disable instance transform if mesh already has it applied to vertex data + instance.flags = OPTIX_INSTANCE_FLAG_DISABLE_TRANSFORM; + // Non-instanced objects read ID from prim_object, so + // distinguish them from instanced objects with high bit set + instance.instanceId |= 0x800000; + } + } + } + } + + // Upload instance descriptions + aabbs.resize(num_instances); + aabbs.copy_to_device(); + instances.resize(num_instances); + instances.copy_to_device(); + + // Build top-level acceleration structure + OptixBuildInput build_input = {}; + build_input.type = OPTIX_BUILD_INPUT_TYPE_INSTANCES; + build_input.instanceArray.instances = instances.device_pointer; + build_input.instanceArray.numInstances = num_instances; + build_input.instanceArray.aabbs = aabbs.device_pointer; + build_input.instanceArray.numAabbs = num_instances; + + return build_optix_bvh(build_input, 0 /* TLAS has no motion itself */, out_data, tlas_handle); + } + + void update_texture_info() + { + if (need_texture_info) { + texture_info.copy_to_device(); + need_texture_info = false; + } + } + + void update_launch_params(const char *name, size_t offset, void *data, size_t data_size) + { + const CUDAContextScope scope(cuda_context); + + for (int i = 0; i < info.cpu_threads; ++i) + check_result_cuda( + cuMemcpyHtoD(launch_params.device_pointer + i * launch_params.data_elements + offset, + data, + data_size)); + + // Set constant memory for CUDA module + // TODO(pmours): This is only used for tonemapping (see 'launch_film_convert'). + // Could be removed by moving those functions to filter CUDA module. + size_t bytes = 0; + CUdeviceptr mem = 0; + check_result_cuda(cuModuleGetGlobal(&mem, &bytes, cuda_module, name)); + assert(mem != NULL && bytes == data_size); + check_result_cuda(cuMemcpyHtoD(mem, data, data_size)); + } + + void mem_alloc(device_memory &mem) override + { + const CUDAContextScope scope(cuda_context); + + mem.device_size = mem.memory_size(); + + if (mem.type == MEM_TEXTURE && mem.interpolation != INTERPOLATION_NONE) { + CUDAMem &cmem = cuda_mem_map[&mem]; // Lock and get associated memory information + + CUDA_TEXTURE_DESC tex_desc = {}; + tex_desc.flags = CU_TRSF_NORMALIZED_COORDINATES; + CUDA_RESOURCE_DESC res_desc = {}; + + switch (mem.extension) { + default: + assert(0); + case EXTENSION_REPEAT: + tex_desc.addressMode[0] = tex_desc.addressMode[1] = tex_desc.addressMode[2] = + CU_TR_ADDRESS_MODE_WRAP; + break; + case EXTENSION_EXTEND: + tex_desc.addressMode[0] = tex_desc.addressMode[1] = tex_desc.addressMode[2] = + CU_TR_ADDRESS_MODE_CLAMP; + break; + case EXTENSION_CLIP: + tex_desc.addressMode[0] = tex_desc.addressMode[1] = tex_desc.addressMode[2] = + CU_TR_ADDRESS_MODE_BORDER; + break; + } + + switch (mem.interpolation) { + default: // Default to linear for unsupported interpolation types + case INTERPOLATION_LINEAR: + tex_desc.filterMode = CU_TR_FILTER_MODE_LINEAR; + break; + case INTERPOLATION_CLOSEST: + tex_desc.filterMode = CU_TR_FILTER_MODE_POINT; + break; + } + + CUarray_format format; + switch (mem.data_type) { + default: + assert(0); + case TYPE_UCHAR: + format = CU_AD_FORMAT_UNSIGNED_INT8; + break; + case TYPE_UINT16: + format = CU_AD_FORMAT_UNSIGNED_INT16; + break; + case TYPE_UINT: + format = CU_AD_FORMAT_UNSIGNED_INT32; + break; + case TYPE_INT: + format = CU_AD_FORMAT_SIGNED_INT32; + break; + case TYPE_FLOAT: + format = CU_AD_FORMAT_FLOAT; + break; + case TYPE_HALF: + format = CU_AD_FORMAT_HALF; + break; + } + + if (mem.data_depth > 1) { /* 3D texture using array. */ + CUDA_ARRAY3D_DESCRIPTOR desc; + desc.Width = mem.data_width; + desc.Height = mem.data_height; + desc.Depth = mem.data_depth; + desc.Format = format; + desc.NumChannels = mem.data_elements; + desc.Flags = 0; + + check_result_cuda(cuArray3DCreate(&cmem.array, &desc)); + mem.device_pointer = (device_ptr)cmem.array; + + res_desc.resType = CU_RESOURCE_TYPE_ARRAY; + res_desc.res.array.hArray = cmem.array; + } + else if (mem.data_height > 0) { /* 2D texture using array. */ + CUDA_ARRAY_DESCRIPTOR desc; + desc.Width = mem.data_width; + desc.Height = mem.data_height; + desc.Format = format; + desc.NumChannels = mem.data_elements; + + check_result_cuda(cuArrayCreate(&cmem.array, &desc)); + mem.device_pointer = (device_ptr)cmem.array; + + res_desc.resType = CU_RESOURCE_TYPE_ARRAY; + res_desc.res.array.hArray = cmem.array; + } + else { + check_result_cuda(cuMemAlloc((CUdeviceptr *)&mem.device_pointer, mem.device_size)); + + res_desc.resType = CU_RESOURCE_TYPE_LINEAR; + res_desc.res.linear.devPtr = (CUdeviceptr)mem.device_pointer; + res_desc.res.linear.format = format; + res_desc.res.linear.numChannels = mem.data_elements; + res_desc.res.linear.sizeInBytes = mem.device_size; + } + + check_result_cuda(cuTexObjectCreate(&cmem.texobject, &res_desc, &tex_desc, NULL)); + + int flat_slot = 0; + if (string_startswith(mem.name, "__tex_image")) { + flat_slot = atoi(mem.name + string(mem.name).rfind("_") + 1); + } + + if (flat_slot >= texture_info.size()) + texture_info.resize(flat_slot + 128); + + TextureInfo &info = texture_info[flat_slot]; + info.data = (uint64_t)cmem.texobject; + info.cl_buffer = 0; + info.interpolation = mem.interpolation; + info.extension = mem.extension; + info.width = mem.data_width; + info.height = mem.data_height; + info.depth = mem.data_depth; + + // Texture information has changed and needs an update, delay this to next launch + need_texture_info = true; + } + else { + // This is not a texture but simple linear memory + check_result_cuda(cuMemAlloc((CUdeviceptr *)&mem.device_pointer, mem.device_size)); + + // Update data storage pointers in launch parameters +# define KERNEL_TEX(data_type, tex_name) \ + if (strcmp(mem.name, #tex_name) == 0) \ + update_launch_params( \ + mem.name, offsetof(KernelParams, tex_name), &mem.device_pointer, sizeof(device_ptr)); +# include "kernel/kernel_textures.h" +# undef KERNEL_TEX + } + + stats.mem_alloc(mem.device_size); + } + + void mem_copy_to(device_memory &mem) override + { + if (!mem.host_pointer || mem.host_pointer == mem.shared_pointer) + return; + if (!mem.device_pointer) + mem_alloc(mem); // Need to allocate memory first if it does not exist yet + + const CUDAContextScope scope(cuda_context); + + if (mem.type == MEM_TEXTURE && mem.interpolation != INTERPOLATION_NONE) { + const CUDAMem &cmem = cuda_mem_map[&mem]; // Lock and get associated memory information + + size_t src_pitch = mem.data_width * datatype_size(mem.data_type) * mem.data_elements; + + if (mem.data_depth > 1) { + CUDA_MEMCPY3D param; + memset(¶m, 0, sizeof(param)); + param.dstMemoryType = CU_MEMORYTYPE_ARRAY; + param.dstArray = cmem.array; + param.srcMemoryType = CU_MEMORYTYPE_HOST; + param.srcHost = mem.host_pointer; + param.srcPitch = src_pitch; + param.WidthInBytes = param.srcPitch; + param.Height = mem.data_height; + param.Depth = mem.data_depth; + + check_result_cuda(cuMemcpy3D(¶m)); + } + else if (mem.data_height > 0) { + CUDA_MEMCPY2D param; + memset(¶m, 0, sizeof(param)); + param.dstMemoryType = CU_MEMORYTYPE_ARRAY; + param.dstArray = cmem.array; + param.srcMemoryType = CU_MEMORYTYPE_HOST; + param.srcHost = mem.host_pointer; + param.srcPitch = src_pitch; + param.WidthInBytes = param.srcPitch; + param.Height = mem.data_height; + + check_result_cuda(cuMemcpy2D(¶m)); + } + else { + check_result_cuda( + cuMemcpyHtoD((CUdeviceptr)mem.device_pointer, mem.host_pointer, mem.device_size)); + } + } + else { + // This is not a texture but simple linear memory + check_result_cuda( + cuMemcpyHtoD((CUdeviceptr)mem.device_pointer, mem.host_pointer, mem.device_size)); + } + } + + void mem_copy_from(device_memory &mem, int y, int w, int h, int elem) override + { + // Calculate linear memory offset and size + const size_t size = elem * w * h; + const size_t offset = elem * y * w; + + if (mem.host_pointer && mem.device_pointer) { + const CUDAContextScope scope(cuda_context); + check_result_cuda(cuMemcpyDtoH( + (char *)mem.host_pointer + offset, (CUdeviceptr)mem.device_pointer + offset, size)); + } + else if (mem.host_pointer) { + memset((char *)mem.host_pointer + offset, 0, size); + } + } + + void mem_zero(device_memory &mem) override + { + if (mem.host_pointer) + memset(mem.host_pointer, 0, mem.memory_size()); + if (mem.host_pointer && mem.host_pointer == mem.shared_pointer) + return; // This is shared host memory, so no device memory to update + + if (!mem.device_pointer) + mem_alloc(mem); // Need to allocate memory first if it does not exist yet + + const CUDAContextScope scope(cuda_context); + check_result_cuda(cuMemsetD8((CUdeviceptr)mem.device_pointer, 0, mem.memory_size())); + } + + void mem_free(device_memory &mem) override + { + assert(mem.device_pointer); + + const CUDAContextScope scope(cuda_context); + + if (mem.type == MEM_TEXTURE && mem.interpolation != INTERPOLATION_NONE) { + CUDAMem &cmem = cuda_mem_map[&mem]; // Lock and get associated memory information + + if (cmem.array) + cuArrayDestroy(cmem.array); + else + cuMemFree((CUdeviceptr)mem.device_pointer); + + if (cmem.texobject) + cuTexObjectDestroy(cmem.texobject); + } + else { + // This is not a texture but simple linear memory + cuMemFree((CUdeviceptr)mem.device_pointer); + } + + stats.mem_free(mem.device_size); + + mem.device_size = 0; + mem.device_pointer = 0; + } + + void const_copy_to(const char *name, void *host, size_t size) override + { + if (strcmp(name, "__data") == 0) { + assert(size <= sizeof(KernelData)); + + // Fix traversable handle on multi devices + KernelData *const data = (KernelData *)host; + *(OptixTraversableHandle *)&data->bvh.scene = tlas_handle; + + update_launch_params(name, offsetof(KernelParams, data), host, size); + } + } + + device_ptr mem_alloc_sub_ptr(device_memory &mem, int offset, int /*size*/) override + { + return (device_ptr)(((char *)mem.device_pointer) + mem.memory_elements_size(offset)); + } + + void task_add(DeviceTask &task) override + { + // Upload texture information to device if it has changed since last launch + update_texture_info(); + + // Split task into smaller ones + list tasks; + task.split(tasks, info.cpu_threads); + + // Queue tasks in internal task pool + struct OptiXDeviceTask : public DeviceTask { + OptiXDeviceTask(OptiXDevice *device, DeviceTask &task, int task_index) : DeviceTask(task) + { + // Using task index parameter instead of thread index, since number of CUDA streams may + // differ from number of threads + run = function_bind(&OptiXDevice::thread_run, device, *this, task_index); + } + }; + + int task_index = 0; + for (DeviceTask &task : tasks) + task_pool.push(new OptiXDeviceTask(this, task, task_index++)); + } + + void task_wait() override + { + // Wait for all queued tasks to finish + task_pool.wait_work(); + } + + void task_cancel() override + { + // Cancel any remaining tasks in the internal pool + task_pool.cancel(); + } + +# define CUDA_GET_BLOCKSIZE(func, w, h) \ + int threads; \ + check_result_cuda_ret( \ + cuFuncGetAttribute(&threads, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, func)); \ + threads = (int)sqrt((float)threads); \ + int xblocks = ((w) + threads - 1) / threads; \ + int yblocks = ((h) + threads - 1) / threads; + +# define CUDA_LAUNCH_KERNEL(func, args) \ + check_result_cuda_ret(cuLaunchKernel( \ + func, xblocks, yblocks, 1, threads, threads, 1, 0, cuda_stream[thread_index], args, 0)); + + /* Similar as above, but for 1-dimensional blocks. */ +# define CUDA_GET_BLOCKSIZE_1D(func, w, h) \ + int threads; \ + check_result_cuda_ret( \ + cuFuncGetAttribute(&threads, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, func)); \ + int xblocks = ((w) + threads - 1) / threads; \ + int yblocks = h; + +# define CUDA_LAUNCH_KERNEL_1D(func, args) \ + check_result_cuda_ret(cuLaunchKernel( \ + func, xblocks, yblocks, 1, threads, 1, 1, 0, cuda_stream[thread_index], args, 0)); + + bool denoising_non_local_means(device_ptr image_ptr, + device_ptr guide_ptr, + device_ptr variance_ptr, + device_ptr out_ptr, + DenoisingTask *task, + int thread_index) + { + if (have_error()) + return false; + + int stride = task->buffer.stride; + int w = task->buffer.width; + int h = task->buffer.h; + int r = task->nlm_state.r; + int f = task->nlm_state.f; + float a = task->nlm_state.a; + float k_2 = task->nlm_state.k_2; + + int pass_stride = task->buffer.pass_stride; + int num_shifts = (2 * r + 1) * (2 * r + 1); + int channel_offset = task->nlm_state.is_color ? task->buffer.pass_stride : 0; + int frame_offset = 0; + + CUdeviceptr difference = (CUdeviceptr)task->buffer.temporary_mem.device_pointer; + CUdeviceptr blurDifference = difference + sizeof(float) * pass_stride * num_shifts; + CUdeviceptr weightAccum = difference + 2 * sizeof(float) * pass_stride * num_shifts; + CUdeviceptr scale_ptr = 0; + + check_result_cuda_ret( + cuMemsetD8Async(weightAccum, 0, sizeof(float) * pass_stride, cuda_stream[thread_index])); + check_result_cuda_ret( + cuMemsetD8Async(out_ptr, 0, sizeof(float) * pass_stride, cuda_stream[thread_index])); + + { + CUfunction cuNLMCalcDifference, cuNLMBlur, cuNLMCalcWeight, cuNLMUpdateOutput; + check_result_cuda_ret(cuModuleGetFunction( + &cuNLMCalcDifference, cuda_filter_module, "kernel_cuda_filter_nlm_calc_difference")); + check_result_cuda_ret( + cuModuleGetFunction(&cuNLMBlur, cuda_filter_module, "kernel_cuda_filter_nlm_blur")); + check_result_cuda_ret(cuModuleGetFunction( + &cuNLMCalcWeight, cuda_filter_module, "kernel_cuda_filter_nlm_calc_weight")); + check_result_cuda_ret(cuModuleGetFunction( + &cuNLMUpdateOutput, cuda_filter_module, "kernel_cuda_filter_nlm_update_output")); + + check_result_cuda_ret(cuFuncSetCacheConfig(cuNLMCalcDifference, CU_FUNC_CACHE_PREFER_L1)); + check_result_cuda_ret(cuFuncSetCacheConfig(cuNLMBlur, CU_FUNC_CACHE_PREFER_L1)); + check_result_cuda_ret(cuFuncSetCacheConfig(cuNLMCalcWeight, CU_FUNC_CACHE_PREFER_L1)); + check_result_cuda_ret(cuFuncSetCacheConfig(cuNLMUpdateOutput, CU_FUNC_CACHE_PREFER_L1)); + + CUDA_GET_BLOCKSIZE_1D(cuNLMCalcDifference, w * h, num_shifts); + + void *calc_difference_args[] = {&guide_ptr, + &variance_ptr, + &scale_ptr, + &difference, + &w, + &h, + &stride, + &pass_stride, + &r, + &channel_offset, + &frame_offset, + &a, + &k_2}; + void *blur_args[] = {&difference, &blurDifference, &w, &h, &stride, &pass_stride, &r, &f}; + void *calc_weight_args[] = { + &blurDifference, &difference, &w, &h, &stride, &pass_stride, &r, &f}; + void *update_output_args[] = {&blurDifference, + &image_ptr, + &out_ptr, + &weightAccum, + &w, + &h, + &stride, + &pass_stride, + &channel_offset, + &r, + &f}; + + CUDA_LAUNCH_KERNEL_1D(cuNLMCalcDifference, calc_difference_args); + CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args); + CUDA_LAUNCH_KERNEL_1D(cuNLMCalcWeight, calc_weight_args); + CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args); + CUDA_LAUNCH_KERNEL_1D(cuNLMUpdateOutput, update_output_args); + } + + { + CUfunction cuNLMNormalize; + check_result_cuda_ret(cuModuleGetFunction( + &cuNLMNormalize, cuda_filter_module, "kernel_cuda_filter_nlm_normalize")); + check_result_cuda_ret(cuFuncSetCacheConfig(cuNLMNormalize, CU_FUNC_CACHE_PREFER_L1)); + void *normalize_args[] = {&out_ptr, &weightAccum, &w, &h, &stride}; + CUDA_GET_BLOCKSIZE(cuNLMNormalize, w, h); + CUDA_LAUNCH_KERNEL(cuNLMNormalize, normalize_args); + check_result_cuda_ret(cuStreamSynchronize(cuda_stream[thread_index])); + } + + return !have_error(); + } + + bool denoising_construct_transform(DenoisingTask *task, int thread_index) + { + if (have_error()) + return false; + + CUfunction cuFilterConstructTransform; + check_result_cuda_ret(cuModuleGetFunction(&cuFilterConstructTransform, + cuda_filter_module, + "kernel_cuda_filter_construct_transform")); + check_result_cuda_ret( + cuFuncSetCacheConfig(cuFilterConstructTransform, CU_FUNC_CACHE_PREFER_SHARED)); + CUDA_GET_BLOCKSIZE(cuFilterConstructTransform, task->storage.w, task->storage.h); + + void *args[] = {&task->buffer.mem.device_pointer, + &task->tile_info_mem.device_pointer, + &task->storage.transform.device_pointer, + &task->storage.rank.device_pointer, + &task->filter_area, + &task->rect, + &task->radius, + &task->pca_threshold, + &task->buffer.pass_stride, + &task->buffer.frame_stride, + &task->buffer.use_time}; + CUDA_LAUNCH_KERNEL(cuFilterConstructTransform, args); + check_result_cuda_ret(cuCtxSynchronize()); + + return !have_error(); + } + + bool denoising_accumulate(device_ptr color_ptr, + device_ptr color_variance_ptr, + device_ptr scale_ptr, + int frame, + DenoisingTask *task, + int thread_index) + { + if (have_error()) + return false; + + int r = task->radius; + int f = 4; + float a = 1.0f; + float k_2 = task->nlm_k_2; + + int w = task->reconstruction_state.source_w; + int h = task->reconstruction_state.source_h; + int stride = task->buffer.stride; + int frame_offset = frame * task->buffer.frame_stride; + int t = task->tile_info->frames[frame]; + + int pass_stride = task->buffer.pass_stride; + int num_shifts = (2 * r + 1) * (2 * r + 1); + + CUdeviceptr difference = (CUdeviceptr)task->buffer.temporary_mem.device_pointer; + CUdeviceptr blurDifference = difference + sizeof(float) * pass_stride * num_shifts; + + CUfunction cuNLMCalcDifference, cuNLMBlur, cuNLMCalcWeight, cuNLMConstructGramian; + check_result_cuda_ret(cuModuleGetFunction( + &cuNLMCalcDifference, cuda_filter_module, "kernel_cuda_filter_nlm_calc_difference")); + check_result_cuda_ret( + cuModuleGetFunction(&cuNLMBlur, cuda_filter_module, "kernel_cuda_filter_nlm_blur")); + check_result_cuda_ret(cuModuleGetFunction( + &cuNLMCalcWeight, cuda_filter_module, "kernel_cuda_filter_nlm_calc_weight")); + check_result_cuda_ret(cuModuleGetFunction( + &cuNLMConstructGramian, cuda_filter_module, "kernel_cuda_filter_nlm_construct_gramian")); + + check_result_cuda_ret(cuFuncSetCacheConfig(cuNLMCalcDifference, CU_FUNC_CACHE_PREFER_L1)); + check_result_cuda_ret(cuFuncSetCacheConfig(cuNLMBlur, CU_FUNC_CACHE_PREFER_L1)); + check_result_cuda_ret(cuFuncSetCacheConfig(cuNLMCalcWeight, CU_FUNC_CACHE_PREFER_L1)); + check_result_cuda_ret( + cuFuncSetCacheConfig(cuNLMConstructGramian, CU_FUNC_CACHE_PREFER_SHARED)); + + CUDA_GET_BLOCKSIZE_1D(cuNLMCalcDifference, + task->reconstruction_state.source_w * + task->reconstruction_state.source_h, + num_shifts); + + void *calc_difference_args[] = {&color_ptr, + &color_variance_ptr, + &scale_ptr, + &difference, + &w, + &h, + &stride, + &pass_stride, + &r, + &pass_stride, + &frame_offset, + &a, + &k_2}; + void *blur_args[] = {&difference, &blurDifference, &w, &h, &stride, &pass_stride, &r, &f}; + void *calc_weight_args[] = { + &blurDifference, &difference, &w, &h, &stride, &pass_stride, &r, &f}; + void *construct_gramian_args[] = {&t, + &blurDifference, + &task->buffer.mem.device_pointer, + &task->storage.transform.device_pointer, + &task->storage.rank.device_pointer, + &task->storage.XtWX.device_pointer, + &task->storage.XtWY.device_pointer, + &task->reconstruction_state.filter_window, + &w, + &h, + &stride, + &pass_stride, + &r, + &f, + &frame_offset, + &task->buffer.use_time}; + + CUDA_LAUNCH_KERNEL_1D(cuNLMCalcDifference, calc_difference_args); + CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args); + CUDA_LAUNCH_KERNEL_1D(cuNLMCalcWeight, calc_weight_args); + CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args); + CUDA_LAUNCH_KERNEL_1D(cuNLMConstructGramian, construct_gramian_args); + check_result_cuda_ret(cuCtxSynchronize()); + + return !have_error(); + } + + bool denoising_solve(device_ptr output_ptr, DenoisingTask *task, int thread_index) + { + if (have_error()) + return false; + + CUfunction cuFinalize; + check_result_cuda_ret( + cuModuleGetFunction(&cuFinalize, cuda_filter_module, "kernel_cuda_filter_finalize")); + check_result_cuda_ret(cuFuncSetCacheConfig(cuFinalize, CU_FUNC_CACHE_PREFER_L1)); + void *finalize_args[] = {&output_ptr, + &task->storage.rank.device_pointer, + &task->storage.XtWX.device_pointer, + &task->storage.XtWY.device_pointer, + &task->filter_area, + &task->reconstruction_state.buffer_params.x, + &task->render_buffer.samples}; + CUDA_GET_BLOCKSIZE( + cuFinalize, task->reconstruction_state.source_w, task->reconstruction_state.source_h); + CUDA_LAUNCH_KERNEL(cuFinalize, finalize_args); + check_result_cuda_ret(cuStreamSynchronize(cuda_stream[thread_index])); + + return !have_error(); + } + + bool denoising_combine_halves(device_ptr a_ptr, + device_ptr b_ptr, + device_ptr mean_ptr, + device_ptr variance_ptr, + int r, + int4 rect, + DenoisingTask *task, + int thread_index) + { + if (have_error()) + return false; + + CUfunction cuFilterCombineHalves; + check_result_cuda_ret(cuModuleGetFunction( + &cuFilterCombineHalves, cuda_filter_module, "kernel_cuda_filter_combine_halves")); + check_result_cuda_ret(cuFuncSetCacheConfig(cuFilterCombineHalves, CU_FUNC_CACHE_PREFER_L1)); + CUDA_GET_BLOCKSIZE( + cuFilterCombineHalves, task->rect.z - task->rect.x, task->rect.w - task->rect.y); + + void *args[] = {&mean_ptr, &variance_ptr, &a_ptr, &b_ptr, &rect, &r}; + CUDA_LAUNCH_KERNEL(cuFilterCombineHalves, args); + check_result_cuda_ret(cuStreamSynchronize(cuda_stream[thread_index])); + + return !have_error(); + } + + bool denoising_divide_shadow(device_ptr a_ptr, + device_ptr b_ptr, + device_ptr sample_variance_ptr, + device_ptr sv_variance_ptr, + device_ptr buffer_variance_ptr, + DenoisingTask *task, + int thread_index) + { + if (have_error()) + return false; + + CUfunction cuFilterDivideShadow; + check_result_cuda_ret(cuModuleGetFunction( + &cuFilterDivideShadow, cuda_filter_module, "kernel_cuda_filter_divide_shadow")); + check_result_cuda_ret(cuFuncSetCacheConfig(cuFilterDivideShadow, CU_FUNC_CACHE_PREFER_L1)); + CUDA_GET_BLOCKSIZE( + cuFilterDivideShadow, task->rect.z - task->rect.x, task->rect.w - task->rect.y); + + void *args[] = {&task->render_buffer.samples, + &task->tile_info_mem.device_pointer, + &a_ptr, + &b_ptr, + &sample_variance_ptr, + &sv_variance_ptr, + &buffer_variance_ptr, + &task->rect, + &task->render_buffer.pass_stride, + &task->render_buffer.offset}; + CUDA_LAUNCH_KERNEL(cuFilterDivideShadow, args); + check_result_cuda_ret(cuStreamSynchronize(cuda_stream[thread_index])); + + return !have_error(); + } + + bool denoising_get_feature(int mean_offset, + int variance_offset, + device_ptr mean_ptr, + device_ptr variance_ptr, + float scale, + DenoisingTask *task, + int thread_index) + { + if (have_error()) + return false; + + CUfunction cuFilterGetFeature; + check_result_cuda_ret(cuModuleGetFunction( + &cuFilterGetFeature, cuda_filter_module, "kernel_cuda_filter_get_feature")); + check_result_cuda_ret(cuFuncSetCacheConfig(cuFilterGetFeature, CU_FUNC_CACHE_PREFER_L1)); + CUDA_GET_BLOCKSIZE( + cuFilterGetFeature, task->rect.z - task->rect.x, task->rect.w - task->rect.y); + + void *args[] = {&task->render_buffer.samples, + &task->tile_info_mem.device_pointer, + &mean_offset, + &variance_offset, + &mean_ptr, + &variance_ptr, + &scale, + &task->rect, + &task->render_buffer.pass_stride, + &task->render_buffer.offset}; + CUDA_LAUNCH_KERNEL(cuFilterGetFeature, args); + check_result_cuda_ret(cuStreamSynchronize(cuda_stream[thread_index])); + + return !have_error(); + } + + bool denoising_write_feature(int out_offset, + device_ptr from_ptr, + device_ptr buffer_ptr, + DenoisingTask *task, + int thread_index) + { + if (have_error()) + return false; + + CUfunction cuFilterWriteFeature; + check_result_cuda_ret(cuModuleGetFunction( + &cuFilterWriteFeature, cuda_filter_module, "kernel_cuda_filter_write_feature")); + check_result_cuda_ret(cuFuncSetCacheConfig(cuFilterWriteFeature, CU_FUNC_CACHE_PREFER_L1)); + CUDA_GET_BLOCKSIZE(cuFilterWriteFeature, task->filter_area.z, task->filter_area.w); + + void *args[] = {&task->render_buffer.samples, + &task->reconstruction_state.buffer_params, + &task->filter_area, + &from_ptr, + &buffer_ptr, + &out_offset, + &task->rect}; + CUDA_LAUNCH_KERNEL(cuFilterWriteFeature, args); + check_result_cuda_ret(cuStreamSynchronize(cuda_stream[thread_index])); + + return !have_error(); + } + + bool denoising_detect_outliers(device_ptr image_ptr, + device_ptr variance_ptr, + device_ptr depth_ptr, + device_ptr output_ptr, + DenoisingTask *task, + int thread_index) + { + if (have_error()) + return false; + + CUfunction cuFilterDetectOutliers; + check_result_cuda_ret(cuModuleGetFunction( + &cuFilterDetectOutliers, cuda_filter_module, "kernel_cuda_filter_detect_outliers")); + check_result_cuda_ret(cuFuncSetCacheConfig(cuFilterDetectOutliers, CU_FUNC_CACHE_PREFER_L1)); + CUDA_GET_BLOCKSIZE( + cuFilterDetectOutliers, task->rect.z - task->rect.x, task->rect.w - task->rect.y); + + void *args[] = {&image_ptr, + &variance_ptr, + &depth_ptr, + &output_ptr, + &task->rect, + &task->buffer.pass_stride}; + + CUDA_LAUNCH_KERNEL(cuFilterDetectOutliers, args); + check_result_cuda_ret(cuStreamSynchronize(cuda_stream[thread_index])); + + return !have_error(); + } +}; + +bool device_optix_init() +{ + if (g_optixFunctionTable.optixDeviceContextCreate != NULL) + return true; // Already initialized function table + + // Need to initialize CUDA as well + if (!device_cuda_init()) + return false; + +# ifdef WITH_CUDA_DYNLOAD + // Load NVRTC function pointers for adaptive kernel compilation + if (DebugFlags().cuda.adaptive_compile && cuewInit(CUEW_INIT_NVRTC) != CUEW_SUCCESS) { + VLOG(1) + << "CUEW initialization failed for NVRTC. Adaptive kernel compilation won't be available."; + } +# endif + + const OptixResult result = optixInit(); + + if (result == OPTIX_ERROR_UNSUPPORTED_ABI_VERSION) { + VLOG(1) + << "OptiX initialization failed because the installed driver does not support ABI version " + << OPTIX_ABI_VERSION; + return false; + } + else if (result != OPTIX_SUCCESS) { + VLOG(1) << "OptiX initialization failed with error code " << (unsigned int)result; + return false; + } + + // Loaded OptiX successfully! + return true; +} + +void device_optix_info(vector &devices) +{ + // Simply add all supported CUDA devices as OptiX devices again + vector cuda_devices; + device_cuda_info(cuda_devices); + + for (auto it = cuda_devices.begin(); it != cuda_devices.end();) { + DeviceInfo &info = *it; + assert(info.type == DEVICE_CUDA); + info.type = DEVICE_OPTIX; + info.id += "_OptiX"; + + // Figure out RTX support + CUdevice cuda_device = 0; + CUcontext cuda_context = NULL; + unsigned int rtcore_version = 0; + if (cuDeviceGet(&cuda_device, info.num) == CUDA_SUCCESS && + cuDevicePrimaryCtxRetain(&cuda_context, cuda_device) == CUDA_SUCCESS) { + OptixDeviceContext optix_context = NULL; + if (optixDeviceContextCreate(cuda_context, nullptr, &optix_context) == OPTIX_SUCCESS) { + optixDeviceContextGetProperty(optix_context, + OPTIX_DEVICE_PROPERTY_RTCORE_VERSION, + &rtcore_version, + sizeof(rtcore_version)); + optixDeviceContextDestroy(optix_context); + } + cuDevicePrimaryCtxRelease(cuda_device); + } + + // Only add devices with RTX support + if (rtcore_version == 0) + it = cuda_devices.erase(it); + else + ++it; + } + + devices.insert(devices.end(), cuda_devices.begin(), cuda_devices.end()); +} + +Device *device_optix_create(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background) +{ + return new OptiXDevice(info, stats, profiler, background); +} + +CCL_NAMESPACE_END + +#endif diff --git a/intern/cycles/kernel/CMakeLists.txt b/intern/cycles/kernel/CMakeLists.txt index 41e57bb3e43..ea8aa197b6f 100644 --- a/intern/cycles/kernel/CMakeLists.txt +++ b/intern/cycles/kernel/CMakeLists.txt @@ -64,6 +64,10 @@ set(SRC_OPENCL_KERNELS kernels/opencl/filter.cl ) +set(SRC_OPTIX_KERNELS + kernels/optix/kernel_optix.cu +) + set(SRC_BVH_HEADERS bvh/bvh.h bvh/bvh_nodes.h @@ -95,6 +99,7 @@ set(SRC_HEADERS kernel_color.h kernel_compat_cpu.h kernel_compat_cuda.h + kernel_compat_optix.h kernel_compat_opencl.h kernel_differential.h kernel_emission.h @@ -140,6 +145,9 @@ set(SRC_KERNELS_CUDA_HEADERS kernels/cuda/kernel_cuda_image.h ) +set(SRC_KERNELS_OPTIX_HEADERS +) + set(SRC_KERNELS_OPENCL_HEADERS kernels/opencl/kernel_split_function.h kernels/opencl/kernel_opencl_image.h @@ -168,7 +176,7 @@ set(SRC_CLOSURE_HEADERS closure/volume.h closure/bsdf_principled_diffuse.h closure/bsdf_principled_sheen.h - closure/bsdf_hair_principled.h + closure/bsdf_hair_principled.h ) set(SRC_SVM_HEADERS @@ -476,6 +484,53 @@ if(WITH_CYCLES_CUDA_BINARIES) cycles_set_solution_folder(cycles_kernel_cuda) endif() +# OptiX PTX modules + +if(WITH_CYCLES_DEVICE_OPTIX) + foreach(input ${SRC_OPTIX_KERNELS}) + get_filename_component(input_we ${input} NAME_WE) + + set(output "${CMAKE_CURRENT_BINARY_DIR}/${input_we}.ptx") + set(cuda_flags + -I "${OPTIX_INCLUDE_DIR}" + -I "${CMAKE_CURRENT_SOURCE_DIR}/.." + -I "${CMAKE_CURRENT_SOURCE_DIR}/kernels/cuda" + -arch=sm_30 + --use_fast_math + -o ${output}) + + if(WITH_CYCLES_DEBUG) + set(cuda_flags ${cuda_flags} + -D __KERNEL_DEBUG__) + endif() + + add_custom_command( + OUTPUT + ${output} + DEPENDS + ${input} + ${SRC_HEADERS} + ${SRC_KERNELS_CUDA_HEADERS} + ${SRC_KERNELS_OPTIX_HEADERS} + ${SRC_BVH_HEADERS} + ${SRC_SVM_HEADERS} + ${SRC_GEOM_HEADERS} + ${SRC_CLOSURE_HEADERS} + ${SRC_UTIL_HEADERS} + COMMAND + ${CUDA_NVCC_EXECUTABLE} --ptx ${cuda_flags} ${input} + WORKING_DIRECTORY + "${CMAKE_CURRENT_SOURCE_DIR}") + + list(APPEND optix_ptx ${output}) + + delayed_install("${CMAKE_CURRENT_BINARY_DIR}" "${output}" ${CYCLES_INSTALL_PATH}/lib) + endforeach() + + add_custom_target(cycles_kernel_optix ALL DEPENDS ${optix_ptx}) + cycles_set_solution_folder(cycles_kernel_optix) +endif() + # OSL module if(WITH_CYCLES_OSL) @@ -535,10 +590,12 @@ endif() cycles_add_library(cycles_kernel "${LIB}" ${SRC_CPU_KERNELS} ${SRC_CUDA_KERNELS} + ${SRC_OPTIX_KERNELS} ${SRC_OPENCL_KERNELS} ${SRC_HEADERS} ${SRC_KERNELS_CPU_HEADERS} ${SRC_KERNELS_CUDA_HEADERS} + ${SRC_KERNELS_OPTIX_HEADERS} ${SRC_KERNELS_OPENCL_HEADERS} ${SRC_BVH_HEADERS} ${SRC_CLOSURE_HEADERS} @@ -548,9 +605,24 @@ cycles_add_library(cycles_kernel "${LIB}" ${SRC_SPLIT_HEADERS} ) +source_group("bvh" FILES ${SRC_BVH_HEADERS}) +source_group("closure" FILES ${SRC_CLOSURE_HEADERS}) +source_group("filter" FILES ${SRC_FILTER_HEADERS}) +source_group("geom" FILES ${SRC_GEOM_HEADERS}) +source_group("kernel" FILES ${SRC_HEADERS}) +source_group("kernel\\split" FILES ${SRC_SPLIT_HEADERS}) +source_group("kernels\\cpu" FILES ${SRC_CPU_KERNELS} ${SRC_KERNELS_CPU_HEADERS}) +source_group("kernels\\cuda" FILES ${SRC_CUDA_KERNELS} ${SRC_KERNELS_CUDA_HEADERS}) +source_group("kernels\\opencl" FILES ${SRC_OPENCL_KERNELS} ${SRC_KERNELS_OPENCL_HEADERS}) +source_group("kernels\\optix" FILES ${SRC_OPTIX_KERNELS} ${SRC_KERNELS_OPTIX_HEADERS}) +source_group("svm" FILES ${SRC_SVM_HEADERS}) + if(WITH_CYCLES_CUDA) add_dependencies(cycles_kernel cycles_kernel_cuda) endif() +if(WITH_CYCLES_DEVICE_OPTIX) + add_dependencies(cycles_kernel cycles_kernel_optix) +endif() # OpenCL kernel @@ -564,9 +636,11 @@ endif() delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_OPENCL_KERNELS}" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl) delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_CUDA_KERNELS}" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/cuda) +delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_OPTIX_KERNELS}" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/optix) delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel) delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_KERNELS_OPENCL_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl) delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_KERNELS_CUDA_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/cuda) +delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_KERNELS_OPTIX_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/optix) delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_BVH_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/bvh) delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_CLOSURE_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/closure) delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_FILTER_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/filter) diff --git a/intern/cycles/render/mesh.cpp b/intern/cycles/render/mesh.cpp index 9be078b6fca..cffe2bfa70a 100644 --- a/intern/cycles/render/mesh.cpp +++ b/intern/cycles/render/mesh.cpp @@ -1139,9 +1139,9 @@ int Mesh::motion_step(float time) const return -1; } -bool Mesh::need_build_bvh(BVHLayout) const +bool Mesh::need_build_bvh(BVHLayout layout) const { - return !transform_applied || has_surface_bssrdf; + return !transform_applied || has_surface_bssrdf || layout == BVH_LAYOUT_OPTIX; } bool Mesh::is_instanced() const diff --git a/intern/cycles/util/util_debug.cpp b/intern/cycles/util/util_debug.cpp index e52fda26845..3ce65802cff 100644 --- a/intern/cycles/util/util_debug.cpp +++ b/intern/cycles/util/util_debug.cpp @@ -86,6 +86,16 @@ void DebugFlags::CUDA::reset() split_kernel = false; } +DebugFlags::OptiX::OptiX() +{ + reset(); +} + +void DebugFlags::OptiX::reset() +{ + cuda_streams = 1; +} + DebugFlags::OpenCL::OpenCL() : device_type(DebugFlags::OpenCL::DEVICE_ALL), debug(false) { reset(); @@ -130,6 +140,7 @@ void DebugFlags::reset() viewport_static_bvh = false; cpu.reset(); cuda.reset(); + optix.reset(); opencl.reset(); } @@ -145,7 +156,10 @@ std::ostream &operator<<(std::ostream &os, DebugFlagsConstRef debug_flags) << " Split : " << string_from_bool(debug_flags.cpu.split_kernel) << "\n"; os << "CUDA flags:\n" - << " Adaptive Compile: " << string_from_bool(debug_flags.cuda.adaptive_compile) << "\n"; + << " Adaptive Compile : " << string_from_bool(debug_flags.cuda.adaptive_compile) << "\n"; + + os << "OptiX flags:\n" + << " CUDA streams : " << debug_flags.optix.cuda_streams << "\n"; const char *opencl_device_type; switch (debug_flags.opencl.device_type) { diff --git a/intern/cycles/util/util_debug.h b/intern/cycles/util/util_debug.h index 5b0004ea768..cf6b442b878 100644 --- a/intern/cycles/util/util_debug.h +++ b/intern/cycles/util/util_debug.h @@ -99,6 +99,17 @@ class DebugFlags { bool split_kernel; }; + /* Descriptor of OptiX feature-set to be used. */ + struct OptiX { + OptiX(); + + /* Reset flags to their defaults. */ + void reset(); + + /* Number of CUDA streams to launch kernels concurrently from. */ + int cuda_streams; + }; + /* Descriptor of OpenCL feature-set to be used. */ struct OpenCL { OpenCL(); @@ -165,6 +176,9 @@ class DebugFlags { /* Requested CUDA flags. */ CUDA cuda; + /* Requested OptiX flags. */ + OptiX optix; + /* Requested OpenCL flags. */ OpenCL opencl; -- cgit v1.2.3