Welcome to mirror list, hosted at ThFree Co, Russian Federation.

git.blender.org/blender.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--CMakeLists.txt1
-rw-r--r--build_files/buildbot/slave_compile.py3
-rw-r--r--build_files/cmake/Modules/FindOptiX.cmake57
-rw-r--r--build_files/cmake/config/blender_lite.cmake1
-rw-r--r--build_files/cmake/config/blender_release.cmake1
-rw-r--r--intern/cycles/CMakeLists.txt18
-rw-r--r--intern/cycles/blender/addon/properties.py46
-rw-r--r--intern/cycles/blender/addon/ui.py23
-rw-r--r--intern/cycles/blender/blender_device.cpp7
-rw-r--r--intern/cycles/blender/blender_python.cpp10
-rw-r--r--intern/cycles/blender/blender_sync.cpp2
-rw-r--r--intern/cycles/bvh/CMakeLists.txt2
-rw-r--r--intern/cycles/bvh/bvh.cpp11
-rw-r--r--intern/cycles/bvh/bvh_optix.cpp215
-rw-r--r--intern/cycles/bvh/bvh_optix.h53
-rw-r--r--intern/cycles/device/CMakeLists.txt4
-rw-r--r--intern/cycles/device/device.cpp30
-rw-r--r--intern/cycles/device/device.h18
-rw-r--r--intern/cycles/device/device_cuda.cpp5
-rw-r--r--intern/cycles/device/device_intern.h4
-rw-r--r--intern/cycles/device/device_multi.cpp18
-rw-r--r--intern/cycles/device/device_optix.cpp1969
-rw-r--r--intern/cycles/kernel/CMakeLists.txt76
-rw-r--r--intern/cycles/render/mesh.cpp4
-rw-r--r--intern/cycles/util/util_debug.cpp16
-rw-r--r--intern/cycles/util/util_debug.h14
26 files changed, 2574 insertions, 34 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt
index ad705821596..258e79b7d4a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -426,6 +426,7 @@ mark_as_advanced(WITH_CYCLES_DEBUG)
mark_as_advanced(WITH_CYCLES_NATIVE_ONLY)
option(WITH_CYCLES_DEVICE_CUDA "Enable Cycles CUDA compute support" ON)
+option(WITH_CYCLES_DEVICE_OPTIX "Enable Cycles OptiX support" OFF)
option(WITH_CYCLES_DEVICE_OPENCL "Enable Cycles OpenCL compute support" ON)
option(WITH_CYCLES_NETWORK "Enable Cycles compute over network support (EXPERIMENTAL and unfinished)" OFF)
mark_as_advanced(WITH_CYCLES_DEVICE_CUDA)
diff --git a/build_files/buildbot/slave_compile.py b/build_files/buildbot/slave_compile.py
index 84667e663f6..0da0ead819f 100644
--- a/build_files/buildbot/slave_compile.py
+++ b/build_files/buildbot/slave_compile.py
@@ -34,6 +34,9 @@ def get_cmake_options(builder):
elif builder.platform == 'linux':
config_file = "build_files/buildbot/config/blender_linux.cmake"
+ optix_sdk_dir = os.path.join(builder.blender_dir, '..', '..', 'NVIDIA-Optix-SDK')
+ options.append('-DOPTIX_ROOT_DIR:PATH=' + optix_sdk_dir)
+
options.append("-C" + os.path.join(builder.blender_dir, config_file))
options.append("-DCMAKE_INSTALL_PREFIX=%s" % (builder.install_dir))
diff --git a/build_files/cmake/Modules/FindOptiX.cmake b/build_files/cmake/Modules/FindOptiX.cmake
new file mode 100644
index 00000000000..56fd2fd1396
--- /dev/null
+++ b/build_files/cmake/Modules/FindOptiX.cmake
@@ -0,0 +1,57 @@
+# - Find OptiX library
+# Find the native OptiX includes and library
+# This module defines
+# OPTIX_INCLUDE_DIRS, where to find optix.h, Set when
+# OPTIX_INCLUDE_DIR is found.
+# OPTIX_ROOT_DIR, The base directory to search for OptiX.
+# This can also be an environment variable.
+# OPTIX_FOUND, If false, do not try to use OptiX.
+
+#=============================================================================
+# Copyright 2019 Blender Foundation.
+#
+# Distributed under the OSI-approved BSD License (the "License");
+# see accompanying file Copyright.txt for details.
+#
+# This software is distributed WITHOUT ANY WARRANTY; without even the
+# implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+# See the License for more information.
+#=============================================================================
+
+# If OPTIX_ROOT_DIR was defined in the environment, use it.
+IF(NOT OPTIX_ROOT_DIR AND NOT $ENV{OPTIX_ROOT_DIR} STREQUAL "")
+ SET(OPTIX_ROOT_DIR $ENV{OPTIX_ROOT_DIR})
+ENDIF()
+
+SET(_optix_SEARCH_DIRS
+ ${OPTIX_ROOT_DIR}
+ "$ENV{PROGRAMDATA}/NVIDIA Corporation/OptiX SDK 7.0.0"
+ /usr/local
+ /sw # Fink
+ /opt/local # DarwinPorts
+)
+
+FIND_PATH(OPTIX_INCLUDE_DIR
+ NAMES
+ optix.h
+ HINTS
+ ${_optix_SEARCH_DIRS}
+ PATH_SUFFIXES
+ include
+)
+
+# handle the QUIETLY and REQUIRED arguments and set OPTIX_FOUND to TRUE if
+# all listed variables are TRUE
+INCLUDE(FindPackageHandleStandardArgs)
+FIND_PACKAGE_HANDLE_STANDARD_ARGS(OptiX DEFAULT_MSG
+ OPTIX_INCLUDE_DIR)
+
+IF(OPTIX_FOUND)
+ SET(OPTIX_INCLUDE_DIRS ${OPTIX_INCLUDE_DIR})
+ENDIF(OPTIX_FOUND)
+
+MARK_AS_ADVANCED(
+ OPTIX_INCLUDE_DIR
+)
+
+UNSET(_optix_SEARCH_DIRS)
diff --git a/build_files/cmake/config/blender_lite.cmake b/build_files/cmake/config/blender_lite.cmake
index e98f4f098bb..37cbfa27972 100644
--- a/build_files/cmake/config/blender_lite.cmake
+++ b/build_files/cmake/config/blender_lite.cmake
@@ -17,6 +17,7 @@ set(WITH_CODEC_FFMPEG OFF CACHE BOOL "" FORCE)
set(WITH_CODEC_SNDFILE OFF CACHE BOOL "" FORCE)
set(WITH_CYCLES OFF CACHE BOOL "" FORCE)
set(WITH_CYCLES_OSL OFF CACHE BOOL "" FORCE)
+set(WITH_CYCLES_DEVICE_OPTIX OFF CACHE BOOL "" FORCE)
set(WITH_DRACO OFF CACHE BOOL "" FORCE)
set(WITH_FFTW3 OFF CACHE BOOL "" FORCE)
set(WITH_LIBMV OFF CACHE BOOL "" FORCE)
diff --git a/build_files/cmake/config/blender_release.cmake b/build_files/cmake/config/blender_release.cmake
index 2d7b167764b..cb338f40a7b 100644
--- a/build_files/cmake/config/blender_release.cmake
+++ b/build_files/cmake/config/blender_release.cmake
@@ -57,6 +57,7 @@ set(WITH_MEM_JEMALLOC ON CACHE BOOL "" FORCE)
set(WITH_CYCLES_CUDA_BINARIES ON CACHE BOOL "" FORCE)
set(WITH_CYCLES_CUBIN_COMPILER OFF CACHE BOOL "" FORCE)
set(CYCLES_CUDA_BINARIES_ARCH sm_30;sm_35;sm_37;sm_50;sm_52;sm_60;sm_61;sm_70;sm_75 CACHE STRING "" FORCE)
+set(WITH_CYCLES_DEVICE_OPTIX ON CACHE BOOL "" FORCE)
# platform dependent options
if(UNIX AND NOT APPLE)
diff --git a/intern/cycles/CMakeLists.txt b/intern/cycles/CMakeLists.txt
index 6a3ebd85378..25e8e124885 100644
--- a/intern/cycles/CMakeLists.txt
+++ b/intern/cycles/CMakeLists.txt
@@ -219,6 +219,24 @@ if(WITH_CYCLES_OSL)
)
endif()
+if(WITH_CYCLES_DEVICE_OPTIX)
+ find_package(OptiX)
+
+ if(OPTIX_FOUND)
+ add_definitions(-DWITH_OPTIX)
+ include_directories(
+ SYSTEM
+ ${OPTIX_INCLUDE_DIR}
+ )
+
+ # Need pre-compiled CUDA binaries in the OptiX device
+ set(WITH_CYCLES_CUDA_BINARIES ON)
+ else()
+ message(STATUS "Optix not found, disabling it from Cycles")
+ set(WITH_CYCLES_DEVICE_OPTIX OFF)
+ endif()
+endif()
+
if(WITH_CYCLES_EMBREE)
add_definitions(-DWITH_EMBREE)
add_definitions(-DEMBREE_STATIC_LIB)
diff --git a/intern/cycles/blender/addon/properties.py b/intern/cycles/blender/addon/properties.py
index 93f8f76cd6a..8623b38a271 100644
--- a/intern/cycles/blender/addon/properties.py
+++ b/intern/cycles/blender/addon/properties.py
@@ -137,6 +137,7 @@ enum_world_mis = (
enum_device_type = (
('CPU', "CPU", "CPU", 0),
('CUDA', "CUDA", "CUDA", 1),
+ ('OPTIX', "OptiX", "OptiX", 3),
('OPENCL', "OpenCL", "OpenCL", 2)
)
@@ -740,6 +741,8 @@ class CyclesRenderSettings(bpy.types.PropertyGroup):
debug_use_cuda_adaptive_compile: BoolProperty(name="Adaptive Compile", default=False)
debug_use_cuda_split_kernel: BoolProperty(name="Split Kernel", default=False)
+ debug_optix_cuda_streams: IntProperty(name="CUDA Streams", default=1, min=1)
+
debug_opencl_kernel_type: EnumProperty(
name="OpenCL Kernel Type",
default='DEFAULT',
@@ -1400,10 +1403,12 @@ class CyclesPreferences(bpy.types.AddonPreferences):
def get_device_types(self, context):
import _cycles
- has_cuda, has_opencl = _cycles.get_device_types()
+ has_cuda, has_optix, has_opencl = _cycles.get_device_types()
list = [('NONE', "None", "Don't use compute device", 0)]
if has_cuda:
list.append(('CUDA', "CUDA", "Use CUDA for GPU acceleration", 1))
+ if has_optix:
+ list.append(('OPTIX', "OptiX", "Use OptiX for GPU acceleration", 3))
if has_opencl:
list.append(('OPENCL', "OpenCL", "Use OpenCL for GPU acceleration", 2))
return list
@@ -1424,7 +1429,7 @@ class CyclesPreferences(bpy.types.AddonPreferences):
def update_device_entries(self, device_list):
for device in device_list:
- if not device[1] in {'CUDA', 'OPENCL', 'CPU'}:
+ if not device[1] in {'CUDA', 'OPTIX', 'OPENCL', 'CPU'}:
continue
# Try to find existing Device entry
entry = self.find_existing_device_entry(device)
@@ -1439,8 +1444,8 @@ class CyclesPreferences(bpy.types.AddonPreferences):
# Update name in case it changed
entry.name = device[0]
- # Gets all devices types by default.
- def get_devices(self, compute_device_type=''):
+ # Gets all devices types for a compute device type.
+ def get_devices_for_type(self, compute_device_type):
import _cycles
# Layout of the device tuples: (Name, Type, Persistent ID)
device_list = _cycles.available_devices(compute_device_type)
@@ -1449,20 +1454,23 @@ class CyclesPreferences(bpy.types.AddonPreferences):
# hold pointers to a resized array.
self.update_device_entries(device_list)
# Sort entries into lists
- cuda_devices = []
- opencl_devices = []
+ devices = []
cpu_devices = []
for device in device_list:
entry = self.find_existing_device_entry(device)
- if entry.type == 'CUDA':
- cuda_devices.append(entry)
- elif entry.type == 'OPENCL':
- opencl_devices.append(entry)
+ if entry.type == compute_device_type:
+ devices.append(entry)
elif entry.type == 'CPU':
cpu_devices.append(entry)
# Extend all GPU devices with CPU.
- cuda_devices.extend(cpu_devices)
- opencl_devices.extend(cpu_devices)
+ if compute_device_type in ('CUDA', 'OPENCL'):
+ devices.extend(cpu_devices)
+ return devices
+
+ # For backwards compatibility, only has CUDA and OpenCL.
+ def get_devices(self, compute_device_type=''):
+ cuda_devices = self.get_devices_for_type('CUDA')
+ opencl_devices = self.get_devices_for_type('OPENCL')
return cuda_devices, opencl_devices
def get_num_gpu_devices(self):
@@ -1498,16 +1506,24 @@ class CyclesPreferences(bpy.types.AddonPreferences):
for device in devices:
box.prop(device, "use", text=device.name)
+ if device_type == 'OPTIX':
+ col = box.column(align=True)
+ col.label(text="OptiX support is experimental", icon='INFO')
+ col.label(text="Not all Cycles features are supported yet", icon='BLANK1')
+
+
def draw_impl(self, layout, context):
row = layout.row()
row.prop(self, "compute_device_type", expand=True)
- cuda_devices, opencl_devices = self.get_devices(self.compute_device_type)
+ devices = self.get_devices_for_type(self.compute_device_type)
row = layout.row()
if self.compute_device_type == 'CUDA':
- self._draw_devices(row, 'CUDA', cuda_devices)
+ self._draw_devices(row, 'CUDA', devices)
+ elif self.compute_device_type == 'OPTIX':
+ self._draw_devices(row, 'OPTIX', devices)
elif self.compute_device_type == 'OPENCL':
- self._draw_devices(row, 'OPENCL', opencl_devices)
+ self._draw_devices(row, 'OPENCL', devices)
def draw(self, context):
self.draw_impl(self.layout, context)
diff --git a/intern/cycles/blender/addon/ui.py b/intern/cycles/blender/addon/ui.py
index 200b08f93cb..44ed28e9e02 100644
--- a/intern/cycles/blender/addon/ui.py
+++ b/intern/cycles/blender/addon/ui.py
@@ -88,10 +88,16 @@ def use_cuda(context):
return (get_device_type(context) == 'CUDA' and cscene.device == 'GPU')
+def use_optix(context):
+ cscene = context.scene.cycles
+
+ return (get_device_type(context) == 'OPTIX' and cscene.device == 'GPU')
+
+
def use_branched_path(context):
cscene = context.scene.cycles
- return (cscene.progressive == 'BRANCHED_PATH')
+ return (cscene.progressive == 'BRANCHED_PATH' and not use_optix(context))
def use_sample_all_lights(context):
@@ -168,7 +174,8 @@ class CYCLES_RENDER_PT_sampling(CyclesButtonsPanel, Panel):
layout.use_property_split = True
layout.use_property_decorate = False
- layout.prop(cscene, "progressive")
+ if not use_optix(context):
+ layout.prop(cscene, "progressive")
if cscene.progressive == 'PATH' or use_branched_path(context) is False:
col = layout.column(align=True)
@@ -1763,6 +1770,10 @@ class CYCLES_RENDER_PT_bake(CyclesButtonsPanel, Panel):
bl_options = {'DEFAULT_CLOSED'}
COMPAT_ENGINES = {'CYCLES'}
+ @classmethod
+ def poll(cls, context):
+ return not use_optix(context)
+
def draw(self, context):
layout = self.layout
layout.use_property_split = True
@@ -1947,7 +1958,13 @@ class CYCLES_RENDER_PT_debug(CyclesButtonsPanel, Panel):
col.separator()
col = layout.column()
- col.label(text='OpenCL Flags:')
+ col.label(text="OptiX Flags:")
+ col.prop(cscene, "debug_optix_cuda_streams")
+
+ col.separator()
+
+ col = layout.column()
+ col.label(text="OpenCL Flags:")
col.prop(cscene, "debug_opencl_device_type", text="Device")
col.prop(cscene, "debug_use_opencl_debug", text="Debug")
col.prop(cscene, "debug_opencl_mem_limit")
diff --git a/intern/cycles/blender/blender_device.cpp b/intern/cycles/blender/blender_device.cpp
index 98fc0c6dec4..111fc8d5192 100644
--- a/intern/cycles/blender/blender_device.cpp
+++ b/intern/cycles/blender/blender_device.cpp
@@ -61,7 +61,8 @@ DeviceInfo blender_device_info(BL::Preferences &b_preferences, BL::Scene &b_scen
COMPUTE_DEVICE_CPU = 0,
COMPUTE_DEVICE_CUDA = 1,
COMPUTE_DEVICE_OPENCL = 2,
- COMPUTE_DEVICE_NUM = 3,
+ COMPUTE_DEVICE_OPTIX = 3,
+ COMPUTE_DEVICE_NUM = 4,
};
ComputeDevice compute_device = (ComputeDevice)get_enum(
@@ -73,6 +74,10 @@ DeviceInfo blender_device_info(BL::Preferences &b_preferences, BL::Scene &b_scen
if (compute_device == COMPUTE_DEVICE_CUDA) {
mask |= DEVICE_MASK_CUDA;
}
+ else if (compute_device == COMPUTE_DEVICE_OPTIX) {
+ /* Cannot use CPU and OptiX device at the same time right now, so replace mask. */
+ mask = DEVICE_MASK_OPTIX;
+ }
else if (compute_device == COMPUTE_DEVICE_OPENCL) {
mask |= DEVICE_MASK_OPENCL;
}
diff --git a/intern/cycles/blender/blender_python.cpp b/intern/cycles/blender/blender_python.cpp
index 2bea6b34772..335d4daf09c 100644
--- a/intern/cycles/blender/blender_python.cpp
+++ b/intern/cycles/blender/blender_python.cpp
@@ -81,6 +81,8 @@ bool debug_flags_sync_from_scene(BL::Scene b_scene)
/* Synchronize CUDA flags. */
flags.cuda.adaptive_compile = get_boolean(cscene, "debug_use_cuda_adaptive_compile");
flags.cuda.split_kernel = get_boolean(cscene, "debug_use_cuda_split_kernel");
+ /* Synchronize OptiX flags. */
+ flags.optix.cuda_streams = get_int(cscene, "debug_optix_cuda_streams");
/* Synchronize OpenCL device type. */
switch (get_enum(cscene, "debug_opencl_device_type")) {
case 0:
@@ -960,14 +962,16 @@ static PyObject *enable_print_stats_func(PyObject * /*self*/, PyObject * /*args*
static PyObject *get_device_types_func(PyObject * /*self*/, PyObject * /*args*/)
{
vector<DeviceType> device_types = Device::available_types();
- bool has_cuda = false, has_opencl = false;
+ bool has_cuda = false, has_optix = false, has_opencl = false;
foreach (DeviceType device_type, device_types) {
has_cuda |= (device_type == DEVICE_CUDA);
+ has_optix |= (device_type == DEVICE_OPTIX);
has_opencl |= (device_type == DEVICE_OPENCL);
}
- PyObject *list = PyTuple_New(2);
+ PyObject *list = PyTuple_New(3);
PyTuple_SET_ITEM(list, 0, PyBool_FromLong(has_cuda));
- PyTuple_SET_ITEM(list, 1, PyBool_FromLong(has_opencl));
+ PyTuple_SET_ITEM(list, 1, PyBool_FromLong(has_optix));
+ PyTuple_SET_ITEM(list, 2, PyBool_FromLong(has_opencl));
return list;
}
diff --git a/intern/cycles/blender/blender_sync.cpp b/intern/cycles/blender/blender_sync.cpp
index 8b7c66363d9..1a166d171bc 100644
--- a/intern/cycles/blender/blender_sync.cpp
+++ b/intern/cycles/blender/blender_sync.cpp
@@ -758,7 +758,7 @@ SessionParams BlenderSync::get_session_params(BL::RenderEngine &b_engine,
preview_samples = preview_samples * preview_samples;
}
- if (get_enum(cscene, "progressive") == 0) {
+ if (get_enum(cscene, "progressive") == 0 && (params.device.type != DEVICE_OPTIX)) {
if (background) {
params.samples = aa_samples;
}
diff --git a/intern/cycles/bvh/CMakeLists.txt b/intern/cycles/bvh/CMakeLists.txt
index 36bbd937e1a..27a7f604e1c 100644
--- a/intern/cycles/bvh/CMakeLists.txt
+++ b/intern/cycles/bvh/CMakeLists.txt
@@ -15,6 +15,7 @@ set(SRC
bvh_build.cpp
bvh_embree.cpp
bvh_node.cpp
+ bvh_optix.cpp
bvh_sort.cpp
bvh_split.cpp
bvh_unaligned.cpp
@@ -29,6 +30,7 @@ set(SRC_HEADERS
bvh_build.h
bvh_embree.h
bvh_node.h
+ bvh_optix.h
bvh_params.h
bvh_sort.h
bvh_split.h
diff --git a/intern/cycles/bvh/bvh.cpp b/intern/cycles/bvh/bvh.cpp
index b6a4aba74b5..16c721da06a 100644
--- a/intern/cycles/bvh/bvh.cpp
+++ b/intern/cycles/bvh/bvh.cpp
@@ -26,6 +26,9 @@
#include "bvh/bvh_build.h"
#include "bvh/bvh_node.h"
+#ifdef WITH_OPTIX
+# include "bvh/bvh_optix.h"
+#endif
#ifdef WITH_EMBREE
# include "bvh/bvh_embree.h"
#endif
@@ -51,6 +54,8 @@ const char *bvh_layout_name(BVHLayout layout)
return "NONE";
case BVH_LAYOUT_EMBREE:
return "EMBREE";
+ case BVH_LAYOUT_OPTIX:
+ return "OPTIX";
case BVH_LAYOUT_ALL:
return "ALL";
}
@@ -116,6 +121,12 @@ BVH *BVH::create(const BVHParams &params,
#else
break;
#endif
+ case BVH_LAYOUT_OPTIX:
+#ifdef WITH_OPTIX
+ return new BVHOptiX(params, meshes, objects);
+#else
+ break;
+#endif
case BVH_LAYOUT_NONE:
case BVH_LAYOUT_ALL:
break;
diff --git a/intern/cycles/bvh/bvh_optix.cpp b/intern/cycles/bvh/bvh_optix.cpp
new file mode 100644
index 00000000000..b3a9aab3266
--- /dev/null
+++ b/intern/cycles/bvh/bvh_optix.cpp
@@ -0,0 +1,215 @@
+/*
+ * Copyright 2019, NVIDIA Corporation.
+ * Copyright 2019, Blender Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifdef WITH_OPTIX
+
+# include "bvh/bvh_optix.h"
+# include "render/mesh.h"
+# include "render/object.h"
+# include "util/util_logging.h"
+# include "util/util_progress.h"
+
+CCL_NAMESPACE_BEGIN
+
+BVHOptiX::BVHOptiX(const BVHParams &params_,
+ const vector<Mesh *> &meshes_,
+ const vector<Object *> &objects_)
+ : BVH(params_, meshes_, objects_)
+{
+}
+
+BVHOptiX::~BVHOptiX()
+{
+}
+
+void BVHOptiX::build(Progress &, Stats *)
+{
+ if (params.top_level)
+ pack_tlas();
+ else
+ pack_blas();
+}
+
+void BVHOptiX::copy_to_device(Progress &progress, DeviceScene *dscene)
+{
+ progress.set_status("Updating Scene BVH", "Building OptiX acceleration structure");
+
+ Device *const device = dscene->bvh_nodes.device;
+ if (!device->build_optix_bvh(this, dscene->bvh_nodes))
+ progress.set_error("Failed to build OptiX acceleration structure");
+}
+
+void BVHOptiX::pack_blas()
+{
+ // Bottom-level BVH can contain multiple primitive types, so merge them:
+ assert(meshes.size() == 1 && objects.size() == 1); // These are build per-mesh
+ Mesh *const mesh = meshes[0];
+
+ if (params.primitive_mask & PRIMITIVE_ALL_CURVE && mesh->num_curves() > 0) {
+ const size_t num_curves = mesh->num_curves();
+ const size_t num_segments = mesh->num_segments();
+ pack.prim_type.reserve(pack.prim_type.size() + num_segments);
+ pack.prim_index.reserve(pack.prim_index.size() + num_segments);
+ pack.prim_object.reserve(pack.prim_object.size() + num_segments);
+ // 'pack.prim_time' is only used in geom_curve_intersect.h
+ // It is not needed because of OPTIX_MOTION_FLAG_[START|END]_VANISH
+
+ uint type = PRIMITIVE_CURVE;
+ if (mesh->use_motion_blur && mesh->curve_attributes.find(ATTR_STD_MOTION_VERTEX_POSITION))
+ type = PRIMITIVE_MOTION_CURVE;
+
+ for (size_t j = 0; j < num_curves; ++j) {
+ const Mesh::Curve curve = mesh->get_curve(j);
+ for (size_t k = 0; k < curve.num_segments(); ++k) {
+ pack.prim_type.push_back_reserved(PRIMITIVE_PACK_SEGMENT(type, k));
+ // Each curve segment points back to its curve index
+ pack.prim_index.push_back_reserved(j);
+ pack.prim_object.push_back_reserved(0);
+ }
+ }
+ }
+
+ if (params.primitive_mask & PRIMITIVE_ALL_TRIANGLE && mesh->num_triangles() > 0) {
+ const size_t num_triangles = mesh->num_triangles();
+ pack.prim_type.reserve(pack.prim_type.size() + num_triangles);
+ pack.prim_index.reserve(pack.prim_index.size() + num_triangles);
+ pack.prim_object.reserve(pack.prim_object.size() + num_triangles);
+
+ uint type = PRIMITIVE_TRIANGLE;
+ if (mesh->use_motion_blur && mesh->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION))
+ type = PRIMITIVE_MOTION_TRIANGLE;
+
+ for (size_t k = 0; k < num_triangles; ++k) {
+ pack.prim_type.push_back_reserved(type);
+ pack.prim_index.push_back_reserved(k);
+ pack.prim_object.push_back_reserved(0);
+ }
+ }
+
+ // Initialize visibility to zero and later update it during top-level build
+ uint prev_visibility = objects[0]->visibility;
+ objects[0]->visibility = 0;
+
+ // Update 'pack.prim_tri_index', 'pack.prim_tri_verts' and 'pack.prim_visibility'
+ pack_primitives();
+
+ // Reset visibility after packing
+ objects[0]->visibility = prev_visibility;
+}
+
+void BVHOptiX::pack_tlas()
+{
+ // Calculate total packed size
+ size_t prim_index_size = 0;
+ size_t prim_tri_verts_size = 0;
+ foreach (Mesh *mesh, meshes) {
+ BVH *const bvh = mesh->bvh;
+ prim_index_size += bvh->pack.prim_index.size();
+ prim_tri_verts_size += bvh->pack.prim_tri_verts.size();
+ }
+
+ if (prim_index_size == 0)
+ return; // Abort right away if this is an empty BVH
+
+ size_t pack_offset = 0;
+ size_t pack_verts_offset = 0;
+
+ pack.prim_type.resize(prim_index_size);
+ int *pack_prim_type = pack.prim_type.data();
+ pack.prim_index.resize(prim_index_size);
+ int *pack_prim_index = pack.prim_index.data();
+ pack.prim_object.resize(prim_index_size);
+ int *pack_prim_object = pack.prim_object.data();
+ pack.prim_visibility.resize(prim_index_size);
+ uint *pack_prim_visibility = pack.prim_visibility.data();
+ pack.prim_tri_index.resize(prim_index_size);
+ uint *pack_prim_tri_index = pack.prim_tri_index.data();
+ pack.prim_tri_verts.resize(prim_tri_verts_size);
+ float4 *pack_prim_tri_verts = pack.prim_tri_verts.data();
+
+ // Top-level BVH should only contain instances, see 'Mesh::need_build_bvh'
+ // Iterate over scene mesh list instead of objects, since the 'prim_offset' is calculated based
+ // on that list, which may be ordered differently from the object list.
+ foreach (Mesh *mesh, meshes) {
+ PackedBVH &bvh_pack = mesh->bvh->pack;
+ int mesh_tri_offset = mesh->tri_offset;
+ int mesh_curve_offset = mesh->curve_offset;
+
+ // Merge primitive, object and triangle indexes
+ if (!bvh_pack.prim_index.empty()) {
+ int *bvh_prim_type = &bvh_pack.prim_type[0];
+ int *bvh_prim_index = &bvh_pack.prim_index[0];
+ uint *bvh_prim_tri_index = &bvh_pack.prim_tri_index[0];
+ uint *bvh_prim_visibility = &bvh_pack.prim_visibility[0];
+
+ for (size_t i = 0; i < bvh_pack.prim_index.size(); i++, pack_offset++) {
+ if (bvh_pack.prim_type[i] & PRIMITIVE_ALL_CURVE) {
+ pack_prim_index[pack_offset] = bvh_prim_index[i] + mesh_curve_offset;
+ pack_prim_tri_index[pack_offset] = -1;
+ }
+ else {
+ pack_prim_index[pack_offset] = bvh_prim_index[i] + mesh_tri_offset;
+ pack_prim_tri_index[pack_offset] = bvh_prim_tri_index[i] + pack_verts_offset;
+ }
+
+ pack_prim_type[pack_offset] = bvh_prim_type[i];
+ pack_prim_object[pack_offset] = 0; // Unused for instanced meshes
+ pack_prim_visibility[pack_offset] = bvh_prim_visibility[i];
+ }
+ }
+
+ // Merge triangle vertex data
+ if (!bvh_pack.prim_tri_verts.empty()) {
+ const size_t prim_tri_size = bvh_pack.prim_tri_verts.size();
+ memcpy(pack_prim_tri_verts + pack_verts_offset,
+ bvh_pack.prim_tri_verts.data(),
+ prim_tri_size * sizeof(float4));
+ pack_verts_offset += prim_tri_size;
+ }
+ }
+
+ // Merge visibility flags of all objects and fix object indices for non-instanced meshes
+ foreach (Object *ob, objects) {
+ Mesh *const mesh = ob->mesh;
+ for (size_t i = 0; i < mesh->num_primitives(); ++i) {
+ if (!ob->mesh->is_instanced()) {
+ assert(pack.prim_object[mesh->prim_offset + i] == 0);
+ pack.prim_object[mesh->prim_offset + i] = ob->get_device_index();
+ }
+ pack.prim_visibility[mesh->prim_offset + i] |= ob->visibility_for_tracing();
+ }
+ }
+}
+
+void BVHOptiX::pack_nodes(const BVHNode *)
+{
+}
+
+void BVHOptiX::refit_nodes()
+{
+ // TODO(pmours): Implement?
+ VLOG(1) << "Refit is not yet implemented for OptiX BVH.";
+}
+
+BVHNode *BVHOptiX::widen_children_nodes(const BVHNode *)
+{
+ return NULL;
+}
+
+CCL_NAMESPACE_END
+
+#endif /* WITH_OPTIX */
diff --git a/intern/cycles/bvh/bvh_optix.h b/intern/cycles/bvh/bvh_optix.h
new file mode 100644
index 00000000000..35033fe635f
--- /dev/null
+++ b/intern/cycles/bvh/bvh_optix.h
@@ -0,0 +1,53 @@
+/*
+ * Copyright 2019, NVIDIA Corporation.
+ * Copyright 2019, Blender Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __BVH_OPTIX_H__
+#define __BVH_OPTIX_H__
+
+#ifdef WITH_OPTIX
+
+# include "bvh/bvh.h"
+# include "bvh/bvh_params.h"
+# include "device/device_memory.h"
+
+CCL_NAMESPACE_BEGIN
+
+class BVHOptiX : public BVH {
+ friend class BVH;
+
+ public:
+ BVHOptiX(const BVHParams &params, const vector<Mesh *> &meshes, const vector<Object *> &objects);
+ virtual ~BVHOptiX();
+
+ virtual void build(Progress &progress, Stats *) override;
+ virtual void copy_to_device(Progress &progress, DeviceScene *dscene) override;
+
+ private:
+ void pack_blas();
+ void pack_tlas();
+
+ virtual void pack_nodes(const BVHNode *) override;
+ virtual void refit_nodes() override;
+
+ virtual BVHNode *widen_children_nodes(const BVHNode *) override;
+};
+
+CCL_NAMESPACE_END
+
+#endif /* WITH_OPTIX */
+
+#endif /* __BVH_OPTIX_H__ */
diff --git a/intern/cycles/device/CMakeLists.txt b/intern/cycles/device/CMakeLists.txt
index 3e14480e2ad..a8c4949ad07 100644
--- a/intern/cycles/device/CMakeLists.txt
+++ b/intern/cycles/device/CMakeLists.txt
@@ -29,6 +29,7 @@ set(SRC
device_memory.cpp
device_multi.cpp
device_opencl.cpp
+ device_optix.cpp
device_split_kernel.cpp
device_task.cpp
)
@@ -85,6 +86,9 @@ endif()
if(WITH_CYCLES_DEVICE_CUDA)
add_definitions(-DWITH_CUDA)
endif()
+if(WITH_CYCLES_DEVICE_OPTIX)
+ add_definitions(-DWITH_OPTIX)
+endif()
if(WITH_CYCLES_DEVICE_MULTI)
add_definitions(-DWITH_MULTI)
endif()
diff --git a/intern/cycles/device/device.cpp b/intern/cycles/device/device.cpp
index 47d111802cd..fe8a814cd14 100644
--- a/intern/cycles/device/device.cpp
+++ b/intern/cycles/device/device.cpp
@@ -38,6 +38,7 @@ bool Device::need_devices_update = true;
thread_mutex Device::device_mutex;
vector<DeviceInfo> Device::opencl_devices;
vector<DeviceInfo> Device::cuda_devices;
+vector<DeviceInfo> Device::optix_devices;
vector<DeviceInfo> Device::cpu_devices;
vector<DeviceInfo> Device::network_devices;
uint Device::devices_initialized_mask = 0;
@@ -379,6 +380,14 @@ Device *Device::create(DeviceInfo &info, Stats &stats, Profiler &profiler, bool
device = NULL;
break;
#endif
+#ifdef WITH_OPTIX
+ case DEVICE_OPTIX:
+ if (device_optix_init())
+ device = device_optix_create(info, stats, profiler, background);
+ else
+ device = NULL;
+ break;
+#endif
#ifdef WITH_MULTI
case DEVICE_MULTI:
device = device_multi_create(info, stats, profiler, background);
@@ -410,6 +419,8 @@ DeviceType Device::type_from_string(const char *name)
return DEVICE_CPU;
else if (strcmp(name, "CUDA") == 0)
return DEVICE_CUDA;
+ else if (strcmp(name, "OPTIX") == 0)
+ return DEVICE_OPTIX;
else if (strcmp(name, "OPENCL") == 0)
return DEVICE_OPENCL;
else if (strcmp(name, "NETWORK") == 0)
@@ -426,6 +437,8 @@ string Device::string_from_type(DeviceType type)
return "CPU";
else if (type == DEVICE_CUDA)
return "CUDA";
+ else if (type == DEVICE_OPTIX)
+ return "OPTIX";
else if (type == DEVICE_OPENCL)
return "OPENCL";
else if (type == DEVICE_NETWORK)
@@ -443,6 +456,9 @@ vector<DeviceType> Device::available_types()
#ifdef WITH_CUDA
types.push_back(DEVICE_CUDA);
#endif
+#ifdef WITH_OPTIX
+ types.push_back(DEVICE_OPTIX);
+#endif
#ifdef WITH_OPENCL
types.push_back(DEVICE_OPENCL);
#endif
@@ -488,6 +504,20 @@ vector<DeviceInfo> Device::available_devices(uint mask)
}
#endif
+#ifdef WITH_OPTIX
+ if (mask & DEVICE_MASK_OPTIX) {
+ if (!(devices_initialized_mask & DEVICE_MASK_OPTIX)) {
+ if (device_optix_init()) {
+ device_optix_info(optix_devices);
+ }
+ devices_initialized_mask |= DEVICE_MASK_OPTIX;
+ }
+ foreach (DeviceInfo &info, optix_devices) {
+ devices.push_back(info);
+ }
+ }
+#endif
+
if (mask & DEVICE_MASK_CPU) {
if (!(devices_initialized_mask & DEVICE_MASK_CPU)) {
device_cpu_info(cpu_devices);
diff --git a/intern/cycles/device/device.h b/intern/cycles/device/device.h
index 15a0ceb4a19..672d93c2581 100644
--- a/intern/cycles/device/device.h
+++ b/intern/cycles/device/device.h
@@ -34,6 +34,7 @@
CCL_NAMESPACE_BEGIN
+class BVH;
class Progress;
class RenderTile;
@@ -45,13 +46,15 @@ enum DeviceType {
DEVICE_OPENCL,
DEVICE_CUDA,
DEVICE_NETWORK,
- DEVICE_MULTI
+ DEVICE_MULTI,
+ DEVICE_OPTIX,
};
enum DeviceTypeMask {
DEVICE_MASK_CPU = (1 << DEVICE_CPU),
DEVICE_MASK_OPENCL = (1 << DEVICE_OPENCL),
DEVICE_MASK_CUDA = (1 << DEVICE_CUDA),
+ DEVICE_MASK_OPTIX = (1 << DEVICE_OPTIX),
DEVICE_MASK_NETWORK = (1 << DEVICE_NETWORK),
DEVICE_MASK_ALL = ~0
};
@@ -380,7 +383,11 @@ class Device {
}
/* tasks */
- virtual int get_split_task_count(DeviceTask &task) = 0;
+ virtual int get_split_task_count(DeviceTask &)
+ {
+ return 1;
+ }
+
virtual void task_add(DeviceTask &task) = 0;
virtual void task_wait() = 0;
virtual void task_cancel() = 0;
@@ -399,6 +406,12 @@ class Device {
bool transparent,
const DeviceDrawParams &draw_params);
+ /* acceleration structure building */
+ virtual bool build_optix_bvh(BVH *, device_memory &)
+ {
+ return false;
+ }
+
#ifdef WITH_NETWORK
/* networking */
void server_run();
@@ -456,6 +469,7 @@ class Device {
static bool need_types_update, need_devices_update;
static thread_mutex device_mutex;
static vector<DeviceInfo> cuda_devices;
+ static vector<DeviceInfo> optix_devices;
static vector<DeviceInfo> opencl_devices;
static vector<DeviceInfo> cpu_devices;
static vector<DeviceInfo> network_devices;
diff --git a/intern/cycles/device/device_cuda.cpp b/intern/cycles/device/device_cuda.cpp
index 4d7d87828c2..00dd37f089c 100644
--- a/intern/cycles/device/device_cuda.cpp
+++ b/intern/cycles/device/device_cuda.cpp
@@ -2263,11 +2263,6 @@ class CUDADevice : public Device {
}
};
- int get_split_task_count(DeviceTask & /*task*/)
- {
- return 1;
- }
-
void task_add(DeviceTask &task)
{
CUDAContextScope scope(this);
diff --git a/intern/cycles/device/device_intern.h b/intern/cycles/device/device_intern.h
index c393a3f9cda..5b8b86886c4 100644
--- a/intern/cycles/device/device_intern.h
+++ b/intern/cycles/device/device_intern.h
@@ -27,6 +27,9 @@ Device *device_opencl_create(DeviceInfo &info, Stats &stats, Profiler &profiler,
bool device_opencl_compile_kernel(const vector<string> &parameters);
bool device_cuda_init();
Device *device_cuda_create(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background);
+bool device_optix_init();
+Device *device_optix_create(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background);
+
Device *device_network_create(DeviceInfo &info,
Stats &stats,
Profiler &profiler,
@@ -36,6 +39,7 @@ Device *device_multi_create(DeviceInfo &info, Stats &stats, Profiler &profiler,
void device_cpu_info(vector<DeviceInfo> &devices);
void device_opencl_info(vector<DeviceInfo> &devices);
void device_cuda_info(vector<DeviceInfo> &devices);
+void device_optix_info(vector<DeviceInfo> &devices);
void device_network_info(vector<DeviceInfo> &devices);
string device_cpu_capabilities();
diff --git a/intern/cycles/device/device_multi.cpp b/intern/cycles/device/device_multi.cpp
index 4a40e106115..ac71be9dbea 100644
--- a/intern/cycles/device/device_multi.cpp
+++ b/intern/cycles/device/device_multi.cpp
@@ -153,6 +153,24 @@ class MultiDevice : public Device {
return result;
}
+ bool build_optix_bvh(BVH *bvh, device_memory &mem)
+ {
+ device_ptr key = unique_key++;
+
+ // Broadcast acceleration structure build to all devices
+ foreach (SubDevice &sub, devices) {
+ mem.device = sub.device;
+ if (!sub.device->build_optix_bvh(bvh, mem))
+ return false;
+ sub.ptr_map[key] = mem.device_pointer;
+ }
+
+ mem.device = this;
+ mem.device_pointer = key;
+ stats.mem_alloc(mem.device_size);
+ return true;
+ }
+
void mem_alloc(device_memory &mem)
{
device_ptr key = unique_key++;
diff --git a/intern/cycles/device/device_optix.cpp b/intern/cycles/device/device_optix.cpp
new file mode 100644
index 00000000000..84d7ecf6934
--- /dev/null
+++ b/intern/cycles/device/device_optix.cpp
@@ -0,0 +1,1969 @@
+/*
+ * Copyright 2019, NVIDIA Corporation.
+ * Copyright 2019, Blender Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifdef WITH_OPTIX
+
+# include "device/device.h"
+# include "device/device_intern.h"
+# include "device/device_denoising.h"
+# include "bvh/bvh.h"
+# include "render/scene.h"
+# include "render/mesh.h"
+# include "render/object.h"
+# include "render/buffers.h"
+# include "util/util_md5.h"
+# include "util/util_path.h"
+# include "util/util_time.h"
+# include "util/util_debug.h"
+# include "util/util_logging.h"
+
+# undef _WIN32_WINNT // Need minimum API support for Windows 7
+# define _WIN32_WINNT _WIN32_WINNT_WIN7
+
+# ifdef WITH_CUDA_DYNLOAD
+# include <cuew.h>
+// Do not use CUDA SDK headers when using CUEW
+# define OPTIX_DONT_INCLUDE_CUDA
+# endif
+# include <optix_stubs.h>
+# include <optix_function_table_definition.h>
+
+CCL_NAMESPACE_BEGIN
+
+/* Make sure this stays in sync with kernel_globals.h */
+struct ShaderParams {
+ uint4 *input;
+ float4 *output;
+ int type;
+ int filter;
+ int sx;
+ int offset;
+ int sample;
+};
+struct KernelParams {
+ WorkTile tile;
+ KernelData data;
+ ShaderParams shader;
+# define KERNEL_TEX(type, name) const type *name;
+# include "kernel/kernel_textures.h"
+# undef KERNEL_TEX
+};
+
+# define check_result_cuda(stmt) \
+ { \
+ CUresult res = stmt; \
+ if (res != CUDA_SUCCESS) { \
+ const char *name; \
+ cuGetErrorName(res, &name); \
+ set_error(string_printf("OptiX CUDA error %s in %s, line %d", name, #stmt, __LINE__)); \
+ return; \
+ } \
+ } \
+ (void)0
+# define check_result_cuda_ret(stmt) \
+ { \
+ CUresult res = stmt; \
+ if (res != CUDA_SUCCESS) { \
+ const char *name; \
+ cuGetErrorName(res, &name); \
+ set_error(string_printf("OptiX CUDA error %s in %s, line %d", name, #stmt, __LINE__)); \
+ return false; \
+ } \
+ } \
+ (void)0
+
+# define check_result_optix(stmt) \
+ { \
+ enum OptixResult res = stmt; \
+ if (res != OPTIX_SUCCESS) { \
+ const char *name = optixGetErrorName(res); \
+ set_error(string_printf("OptiX error %s in %s, line %d", name, #stmt, __LINE__)); \
+ return; \
+ } \
+ } \
+ (void)0
+# define check_result_optix_ret(stmt) \
+ { \
+ enum OptixResult res = stmt; \
+ if (res != OPTIX_SUCCESS) { \
+ const char *name = optixGetErrorName(res); \
+ set_error(string_printf("OptiX error %s in %s, line %d", name, #stmt, __LINE__)); \
+ return false; \
+ } \
+ } \
+ (void)0
+
+class OptiXDevice : public Device {
+
+ // List of OptiX program groups
+ enum {
+ PG_RGEN,
+ PG_MISS,
+ PG_HITD, // Default hit group
+ PG_HITL, // __BVH_LOCAL__ hit group
+ PG_HITS, // __SHADOW_RECORD_ALL__ hit group
+# ifdef WITH_CYCLES_DEBUG
+ PG_EXCP,
+# endif
+ PG_BAKE, // kernel_bake_evaluate
+ PG_DISP, // kernel_displace_evaluate
+ PG_BACK, // kernel_background_evaluate
+ NUM_PROGRAM_GROUPS
+ };
+
+ // List of OptiX pipelines
+ enum { PIP_PATH_TRACE, PIP_SHADER_EVAL, NUM_PIPELINES };
+
+ // A single shader binding table entry
+ struct SbtRecord {
+ char header[OPTIX_SBT_RECORD_HEADER_SIZE];
+ };
+
+ // Information stored about CUDA memory allocations
+ struct CUDAMem {
+ bool free_map_host = false;
+ CUarray array = NULL;
+ CUtexObject texobject = 0;
+ void *map_host_pointer = nullptr;
+ };
+
+ // Helper class to manage current CUDA context
+ struct CUDAContextScope {
+ CUDAContextScope(CUcontext ctx)
+ {
+ cuCtxPushCurrent(ctx);
+ }
+ ~CUDAContextScope()
+ {
+ cuCtxPopCurrent(NULL);
+ }
+ };
+
+ // Use a pool with multiple threads to support launches with multiple CUDA streams
+ TaskPool task_pool;
+
+ // CUDA/OptiX context handles
+ CUdevice cuda_device = 0;
+ CUcontext cuda_context = NULL;
+ vector<CUstream> cuda_stream;
+ OptixDeviceContext context = NULL;
+
+ // Need CUDA kernel module for some utility functions
+ CUmodule cuda_module = NULL;
+ CUmodule cuda_filter_module = NULL;
+ // All necessary OptiX kernels are in one module
+ OptixModule optix_module = NULL;
+ OptixPipeline pipelines[NUM_PIPELINES] = {};
+
+ bool need_texture_info = false;
+ device_vector<SbtRecord> sbt_data;
+ device_vector<TextureInfo> texture_info;
+ device_only_memory<KernelParams> launch_params;
+ vector<device_only_memory<uint8_t>> blas;
+ OptixTraversableHandle tlas_handle = 0;
+
+ map<device_memory *, CUDAMem> cuda_mem_map;
+
+ public:
+ OptiXDevice(DeviceInfo &info_, Stats &stats_, Profiler &profiler_, bool background_)
+ : Device(info_, stats_, profiler_, background_),
+ sbt_data(this, "__sbt", MEM_READ_ONLY),
+ texture_info(this, "__texture_info", MEM_TEXTURE),
+ launch_params(this, "__params")
+ {
+ // Store number of CUDA streams in device info
+ info.cpu_threads = DebugFlags().optix.cuda_streams;
+
+ // Initialize CUDA driver API
+ check_result_cuda(cuInit(0));
+
+ // Retrieve the primary CUDA context for this device
+ check_result_cuda(cuDeviceGet(&cuda_device, info.num));
+ check_result_cuda(cuDevicePrimaryCtxRetain(&cuda_context, cuda_device));
+
+ // Make that CUDA context current
+ const CUDAContextScope scope(cuda_context);
+
+ // Create OptiX context for this device
+ OptixDeviceContextOptions options = {};
+# ifdef WITH_CYCLES_LOGGING
+ options.logCallbackLevel = 4; // Fatal = 1, Error = 2, Warning = 3, Print = 4
+ options.logCallbackFunction =
+ [](unsigned int level, const char *, const char *message, void *) {
+ switch (level) {
+ case 1:
+ LOG_IF(FATAL, VLOG_IS_ON(1)) << message;
+ break;
+ case 2:
+ LOG_IF(ERROR, VLOG_IS_ON(1)) << message;
+ break;
+ case 3:
+ LOG_IF(WARNING, VLOG_IS_ON(1)) << message;
+ break;
+ case 4:
+ LOG_IF(INFO, VLOG_IS_ON(1)) << message;
+ break;
+ }
+ };
+# endif
+ check_result_optix(optixDeviceContextCreate(cuda_context, &options, &context));
+# ifdef WITH_CYCLES_LOGGING
+ check_result_optix(optixDeviceContextSetLogCallback(
+ context, options.logCallbackFunction, options.logCallbackData, options.logCallbackLevel));
+# endif
+
+ // Create launch streams
+ cuda_stream.resize(info.cpu_threads);
+ for (int i = 0; i < info.cpu_threads; ++i)
+ check_result_cuda(cuStreamCreate(&cuda_stream[i], CU_STREAM_NON_BLOCKING));
+
+ // Fix weird compiler bug that assigns wrong size
+ launch_params.data_elements = sizeof(KernelParams);
+ // Allocate launch parameter buffer memory on device
+ launch_params.alloc_to_device(info.cpu_threads);
+ }
+ ~OptiXDevice()
+ {
+ // Stop processing any more tasks
+ task_pool.stop();
+
+ // Clean up all memory before destroying context
+ blas.clear();
+
+ sbt_data.free();
+ texture_info.free();
+ launch_params.free();
+
+ // Make CUDA context current
+ const CUDAContextScope scope(cuda_context);
+
+ // Unload modules
+ if (cuda_module != NULL)
+ cuModuleUnload(cuda_module);
+ if (cuda_filter_module != NULL)
+ cuModuleUnload(cuda_filter_module);
+ if (optix_module != NULL)
+ optixModuleDestroy(optix_module);
+ for (unsigned int i = 0; i < NUM_PIPELINES; ++i)
+ if (pipelines[i] != NULL)
+ optixPipelineDestroy(pipelines[i]);
+
+ // Destroy launch streams
+ for (int i = 0; i < info.cpu_threads; ++i)
+ cuStreamDestroy(cuda_stream[i]);
+
+ // Destroy OptiX and CUDA context
+ optixDeviceContextDestroy(context);
+ cuDevicePrimaryCtxRelease(cuda_device);
+ }
+
+ private:
+ bool show_samples() const override
+ {
+ // Only show samples if not rendering multiple tiles in parallel
+ return info.cpu_threads == 1;
+ }
+
+ BVHLayoutMask get_bvh_layout_mask() const override
+ {
+ // OptiX has its own internal acceleration structure format
+ return BVH_LAYOUT_OPTIX;
+ }
+
+ bool load_kernels(const DeviceRequestedFeatures &requested_features) override
+ {
+ if (have_error())
+ return false; // Abort early if context creation failed already
+
+ // Disable baking for now, since its kernel is not well-suited for inlining and is very slow
+ if (requested_features.use_baking) {
+ set_error("OptiX implementation does not support baking yet");
+ return false;
+ }
+ // Disable shader raytracing support for now, since continuation callables are slow
+ if (requested_features.use_shader_raytrace) {
+ set_error("OptiX implementation does not support shader raytracing yet");
+ return false;
+ }
+
+ const CUDAContextScope scope(cuda_context);
+
+ // Unload any existing modules first
+ if (cuda_module != NULL)
+ cuModuleUnload(cuda_module);
+ if (cuda_filter_module != NULL)
+ cuModuleUnload(cuda_filter_module);
+ if (optix_module != NULL)
+ optixModuleDestroy(optix_module);
+ for (unsigned int i = 0; i < NUM_PIPELINES; ++i)
+ if (pipelines[i] != NULL)
+ optixPipelineDestroy(pipelines[i]);
+
+ OptixModuleCompileOptions module_options;
+ module_options.maxRegisterCount = 0; // Do not set an explicit register limit
+# ifdef WITH_CYCLES_DEBUG
+ module_options.optLevel = OPTIX_COMPILE_OPTIMIZATION_LEVEL_0;
+ module_options.debugLevel = OPTIX_COMPILE_DEBUG_LEVEL_FULL;
+# else
+ module_options.optLevel = OPTIX_COMPILE_OPTIMIZATION_LEVEL_3;
+ module_options.debugLevel = OPTIX_COMPILE_DEBUG_LEVEL_LINEINFO;
+# endif
+ OptixPipelineCompileOptions pipeline_options;
+ // Default to no motion blur and two-level graph, since it is the fastest option
+ pipeline_options.usesMotionBlur = false;
+ pipeline_options.traversableGraphFlags =
+ OPTIX_TRAVERSABLE_GRAPH_FLAG_ALLOW_SINGLE_LEVEL_INSTANCING;
+ pipeline_options.numPayloadValues = 6;
+ pipeline_options.numAttributeValues = 2; // u, v
+# ifdef WITH_CYCLES_DEBUG
+ pipeline_options.exceptionFlags = OPTIX_EXCEPTION_FLAG_STACK_OVERFLOW |
+ OPTIX_EXCEPTION_FLAG_TRACE_DEPTH;
+# else
+ pipeline_options.exceptionFlags = OPTIX_EXCEPTION_FLAG_NONE;
+# endif
+ pipeline_options.pipelineLaunchParamsVariableName = "__params"; // See kernel_globals.h
+
+ if (requested_features.use_object_motion) {
+ pipeline_options.usesMotionBlur = true;
+ // Motion blur can insert motion transforms into the traversal graph
+ // It is no longer a two-level graph then, so need to set flags to allow any configuration
+ pipeline_options.traversableGraphFlags = OPTIX_TRAVERSABLE_GRAPH_FLAG_ALLOW_ANY;
+ }
+
+ { // Load and compile PTX module with OptiX kernels
+ string ptx_data;
+ const string ptx_filename = "lib/kernel_optix.ptx";
+ if (!path_read_text(path_get(ptx_filename), ptx_data)) {
+ set_error("Failed loading OptiX kernel " + ptx_filename + ".");
+ return false;
+ }
+
+ check_result_optix_ret(optixModuleCreateFromPTX(context,
+ &module_options,
+ &pipeline_options,
+ ptx_data.data(),
+ ptx_data.size(),
+ nullptr,
+ 0,
+ &optix_module));
+ }
+
+ { // Load CUDA modules because we need some of the utility kernels
+ int major, minor;
+ cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, info.num);
+ cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, info.num);
+
+ string cubin_data;
+ const string cubin_filename = string_printf("lib/kernel_sm_%d%d.cubin", major, minor);
+ if (!path_read_text(path_get(cubin_filename), cubin_data)) {
+ set_error("Failed loading pre-compiled CUDA kernel " + cubin_filename + ".");
+ return false;
+ }
+
+ check_result_cuda_ret(cuModuleLoadData(&cuda_module, cubin_data.data()));
+
+ if (requested_features.use_denoising) {
+ string filter_data;
+ const string filter_filename = string_printf("lib/filter_sm_%d%d.cubin", major, minor);
+ if (!path_read_text(path_get(filter_filename), filter_data)) {
+ set_error("Failed loading pre-compiled CUDA filter kernel " + filter_filename + ".");
+ return false;
+ }
+
+ check_result_cuda_ret(cuModuleLoadData(&cuda_filter_module, filter_data.data()));
+ }
+ }
+
+ // Create program groups
+ OptixProgramGroup groups[NUM_PROGRAM_GROUPS] = {};
+ OptixProgramGroupDesc group_descs[NUM_PROGRAM_GROUPS] = {};
+ OptixProgramGroupOptions group_options = {}; // There are no options currently
+ group_descs[PG_RGEN].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
+ group_descs[PG_RGEN].raygen.module = optix_module;
+ // Ignore branched integrator for now (see "requested_features.use_integrator_branched")
+ group_descs[PG_RGEN].raygen.entryFunctionName = "__raygen__kernel_optix_path_trace";
+ group_descs[PG_MISS].kind = OPTIX_PROGRAM_GROUP_KIND_MISS;
+ group_descs[PG_MISS].miss.module = optix_module;
+ group_descs[PG_MISS].miss.entryFunctionName = "__miss__kernel_optix_miss";
+ group_descs[PG_HITD].kind = OPTIX_PROGRAM_GROUP_KIND_HITGROUP;
+ group_descs[PG_HITD].hitgroup.moduleCH = optix_module;
+ group_descs[PG_HITD].hitgroup.entryFunctionNameCH = "__closesthit__kernel_optix_hit";
+ group_descs[PG_HITD].hitgroup.moduleAH = optix_module;
+ group_descs[PG_HITD].hitgroup.entryFunctionNameAH = "__anyhit__kernel_optix_visibility_test";
+ group_descs[PG_HITS].kind = OPTIX_PROGRAM_GROUP_KIND_HITGROUP;
+ group_descs[PG_HITS].hitgroup.moduleAH = optix_module;
+ group_descs[PG_HITS].hitgroup.entryFunctionNameAH = "__anyhit__kernel_optix_shadow_all_hit";
+
+ if (requested_features.use_hair) {
+ // Add curve intersection programs
+ group_descs[PG_HITD].hitgroup.moduleIS = optix_module;
+ group_descs[PG_HITD].hitgroup.entryFunctionNameIS = "__intersection__curve";
+ group_descs[PG_HITS].hitgroup.moduleIS = optix_module;
+ group_descs[PG_HITS].hitgroup.entryFunctionNameIS = "__intersection__curve";
+ }
+
+ if (requested_features.use_subsurface || requested_features.use_shader_raytrace) {
+ // Add hit group for local intersections
+ group_descs[PG_HITL].kind = OPTIX_PROGRAM_GROUP_KIND_HITGROUP;
+ group_descs[PG_HITL].hitgroup.moduleAH = optix_module;
+ group_descs[PG_HITL].hitgroup.entryFunctionNameAH = "__anyhit__kernel_optix_local_hit";
+ }
+
+# ifdef WITH_CYCLES_DEBUG
+ group_descs[PG_EXCP].kind = OPTIX_PROGRAM_GROUP_KIND_EXCEPTION;
+ group_descs[PG_EXCP].exception.module = optix_module;
+ group_descs[PG_EXCP].exception.entryFunctionName = "__exception__kernel_optix_exception";
+# endif
+
+ if (requested_features.use_baking) {
+ group_descs[PG_BAKE].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
+ group_descs[PG_BAKE].raygen.module = optix_module;
+ group_descs[PG_BAKE].raygen.entryFunctionName = "__raygen__kernel_optix_bake";
+ }
+
+ if (requested_features.use_true_displacement) {
+ group_descs[PG_DISP].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
+ group_descs[PG_DISP].raygen.module = optix_module;
+ group_descs[PG_DISP].raygen.entryFunctionName = "__raygen__kernel_optix_displace";
+ }
+
+ if (requested_features.use_background_light) {
+ group_descs[PG_BACK].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
+ group_descs[PG_BACK].raygen.module = optix_module;
+ group_descs[PG_BACK].raygen.entryFunctionName = "__raygen__kernel_optix_background";
+ }
+
+ check_result_optix_ret(optixProgramGroupCreate(
+ context, group_descs, NUM_PROGRAM_GROUPS, &group_options, nullptr, 0, groups));
+
+ // Get program stack sizes
+ OptixStackSizes stack_size[NUM_PROGRAM_GROUPS] = {};
+ // Set up SBT, which in this case is used only to select between different programs
+ sbt_data.alloc(NUM_PROGRAM_GROUPS);
+ memset(sbt_data.host_pointer, 0, sizeof(SbtRecord) * NUM_PROGRAM_GROUPS);
+ for (unsigned int i = 0; i < NUM_PROGRAM_GROUPS; ++i) {
+ check_result_optix_ret(optixSbtRecordPackHeader(groups[i], &sbt_data[i]));
+ check_result_optix_ret(optixProgramGroupGetStackSize(groups[i], &stack_size[i]));
+ }
+ sbt_data.copy_to_device(); // Upload SBT to device
+
+ // Calculate maximum trace continuation stack size
+ unsigned int trace_css = stack_size[PG_HITD].cssCH;
+ // This is based on the maximum of closest-hit and any-hit/intersection programs
+ trace_css = max(trace_css, stack_size[PG_HITD].cssIS + stack_size[PG_HITD].cssAH);
+ trace_css = max(trace_css, stack_size[PG_HITL].cssIS + stack_size[PG_HITL].cssAH);
+ trace_css = max(trace_css, stack_size[PG_HITS].cssIS + stack_size[PG_HITS].cssAH);
+
+ OptixPipelineLinkOptions link_options;
+ link_options.maxTraceDepth = 1;
+# ifdef WITH_CYCLES_DEBUG
+ link_options.debugLevel = OPTIX_COMPILE_DEBUG_LEVEL_FULL;
+# else
+ link_options.debugLevel = OPTIX_COMPILE_DEBUG_LEVEL_LINEINFO;
+# endif
+ link_options.overrideUsesMotionBlur = pipeline_options.usesMotionBlur;
+
+ { // Create path tracing pipeline
+ OptixProgramGroup pipeline_groups[] = {
+ groups[PG_RGEN],
+ groups[PG_MISS],
+ groups[PG_HITD],
+ groups[PG_HITS],
+ groups[PG_HITL],
+# ifdef WITH_CYCLES_DEBUG
+ groups[PG_EXCP],
+# endif
+ };
+ check_result_optix_ret(
+ optixPipelineCreate(context,
+ &pipeline_options,
+ &link_options,
+ pipeline_groups,
+ (sizeof(pipeline_groups) / sizeof(pipeline_groups[0])),
+ nullptr,
+ 0,
+ &pipelines[PIP_PATH_TRACE]));
+
+ // Combine ray generation and trace continuation stack size
+ const unsigned int css = stack_size[PG_RGEN].cssRG + link_options.maxTraceDepth * trace_css;
+
+ // Set stack size depending on pipeline options
+ check_result_optix_ret(optixPipelineSetStackSize(
+ pipelines[PIP_PATH_TRACE], 0, 0, css, (pipeline_options.usesMotionBlur ? 3 : 2)));
+ }
+
+ // Only need to create shader evaluation pipeline if one of these features is used:
+ const bool use_shader_eval_pipeline = requested_features.use_baking ||
+ requested_features.use_background_light ||
+ requested_features.use_true_displacement;
+
+ if (use_shader_eval_pipeline) { // Create shader evaluation pipeline
+ OptixProgramGroup pipeline_groups[] = {
+ groups[PG_BAKE],
+ groups[PG_DISP],
+ groups[PG_BACK],
+ groups[PG_MISS],
+ groups[PG_HITD],
+ groups[PG_HITS],
+ groups[PG_HITL],
+# ifdef WITH_CYCLES_DEBUG
+ groups[PG_EXCP],
+# endif
+ };
+ check_result_optix_ret(
+ optixPipelineCreate(context,
+ &pipeline_options,
+ &link_options,
+ pipeline_groups,
+ (sizeof(pipeline_groups) / sizeof(pipeline_groups[0])),
+ nullptr,
+ 0,
+ &pipelines[PIP_SHADER_EVAL]));
+
+ // Calculate continuation stack size based on the maximum of all ray generation stack sizes
+ const unsigned int css = max(stack_size[PG_BAKE].cssRG,
+ max(stack_size[PG_DISP].cssRG, stack_size[PG_BACK].cssRG)) +
+ link_options.maxTraceDepth * trace_css;
+
+ check_result_optix_ret(optixPipelineSetStackSize(
+ pipelines[PIP_SHADER_EVAL], 0, 0, css, (pipeline_options.usesMotionBlur ? 3 : 2)));
+ }
+
+ // Clean up program group objects
+ for (unsigned int i = 0; i < NUM_PROGRAM_GROUPS; ++i) {
+ optixProgramGroupDestroy(groups[i]);
+ }
+
+ return true;
+ }
+
+ void thread_run(DeviceTask &task, int thread_index) // Main task entry point
+ {
+ if (have_error())
+ return; // Abort early if there was an error previously
+
+ if (task.type == DeviceTask::RENDER) {
+ RenderTile tile;
+ while (task.acquire_tile(this, tile)) {
+ if (tile.task == RenderTile::PATH_TRACE)
+ launch_render(task, tile, thread_index);
+ else if (tile.task == RenderTile::DENOISE)
+ launch_denoise(task, tile, thread_index);
+ task.release_tile(tile);
+ if (task.get_cancel() && !task.need_finish_queue)
+ break; // User requested cancellation
+ else if (have_error())
+ break; // Abort rendering when encountering an error
+ }
+ }
+ else if (task.type == DeviceTask::SHADER) {
+ launch_shader_eval(task, thread_index);
+ }
+ else if (task.type == DeviceTask::FILM_CONVERT) {
+ launch_film_convert(task, thread_index);
+ }
+ }
+
+ void launch_render(DeviceTask &task, RenderTile &rtile, int thread_index)
+ {
+ assert(thread_index < launch_params.data_size);
+
+ // Keep track of total render time of this tile
+ const scoped_timer timer(&rtile.buffers->render_time);
+
+ WorkTile wtile;
+ wtile.x = rtile.x;
+ wtile.y = rtile.y;
+ wtile.w = rtile.w;
+ wtile.h = rtile.h;
+ wtile.offset = rtile.offset;
+ wtile.stride = rtile.stride;
+ wtile.buffer = (float *)rtile.buffer;
+
+ const int end_sample = rtile.start_sample + rtile.num_samples;
+ // Keep this number reasonable to avoid running into TDRs
+ const int step_samples = (info.display_device ? 8 : 32);
+ // Offset into launch params buffer so that streams use separate data
+ device_ptr launch_params_ptr = launch_params.device_pointer +
+ thread_index * launch_params.data_elements;
+
+ const CUDAContextScope scope(cuda_context);
+
+ for (int sample = rtile.start_sample; sample < end_sample; sample += step_samples) {
+ // Copy work tile information to device
+ wtile.num_samples = min(step_samples, end_sample - sample);
+ wtile.start_sample = sample;
+ check_result_cuda(cuMemcpyHtoDAsync(launch_params_ptr + offsetof(KernelParams, tile),
+ &wtile,
+ sizeof(wtile),
+ cuda_stream[thread_index]));
+
+ OptixShaderBindingTable sbt_params = {};
+ sbt_params.raygenRecord = sbt_data.device_pointer + PG_RGEN * sizeof(SbtRecord);
+# ifdef WITH_CYCLES_DEBUG
+ sbt_params.exceptionRecord = sbt_data.device_pointer + PG_EXCP * sizeof(SbtRecord);
+# endif
+ sbt_params.missRecordBase = sbt_data.device_pointer + PG_MISS * sizeof(SbtRecord);
+ sbt_params.missRecordStrideInBytes = sizeof(SbtRecord);
+ sbt_params.missRecordCount = 1;
+ sbt_params.hitgroupRecordBase = sbt_data.device_pointer + PG_HITD * sizeof(SbtRecord);
+ sbt_params.hitgroupRecordStrideInBytes = sizeof(SbtRecord);
+ sbt_params.hitgroupRecordCount = 3; // PG_HITD, PG_HITL, PG_HITS
+
+ // Launch the ray generation program
+ check_result_optix(optixLaunch(pipelines[PIP_PATH_TRACE],
+ cuda_stream[thread_index],
+ launch_params_ptr,
+ launch_params.data_elements,
+ &sbt_params,
+ // Launch with samples close to each other for better locality
+ wtile.w * wtile.num_samples,
+ wtile.h,
+ 1));
+
+ // Wait for launch to finish
+ check_result_cuda(cuStreamSynchronize(cuda_stream[thread_index]));
+
+ // Update current sample, so it is displayed correctly
+ rtile.sample = wtile.start_sample + wtile.num_samples;
+ // Update task progress after the kernel completed rendering
+ task.update_progress(&rtile, wtile.w * wtile.h * wtile.num_samples);
+
+ if (task.get_cancel() && !task.need_finish_queue)
+ return; // Cancel rendering
+ }
+ }
+
+ void launch_denoise(DeviceTask &task, RenderTile &rtile, int thread_index)
+ {
+ const CUDAContextScope scope(cuda_context);
+
+ // Run CUDA denoising kernels
+ DenoisingTask denoising(this, task);
+ denoising.functions.construct_transform = function_bind(
+ &OptiXDevice::denoising_construct_transform, this, &denoising, thread_index);
+ denoising.functions.accumulate = function_bind(
+ &OptiXDevice::denoising_accumulate, this, _1, _2, _3, _4, &denoising, thread_index);
+ denoising.functions.solve = function_bind(
+ &OptiXDevice::denoising_solve, this, _1, &denoising, thread_index);
+ denoising.functions.divide_shadow = function_bind(
+ &OptiXDevice::denoising_divide_shadow, this, _1, _2, _3, _4, _5, &denoising, thread_index);
+ denoising.functions.non_local_means = function_bind(
+ &OptiXDevice::denoising_non_local_means, this, _1, _2, _3, _4, &denoising, thread_index);
+ denoising.functions.combine_halves = function_bind(&OptiXDevice::denoising_combine_halves,
+ this,
+ _1,
+ _2,
+ _3,
+ _4,
+ _5,
+ _6,
+ &denoising,
+ thread_index);
+ denoising.functions.get_feature = function_bind(
+ &OptiXDevice::denoising_get_feature, this, _1, _2, _3, _4, _5, &denoising, thread_index);
+ denoising.functions.write_feature = function_bind(
+ &OptiXDevice::denoising_write_feature, this, _1, _2, _3, &denoising, thread_index);
+ denoising.functions.detect_outliers = function_bind(
+ &OptiXDevice::denoising_detect_outliers, this, _1, _2, _3, _4, &denoising, thread_index);
+
+ denoising.filter_area = make_int4(rtile.x, rtile.y, rtile.w, rtile.h);
+ denoising.render_buffer.samples = rtile.sample = rtile.start_sample + rtile.num_samples;
+ denoising.buffer.gpu_temporary_mem = true;
+
+ denoising.run_denoising(&rtile);
+
+ task.update_progress(&rtile, rtile.w * rtile.h);
+ }
+
+ void launch_shader_eval(DeviceTask &task, int thread_index)
+ {
+ unsigned int rgen_index = PG_BACK;
+ if (task.shader_eval_type >= SHADER_EVAL_BAKE)
+ rgen_index = PG_BAKE;
+ if (task.shader_eval_type == SHADER_EVAL_DISPLACE)
+ rgen_index = PG_DISP;
+
+ const CUDAContextScope scope(cuda_context);
+
+ device_ptr launch_params_ptr = launch_params.device_pointer +
+ thread_index * launch_params.data_elements;
+
+ for (int sample = 0; sample < task.num_samples; ++sample) {
+ ShaderParams params;
+ params.input = (uint4 *)task.shader_input;
+ params.output = (float4 *)task.shader_output;
+ params.type = task.shader_eval_type;
+ params.filter = task.shader_filter;
+ params.sx = task.shader_x;
+ params.offset = task.offset;
+ params.sample = sample;
+
+ check_result_cuda(cuMemcpyHtoDAsync(launch_params_ptr + offsetof(KernelParams, shader),
+ &params,
+ sizeof(params),
+ cuda_stream[thread_index]));
+
+ OptixShaderBindingTable sbt_params = {};
+ sbt_params.raygenRecord = sbt_data.device_pointer + rgen_index * sizeof(SbtRecord);
+# ifdef WITH_CYCLES_DEBUG
+ sbt_params.exceptionRecord = sbt_data.device_pointer + PG_EXCP * sizeof(SbtRecord);
+# endif
+ sbt_params.missRecordBase = sbt_data.device_pointer + PG_MISS * sizeof(SbtRecord);
+ sbt_params.missRecordStrideInBytes = sizeof(SbtRecord);
+ sbt_params.missRecordCount = 1;
+ sbt_params.hitgroupRecordBase = sbt_data.device_pointer + PG_HITD * sizeof(SbtRecord);
+ sbt_params.hitgroupRecordStrideInBytes = sizeof(SbtRecord);
+ sbt_params.hitgroupRecordCount = 3; // PG_HITD, PG_HITL, PG_HITS
+
+ check_result_optix(optixLaunch(pipelines[PIP_SHADER_EVAL],
+ cuda_stream[thread_index],
+ launch_params_ptr,
+ launch_params.data_elements,
+ &sbt_params,
+ task.shader_w,
+ 1,
+ 1));
+
+ check_result_cuda(cuStreamSynchronize(cuda_stream[thread_index]));
+
+ task.update_progress(NULL);
+ }
+ }
+
+ void launch_film_convert(DeviceTask &task, int thread_index)
+ {
+ const CUDAContextScope scope(cuda_context);
+
+ CUfunction film_convert_func;
+ check_result_cuda(cuModuleGetFunction(&film_convert_func,
+ cuda_module,
+ task.rgba_byte ? "kernel_cuda_convert_to_byte" :
+ "kernel_cuda_convert_to_half_float"));
+
+ float sample_scale = 1.0f / (task.sample + 1);
+ CUdeviceptr rgba = (task.rgba_byte ? task.rgba_byte : task.rgba_half);
+
+ void *args[] = {&rgba,
+ &task.buffer,
+ &sample_scale,
+ &task.x,
+ &task.y,
+ &task.w,
+ &task.h,
+ &task.offset,
+ &task.stride};
+
+ int threads_per_block;
+ check_result_cuda(cuFuncGetAttribute(
+ &threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, film_convert_func));
+
+ const int num_threads_x = (int)sqrt(threads_per_block);
+ const int num_blocks_x = (task.w + num_threads_x - 1) / num_threads_x;
+ const int num_threads_y = (int)sqrt(threads_per_block);
+ const int num_blocks_y = (task.h + num_threads_y - 1) / num_threads_y;
+
+ check_result_cuda(cuLaunchKernel(film_convert_func,
+ num_blocks_x,
+ num_blocks_y,
+ 1, /* blocks */
+ num_threads_x,
+ num_threads_y,
+ 1, /* threads */
+ 0,
+ cuda_stream[thread_index],
+ args,
+ 0));
+
+ check_result_cuda(cuStreamSynchronize(cuda_stream[thread_index]));
+
+ task.update_progress(NULL);
+ }
+
+ bool build_optix_bvh(const OptixBuildInput &build_input,
+ uint16_t num_motion_steps,
+ device_memory &out_data,
+ OptixTraversableHandle &out_handle)
+ {
+ out_handle = 0;
+
+ const CUDAContextScope scope(cuda_context);
+
+ // Compute memory usage
+ OptixAccelBufferSizes sizes = {};
+ OptixAccelBuildOptions options;
+ options.operation = OPTIX_BUILD_OPERATION_BUILD;
+ options.buildFlags = OPTIX_BUILD_FLAG_PREFER_FAST_TRACE;
+ options.motionOptions.numKeys = num_motion_steps;
+ options.motionOptions.flags = OPTIX_MOTION_FLAG_START_VANISH | OPTIX_MOTION_FLAG_END_VANISH;
+ options.motionOptions.timeBegin = 0.0f;
+ options.motionOptions.timeEnd = 1.0f;
+
+ check_result_optix_ret(
+ optixAccelComputeMemoryUsage(context, &options, &build_input, 1, &sizes));
+
+ // Allocate required output buffers
+ device_only_memory<char> temp_mem(this, "temp_build_mem");
+ temp_mem.alloc_to_device(sizes.tempSizeInBytes);
+
+ out_data.data_type = TYPE_UNKNOWN;
+ out_data.data_elements = 1;
+ out_data.data_size = sizes.outputSizeInBytes;
+ mem_alloc(out_data);
+
+ // Finally build the acceleration structure
+ check_result_optix_ret(optixAccelBuild(context,
+ NULL,
+ &options,
+ &build_input,
+ 1,
+ temp_mem.device_pointer,
+ sizes.tempSizeInBytes,
+ out_data.device_pointer,
+ sizes.outputSizeInBytes,
+ &out_handle,
+ NULL,
+ 0));
+
+ // Wait for all operations to finish
+ check_result_cuda_ret(cuStreamSynchronize(NULL));
+
+ return true;
+ }
+
+ bool build_optix_bvh(BVH *bvh, device_memory &out_data) override
+ {
+ assert(bvh->params.top_level);
+
+ unsigned int num_instances = 0;
+ unordered_map<Mesh *, vector<OptixTraversableHandle>> meshes;
+
+ // Clear all previous AS
+ blas.clear();
+
+ // Build bottom level acceleration structures (BLAS)
+ // Note: Always keep this logic in sync with bvh_optix.cpp!
+ for (Object *ob : bvh->objects) {
+ // Skip meshes for which acceleration structure already exists
+ if (meshes.find(ob->mesh) != meshes.end())
+ continue;
+
+ Mesh *const mesh = ob->mesh;
+ vector<OptixTraversableHandle> handles;
+
+ // Build BLAS for curve primitives
+ if (bvh->params.primitive_mask & PRIMITIVE_ALL_CURVE && mesh->num_curves() > 0) {
+ const size_t num_curves = mesh->num_curves();
+ const size_t num_segments = mesh->num_segments();
+
+ size_t num_motion_steps = 1;
+ Attribute *motion_keys = mesh->curve_attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
+ if (mesh->use_motion_blur && motion_keys) {
+ num_motion_steps = mesh->motion_steps;
+ }
+
+ device_vector<OptixAabb> aabb_data(this, "temp_aabb_data", MEM_READ_ONLY);
+ aabb_data.alloc(num_segments * num_motion_steps);
+
+ // Get AABBs for each motion step
+ for (size_t step = 0; step < num_motion_steps; ++step) {
+ const float3 *keys = mesh->curve_keys.data();
+
+ size_t center_step = (num_motion_steps - 1) / 2;
+ // The center step for motion vertices is not stored in the attribute
+ if (step != center_step) {
+ keys = motion_keys->data_float3() +
+ (step > center_step ? step - 1 : step) * num_segments;
+ }
+
+ for (size_t i = step * num_segments, j = 0; j < num_curves; ++j) {
+ const Mesh::Curve c = mesh->get_curve(j);
+ for (size_t k = 0; k < c.num_segments(); ++i, ++k) {
+ BoundBox bounds = BoundBox::empty;
+ c.bounds_grow(k, keys, mesh->curve_radius.data(), bounds);
+ aabb_data[i].minX = bounds.min.x;
+ aabb_data[i].minY = bounds.min.y;
+ aabb_data[i].minZ = bounds.min.z;
+ aabb_data[i].maxX = bounds.max.x;
+ aabb_data[i].maxY = bounds.max.y;
+ aabb_data[i].maxZ = bounds.max.z;
+ }
+ }
+ }
+
+ // Upload AABB data to GPU
+ aabb_data.copy_to_device();
+
+ vector<device_ptr> aabb_ptrs;
+ aabb_ptrs.reserve(num_motion_steps);
+ for (size_t step = 0; step < num_motion_steps; ++step) {
+ aabb_ptrs.push_back(aabb_data.device_pointer + step * num_segments * sizeof(OptixAabb));
+ }
+
+ // Disable visibility test anyhit program, since it is already checked during intersection
+ // Those trace calls that require anyhit can force it with OPTIX_RAY_FLAG_ENFORCE_ANYHIT
+ unsigned int build_flags = OPTIX_GEOMETRY_FLAG_DISABLE_ANYHIT;
+
+ OptixBuildInput build_input = {};
+ build_input.type = OPTIX_BUILD_INPUT_TYPE_CUSTOM_PRIMITIVES;
+ build_input.aabbArray.aabbBuffers = (CUdeviceptr *)aabb_ptrs.data();
+ build_input.aabbArray.numPrimitives = num_segments;
+ build_input.aabbArray.strideInBytes = sizeof(OptixAabb);
+ build_input.aabbArray.flags = &build_flags;
+ build_input.aabbArray.numSbtRecords = 1;
+ build_input.aabbArray.primitiveIndexOffset = mesh->prim_offset;
+
+ // Allocate memory for new BLAS and build it
+ blas.emplace_back(this, "blas");
+ handles.emplace_back();
+ if (!build_optix_bvh(build_input, num_motion_steps, blas.back(), handles.back()))
+ return false;
+ }
+
+ // Build BLAS for triangle primitives
+ if (bvh->params.primitive_mask & PRIMITIVE_ALL_TRIANGLE && mesh->num_triangles() > 0) {
+ const size_t num_verts = mesh->verts.size();
+
+ size_t num_motion_steps = 1;
+ Attribute *motion_keys = mesh->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
+ if (mesh->use_motion_blur && motion_keys) {
+ num_motion_steps = mesh->motion_steps;
+ }
+
+ device_vector<int> index_data(this, "temp_index_data", MEM_READ_ONLY);
+ index_data.alloc(mesh->triangles.size());
+ memcpy(index_data.data(), mesh->triangles.data(), mesh->triangles.size() * sizeof(int));
+ device_vector<float3> vertex_data(this, "temp_vertex_data", MEM_READ_ONLY);
+ vertex_data.alloc(num_verts * num_motion_steps);
+
+ for (size_t step = 0; step < num_motion_steps; ++step) {
+ const float3 *verts = mesh->verts.data();
+
+ size_t center_step = (num_motion_steps - 1) / 2;
+ // The center step for motion vertices is not stored in the attribute
+ if (step != center_step) {
+ verts = motion_keys->data_float3() +
+ (step > center_step ? step - 1 : step) * num_verts;
+ }
+
+ memcpy(vertex_data.data() + num_verts * step, verts, num_verts * sizeof(float3));
+ }
+
+ // Upload triangle data to GPU
+ index_data.copy_to_device();
+ vertex_data.copy_to_device();
+
+ vector<device_ptr> vertex_ptrs;
+ vertex_ptrs.reserve(num_motion_steps);
+ for (size_t step = 0; step < num_motion_steps; ++step) {
+ vertex_ptrs.push_back(vertex_data.device_pointer + num_verts * step * sizeof(float3));
+ }
+
+ // No special build flags for triangle primitives
+ unsigned int build_flags = OPTIX_GEOMETRY_FLAG_NONE;
+
+ OptixBuildInput build_input = {};
+ build_input.type = OPTIX_BUILD_INPUT_TYPE_TRIANGLES;
+ build_input.triangleArray.vertexBuffers = (CUdeviceptr *)vertex_ptrs.data();
+ build_input.triangleArray.numVertices = num_verts;
+ build_input.triangleArray.vertexFormat = OPTIX_VERTEX_FORMAT_FLOAT3;
+ build_input.triangleArray.vertexStrideInBytes = sizeof(float3);
+ build_input.triangleArray.indexBuffer = index_data.device_pointer;
+ build_input.triangleArray.numIndexTriplets = mesh->num_triangles();
+ build_input.triangleArray.indexFormat = OPTIX_INDICES_FORMAT_UNSIGNED_INT3;
+ build_input.triangleArray.indexStrideInBytes = 3 * sizeof(int);
+ build_input.triangleArray.flags = &build_flags;
+ // The SBT does not store per primitive data since Cycles already allocates separate
+ // buffers for that purpose. OptiX does not allow this to be zero though, so just pass in
+ // one and rely on that having the same meaning in this case.
+ build_input.triangleArray.numSbtRecords = 1;
+ // Triangle primitives are packed right after the curve primitives of this mesh
+ build_input.triangleArray.primitiveIndexOffset = mesh->prim_offset + mesh->num_segments();
+
+ // Allocate memory for new BLAS and build it
+ blas.emplace_back(this, "blas");
+ handles.emplace_back();
+ if (!build_optix_bvh(build_input, num_motion_steps, blas.back(), handles.back()))
+ return false;
+ }
+
+ meshes.insert({mesh, handles});
+ }
+
+ // Fill instance descriptions
+ device_vector<OptixAabb> aabbs(this, "tlas_aabbs", MEM_READ_ONLY);
+ aabbs.alloc(bvh->objects.size() * 2);
+ device_vector<OptixInstance> instances(this, "tlas_instances", MEM_READ_ONLY);
+ instances.alloc(bvh->objects.size() * 2);
+
+ for (Object *ob : bvh->objects) {
+ // Skip non-traceable objects
+ if (!ob->is_traceable())
+ continue;
+ // Create separate instance for triangle/curve meshes of an object
+ for (OptixTraversableHandle handle : meshes[ob->mesh]) {
+ OptixAabb &aabb = aabbs[num_instances];
+ aabb.minX = ob->bounds.min.x;
+ aabb.minY = ob->bounds.min.y;
+ aabb.minZ = ob->bounds.min.z;
+ aabb.maxX = ob->bounds.max.x;
+ aabb.maxY = ob->bounds.max.y;
+ aabb.maxZ = ob->bounds.max.z;
+
+ OptixInstance &instance = instances[num_instances++];
+ memset(&instance, 0, sizeof(instance));
+
+ // Clear transform to identity matrix
+ instance.transform[0] = 1.0f;
+ instance.transform[5] = 1.0f;
+ instance.transform[10] = 1.0f;
+
+ // Set user instance ID to object index
+ instance.instanceId = ob->get_device_index();
+
+ // Volumes have a special bit set in the visibility mask so a trace can mask only volumes
+ // See 'scene_intersect_volume' in bvh.h
+ instance.visibilityMask = (ob->mesh->has_volume ? 3 : 1);
+
+ // Insert motion traversable if object has motion
+ if (ob->use_motion()) {
+ blas.emplace_back(this, "motion_transform");
+ device_only_memory<uint8_t> &motion_transform_gpu = blas.back();
+ motion_transform_gpu.alloc_to_device(sizeof(OptixSRTMotionTransform) +
+ (max(ob->motion.size(), 2) - 2) *
+ sizeof(OptixSRTData));
+
+ // Allocate host side memory for motion transform and fill it with transform data
+ OptixSRTMotionTransform &motion_transform = *reinterpret_cast<OptixSRTMotionTransform *>(
+ motion_transform_gpu.host_pointer = new uint8_t[motion_transform_gpu.memory_size()]);
+ motion_transform.child = handle;
+ motion_transform.motionOptions.numKeys = ob->motion.size();
+ motion_transform.motionOptions.flags = OPTIX_MOTION_FLAG_NONE;
+ motion_transform.motionOptions.timeBegin = 0.0f;
+ motion_transform.motionOptions.timeEnd = 1.0f;
+
+ OptixSRTData *const srt_data = motion_transform.srtData;
+ array<DecomposedTransform> decomp(ob->motion.size());
+ transform_motion_decompose(decomp.data(), ob->motion.data(), ob->motion.size());
+
+ for (size_t i = 0; i < ob->motion.size(); ++i) {
+ // scaling
+ srt_data[i].a = decomp[i].z.x; // scale.x.y
+ srt_data[i].b = decomp[i].z.y; // scale.x.z
+ srt_data[i].c = decomp[i].w.x; // scale.y.z
+ srt_data[i].sx = decomp[i].y.w; // scale.x.x
+ srt_data[i].sy = decomp[i].z.w; // scale.y.y
+ srt_data[i].sz = decomp[i].w.w; // scale.z.z
+ srt_data[i].pvx = 0;
+ srt_data[i].pvy = 0;
+ srt_data[i].pvz = 0;
+ // rotation
+ srt_data[i].qx = decomp[i].x.x;
+ srt_data[i].qy = decomp[i].x.y;
+ srt_data[i].qz = decomp[i].x.z;
+ srt_data[i].qw = decomp[i].x.w;
+ // transform
+ srt_data[i].tx = decomp[i].y.x;
+ srt_data[i].ty = decomp[i].y.y;
+ srt_data[i].tz = decomp[i].y.z;
+ }
+
+ // Upload motion transform to GPU
+ mem_copy_to(motion_transform_gpu);
+ delete[] reinterpret_cast<uint8_t *>(motion_transform_gpu.host_pointer);
+ motion_transform_gpu.host_pointer = 0;
+
+ // Disable instance transform if object uses motion transform already
+ instance.flags = OPTIX_INSTANCE_FLAG_DISABLE_TRANSFORM;
+
+ // Get traversable handle to motion transform
+ optixConvertPointerToTraversableHandle(context,
+ motion_transform_gpu.device_pointer,
+ OPTIX_TRAVERSABLE_TYPE_SRT_MOTION_TRANSFORM,
+ &instance.traversableHandle);
+ }
+ else {
+ instance.traversableHandle = handle;
+
+ if (ob->mesh->is_instanced()) {
+ // Set transform matrix
+ memcpy(instance.transform, &ob->tfm, sizeof(instance.transform));
+ }
+ else {
+ // Disable instance transform if mesh already has it applied to vertex data
+ instance.flags = OPTIX_INSTANCE_FLAG_DISABLE_TRANSFORM;
+ // Non-instanced objects read ID from prim_object, so
+ // distinguish them from instanced objects with high bit set
+ instance.instanceId |= 0x800000;
+ }
+ }
+ }
+ }
+
+ // Upload instance descriptions
+ aabbs.resize(num_instances);
+ aabbs.copy_to_device();
+ instances.resize(num_instances);
+ instances.copy_to_device();
+
+ // Build top-level acceleration structure
+ OptixBuildInput build_input = {};
+ build_input.type = OPTIX_BUILD_INPUT_TYPE_INSTANCES;
+ build_input.instanceArray.instances = instances.device_pointer;
+ build_input.instanceArray.numInstances = num_instances;
+ build_input.instanceArray.aabbs = aabbs.device_pointer;
+ build_input.instanceArray.numAabbs = num_instances;
+
+ return build_optix_bvh(build_input, 0 /* TLAS has no motion itself */, out_data, tlas_handle);
+ }
+
+ void update_texture_info()
+ {
+ if (need_texture_info) {
+ texture_info.copy_to_device();
+ need_texture_info = false;
+ }
+ }
+
+ void update_launch_params(const char *name, size_t offset, void *data, size_t data_size)
+ {
+ const CUDAContextScope scope(cuda_context);
+
+ for (int i = 0; i < info.cpu_threads; ++i)
+ check_result_cuda(
+ cuMemcpyHtoD(launch_params.device_pointer + i * launch_params.data_elements + offset,
+ data,
+ data_size));
+
+ // Set constant memory for CUDA module
+ // TODO(pmours): This is only used for tonemapping (see 'launch_film_convert').
+ // Could be removed by moving those functions to filter CUDA module.
+ size_t bytes = 0;
+ CUdeviceptr mem = 0;
+ check_result_cuda(cuModuleGetGlobal(&mem, &bytes, cuda_module, name));
+ assert(mem != NULL && bytes == data_size);
+ check_result_cuda(cuMemcpyHtoD(mem, data, data_size));
+ }
+
+ void mem_alloc(device_memory &mem) override
+ {
+ const CUDAContextScope scope(cuda_context);
+
+ mem.device_size = mem.memory_size();
+
+ if (mem.type == MEM_TEXTURE && mem.interpolation != INTERPOLATION_NONE) {
+ CUDAMem &cmem = cuda_mem_map[&mem]; // Lock and get associated memory information
+
+ CUDA_TEXTURE_DESC tex_desc = {};
+ tex_desc.flags = CU_TRSF_NORMALIZED_COORDINATES;
+ CUDA_RESOURCE_DESC res_desc = {};
+
+ switch (mem.extension) {
+ default:
+ assert(0);
+ case EXTENSION_REPEAT:
+ tex_desc.addressMode[0] = tex_desc.addressMode[1] = tex_desc.addressMode[2] =
+ CU_TR_ADDRESS_MODE_WRAP;
+ break;
+ case EXTENSION_EXTEND:
+ tex_desc.addressMode[0] = tex_desc.addressMode[1] = tex_desc.addressMode[2] =
+ CU_TR_ADDRESS_MODE_CLAMP;
+ break;
+ case EXTENSION_CLIP:
+ tex_desc.addressMode[0] = tex_desc.addressMode[1] = tex_desc.addressMode[2] =
+ CU_TR_ADDRESS_MODE_BORDER;
+ break;
+ }
+
+ switch (mem.interpolation) {
+ default: // Default to linear for unsupported interpolation types
+ case INTERPOLATION_LINEAR:
+ tex_desc.filterMode = CU_TR_FILTER_MODE_LINEAR;
+ break;
+ case INTERPOLATION_CLOSEST:
+ tex_desc.filterMode = CU_TR_FILTER_MODE_POINT;
+ break;
+ }
+
+ CUarray_format format;
+ switch (mem.data_type) {
+ default:
+ assert(0);
+ case TYPE_UCHAR:
+ format = CU_AD_FORMAT_UNSIGNED_INT8;
+ break;
+ case TYPE_UINT16:
+ format = CU_AD_FORMAT_UNSIGNED_INT16;
+ break;
+ case TYPE_UINT:
+ format = CU_AD_FORMAT_UNSIGNED_INT32;
+ break;
+ case TYPE_INT:
+ format = CU_AD_FORMAT_SIGNED_INT32;
+ break;
+ case TYPE_FLOAT:
+ format = CU_AD_FORMAT_FLOAT;
+ break;
+ case TYPE_HALF:
+ format = CU_AD_FORMAT_HALF;
+ break;
+ }
+
+ if (mem.data_depth > 1) { /* 3D texture using array. */
+ CUDA_ARRAY3D_DESCRIPTOR desc;
+ desc.Width = mem.data_width;
+ desc.Height = mem.data_height;
+ desc.Depth = mem.data_depth;
+ desc.Format = format;
+ desc.NumChannels = mem.data_elements;
+ desc.Flags = 0;
+
+ check_result_cuda(cuArray3DCreate(&cmem.array, &desc));
+ mem.device_pointer = (device_ptr)cmem.array;
+
+ res_desc.resType = CU_RESOURCE_TYPE_ARRAY;
+ res_desc.res.array.hArray = cmem.array;
+ }
+ else if (mem.data_height > 0) { /* 2D texture using array. */
+ CUDA_ARRAY_DESCRIPTOR desc;
+ desc.Width = mem.data_width;
+ desc.Height = mem.data_height;
+ desc.Format = format;
+ desc.NumChannels = mem.data_elements;
+
+ check_result_cuda(cuArrayCreate(&cmem.array, &desc));
+ mem.device_pointer = (device_ptr)cmem.array;
+
+ res_desc.resType = CU_RESOURCE_TYPE_ARRAY;
+ res_desc.res.array.hArray = cmem.array;
+ }
+ else {
+ check_result_cuda(cuMemAlloc((CUdeviceptr *)&mem.device_pointer, mem.device_size));
+
+ res_desc.resType = CU_RESOURCE_TYPE_LINEAR;
+ res_desc.res.linear.devPtr = (CUdeviceptr)mem.device_pointer;
+ res_desc.res.linear.format = format;
+ res_desc.res.linear.numChannels = mem.data_elements;
+ res_desc.res.linear.sizeInBytes = mem.device_size;
+ }
+
+ check_result_cuda(cuTexObjectCreate(&cmem.texobject, &res_desc, &tex_desc, NULL));
+
+ int flat_slot = 0;
+ if (string_startswith(mem.name, "__tex_image")) {
+ flat_slot = atoi(mem.name + string(mem.name).rfind("_") + 1);
+ }
+
+ if (flat_slot >= texture_info.size())
+ texture_info.resize(flat_slot + 128);
+
+ TextureInfo &info = texture_info[flat_slot];
+ info.data = (uint64_t)cmem.texobject;
+ info.cl_buffer = 0;
+ info.interpolation = mem.interpolation;
+ info.extension = mem.extension;
+ info.width = mem.data_width;
+ info.height = mem.data_height;
+ info.depth = mem.data_depth;
+
+ // Texture information has changed and needs an update, delay this to next launch
+ need_texture_info = true;
+ }
+ else {
+ // This is not a texture but simple linear memory
+ check_result_cuda(cuMemAlloc((CUdeviceptr *)&mem.device_pointer, mem.device_size));
+
+ // Update data storage pointers in launch parameters
+# define KERNEL_TEX(data_type, tex_name) \
+ if (strcmp(mem.name, #tex_name) == 0) \
+ update_launch_params( \
+ mem.name, offsetof(KernelParams, tex_name), &mem.device_pointer, sizeof(device_ptr));
+# include "kernel/kernel_textures.h"
+# undef KERNEL_TEX
+ }
+
+ stats.mem_alloc(mem.device_size);
+ }
+
+ void mem_copy_to(device_memory &mem) override
+ {
+ if (!mem.host_pointer || mem.host_pointer == mem.shared_pointer)
+ return;
+ if (!mem.device_pointer)
+ mem_alloc(mem); // Need to allocate memory first if it does not exist yet
+
+ const CUDAContextScope scope(cuda_context);
+
+ if (mem.type == MEM_TEXTURE && mem.interpolation != INTERPOLATION_NONE) {
+ const CUDAMem &cmem = cuda_mem_map[&mem]; // Lock and get associated memory information
+
+ size_t src_pitch = mem.data_width * datatype_size(mem.data_type) * mem.data_elements;
+
+ if (mem.data_depth > 1) {
+ CUDA_MEMCPY3D param;
+ memset(&param, 0, sizeof(param));
+ param.dstMemoryType = CU_MEMORYTYPE_ARRAY;
+ param.dstArray = cmem.array;
+ param.srcMemoryType = CU_MEMORYTYPE_HOST;
+ param.srcHost = mem.host_pointer;
+ param.srcPitch = src_pitch;
+ param.WidthInBytes = param.srcPitch;
+ param.Height = mem.data_height;
+ param.Depth = mem.data_depth;
+
+ check_result_cuda(cuMemcpy3D(&param));
+ }
+ else if (mem.data_height > 0) {
+ CUDA_MEMCPY2D param;
+ memset(&param, 0, sizeof(param));
+ param.dstMemoryType = CU_MEMORYTYPE_ARRAY;
+ param.dstArray = cmem.array;
+ param.srcMemoryType = CU_MEMORYTYPE_HOST;
+ param.srcHost = mem.host_pointer;
+ param.srcPitch = src_pitch;
+ param.WidthInBytes = param.srcPitch;
+ param.Height = mem.data_height;
+
+ check_result_cuda(cuMemcpy2D(&param));
+ }
+ else {
+ check_result_cuda(
+ cuMemcpyHtoD((CUdeviceptr)mem.device_pointer, mem.host_pointer, mem.device_size));
+ }
+ }
+ else {
+ // This is not a texture but simple linear memory
+ check_result_cuda(
+ cuMemcpyHtoD((CUdeviceptr)mem.device_pointer, mem.host_pointer, mem.device_size));
+ }
+ }
+
+ void mem_copy_from(device_memory &mem, int y, int w, int h, int elem) override
+ {
+ // Calculate linear memory offset and size
+ const size_t size = elem * w * h;
+ const size_t offset = elem * y * w;
+
+ if (mem.host_pointer && mem.device_pointer) {
+ const CUDAContextScope scope(cuda_context);
+ check_result_cuda(cuMemcpyDtoH(
+ (char *)mem.host_pointer + offset, (CUdeviceptr)mem.device_pointer + offset, size));
+ }
+ else if (mem.host_pointer) {
+ memset((char *)mem.host_pointer + offset, 0, size);
+ }
+ }
+
+ void mem_zero(device_memory &mem) override
+ {
+ if (mem.host_pointer)
+ memset(mem.host_pointer, 0, mem.memory_size());
+ if (mem.host_pointer && mem.host_pointer == mem.shared_pointer)
+ return; // This is shared host memory, so no device memory to update
+
+ if (!mem.device_pointer)
+ mem_alloc(mem); // Need to allocate memory first if it does not exist yet
+
+ const CUDAContextScope scope(cuda_context);
+ check_result_cuda(cuMemsetD8((CUdeviceptr)mem.device_pointer, 0, mem.memory_size()));
+ }
+
+ void mem_free(device_memory &mem) override
+ {
+ assert(mem.device_pointer);
+
+ const CUDAContextScope scope(cuda_context);
+
+ if (mem.type == MEM_TEXTURE && mem.interpolation != INTERPOLATION_NONE) {
+ CUDAMem &cmem = cuda_mem_map[&mem]; // Lock and get associated memory information
+
+ if (cmem.array)
+ cuArrayDestroy(cmem.array);
+ else
+ cuMemFree((CUdeviceptr)mem.device_pointer);
+
+ if (cmem.texobject)
+ cuTexObjectDestroy(cmem.texobject);
+ }
+ else {
+ // This is not a texture but simple linear memory
+ cuMemFree((CUdeviceptr)mem.device_pointer);
+ }
+
+ stats.mem_free(mem.device_size);
+
+ mem.device_size = 0;
+ mem.device_pointer = 0;
+ }
+
+ void const_copy_to(const char *name, void *host, size_t size) override
+ {
+ if (strcmp(name, "__data") == 0) {
+ assert(size <= sizeof(KernelData));
+
+ // Fix traversable handle on multi devices
+ KernelData *const data = (KernelData *)host;
+ *(OptixTraversableHandle *)&data->bvh.scene = tlas_handle;
+
+ update_launch_params(name, offsetof(KernelParams, data), host, size);
+ }
+ }
+
+ device_ptr mem_alloc_sub_ptr(device_memory &mem, int offset, int /*size*/) override
+ {
+ return (device_ptr)(((char *)mem.device_pointer) + mem.memory_elements_size(offset));
+ }
+
+ void task_add(DeviceTask &task) override
+ {
+ // Upload texture information to device if it has changed since last launch
+ update_texture_info();
+
+ // Split task into smaller ones
+ list<DeviceTask> tasks;
+ task.split(tasks, info.cpu_threads);
+
+ // Queue tasks in internal task pool
+ struct OptiXDeviceTask : public DeviceTask {
+ OptiXDeviceTask(OptiXDevice *device, DeviceTask &task, int task_index) : DeviceTask(task)
+ {
+ // Using task index parameter instead of thread index, since number of CUDA streams may
+ // differ from number of threads
+ run = function_bind(&OptiXDevice::thread_run, device, *this, task_index);
+ }
+ };
+
+ int task_index = 0;
+ for (DeviceTask &task : tasks)
+ task_pool.push(new OptiXDeviceTask(this, task, task_index++));
+ }
+
+ void task_wait() override
+ {
+ // Wait for all queued tasks to finish
+ task_pool.wait_work();
+ }
+
+ void task_cancel() override
+ {
+ // Cancel any remaining tasks in the internal pool
+ task_pool.cancel();
+ }
+
+# define CUDA_GET_BLOCKSIZE(func, w, h) \
+ int threads; \
+ check_result_cuda_ret( \
+ cuFuncGetAttribute(&threads, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, func)); \
+ threads = (int)sqrt((float)threads); \
+ int xblocks = ((w) + threads - 1) / threads; \
+ int yblocks = ((h) + threads - 1) / threads;
+
+# define CUDA_LAUNCH_KERNEL(func, args) \
+ check_result_cuda_ret(cuLaunchKernel( \
+ func, xblocks, yblocks, 1, threads, threads, 1, 0, cuda_stream[thread_index], args, 0));
+
+ /* Similar as above, but for 1-dimensional blocks. */
+# define CUDA_GET_BLOCKSIZE_1D(func, w, h) \
+ int threads; \
+ check_result_cuda_ret( \
+ cuFuncGetAttribute(&threads, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, func)); \
+ int xblocks = ((w) + threads - 1) / threads; \
+ int yblocks = h;
+
+# define CUDA_LAUNCH_KERNEL_1D(func, args) \
+ check_result_cuda_ret(cuLaunchKernel( \
+ func, xblocks, yblocks, 1, threads, 1, 1, 0, cuda_stream[thread_index], args, 0));
+
+ bool denoising_non_local_means(device_ptr image_ptr,
+ device_ptr guide_ptr,
+ device_ptr variance_ptr,
+ device_ptr out_ptr,
+ DenoisingTask *task,
+ int thread_index)
+ {
+ if (have_error())
+ return false;
+
+ int stride = task->buffer.stride;
+ int w = task->buffer.width;
+ int h = task->buffer.h;
+ int r = task->nlm_state.r;
+ int f = task->nlm_state.f;
+ float a = task->nlm_state.a;
+ float k_2 = task->nlm_state.k_2;
+
+ int pass_stride = task->buffer.pass_stride;
+ int num_shifts = (2 * r + 1) * (2 * r + 1);
+ int channel_offset = task->nlm_state.is_color ? task->buffer.pass_stride : 0;
+ int frame_offset = 0;
+
+ CUdeviceptr difference = (CUdeviceptr)task->buffer.temporary_mem.device_pointer;
+ CUdeviceptr blurDifference = difference + sizeof(float) * pass_stride * num_shifts;
+ CUdeviceptr weightAccum = difference + 2 * sizeof(float) * pass_stride * num_shifts;
+ CUdeviceptr scale_ptr = 0;
+
+ check_result_cuda_ret(
+ cuMemsetD8Async(weightAccum, 0, sizeof(float) * pass_stride, cuda_stream[thread_index]));
+ check_result_cuda_ret(
+ cuMemsetD8Async(out_ptr, 0, sizeof(float) * pass_stride, cuda_stream[thread_index]));
+
+ {
+ CUfunction cuNLMCalcDifference, cuNLMBlur, cuNLMCalcWeight, cuNLMUpdateOutput;
+ check_result_cuda_ret(cuModuleGetFunction(
+ &cuNLMCalcDifference, cuda_filter_module, "kernel_cuda_filter_nlm_calc_difference"));
+ check_result_cuda_ret(
+ cuModuleGetFunction(&cuNLMBlur, cuda_filter_module, "kernel_cuda_filter_nlm_blur"));
+ check_result_cuda_ret(cuModuleGetFunction(
+ &cuNLMCalcWeight, cuda_filter_module, "kernel_cuda_filter_nlm_calc_weight"));
+ check_result_cuda_ret(cuModuleGetFunction(
+ &cuNLMUpdateOutput, cuda_filter_module, "kernel_cuda_filter_nlm_update_output"));
+
+ check_result_cuda_ret(cuFuncSetCacheConfig(cuNLMCalcDifference, CU_FUNC_CACHE_PREFER_L1));
+ check_result_cuda_ret(cuFuncSetCacheConfig(cuNLMBlur, CU_FUNC_CACHE_PREFER_L1));
+ check_result_cuda_ret(cuFuncSetCacheConfig(cuNLMCalcWeight, CU_FUNC_CACHE_PREFER_L1));
+ check_result_cuda_ret(cuFuncSetCacheConfig(cuNLMUpdateOutput, CU_FUNC_CACHE_PREFER_L1));
+
+ CUDA_GET_BLOCKSIZE_1D(cuNLMCalcDifference, w * h, num_shifts);
+
+ void *calc_difference_args[] = {&guide_ptr,
+ &variance_ptr,
+ &scale_ptr,
+ &difference,
+ &w,
+ &h,
+ &stride,
+ &pass_stride,
+ &r,
+ &channel_offset,
+ &frame_offset,
+ &a,
+ &k_2};
+ void *blur_args[] = {&difference, &blurDifference, &w, &h, &stride, &pass_stride, &r, &f};
+ void *calc_weight_args[] = {
+ &blurDifference, &difference, &w, &h, &stride, &pass_stride, &r, &f};
+ void *update_output_args[] = {&blurDifference,
+ &image_ptr,
+ &out_ptr,
+ &weightAccum,
+ &w,
+ &h,
+ &stride,
+ &pass_stride,
+ &channel_offset,
+ &r,
+ &f};
+
+ CUDA_LAUNCH_KERNEL_1D(cuNLMCalcDifference, calc_difference_args);
+ CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args);
+ CUDA_LAUNCH_KERNEL_1D(cuNLMCalcWeight, calc_weight_args);
+ CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args);
+ CUDA_LAUNCH_KERNEL_1D(cuNLMUpdateOutput, update_output_args);
+ }
+
+ {
+ CUfunction cuNLMNormalize;
+ check_result_cuda_ret(cuModuleGetFunction(
+ &cuNLMNormalize, cuda_filter_module, "kernel_cuda_filter_nlm_normalize"));
+ check_result_cuda_ret(cuFuncSetCacheConfig(cuNLMNormalize, CU_FUNC_CACHE_PREFER_L1));
+ void *normalize_args[] = {&out_ptr, &weightAccum, &w, &h, &stride};
+ CUDA_GET_BLOCKSIZE(cuNLMNormalize, w, h);
+ CUDA_LAUNCH_KERNEL(cuNLMNormalize, normalize_args);
+ check_result_cuda_ret(cuStreamSynchronize(cuda_stream[thread_index]));
+ }
+
+ return !have_error();
+ }
+
+ bool denoising_construct_transform(DenoisingTask *task, int thread_index)
+ {
+ if (have_error())
+ return false;
+
+ CUfunction cuFilterConstructTransform;
+ check_result_cuda_ret(cuModuleGetFunction(&cuFilterConstructTransform,
+ cuda_filter_module,
+ "kernel_cuda_filter_construct_transform"));
+ check_result_cuda_ret(
+ cuFuncSetCacheConfig(cuFilterConstructTransform, CU_FUNC_CACHE_PREFER_SHARED));
+ CUDA_GET_BLOCKSIZE(cuFilterConstructTransform, task->storage.w, task->storage.h);
+
+ void *args[] = {&task->buffer.mem.device_pointer,
+ &task->tile_info_mem.device_pointer,
+ &task->storage.transform.device_pointer,
+ &task->storage.rank.device_pointer,
+ &task->filter_area,
+ &task->rect,
+ &task->radius,
+ &task->pca_threshold,
+ &task->buffer.pass_stride,
+ &task->buffer.frame_stride,
+ &task->buffer.use_time};
+ CUDA_LAUNCH_KERNEL(cuFilterConstructTransform, args);
+ check_result_cuda_ret(cuCtxSynchronize());
+
+ return !have_error();
+ }
+
+ bool denoising_accumulate(device_ptr color_ptr,
+ device_ptr color_variance_ptr,
+ device_ptr scale_ptr,
+ int frame,
+ DenoisingTask *task,
+ int thread_index)
+ {
+ if (have_error())
+ return false;
+
+ int r = task->radius;
+ int f = 4;
+ float a = 1.0f;
+ float k_2 = task->nlm_k_2;
+
+ int w = task->reconstruction_state.source_w;
+ int h = task->reconstruction_state.source_h;
+ int stride = task->buffer.stride;
+ int frame_offset = frame * task->buffer.frame_stride;
+ int t = task->tile_info->frames[frame];
+
+ int pass_stride = task->buffer.pass_stride;
+ int num_shifts = (2 * r + 1) * (2 * r + 1);
+
+ CUdeviceptr difference = (CUdeviceptr)task->buffer.temporary_mem.device_pointer;
+ CUdeviceptr blurDifference = difference + sizeof(float) * pass_stride * num_shifts;
+
+ CUfunction cuNLMCalcDifference, cuNLMBlur, cuNLMCalcWeight, cuNLMConstructGramian;
+ check_result_cuda_ret(cuModuleGetFunction(
+ &cuNLMCalcDifference, cuda_filter_module, "kernel_cuda_filter_nlm_calc_difference"));
+ check_result_cuda_ret(
+ cuModuleGetFunction(&cuNLMBlur, cuda_filter_module, "kernel_cuda_filter_nlm_blur"));
+ check_result_cuda_ret(cuModuleGetFunction(
+ &cuNLMCalcWeight, cuda_filter_module, "kernel_cuda_filter_nlm_calc_weight"));
+ check_result_cuda_ret(cuModuleGetFunction(
+ &cuNLMConstructGramian, cuda_filter_module, "kernel_cuda_filter_nlm_construct_gramian"));
+
+ check_result_cuda_ret(cuFuncSetCacheConfig(cuNLMCalcDifference, CU_FUNC_CACHE_PREFER_L1));
+ check_result_cuda_ret(cuFuncSetCacheConfig(cuNLMBlur, CU_FUNC_CACHE_PREFER_L1));
+ check_result_cuda_ret(cuFuncSetCacheConfig(cuNLMCalcWeight, CU_FUNC_CACHE_PREFER_L1));
+ check_result_cuda_ret(
+ cuFuncSetCacheConfig(cuNLMConstructGramian, CU_FUNC_CACHE_PREFER_SHARED));
+
+ CUDA_GET_BLOCKSIZE_1D(cuNLMCalcDifference,
+ task->reconstruction_state.source_w *
+ task->reconstruction_state.source_h,
+ num_shifts);
+
+ void *calc_difference_args[] = {&color_ptr,
+ &color_variance_ptr,
+ &scale_ptr,
+ &difference,
+ &w,
+ &h,
+ &stride,
+ &pass_stride,
+ &r,
+ &pass_stride,
+ &frame_offset,
+ &a,
+ &k_2};
+ void *blur_args[] = {&difference, &blurDifference, &w, &h, &stride, &pass_stride, &r, &f};
+ void *calc_weight_args[] = {
+ &blurDifference, &difference, &w, &h, &stride, &pass_stride, &r, &f};
+ void *construct_gramian_args[] = {&t,
+ &blurDifference,
+ &task->buffer.mem.device_pointer,
+ &task->storage.transform.device_pointer,
+ &task->storage.rank.device_pointer,
+ &task->storage.XtWX.device_pointer,
+ &task->storage.XtWY.device_pointer,
+ &task->reconstruction_state.filter_window,
+ &w,
+ &h,
+ &stride,
+ &pass_stride,
+ &r,
+ &f,
+ &frame_offset,
+ &task->buffer.use_time};
+
+ CUDA_LAUNCH_KERNEL_1D(cuNLMCalcDifference, calc_difference_args);
+ CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args);
+ CUDA_LAUNCH_KERNEL_1D(cuNLMCalcWeight, calc_weight_args);
+ CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args);
+ CUDA_LAUNCH_KERNEL_1D(cuNLMConstructGramian, construct_gramian_args);
+ check_result_cuda_ret(cuCtxSynchronize());
+
+ return !have_error();
+ }
+
+ bool denoising_solve(device_ptr output_ptr, DenoisingTask *task, int thread_index)
+ {
+ if (have_error())
+ return false;
+
+ CUfunction cuFinalize;
+ check_result_cuda_ret(
+ cuModuleGetFunction(&cuFinalize, cuda_filter_module, "kernel_cuda_filter_finalize"));
+ check_result_cuda_ret(cuFuncSetCacheConfig(cuFinalize, CU_FUNC_CACHE_PREFER_L1));
+ void *finalize_args[] = {&output_ptr,
+ &task->storage.rank.device_pointer,
+ &task->storage.XtWX.device_pointer,
+ &task->storage.XtWY.device_pointer,
+ &task->filter_area,
+ &task->reconstruction_state.buffer_params.x,
+ &task->render_buffer.samples};
+ CUDA_GET_BLOCKSIZE(
+ cuFinalize, task->reconstruction_state.source_w, task->reconstruction_state.source_h);
+ CUDA_LAUNCH_KERNEL(cuFinalize, finalize_args);
+ check_result_cuda_ret(cuStreamSynchronize(cuda_stream[thread_index]));
+
+ return !have_error();
+ }
+
+ bool denoising_combine_halves(device_ptr a_ptr,
+ device_ptr b_ptr,
+ device_ptr mean_ptr,
+ device_ptr variance_ptr,
+ int r,
+ int4 rect,
+ DenoisingTask *task,
+ int thread_index)
+ {
+ if (have_error())
+ return false;
+
+ CUfunction cuFilterCombineHalves;
+ check_result_cuda_ret(cuModuleGetFunction(
+ &cuFilterCombineHalves, cuda_filter_module, "kernel_cuda_filter_combine_halves"));
+ check_result_cuda_ret(cuFuncSetCacheConfig(cuFilterCombineHalves, CU_FUNC_CACHE_PREFER_L1));
+ CUDA_GET_BLOCKSIZE(
+ cuFilterCombineHalves, task->rect.z - task->rect.x, task->rect.w - task->rect.y);
+
+ void *args[] = {&mean_ptr, &variance_ptr, &a_ptr, &b_ptr, &rect, &r};
+ CUDA_LAUNCH_KERNEL(cuFilterCombineHalves, args);
+ check_result_cuda_ret(cuStreamSynchronize(cuda_stream[thread_index]));
+
+ return !have_error();
+ }
+
+ bool denoising_divide_shadow(device_ptr a_ptr,
+ device_ptr b_ptr,
+ device_ptr sample_variance_ptr,
+ device_ptr sv_variance_ptr,
+ device_ptr buffer_variance_ptr,
+ DenoisingTask *task,
+ int thread_index)
+ {
+ if (have_error())
+ return false;
+
+ CUfunction cuFilterDivideShadow;
+ check_result_cuda_ret(cuModuleGetFunction(
+ &cuFilterDivideShadow, cuda_filter_module, "kernel_cuda_filter_divide_shadow"));
+ check_result_cuda_ret(cuFuncSetCacheConfig(cuFilterDivideShadow, CU_FUNC_CACHE_PREFER_L1));
+ CUDA_GET_BLOCKSIZE(
+ cuFilterDivideShadow, task->rect.z - task->rect.x, task->rect.w - task->rect.y);
+
+ void *args[] = {&task->render_buffer.samples,
+ &task->tile_info_mem.device_pointer,
+ &a_ptr,
+ &b_ptr,
+ &sample_variance_ptr,
+ &sv_variance_ptr,
+ &buffer_variance_ptr,
+ &task->rect,
+ &task->render_buffer.pass_stride,
+ &task->render_buffer.offset};
+ CUDA_LAUNCH_KERNEL(cuFilterDivideShadow, args);
+ check_result_cuda_ret(cuStreamSynchronize(cuda_stream[thread_index]));
+
+ return !have_error();
+ }
+
+ bool denoising_get_feature(int mean_offset,
+ int variance_offset,
+ device_ptr mean_ptr,
+ device_ptr variance_ptr,
+ float scale,
+ DenoisingTask *task,
+ int thread_index)
+ {
+ if (have_error())
+ return false;
+
+ CUfunction cuFilterGetFeature;
+ check_result_cuda_ret(cuModuleGetFunction(
+ &cuFilterGetFeature, cuda_filter_module, "kernel_cuda_filter_get_feature"));
+ check_result_cuda_ret(cuFuncSetCacheConfig(cuFilterGetFeature, CU_FUNC_CACHE_PREFER_L1));
+ CUDA_GET_BLOCKSIZE(
+ cuFilterGetFeature, task->rect.z - task->rect.x, task->rect.w - task->rect.y);
+
+ void *args[] = {&task->render_buffer.samples,
+ &task->tile_info_mem.device_pointer,
+ &mean_offset,
+ &variance_offset,
+ &mean_ptr,
+ &variance_ptr,
+ &scale,
+ &task->rect,
+ &task->render_buffer.pass_stride,
+ &task->render_buffer.offset};
+ CUDA_LAUNCH_KERNEL(cuFilterGetFeature, args);
+ check_result_cuda_ret(cuStreamSynchronize(cuda_stream[thread_index]));
+
+ return !have_error();
+ }
+
+ bool denoising_write_feature(int out_offset,
+ device_ptr from_ptr,
+ device_ptr buffer_ptr,
+ DenoisingTask *task,
+ int thread_index)
+ {
+ if (have_error())
+ return false;
+
+ CUfunction cuFilterWriteFeature;
+ check_result_cuda_ret(cuModuleGetFunction(
+ &cuFilterWriteFeature, cuda_filter_module, "kernel_cuda_filter_write_feature"));
+ check_result_cuda_ret(cuFuncSetCacheConfig(cuFilterWriteFeature, CU_FUNC_CACHE_PREFER_L1));
+ CUDA_GET_BLOCKSIZE(cuFilterWriteFeature, task->filter_area.z, task->filter_area.w);
+
+ void *args[] = {&task->render_buffer.samples,
+ &task->reconstruction_state.buffer_params,
+ &task->filter_area,
+ &from_ptr,
+ &buffer_ptr,
+ &out_offset,
+ &task->rect};
+ CUDA_LAUNCH_KERNEL(cuFilterWriteFeature, args);
+ check_result_cuda_ret(cuStreamSynchronize(cuda_stream[thread_index]));
+
+ return !have_error();
+ }
+
+ bool denoising_detect_outliers(device_ptr image_ptr,
+ device_ptr variance_ptr,
+ device_ptr depth_ptr,
+ device_ptr output_ptr,
+ DenoisingTask *task,
+ int thread_index)
+ {
+ if (have_error())
+ return false;
+
+ CUfunction cuFilterDetectOutliers;
+ check_result_cuda_ret(cuModuleGetFunction(
+ &cuFilterDetectOutliers, cuda_filter_module, "kernel_cuda_filter_detect_outliers"));
+ check_result_cuda_ret(cuFuncSetCacheConfig(cuFilterDetectOutliers, CU_FUNC_CACHE_PREFER_L1));
+ CUDA_GET_BLOCKSIZE(
+ cuFilterDetectOutliers, task->rect.z - task->rect.x, task->rect.w - task->rect.y);
+
+ void *args[] = {&image_ptr,
+ &variance_ptr,
+ &depth_ptr,
+ &output_ptr,
+ &task->rect,
+ &task->buffer.pass_stride};
+
+ CUDA_LAUNCH_KERNEL(cuFilterDetectOutliers, args);
+ check_result_cuda_ret(cuStreamSynchronize(cuda_stream[thread_index]));
+
+ return !have_error();
+ }
+};
+
+bool device_optix_init()
+{
+ if (g_optixFunctionTable.optixDeviceContextCreate != NULL)
+ return true; // Already initialized function table
+
+ // Need to initialize CUDA as well
+ if (!device_cuda_init())
+ return false;
+
+# ifdef WITH_CUDA_DYNLOAD
+ // Load NVRTC function pointers for adaptive kernel compilation
+ if (DebugFlags().cuda.adaptive_compile && cuewInit(CUEW_INIT_NVRTC) != CUEW_SUCCESS) {
+ VLOG(1)
+ << "CUEW initialization failed for NVRTC. Adaptive kernel compilation won't be available.";
+ }
+# endif
+
+ const OptixResult result = optixInit();
+
+ if (result == OPTIX_ERROR_UNSUPPORTED_ABI_VERSION) {
+ VLOG(1)
+ << "OptiX initialization failed because the installed driver does not support ABI version "
+ << OPTIX_ABI_VERSION;
+ return false;
+ }
+ else if (result != OPTIX_SUCCESS) {
+ VLOG(1) << "OptiX initialization failed with error code " << (unsigned int)result;
+ return false;
+ }
+
+ // Loaded OptiX successfully!
+ return true;
+}
+
+void device_optix_info(vector<DeviceInfo> &devices)
+{
+ // Simply add all supported CUDA devices as OptiX devices again
+ vector<DeviceInfo> cuda_devices;
+ device_cuda_info(cuda_devices);
+
+ for (auto it = cuda_devices.begin(); it != cuda_devices.end();) {
+ DeviceInfo &info = *it;
+ assert(info.type == DEVICE_CUDA);
+ info.type = DEVICE_OPTIX;
+ info.id += "_OptiX";
+
+ // Figure out RTX support
+ CUdevice cuda_device = 0;
+ CUcontext cuda_context = NULL;
+ unsigned int rtcore_version = 0;
+ if (cuDeviceGet(&cuda_device, info.num) == CUDA_SUCCESS &&
+ cuDevicePrimaryCtxRetain(&cuda_context, cuda_device) == CUDA_SUCCESS) {
+ OptixDeviceContext optix_context = NULL;
+ if (optixDeviceContextCreate(cuda_context, nullptr, &optix_context) == OPTIX_SUCCESS) {
+ optixDeviceContextGetProperty(optix_context,
+ OPTIX_DEVICE_PROPERTY_RTCORE_VERSION,
+ &rtcore_version,
+ sizeof(rtcore_version));
+ optixDeviceContextDestroy(optix_context);
+ }
+ cuDevicePrimaryCtxRelease(cuda_device);
+ }
+
+ // Only add devices with RTX support
+ if (rtcore_version == 0)
+ it = cuda_devices.erase(it);
+ else
+ ++it;
+ }
+
+ devices.insert(devices.end(), cuda_devices.begin(), cuda_devices.end());
+}
+
+Device *device_optix_create(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background)
+{
+ return new OptiXDevice(info, stats, profiler, background);
+}
+
+CCL_NAMESPACE_END
+
+#endif
diff --git a/intern/cycles/kernel/CMakeLists.txt b/intern/cycles/kernel/CMakeLists.txt
index 41e57bb3e43..ea8aa197b6f 100644
--- a/intern/cycles/kernel/CMakeLists.txt
+++ b/intern/cycles/kernel/CMakeLists.txt
@@ -64,6 +64,10 @@ set(SRC_OPENCL_KERNELS
kernels/opencl/filter.cl
)
+set(SRC_OPTIX_KERNELS
+ kernels/optix/kernel_optix.cu
+)
+
set(SRC_BVH_HEADERS
bvh/bvh.h
bvh/bvh_nodes.h
@@ -95,6 +99,7 @@ set(SRC_HEADERS
kernel_color.h
kernel_compat_cpu.h
kernel_compat_cuda.h
+ kernel_compat_optix.h
kernel_compat_opencl.h
kernel_differential.h
kernel_emission.h
@@ -140,6 +145,9 @@ set(SRC_KERNELS_CUDA_HEADERS
kernels/cuda/kernel_cuda_image.h
)
+set(SRC_KERNELS_OPTIX_HEADERS
+)
+
set(SRC_KERNELS_OPENCL_HEADERS
kernels/opencl/kernel_split_function.h
kernels/opencl/kernel_opencl_image.h
@@ -168,7 +176,7 @@ set(SRC_CLOSURE_HEADERS
closure/volume.h
closure/bsdf_principled_diffuse.h
closure/bsdf_principled_sheen.h
- closure/bsdf_hair_principled.h
+ closure/bsdf_hair_principled.h
)
set(SRC_SVM_HEADERS
@@ -476,6 +484,53 @@ if(WITH_CYCLES_CUDA_BINARIES)
cycles_set_solution_folder(cycles_kernel_cuda)
endif()
+# OptiX PTX modules
+
+if(WITH_CYCLES_DEVICE_OPTIX)
+ foreach(input ${SRC_OPTIX_KERNELS})
+ get_filename_component(input_we ${input} NAME_WE)
+
+ set(output "${CMAKE_CURRENT_BINARY_DIR}/${input_we}.ptx")
+ set(cuda_flags
+ -I "${OPTIX_INCLUDE_DIR}"
+ -I "${CMAKE_CURRENT_SOURCE_DIR}/.."
+ -I "${CMAKE_CURRENT_SOURCE_DIR}/kernels/cuda"
+ -arch=sm_30
+ --use_fast_math
+ -o ${output})
+
+ if(WITH_CYCLES_DEBUG)
+ set(cuda_flags ${cuda_flags}
+ -D __KERNEL_DEBUG__)
+ endif()
+
+ add_custom_command(
+ OUTPUT
+ ${output}
+ DEPENDS
+ ${input}
+ ${SRC_HEADERS}
+ ${SRC_KERNELS_CUDA_HEADERS}
+ ${SRC_KERNELS_OPTIX_HEADERS}
+ ${SRC_BVH_HEADERS}
+ ${SRC_SVM_HEADERS}
+ ${SRC_GEOM_HEADERS}
+ ${SRC_CLOSURE_HEADERS}
+ ${SRC_UTIL_HEADERS}
+ COMMAND
+ ${CUDA_NVCC_EXECUTABLE} --ptx ${cuda_flags} ${input}
+ WORKING_DIRECTORY
+ "${CMAKE_CURRENT_SOURCE_DIR}")
+
+ list(APPEND optix_ptx ${output})
+
+ delayed_install("${CMAKE_CURRENT_BINARY_DIR}" "${output}" ${CYCLES_INSTALL_PATH}/lib)
+ endforeach()
+
+ add_custom_target(cycles_kernel_optix ALL DEPENDS ${optix_ptx})
+ cycles_set_solution_folder(cycles_kernel_optix)
+endif()
+
# OSL module
if(WITH_CYCLES_OSL)
@@ -535,10 +590,12 @@ endif()
cycles_add_library(cycles_kernel "${LIB}"
${SRC_CPU_KERNELS}
${SRC_CUDA_KERNELS}
+ ${SRC_OPTIX_KERNELS}
${SRC_OPENCL_KERNELS}
${SRC_HEADERS}
${SRC_KERNELS_CPU_HEADERS}
${SRC_KERNELS_CUDA_HEADERS}
+ ${SRC_KERNELS_OPTIX_HEADERS}
${SRC_KERNELS_OPENCL_HEADERS}
${SRC_BVH_HEADERS}
${SRC_CLOSURE_HEADERS}
@@ -548,9 +605,24 @@ cycles_add_library(cycles_kernel "${LIB}"
${SRC_SPLIT_HEADERS}
)
+source_group("bvh" FILES ${SRC_BVH_HEADERS})
+source_group("closure" FILES ${SRC_CLOSURE_HEADERS})
+source_group("filter" FILES ${SRC_FILTER_HEADERS})
+source_group("geom" FILES ${SRC_GEOM_HEADERS})
+source_group("kernel" FILES ${SRC_HEADERS})
+source_group("kernel\\split" FILES ${SRC_SPLIT_HEADERS})
+source_group("kernels\\cpu" FILES ${SRC_CPU_KERNELS} ${SRC_KERNELS_CPU_HEADERS})
+source_group("kernels\\cuda" FILES ${SRC_CUDA_KERNELS} ${SRC_KERNELS_CUDA_HEADERS})
+source_group("kernels\\opencl" FILES ${SRC_OPENCL_KERNELS} ${SRC_KERNELS_OPENCL_HEADERS})
+source_group("kernels\\optix" FILES ${SRC_OPTIX_KERNELS} ${SRC_KERNELS_OPTIX_HEADERS})
+source_group("svm" FILES ${SRC_SVM_HEADERS})
+
if(WITH_CYCLES_CUDA)
add_dependencies(cycles_kernel cycles_kernel_cuda)
endif()
+if(WITH_CYCLES_DEVICE_OPTIX)
+ add_dependencies(cycles_kernel cycles_kernel_optix)
+endif()
# OpenCL kernel
@@ -564,9 +636,11 @@ endif()
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_OPENCL_KERNELS}" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl)
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_CUDA_KERNELS}" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/cuda)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_OPTIX_KERNELS}" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/optix)
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel)
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_KERNELS_OPENCL_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl)
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_KERNELS_CUDA_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/cuda)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_KERNELS_OPTIX_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/optix)
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_BVH_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/bvh)
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_CLOSURE_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/closure)
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_FILTER_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/filter)
diff --git a/intern/cycles/render/mesh.cpp b/intern/cycles/render/mesh.cpp
index 9be078b6fca..cffe2bfa70a 100644
--- a/intern/cycles/render/mesh.cpp
+++ b/intern/cycles/render/mesh.cpp
@@ -1139,9 +1139,9 @@ int Mesh::motion_step(float time) const
return -1;
}
-bool Mesh::need_build_bvh(BVHLayout) const
+bool Mesh::need_build_bvh(BVHLayout layout) const
{
- return !transform_applied || has_surface_bssrdf;
+ return !transform_applied || has_surface_bssrdf || layout == BVH_LAYOUT_OPTIX;
}
bool Mesh::is_instanced() const
diff --git a/intern/cycles/util/util_debug.cpp b/intern/cycles/util/util_debug.cpp
index e52fda26845..3ce65802cff 100644
--- a/intern/cycles/util/util_debug.cpp
+++ b/intern/cycles/util/util_debug.cpp
@@ -86,6 +86,16 @@ void DebugFlags::CUDA::reset()
split_kernel = false;
}
+DebugFlags::OptiX::OptiX()
+{
+ reset();
+}
+
+void DebugFlags::OptiX::reset()
+{
+ cuda_streams = 1;
+}
+
DebugFlags::OpenCL::OpenCL() : device_type(DebugFlags::OpenCL::DEVICE_ALL), debug(false)
{
reset();
@@ -130,6 +140,7 @@ void DebugFlags::reset()
viewport_static_bvh = false;
cpu.reset();
cuda.reset();
+ optix.reset();
opencl.reset();
}
@@ -145,7 +156,10 @@ std::ostream &operator<<(std::ostream &os, DebugFlagsConstRef debug_flags)
<< " Split : " << string_from_bool(debug_flags.cpu.split_kernel) << "\n";
os << "CUDA flags:\n"
- << " Adaptive Compile: " << string_from_bool(debug_flags.cuda.adaptive_compile) << "\n";
+ << " Adaptive Compile : " << string_from_bool(debug_flags.cuda.adaptive_compile) << "\n";
+
+ os << "OptiX flags:\n"
+ << " CUDA streams : " << debug_flags.optix.cuda_streams << "\n";
const char *opencl_device_type;
switch (debug_flags.opencl.device_type) {
diff --git a/intern/cycles/util/util_debug.h b/intern/cycles/util/util_debug.h
index 5b0004ea768..cf6b442b878 100644
--- a/intern/cycles/util/util_debug.h
+++ b/intern/cycles/util/util_debug.h
@@ -99,6 +99,17 @@ class DebugFlags {
bool split_kernel;
};
+ /* Descriptor of OptiX feature-set to be used. */
+ struct OptiX {
+ OptiX();
+
+ /* Reset flags to their defaults. */
+ void reset();
+
+ /* Number of CUDA streams to launch kernels concurrently from. */
+ int cuda_streams;
+ };
+
/* Descriptor of OpenCL feature-set to be used. */
struct OpenCL {
OpenCL();
@@ -165,6 +176,9 @@ class DebugFlags {
/* Requested CUDA flags. */
CUDA cuda;
+ /* Requested OptiX flags. */
+ OptiX optix;
+
/* Requested OpenCL flags. */
OpenCL opencl;